Switch YouTube transcript fetching from Jsoup to Playwright head mode

Jsoup was blocked by YouTube bot detection. Now uses Playwright with
headed Chromium via Xvfb virtual display to bypass restrictions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-01 07:11:52 +00:00
parent 9798cda41e
commit 1bfe55d5a8
2 changed files with 36 additions and 12 deletions

View File

@@ -10,6 +10,13 @@ export JAVA_HOME
export PLAYWRIGHT_BROWSERS_PATH=/home/opc/.cache/ms-playwright
export PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1
# Xvfb virtual display for Playwright head mode (YouTube transcript)
export DISPLAY=:99
if ! pgrep -x Xvfb > /dev/null; then
Xvfb :99 -screen 0 1280x720x24 -nolisten tcp &
sleep 1
fi
# Playwright driver-bundle requires exploded classpath (fat JAR extraction fails)
BACKEND_DIR=/home/opc/sundol/sundol-backend
exec $JAVA_HOME/bin/java -cp "$BACKEND_DIR/target/classes:$BACKEND_DIR/target/dependency/*" com.sundol.SundolApplication

View File

@@ -1,7 +1,7 @@
package com.sundol.service;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.microsoft.playwright.*;
import com.microsoft.playwright.options.WaitUntilState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
@@ -11,6 +11,7 @@ import java.io.IOException;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -45,18 +46,34 @@ public class YouTubeTranscriptService {
String watchUrl = "https://www.youtube.com/watch?v=" + videoId;
log.info("Fetching YouTube transcript for: {}", watchUrl);
// YouTube 페이지 HTML 가져오기
// Playwright head 모드로 YouTube 페이지 HTML 가져오기
String html;
try {
Document doc = Jsoup.connect(watchUrl)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
.header("Accept-Language", "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7")
.timeout(15_000)
.maxBodySize(0)
.get();
html = doc.html();
try (Playwright playwright = Playwright.create()) {
BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions()
.setHeadless(false)
.setArgs(List.of(
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage"
));
try (Browser browser = playwright.chromium().launch(launchOptions)) {
BrowserContext context = browser.newContext(new Browser.NewContextOptions()
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
.setLocale("ko-KR"));
Page page = context.newPage();
page.navigate(watchUrl, new Page.NavigateOptions()
.setTimeout(30_000)
.setWaitUntil(WaitUntilState.NETWORKIDLE));
html = page.content();
log.info("Playwright fetched YouTube page: {} chars", html.length());
context.close();
}
} catch (Exception e) {
throw new IOException("YouTube 페이지를 가져올 수 없습니다: " + e.getMessage(), e);
throw new IOException("YouTube 페이지를 가져올 수 없습니다 (Playwright): " + e.getMessage(), e);
}
// captionTracks JSON 추출