From 1bfe55d5a80342cb71059951ebcae5bfbd38cb9d Mon Sep 17 00:00:00 2001 From: joungmin Date: Wed, 1 Apr 2026 07:11:52 +0000 Subject: [PATCH] Switch YouTube transcript fetching from Jsoup to Playwright head mode Jsoup was blocked by YouTube bot detection. Now uses Playwright with headed Chromium via Xvfb virtual display to bypass restrictions. Co-Authored-By: Claude Opus 4.6 (1M context) --- start-backend.sh | 7 ++++ .../service/YouTubeTranscriptService.java | 41 +++++++++++++------ 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/start-backend.sh b/start-backend.sh index 57dcce0..de2138b 100755 --- a/start-backend.sh +++ b/start-backend.sh @@ -10,6 +10,13 @@ export JAVA_HOME export PLAYWRIGHT_BROWSERS_PATH=/home/opc/.cache/ms-playwright export PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 +# Xvfb virtual display for Playwright head mode (YouTube transcript) +export DISPLAY=:99 +if ! pgrep -x Xvfb > /dev/null; then + Xvfb :99 -screen 0 1280x720x24 -nolisten tcp & + sleep 1 +fi + # Playwright driver-bundle requires exploded classpath (fat JAR extraction fails) BACKEND_DIR=/home/opc/sundol/sundol-backend exec $JAVA_HOME/bin/java -cp "$BACKEND_DIR/target/classes:$BACKEND_DIR/target/dependency/*" com.sundol.SundolApplication diff --git a/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java b/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java index db09521..f691226 100644 --- a/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java +++ b/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java @@ -1,7 +1,7 @@ package com.sundol.service; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; +import com.microsoft.playwright.*; +import com.microsoft.playwright.options.WaitUntilState; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.stereotype.Service; @@ -11,6 +11,7 @@ import java.io.IOException; import java.net.URLDecoder; import java.nio.charset.StandardCharsets; import java.time.Duration; +import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -45,18 +46,34 @@ public class YouTubeTranscriptService { String watchUrl = "https://www.youtube.com/watch?v=" + videoId; log.info("Fetching YouTube transcript for: {}", watchUrl); - // YouTube 페이지 HTML 가져오기 + // Playwright head 모드로 YouTube 페이지 HTML 가져오기 String html; - try { - Document doc = Jsoup.connect(watchUrl) - .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36") - .header("Accept-Language", "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7") - .timeout(15_000) - .maxBodySize(0) - .get(); - html = doc.html(); + try (Playwright playwright = Playwright.create()) { + BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions() + .setHeadless(false) + .setArgs(List.of( + "--no-sandbox", + "--disable-setuid-sandbox", + "--disable-dev-shm-usage" + )); + + try (Browser browser = playwright.chromium().launch(launchOptions)) { + BrowserContext context = browser.newContext(new Browser.NewContextOptions() + .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36") + .setLocale("ko-KR")); + Page page = context.newPage(); + + page.navigate(watchUrl, new Page.NavigateOptions() + .setTimeout(30_000) + .setWaitUntil(WaitUntilState.NETWORKIDLE)); + + html = page.content(); + log.info("Playwright fetched YouTube page: {} chars", html.length()); + + context.close(); + } } catch (Exception e) { - throw new IOException("YouTube 페이지를 가져올 수 없습니다: " + e.getMessage(), e); + throw new IOException("YouTube 페이지를 가져올 수 없습니다 (Playwright): " + e.getMessage(), e); } // captionTracks JSON 추출