diff --git a/start-backend.sh b/start-backend.sh index 57dcce0..de2138b 100755 --- a/start-backend.sh +++ b/start-backend.sh @@ -10,6 +10,13 @@ export JAVA_HOME export PLAYWRIGHT_BROWSERS_PATH=/home/opc/.cache/ms-playwright export PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 +# Xvfb virtual display for Playwright head mode (YouTube transcript) +export DISPLAY=:99 +if ! pgrep -x Xvfb > /dev/null; then + Xvfb :99 -screen 0 1280x720x24 -nolisten tcp & + sleep 1 +fi + # Playwright driver-bundle requires exploded classpath (fat JAR extraction fails) BACKEND_DIR=/home/opc/sundol/sundol-backend exec $JAVA_HOME/bin/java -cp "$BACKEND_DIR/target/classes:$BACKEND_DIR/target/dependency/*" com.sundol.SundolApplication diff --git a/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java b/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java index db09521..f691226 100644 --- a/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java +++ b/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java @@ -1,7 +1,7 @@ package com.sundol.service; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; +import com.microsoft.playwright.*; +import com.microsoft.playwright.options.WaitUntilState; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.stereotype.Service; @@ -11,6 +11,7 @@ import java.io.IOException; import java.net.URLDecoder; import java.nio.charset.StandardCharsets; import java.time.Duration; +import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -45,18 +46,34 @@ public class YouTubeTranscriptService { String watchUrl = "https://www.youtube.com/watch?v=" + videoId; log.info("Fetching YouTube transcript for: {}", watchUrl); - // YouTube 페이지 HTML 가져오기 + // Playwright head 모드로 YouTube 페이지 HTML 가져오기 String html; - try { - Document doc = Jsoup.connect(watchUrl) - .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36") - .header("Accept-Language", "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7") - .timeout(15_000) - .maxBodySize(0) - .get(); - html = doc.html(); + try (Playwright playwright = Playwright.create()) { + BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions() + .setHeadless(false) + .setArgs(List.of( + "--no-sandbox", + "--disable-setuid-sandbox", + "--disable-dev-shm-usage" + )); + + try (Browser browser = playwright.chromium().launch(launchOptions)) { + BrowserContext context = browser.newContext(new Browser.NewContextOptions() + .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36") + .setLocale("ko-KR")); + Page page = context.newPage(); + + page.navigate(watchUrl, new Page.NavigateOptions() + .setTimeout(30_000) + .setWaitUntil(WaitUntilState.NETWORKIDLE)); + + html = page.content(); + log.info("Playwright fetched YouTube page: {} chars", html.length()); + + context.close(); + } } catch (Exception e) { - throw new IOException("YouTube 페이지를 가져올 수 없습니다: " + e.getMessage(), e); + throw new IOException("YouTube 페이지를 가져올 수 없습니다 (Playwright): " + e.getMessage(), e); } // captionTracks JSON 추출