Switch YouTube transcript fetching from Jsoup to Playwright head mode
Jsoup was blocked by YouTube bot detection. Now uses Playwright with headed Chromium via Xvfb virtual display to bypass restrictions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -10,6 +10,13 @@ export JAVA_HOME
|
||||
export PLAYWRIGHT_BROWSERS_PATH=/home/opc/.cache/ms-playwright
|
||||
export PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1
|
||||
|
||||
# Xvfb virtual display for Playwright head mode (YouTube transcript)
|
||||
export DISPLAY=:99
|
||||
if ! pgrep -x Xvfb > /dev/null; then
|
||||
Xvfb :99 -screen 0 1280x720x24 -nolisten tcp &
|
||||
sleep 1
|
||||
fi
|
||||
|
||||
# Playwright driver-bundle requires exploded classpath (fat JAR extraction fails)
|
||||
BACKEND_DIR=/home/opc/sundol/sundol-backend
|
||||
exec $JAVA_HOME/bin/java -cp "$BACKEND_DIR/target/classes:$BACKEND_DIR/target/dependency/*" com.sundol.SundolApplication
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
package com.sundol.service;
|
||||
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import com.microsoft.playwright.*;
|
||||
import com.microsoft.playwright.options.WaitUntilState;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Service;
|
||||
@@ -11,6 +11,7 @@ import java.io.IOException;
|
||||
import java.net.URLDecoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Duration;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@@ -45,18 +46,34 @@ public class YouTubeTranscriptService {
|
||||
String watchUrl = "https://www.youtube.com/watch?v=" + videoId;
|
||||
log.info("Fetching YouTube transcript for: {}", watchUrl);
|
||||
|
||||
// YouTube 페이지 HTML 가져오기
|
||||
// Playwright head 모드로 YouTube 페이지 HTML 가져오기
|
||||
String html;
|
||||
try {
|
||||
Document doc = Jsoup.connect(watchUrl)
|
||||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
|
||||
.header("Accept-Language", "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7")
|
||||
.timeout(15_000)
|
||||
.maxBodySize(0)
|
||||
.get();
|
||||
html = doc.html();
|
||||
try (Playwright playwright = Playwright.create()) {
|
||||
BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions()
|
||||
.setHeadless(false)
|
||||
.setArgs(List.of(
|
||||
"--no-sandbox",
|
||||
"--disable-setuid-sandbox",
|
||||
"--disable-dev-shm-usage"
|
||||
));
|
||||
|
||||
try (Browser browser = playwright.chromium().launch(launchOptions)) {
|
||||
BrowserContext context = browser.newContext(new Browser.NewContextOptions()
|
||||
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
|
||||
.setLocale("ko-KR"));
|
||||
Page page = context.newPage();
|
||||
|
||||
page.navigate(watchUrl, new Page.NavigateOptions()
|
||||
.setTimeout(30_000)
|
||||
.setWaitUntil(WaitUntilState.NETWORKIDLE));
|
||||
|
||||
html = page.content();
|
||||
log.info("Playwright fetched YouTube page: {} chars", html.length());
|
||||
|
||||
context.close();
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new IOException("YouTube 페이지를 가져올 수 없습니다: " + e.getMessage(), e);
|
||||
throw new IOException("YouTube 페이지를 가져올 수 없습니다 (Playwright): " + e.getMessage(), e);
|
||||
}
|
||||
|
||||
// captionTracks JSON 추출
|
||||
|
||||
Reference in New Issue
Block a user