Switch YouTube transcript fetching from Jsoup to Playwright head mode
Jsoup was blocked by YouTube bot detection. Now uses Playwright with headed Chromium via Xvfb virtual display to bypass restrictions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -10,6 +10,13 @@ export JAVA_HOME
|
|||||||
export PLAYWRIGHT_BROWSERS_PATH=/home/opc/.cache/ms-playwright
|
export PLAYWRIGHT_BROWSERS_PATH=/home/opc/.cache/ms-playwright
|
||||||
export PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1
|
export PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1
|
||||||
|
|
||||||
|
# Xvfb virtual display for Playwright head mode (YouTube transcript)
|
||||||
|
export DISPLAY=:99
|
||||||
|
if ! pgrep -x Xvfb > /dev/null; then
|
||||||
|
Xvfb :99 -screen 0 1280x720x24 -nolisten tcp &
|
||||||
|
sleep 1
|
||||||
|
fi
|
||||||
|
|
||||||
# Playwright driver-bundle requires exploded classpath (fat JAR extraction fails)
|
# Playwright driver-bundle requires exploded classpath (fat JAR extraction fails)
|
||||||
BACKEND_DIR=/home/opc/sundol/sundol-backend
|
BACKEND_DIR=/home/opc/sundol/sundol-backend
|
||||||
exec $JAVA_HOME/bin/java -cp "$BACKEND_DIR/target/classes:$BACKEND_DIR/target/dependency/*" com.sundol.SundolApplication
|
exec $JAVA_HOME/bin/java -cp "$BACKEND_DIR/target/classes:$BACKEND_DIR/target/dependency/*" com.sundol.SundolApplication
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
package com.sundol.service;
|
package com.sundol.service;
|
||||||
|
|
||||||
import org.jsoup.Jsoup;
|
import com.microsoft.playwright.*;
|
||||||
import org.jsoup.nodes.Document;
|
import com.microsoft.playwright.options.WaitUntilState;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
@@ -11,6 +11,7 @@ import java.io.IOException;
|
|||||||
import java.net.URLDecoder;
|
import java.net.URLDecoder;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
|
import java.util.List;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
@@ -45,18 +46,34 @@ public class YouTubeTranscriptService {
|
|||||||
String watchUrl = "https://www.youtube.com/watch?v=" + videoId;
|
String watchUrl = "https://www.youtube.com/watch?v=" + videoId;
|
||||||
log.info("Fetching YouTube transcript for: {}", watchUrl);
|
log.info("Fetching YouTube transcript for: {}", watchUrl);
|
||||||
|
|
||||||
// YouTube 페이지 HTML 가져오기
|
// Playwright head 모드로 YouTube 페이지 HTML 가져오기
|
||||||
String html;
|
String html;
|
||||||
try {
|
try (Playwright playwright = Playwright.create()) {
|
||||||
Document doc = Jsoup.connect(watchUrl)
|
BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions()
|
||||||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
|
.setHeadless(false)
|
||||||
.header("Accept-Language", "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7")
|
.setArgs(List.of(
|
||||||
.timeout(15_000)
|
"--no-sandbox",
|
||||||
.maxBodySize(0)
|
"--disable-setuid-sandbox",
|
||||||
.get();
|
"--disable-dev-shm-usage"
|
||||||
html = doc.html();
|
));
|
||||||
|
|
||||||
|
try (Browser browser = playwright.chromium().launch(launchOptions)) {
|
||||||
|
BrowserContext context = browser.newContext(new Browser.NewContextOptions()
|
||||||
|
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
|
||||||
|
.setLocale("ko-KR"));
|
||||||
|
Page page = context.newPage();
|
||||||
|
|
||||||
|
page.navigate(watchUrl, new Page.NavigateOptions()
|
||||||
|
.setTimeout(30_000)
|
||||||
|
.setWaitUntil(WaitUntilState.NETWORKIDLE));
|
||||||
|
|
||||||
|
html = page.content();
|
||||||
|
log.info("Playwright fetched YouTube page: {} chars", html.length());
|
||||||
|
|
||||||
|
context.close();
|
||||||
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new IOException("YouTube 페이지를 가져올 수 없습니다: " + e.getMessage(), e);
|
throw new IOException("YouTube 페이지를 가져올 수 없습니다 (Playwright): " + e.getMessage(), e);
|
||||||
}
|
}
|
||||||
|
|
||||||
// captionTracks JSON 추출
|
// captionTracks JSON 추출
|
||||||
|
|||||||
Reference in New Issue
Block a user