Refactor Playwright to singleton browser with tab-based crawling

- Add PlaywrightBrowserService: singleton Chromium browser with auto-recovery
- Refactor WebCrawlerService/YouTubeTranscriptService to use shared browser tabs
- Fix YouTube transcript: extract from DOM panel + fmt=json3 fallback
- Keep browser window alive (about:blank instead of page.close)
- Add docs: X Window setup, operation manual, crawling guide

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-09 19:18:33 +00:00
parent db4155c36d
commit afc9cdcde6
7 changed files with 934 additions and 169 deletions

View File

@@ -0,0 +1,168 @@
package com.sundol.service;
import com.microsoft.playwright.*;
import com.microsoft.playwright.options.WaitUntilState;
import jakarta.annotation.PostConstruct;
import jakarta.annotation.PreDestroy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
/**
* Playwright 브라우저를 싱글톤으로 유지하는 서비스.
* 앱 기동 시 브라우저를 한 번 띄우고, 각 작업은 새 탭(Page)으로 처리한다.
*/
@Service
public class PlaywrightBrowserService {
private static final Logger log = LoggerFactory.getLogger(PlaywrightBrowserService.class);
private Playwright playwright;
private Browser browser;
private BrowserContext context;
@PostConstruct
public void init() {
try {
playwright = Playwright.create();
BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions()
.setHeadless(false)
.setArgs(List.of(
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu"
));
browser = playwright.chromium().launch(launchOptions);
context = browser.newContext(new Browser.NewContextOptions()
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
.setLocale("ko-KR"));
loadCookies(context);
log.info("Playwright 브라우저 싱글톤 초기화 완료");
} catch (Exception e) {
log.error("Playwright 브라우저 초기화 실패: {}", e.getMessage(), e);
}
}
@PreDestroy
public void destroy() {
try {
if (context != null) context.close();
if (browser != null) browser.close();
if (playwright != null) playwright.close();
log.info("Playwright 브라우저 종료 완료");
} catch (Exception e) {
log.warn("Playwright 브라우저 종료 중 오류: {}", e.getMessage());
}
}
/**
* 새 탭을 열고 URL로 이동한다. 호출자가 사용 후 page.close()를 해야 한다.
*/
public Page openPage(String url, int timeoutMs) throws IOException {
ensureBrowserAlive();
Page page = context.newPage();
try {
page.navigate(url, new Page.NavigateOptions()
.setTimeout(timeoutMs)
.setWaitUntil(WaitUntilState.NETWORKIDLE));
return page;
} catch (Exception e) {
page.close();
throw new IOException("페이지 로드 실패 (" + url + "): " + e.getMessage(), e);
}
}
/**
* 새 탭을 열고 URL로 이동한다 (기본 30초 타임아웃).
*/
public Page openPage(String url) throws IOException {
return openPage(url, 30_000);
}
/**
* 브라우저 페이지 내에서 JavaScript fetch로 URL 내용을 가져온다.
* 브라우저의 쿠키/세션이 그대로 적용된다.
*/
public String fetchInPage(Page page, String url) throws IOException {
try {
Object result = page.evaluate("async (url) => {" +
" const res = await fetch(url, { credentials: 'include' });" +
" if (!res.ok) throw new Error('HTTP ' + res.status);" +
" return await res.text();" +
"}", url);
return result != null ? result.toString() : "";
} catch (Exception e) {
throw new IOException("브라우저 내 fetch 실패 (" + url + "): " + e.getMessage(), e);
}
}
/**
* 브라우저가 죽었으면 재시작한다.
*/
private synchronized void ensureBrowserAlive() throws IOException {
if (browser != null && browser.isConnected()) {
return;
}
log.warn("Playwright 브라우저가 죽어있습니다. 재시작합니다.");
destroy();
try {
playwright = Playwright.create();
browser = playwright.chromium().launch(new BrowserType.LaunchOptions()
.setHeadless(false)
.setArgs(List.of(
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu"
)));
context = browser.newContext(new Browser.NewContextOptions()
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
.setLocale("ko-KR"));
loadCookies(context);
log.info("Playwright 브라우저 재시작 완료");
} catch (Exception e) {
throw new IOException("Playwright 브라우저 재시작 실패: " + e.getMessage(), e);
}
}
private void loadCookies(BrowserContext ctx) {
Path cookieFile = Path.of(System.getProperty("user.dir"), "cookies.txt");
if (!Files.exists(cookieFile)) {
log.warn("cookies.txt not found at: {}", cookieFile);
return;
}
try {
List<String> lines = Files.readAllLines(cookieFile);
List<com.microsoft.playwright.options.Cookie> cookies = new ArrayList<>();
for (String line : lines) {
if (line.startsWith("#") || line.isBlank()) continue;
String[] parts = line.split("\t");
if (parts.length < 7) continue;
String domain = parts[0];
if (!domain.contains("youtube") && !domain.contains("google")) continue;
cookies.add(new com.microsoft.playwright.options.Cookie(parts[5], parts[6])
.setDomain(domain)
.setPath(parts[2])
.setSecure("TRUE".equalsIgnoreCase(parts[3]))
.setHttpOnly(false));
}
if (!cookies.isEmpty()) {
ctx.addCookies(cookies);
log.info("Loaded {} YouTube cookies", cookies.size());
}
} catch (Exception e) {
log.warn("Failed to load cookies: {}", e.getMessage());
}
}
}

View File

@@ -1,9 +1,6 @@
package com.sundol.service;
import com.microsoft.playwright.Browser;
import com.microsoft.playwright.BrowserType;
import com.microsoft.playwright.Page;
import com.microsoft.playwright.Playwright;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@@ -31,11 +28,13 @@ public class WebCrawlerService {
);
private final WebClient webClient;
private final PlaywrightBrowserService browserService;
@Value("${jina.reader.api-key:}")
private String jinaApiKey;
public WebCrawlerService() {
public WebCrawlerService(PlaywrightBrowserService browserService) {
this.browserService = browserService;
this.webClient = WebClient.builder()
.codecs(configurer -> configurer.defaultCodecs().maxInMemorySize(5 * 1024 * 1024))
.build();
@@ -66,7 +65,7 @@ public class WebCrawlerService {
log.warn("Jina Reader failed for {}: {}, falling back to Playwright", url, e.getMessage());
}
// 3차: Playwright headless browser (최후의 수단)
// 3차: Playwright (싱글톤 브라우저 탭)
String playwrightText = crawlWithPlaywright(url);
if (!isValidContent(playwrightText)) {
throw new IOException("All crawl methods failed for " + url + " (error page detected from all 3 sources)");
@@ -141,47 +140,29 @@ public class WebCrawlerService {
private String crawlWithPlaywright(String url) throws IOException {
log.info("Crawling with Playwright: {}", url);
try (Playwright playwright = Playwright.create()) {
BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions()
.setHeadless(true)
.setArgs(java.util.List.of(
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu"
));
Page page = browserService.openPage(url);
try {
// JS 실행으로 본문 텍스트 추출
String text = page.evaluate("() => {" +
" ['nav','footer','header','script','style','.ad','#cookie-banner','.sidebar','.comments']" +
" .forEach(sel => document.querySelectorAll(sel).forEach(el => el.remove()));" +
" const article = document.querySelector('article, main, .post-content, .article-body, .entry-content');" +
" return (article || document.body).innerText;" +
"}").toString();
try (Browser browser = playwright.chromium().launch(launchOptions)) {
Browser.NewContextOptions contextOptions = new Browser.NewContextOptions()
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36");
log.info("Playwright crawled {} - {} chars", url, text.length());
var context = browser.newContext(contextOptions);
Page page = context.newPage();
page.navigate(url, new Page.NavigateOptions()
.setTimeout(30_000)
.setWaitUntil(com.microsoft.playwright.options.WaitUntilState.NETWORKIDLE));
// JS 실행으로 본문 텍스트 추출
String text = page.evaluate("() => {" +
" ['nav','footer','header','script','style','.ad','#cookie-banner','.sidebar','.comments']" +
" .forEach(sel => document.querySelectorAll(sel).forEach(el => el.remove()));" +
" const article = document.querySelector('article, main, .post-content, .article-body, .entry-content');" +
" return (article || document.body).innerText;" +
"}").toString();
log.info("Playwright crawled {} - {} chars", url, text.length());
if (text == null || text.isBlank()) {
throw new IOException("Playwright returned empty content for: " + url);
}
return text;
if (text == null || text.isBlank()) {
throw new IOException("Playwright returned empty content for: " + url);
}
return text;
} finally {
try {
page.navigate("about:blank");
} catch (Exception ignored) {
page.close();
}
} catch (IOException e) {
throw e;
} catch (Exception e) {
throw new IOException("Playwright failed for " + url + ": " + e.getMessage(), e);
}
}
@@ -205,17 +186,16 @@ public class WebCrawlerService {
return firstLine.length() > 80 ? firstLine.substring(0, 77) + "..." : firstLine;
} catch (Exception e2) {
log.warn("Jina Reader title extraction also failed, trying Playwright", e2);
// Playwright로 제목 추출
try (Playwright playwright = Playwright.create()) {
try (Browser browser = playwright.chromium().launch(
new BrowserType.LaunchOptions().setHeadless(true)
.setArgs(java.util.List.of("--no-sandbox", "--disable-setuid-sandbox")))) {
Page page = browser.newPage();
page.navigate(url, new Page.NavigateOptions().setTimeout(30_000));
return page.title();
// Playwright 싱글톤 브라우저로 제목 추출
Page page = browserService.openPage(url);
try {
return page.title();
} finally {
try {
page.navigate("about:blank");
} catch (Exception ignored2) {
page.close();
}
} catch (Exception e3) {
throw new IOException("All title extraction methods failed for: " + url, e3);
}
}
}

View File

@@ -1,7 +1,6 @@
package com.sundol.service;
import com.microsoft.playwright.*;
import com.microsoft.playwright.options.WaitUntilState;
import com.microsoft.playwright.Page;
import io.github.thoroldvix.api.TranscriptApiFactory;
import io.github.thoroldvix.api.TranscriptContent;
import io.github.thoroldvix.api.TranscriptList;
@@ -14,10 +13,6 @@ import org.springframework.stereotype.Service;
import java.io.IOException;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@@ -39,6 +34,11 @@ public class YouTubeTranscriptService {
Pattern.compile("<text[^>]*>(.*?)</text>", Pattern.DOTALL);
private final YoutubeTranscriptApi transcriptApi = TranscriptApiFactory.createDefault();
private final PlaywrightBrowserService browserService;
public YouTubeTranscriptService(PlaywrightBrowserService browserService) {
this.browserService = browserService;
}
public String fetchTranscript(String youtubeUrl) throws IOException {
String videoId = extractVideoId(youtubeUrl);
@@ -59,7 +59,7 @@ public class YouTubeTranscriptService {
log.warn("youtube-transcript-api failed for {}: {}", videoId, e.getMessage());
}
// 2차 fallback: Playwright head 모드
// 2차 fallback: Playwright head 모드 (싱글톤 브라우저 탭)
log.info("Falling back to Playwright for videoId: {}", videoId);
return fetchWithPlaywright(videoId);
}
@@ -107,85 +107,157 @@ public class YouTubeTranscriptService {
private String fetchWithPlaywright(String videoId) throws IOException {
String watchUrl = "https://www.youtube.com/watch?v=" + videoId;
String html;
try (Playwright playwright = Playwright.create()) {
BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions()
.setHeadless(false)
.setArgs(List.of(
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage"
));
try (Browser browser = playwright.chromium().launch(launchOptions)) {
BrowserContext context = browser.newContext(new Browser.NewContextOptions()
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
.setLocale("ko-KR"));
// YouTube 쿠키 로딩 (봇 차단 우회)
loadCookies(context);
Page page = context.newPage();
page.navigate(watchUrl, new Page.NavigateOptions()
.setTimeout(30_000)
.setWaitUntil(WaitUntilState.NETWORKIDLE));
html = page.content();
log.info("Playwright fetched YouTube page: {} chars", html.length());
context.close();
}
} catch (Exception e) {
throw new IOException("YouTube 페이지를 가져올 수 없습니다 (Playwright): " + e.getMessage(), e);
}
// captionTracks JSON 추출
Matcher captionMatcher = CAPTION_TRACK_PATTERN.matcher(html);
if (!captionMatcher.find()) {
throw new IOException("이 영상에는 자막(caption)이 없습니다.");
}
String captionTracksJson = captionMatcher.group(1);
String captionUrl = selectCaptionUrl(captionTracksJson);
if (captionUrl == null) {
throw new IOException("자막 트랙 URL을 추출할 수 없습니다.");
}
captionUrl = captionUrl.replace("\\u0026", "&");
log.info("Fetching caption XML from: {}", captionUrl);
// 자막 XML 가져오기
log.info("Attempting to fetch caption XML...");
String xml;
Page page = browserService.openPage(watchUrl);
try {
var conn = (java.net.HttpURLConnection) new java.net.URI(captionUrl).toURL().openConnection();
conn.setRequestProperty("User-Agent", "Mozilla/5.0");
conn.setConnectTimeout(15_000);
conn.setReadTimeout(15_000);
int responseCode = conn.getResponseCode();
log.info("Caption XML response code: {}", responseCode);
if (responseCode != 200) {
String errorBody = new String(conn.getErrorStream().readAllBytes(), StandardCharsets.UTF_8);
log.error("Caption XML error response: {}", errorBody);
throw new IOException("자막 XML 응답 코드: " + responseCode);
log.info("Playwright fetched YouTube page for videoId: {}", videoId);
// 방법 1: YouTube 페이지 내 JS로 자막 패널 열어서 텍스트 추출
String transcript = extractTranscriptFromPanel(page);
if (transcript != null && !transcript.isBlank()) {
log.info("Successfully fetched transcript via panel: {} chars", transcript.length());
return transcript;
}
xml = new String(conn.getInputStream().readAllBytes(), StandardCharsets.UTF_8);
log.info("Caption XML fetched: {} chars", xml.length());
// 방법 2: ytInitialPlayerResponse에서 caption URL 추출 후 fmt=json3로 시도
String html = page.content();
Matcher captionMatcher = CAPTION_TRACK_PATTERN.matcher(html);
if (!captionMatcher.find()) {
throw new IOException("이 영상에는 자막(caption)이 없습니다.");
}
String captionTracksJson = captionMatcher.group(1);
String captionUrl = selectCaptionUrl(captionTracksJson);
if (captionUrl == null) {
throw new IOException("자막 트랙 URL을 추출할 수 없습니다.");
}
captionUrl = captionUrl.replace("\\u0026", "&");
// fmt=json3 추가하여 JSON 형식으로 요청
if (!captionUrl.contains("fmt=")) {
captionUrl += "&fmt=json3";
}
log.info("Fetching caption JSON from: {}", captionUrl);
String json = browserService.fetchInPage(page, captionUrl);
log.info("Caption JSON fetched: {} chars", json.length());
if (json.length() > 0) {
transcript = parseTranscriptJson(json);
if (transcript != null && !transcript.isBlank()) {
log.info("Successfully fetched transcript via JSON: {} chars", transcript.length());
return transcript;
}
}
throw new IOException("자막 텍스트를 가져올 수 없습니다.");
} finally {
// 탭을 닫지 않고 빈 페이지로 이동 (브라우저 창 유지)
try {
page.navigate("about:blank");
} catch (Exception ignored) {
page.close();
}
}
}
/**
* YouTube 페이지에서 '자막 표시' 패널을 열고 텍스트를 추출한다.
*/
private String extractTranscriptFromPanel(Page page) {
try {
// '더보기' 또는 '...더보기' 버튼 클릭하여 설명 패널 열기
page.waitForTimeout(2000);
// 자막 패널 열기: '스크립트 표시' 버튼 찾기
Object result = page.evaluate("() => {" +
" // 방법 1: 동영상 설명 아래 '스크립트 표시' 버튼 클릭" +
" const buttons = document.querySelectorAll('button, ytd-button-renderer');" +
" for (const btn of buttons) {" +
" const text = btn.innerText || btn.textContent || '';" +
" if (text.includes('스크립트 표시') || text.includes('Show transcript') || text.includes('자막')) {" +
" btn.click();" +
" return 'clicked';" +
" }" +
" }" +
" // 방법 2: 메뉴에서 스크립트 열기" +
" const menuBtn = document.querySelector('#button-shape button[aria-label=\"더보기\"], button.ytp-subtitles-button');" +
" if (menuBtn) { menuBtn.click(); return 'menu_clicked'; }" +
" return 'not_found';" +
"}");
log.info("Transcript panel button result: {}", result);
if ("not_found".equals(result)) {
// 대안: 설명란 펼치기 → 스크립트 표시
page.evaluate("() => {" +
" const expander = document.querySelector('#expand, tp-yt-paper-button#expand');" +
" if (expander) expander.click();" +
"}");
page.waitForTimeout(1000);
page.evaluate("() => {" +
" const buttons = document.querySelectorAll('button, ytd-button-renderer');" +
" for (const btn of buttons) {" +
" const text = btn.innerText || btn.textContent || '';" +
" if (text.includes('스크립트 표시') || text.includes('Show transcript')) {" +
" btn.click();" +
" return 'clicked';" +
" }" +
" }" +
" return 'not_found';" +
"}");
}
// 자막 패널이 로드될 때까지 대기
page.waitForTimeout(3000);
// 자막 텍스트 추출
Object transcriptObj = page.evaluate("() => {" +
" // 스크립트 패널의 자막 세그먼트 추출" +
" const segments = document.querySelectorAll(" +
" 'ytd-transcript-segment-renderer .segment-text," +
" yt-formatted-string.segment-text," +
" #segments-container yt-formatted-string'" +
" );" +
" if (segments.length > 0) {" +
" return Array.from(segments).map(s => s.textContent.trim()).filter(t => t.length > 0).join(' ');" +
" }" +
" return '';" +
"}");
String transcript = transcriptObj != null ? transcriptObj.toString() : "";
log.info("Transcript from panel: {} chars", transcript.length());
return transcript.isBlank() ? null : transcript;
} catch (Exception e) {
log.error("Failed to fetch caption XML: {}", e.getMessage(), e);
throw new IOException("자막 XML을 가져올 수 없습니다: " + e.getMessage(), e);
log.warn("Failed to extract transcript from panel: {}", e.getMessage());
return null;
}
}
String transcript = parseTranscriptXml(xml);
log.info("Parsed transcript: {} chars (blank={})", transcript.length(), transcript.isBlank());
if (transcript.isBlank()) {
log.error("Transcript XML content (first 500 chars): {}", xml.substring(0, Math.min(500, xml.length())));
throw new IOException("자막 텍스트를 파싱할 수 없습니다.");
/**
* YouTube timedtext API의 fmt=json3 응답을 파싱한다.
*/
private String parseTranscriptJson(String json) {
try {
StringBuilder sb = new StringBuilder();
// json3 형식: {"events":[{"segs":[{"utf8":"text"}]},...]}
Pattern segPattern = Pattern.compile("\"utf8\":\\s*\"(.*?)\"");
Matcher matcher = segPattern.matcher(json);
while (matcher.find()) {
String text = matcher.group(1)
.replace("\\n", " ")
.replace("\\\"", "\"")
.trim();
if (!text.isEmpty() && !text.equals("\n")) {
if (sb.length() > 0) sb.append(" ");
sb.append(text);
}
}
return sb.toString();
} catch (Exception e) {
log.warn("Failed to parse transcript JSON: {}", e.getMessage());
return null;
}
log.info("Successfully fetched transcript via Playwright: {} chars", transcript.length());
return transcript;
}
private String selectCaptionUrl(String captionTracksJson) {
@@ -236,37 +308,6 @@ public class YouTubeTranscriptService {
return sb.toString();
}
private void loadCookies(BrowserContext context) {
Path cookieFile = Path.of(System.getProperty("user.dir"), "cookies.txt");
if (!Files.exists(cookieFile)) {
log.warn("cookies.txt not found at: {}", cookieFile);
return;
}
try {
List<String> lines = Files.readAllLines(cookieFile);
List<com.microsoft.playwright.options.Cookie> cookies = new ArrayList<>();
for (String line : lines) {
if (line.startsWith("#") || line.isBlank()) continue;
String[] parts = line.split("\t");
if (parts.length < 7) continue;
String domain = parts[0];
if (!domain.contains("youtube") && !domain.contains("google")) continue;
cookies.add(new com.microsoft.playwright.options.Cookie(parts[5], parts[6])
.setDomain(domain)
.setPath(parts[2])
.setSecure("TRUE".equalsIgnoreCase(parts[3]))
.setHttpOnly(false));
}
if (!cookies.isEmpty()) {
context.addCookies(cookies);
log.info("Loaded {} YouTube cookies", cookies.size());
}
} catch (Exception e) {
log.warn("Failed to load cookies: {}", e.getMessage());
}
}
private String extractVideoId(String url) {
if (url == null || url.isBlank()) return null;
try {