Add Playwright headless browser as 3rd crawling fallback

Crawl chain: Jsoup → Jina Reader → Playwright (headless Chromium).
Error page detection (403, Access Denied, etc.) triggers next fallback.
Switch to exploded classpath for Playwright driver-bundle compatibility.
Fix Next.js standalone static file serving with symlink.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-30 22:36:24 +00:00
parent 0cc84354f5
commit f0f7b62e3d
4 changed files with 132 additions and 10 deletions

View File

@@ -7,6 +7,7 @@ module.exports = {
cwd: "/home/opc/sundol",
env: {
JAVA_HOME: "/usr/lib/jvm/java-21",
PLAYWRIGHT_NODEJS_PATH: "/home/opc/.playwright-driver/driver/linux/node",
},
},
{

View File

@@ -6,4 +6,10 @@ set +a
JAVA_HOME=${JAVA_HOME:-/usr/lib/jvm/java-21}
export JAVA_HOME
exec $JAVA_HOME/bin/java -jar /home/opc/sundol/sundol-backend/target/sundol-backend-0.0.1-SNAPSHOT.jar
# Playwright: use pre-installed browsers, skip auto-download
export PLAYWRIGHT_BROWSERS_PATH=/home/opc/.cache/ms-playwright
export PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1
# Playwright driver-bundle requires exploded classpath (fat JAR extraction fails)
BACKEND_DIR=/home/opc/sundol/sundol-backend
exec $JAVA_HOME/bin/java -cp "$BACKEND_DIR/target/classes:$BACKEND_DIR/target/dependency/*" com.sundol.SundolApplication

View File

@@ -104,6 +104,18 @@
<version>1.18.3</version>
</dependency>
<!-- Playwright (headless browser, driver-bundle includes node runtime) -->
<dependency>
<groupId>com.microsoft.playwright</groupId>
<artifactId>playwright</artifactId>
<version>1.51.0</version>
</dependency>
<dependency>
<groupId>com.microsoft.playwright</groupId>
<artifactId>driver-bundle</artifactId>
<version>1.51.0</version>
</dependency>
<!-- Jackson -->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>

View File

@@ -1,5 +1,9 @@
package com.sundol.service;
import com.microsoft.playwright.Browser;
import com.microsoft.playwright.BrowserType;
import com.microsoft.playwright.Page;
import com.microsoft.playwright.Playwright;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@@ -18,6 +22,13 @@ public class WebCrawlerService {
private static final Logger log = LoggerFactory.getLogger(WebCrawlerService.class);
private static final String JINA_READER_BASE = "https://r.jina.ai/";
private static final int MIN_CONTENT_LENGTH = 100;
private static final java.util.List<String> ERROR_PATTERNS = java.util.List.of(
"access denied", "403 forbidden", "you don't have permission",
"error 403", "error 401", "unauthorized", "captcha",
"please enable javascript", "checking your browser",
"attention required", "just a moment",
"technical difficulty", "page not found", "404 not found"
);
private final WebClient webClient;
@@ -34,17 +45,48 @@ public class WebCrawlerService {
// 1차: Jsoup 시도
try {
String text = crawlWithJsoup(url);
if (text != null && text.length() >= MIN_CONTENT_LENGTH) {
if (isValidContent(text)) {
return text;
}
log.warn("Jsoup returned insufficient content ({} chars), falling back to Jina Reader",
log.warn("Jsoup returned invalid content ({} chars), falling back to Jina Reader",
text != null ? text.length() : 0);
} catch (Exception e) {
log.warn("Jsoup crawl failed for {}: {}, falling back to Jina Reader", url, e.getMessage());
}
// 2차: Jina Reader fallback
return crawlWithJinaReader(url);
try {
String text = crawlWithJinaReader(url);
if (isValidContent(text)) {
return text;
}
log.warn("Jina Reader returned invalid content ({} chars), falling back to Playwright",
text != null ? text.length() : 0);
} catch (Exception e) {
log.warn("Jina Reader failed for {}: {}, falling back to Playwright", url, e.getMessage());
}
// 3차: Playwright headless browser (최후의 수단)
String playwrightText = crawlWithPlaywright(url);
if (!isValidContent(playwrightText)) {
throw new IOException("All crawl methods failed for " + url + " (error page detected from all 3 sources)");
}
return playwrightText;
}
private boolean isValidContent(String text) {
if (text == null || text.length() < MIN_CONTENT_LENGTH) {
return false;
}
// 에러 페이지 패턴 감지 (앞 500자만 검사)
String preview = text.substring(0, Math.min(text.length(), 500)).toLowerCase();
for (String pattern : ERROR_PATTERNS) {
if (preview.contains(pattern)) {
log.warn("Error page detected: content contains '{}'", pattern);
return false;
}
}
return true;
}
private String crawlWithJsoup(String url) throws IOException {
@@ -97,6 +139,52 @@ public class WebCrawlerService {
}
}
private String crawlWithPlaywright(String url) throws IOException {
log.info("Crawling with Playwright: {}", url);
try (Playwright playwright = Playwright.create()) {
BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions()
.setHeadless(true)
.setArgs(java.util.List.of(
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu"
));
try (Browser browser = playwright.chromium().launch(launchOptions)) {
Browser.NewContextOptions contextOptions = new Browser.NewContextOptions()
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36");
var context = browser.newContext(contextOptions);
Page page = context.newPage();
page.navigate(url, new Page.NavigateOptions()
.setTimeout(30_000)
.setWaitUntil(com.microsoft.playwright.options.WaitUntilState.NETWORKIDLE));
// JS 실행으로 본문 텍스트 추출
String text = page.evaluate("() => {" +
" ['nav','footer','header','script','style','.ad','#cookie-banner','.sidebar','.comments']" +
" .forEach(sel => document.querySelectorAll(sel).forEach(el => el.remove()));" +
" const article = document.querySelector('article, main, .post-content, .article-body, .entry-content');" +
" return (article || document.body).innerText;" +
"}").toString();
log.info("Playwright crawled {} - {} chars", url, text.length());
if (text == null || text.isBlank()) {
throw new IOException("Playwright returned empty content for: " + url);
}
return text;
}
} catch (IOException e) {
throw e;
} catch (Exception e) {
throw new IOException("Playwright failed for " + url + ": " + e.getMessage(), e);
}
}
public String extractTitle(String url) throws IOException {
// Jsoup으로 제목만 가져오기 (가벼움)
try {
@@ -108,13 +196,28 @@ public class WebCrawlerService {
} catch (Exception e) {
log.warn("Title extraction via Jsoup failed for {}, trying Jina Reader", url);
// Jina Reader 응답에서 첫 줄을 제목으로 사용
String content = crawlWithJinaReader(url);
String firstLine = content.strip().split("\\r?\\n", 2)[0].strip();
// Jina Reader는 "Title: ..." 형태로 제목을 포함하는 경우가 있음
if (firstLine.startsWith("Title:")) {
return firstLine.substring(6).strip();
try {
String content = crawlWithJinaReader(url);
String firstLine = content.strip().split("\\r?\\n", 2)[0].strip();
if (firstLine.startsWith("Title:")) {
return firstLine.substring(6).strip();
}
return firstLine.length() > 80 ? firstLine.substring(0, 77) + "..." : firstLine;
} catch (Exception e2) {
log.warn("Jina Reader title extraction also failed, trying Playwright", e2);
// Playwright로 제목 추출
try (Playwright playwright = Playwright.create()) {
try (Browser browser = playwright.chromium().launch(
new BrowserType.LaunchOptions().setHeadless(true)
.setArgs(java.util.List.of("--no-sandbox", "--disable-setuid-sandbox")))) {
Page page = browser.newPage();
page.navigate(url, new Page.NavigateOptions().setTimeout(30_000));
return page.title();
}
} catch (Exception e3) {
throw new IOException("All title extraction methods failed for: " + url, e3);
}
}
return firstLine.length() > 80 ? firstLine.substring(0, 77) + "..." : firstLine;
}
}
}