Add Playwright headless browser as 3rd crawling fallback
Crawl chain: Jsoup → Jina Reader → Playwright (headless Chromium). Error page detection (403, Access Denied, etc.) triggers next fallback. Switch to exploded classpath for Playwright driver-bundle compatibility. Fix Next.js standalone static file serving with symlink. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -7,6 +7,7 @@ module.exports = {
|
||||
cwd: "/home/opc/sundol",
|
||||
env: {
|
||||
JAVA_HOME: "/usr/lib/jvm/java-21",
|
||||
PLAYWRIGHT_NODEJS_PATH: "/home/opc/.playwright-driver/driver/linux/node",
|
||||
},
|
||||
},
|
||||
{
|
||||
|
||||
@@ -6,4 +6,10 @@ set +a
|
||||
JAVA_HOME=${JAVA_HOME:-/usr/lib/jvm/java-21}
|
||||
export JAVA_HOME
|
||||
|
||||
exec $JAVA_HOME/bin/java -jar /home/opc/sundol/sundol-backend/target/sundol-backend-0.0.1-SNAPSHOT.jar
|
||||
# Playwright: use pre-installed browsers, skip auto-download
|
||||
export PLAYWRIGHT_BROWSERS_PATH=/home/opc/.cache/ms-playwright
|
||||
export PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1
|
||||
|
||||
# Playwright driver-bundle requires exploded classpath (fat JAR extraction fails)
|
||||
BACKEND_DIR=/home/opc/sundol/sundol-backend
|
||||
exec $JAVA_HOME/bin/java -cp "$BACKEND_DIR/target/classes:$BACKEND_DIR/target/dependency/*" com.sundol.SundolApplication
|
||||
|
||||
@@ -104,6 +104,18 @@
|
||||
<version>1.18.3</version>
|
||||
</dependency>
|
||||
|
||||
<!-- Playwright (headless browser, driver-bundle includes node runtime) -->
|
||||
<dependency>
|
||||
<groupId>com.microsoft.playwright</groupId>
|
||||
<artifactId>playwright</artifactId>
|
||||
<version>1.51.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.microsoft.playwright</groupId>
|
||||
<artifactId>driver-bundle</artifactId>
|
||||
<version>1.51.0</version>
|
||||
</dependency>
|
||||
|
||||
<!-- Jackson -->
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
package com.sundol.service;
|
||||
|
||||
import com.microsoft.playwright.Browser;
|
||||
import com.microsoft.playwright.BrowserType;
|
||||
import com.microsoft.playwright.Page;
|
||||
import com.microsoft.playwright.Playwright;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
@@ -18,6 +22,13 @@ public class WebCrawlerService {
|
||||
private static final Logger log = LoggerFactory.getLogger(WebCrawlerService.class);
|
||||
private static final String JINA_READER_BASE = "https://r.jina.ai/";
|
||||
private static final int MIN_CONTENT_LENGTH = 100;
|
||||
private static final java.util.List<String> ERROR_PATTERNS = java.util.List.of(
|
||||
"access denied", "403 forbidden", "you don't have permission",
|
||||
"error 403", "error 401", "unauthorized", "captcha",
|
||||
"please enable javascript", "checking your browser",
|
||||
"attention required", "just a moment",
|
||||
"technical difficulty", "page not found", "404 not found"
|
||||
);
|
||||
|
||||
private final WebClient webClient;
|
||||
|
||||
@@ -34,17 +45,48 @@ public class WebCrawlerService {
|
||||
// 1차: Jsoup 시도
|
||||
try {
|
||||
String text = crawlWithJsoup(url);
|
||||
if (text != null && text.length() >= MIN_CONTENT_LENGTH) {
|
||||
if (isValidContent(text)) {
|
||||
return text;
|
||||
}
|
||||
log.warn("Jsoup returned insufficient content ({} chars), falling back to Jina Reader",
|
||||
log.warn("Jsoup returned invalid content ({} chars), falling back to Jina Reader",
|
||||
text != null ? text.length() : 0);
|
||||
} catch (Exception e) {
|
||||
log.warn("Jsoup crawl failed for {}: {}, falling back to Jina Reader", url, e.getMessage());
|
||||
}
|
||||
|
||||
// 2차: Jina Reader fallback
|
||||
return crawlWithJinaReader(url);
|
||||
try {
|
||||
String text = crawlWithJinaReader(url);
|
||||
if (isValidContent(text)) {
|
||||
return text;
|
||||
}
|
||||
log.warn("Jina Reader returned invalid content ({} chars), falling back to Playwright",
|
||||
text != null ? text.length() : 0);
|
||||
} catch (Exception e) {
|
||||
log.warn("Jina Reader failed for {}: {}, falling back to Playwright", url, e.getMessage());
|
||||
}
|
||||
|
||||
// 3차: Playwright headless browser (최후의 수단)
|
||||
String playwrightText = crawlWithPlaywright(url);
|
||||
if (!isValidContent(playwrightText)) {
|
||||
throw new IOException("All crawl methods failed for " + url + " (error page detected from all 3 sources)");
|
||||
}
|
||||
return playwrightText;
|
||||
}
|
||||
|
||||
private boolean isValidContent(String text) {
|
||||
if (text == null || text.length() < MIN_CONTENT_LENGTH) {
|
||||
return false;
|
||||
}
|
||||
// 에러 페이지 패턴 감지 (앞 500자만 검사)
|
||||
String preview = text.substring(0, Math.min(text.length(), 500)).toLowerCase();
|
||||
for (String pattern : ERROR_PATTERNS) {
|
||||
if (preview.contains(pattern)) {
|
||||
log.warn("Error page detected: content contains '{}'", pattern);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private String crawlWithJsoup(String url) throws IOException {
|
||||
@@ -97,6 +139,52 @@ public class WebCrawlerService {
|
||||
}
|
||||
}
|
||||
|
||||
private String crawlWithPlaywright(String url) throws IOException {
|
||||
log.info("Crawling with Playwright: {}", url);
|
||||
try (Playwright playwright = Playwright.create()) {
|
||||
BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions()
|
||||
.setHeadless(true)
|
||||
.setArgs(java.util.List.of(
|
||||
"--no-sandbox",
|
||||
"--disable-setuid-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-gpu"
|
||||
));
|
||||
|
||||
try (Browser browser = playwright.chromium().launch(launchOptions)) {
|
||||
Browser.NewContextOptions contextOptions = new Browser.NewContextOptions()
|
||||
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36");
|
||||
|
||||
var context = browser.newContext(contextOptions);
|
||||
Page page = context.newPage();
|
||||
|
||||
page.navigate(url, new Page.NavigateOptions()
|
||||
.setTimeout(30_000)
|
||||
.setWaitUntil(com.microsoft.playwright.options.WaitUntilState.NETWORKIDLE));
|
||||
|
||||
// JS 실행으로 본문 텍스트 추출
|
||||
String text = page.evaluate("() => {" +
|
||||
" ['nav','footer','header','script','style','.ad','#cookie-banner','.sidebar','.comments']" +
|
||||
" .forEach(sel => document.querySelectorAll(sel).forEach(el => el.remove()));" +
|
||||
" const article = document.querySelector('article, main, .post-content, .article-body, .entry-content');" +
|
||||
" return (article || document.body).innerText;" +
|
||||
"}").toString();
|
||||
|
||||
log.info("Playwright crawled {} - {} chars", url, text.length());
|
||||
|
||||
if (text == null || text.isBlank()) {
|
||||
throw new IOException("Playwright returned empty content for: " + url);
|
||||
}
|
||||
|
||||
return text;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw e;
|
||||
} catch (Exception e) {
|
||||
throw new IOException("Playwright failed for " + url + ": " + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
public String extractTitle(String url) throws IOException {
|
||||
// Jsoup으로 제목만 가져오기 (가벼움)
|
||||
try {
|
||||
@@ -108,13 +196,28 @@ public class WebCrawlerService {
|
||||
} catch (Exception e) {
|
||||
log.warn("Title extraction via Jsoup failed for {}, trying Jina Reader", url);
|
||||
// Jina Reader 응답에서 첫 줄을 제목으로 사용
|
||||
try {
|
||||
String content = crawlWithJinaReader(url);
|
||||
String firstLine = content.strip().split("\\r?\\n", 2)[0].strip();
|
||||
// Jina Reader는 "Title: ..." 형태로 제목을 포함하는 경우가 있음
|
||||
if (firstLine.startsWith("Title:")) {
|
||||
return firstLine.substring(6).strip();
|
||||
}
|
||||
return firstLine.length() > 80 ? firstLine.substring(0, 77) + "..." : firstLine;
|
||||
} catch (Exception e2) {
|
||||
log.warn("Jina Reader title extraction also failed, trying Playwright", e2);
|
||||
// Playwright로 제목 추출
|
||||
try (Playwright playwright = Playwright.create()) {
|
||||
try (Browser browser = playwright.chromium().launch(
|
||||
new BrowserType.LaunchOptions().setHeadless(true)
|
||||
.setArgs(java.util.List.of("--no-sandbox", "--disable-setuid-sandbox")))) {
|
||||
Page page = browser.newPage();
|
||||
page.navigate(url, new Page.NavigateOptions().setTimeout(30_000));
|
||||
return page.title();
|
||||
}
|
||||
} catch (Exception e3) {
|
||||
throw new IOException("All title extraction methods failed for: " + url, e3);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user