diff --git a/ecosystem.config.cjs b/ecosystem.config.cjs
index f7d9ea3..2b5d190 100644
--- a/ecosystem.config.cjs
+++ b/ecosystem.config.cjs
@@ -7,6 +7,7 @@ module.exports = {
cwd: "/home/opc/sundol",
env: {
JAVA_HOME: "/usr/lib/jvm/java-21",
+ PLAYWRIGHT_NODEJS_PATH: "/home/opc/.playwright-driver/driver/linux/node",
},
},
{
diff --git a/start-backend.sh b/start-backend.sh
index 177241e..57dcce0 100755
--- a/start-backend.sh
+++ b/start-backend.sh
@@ -6,4 +6,10 @@ set +a
JAVA_HOME=${JAVA_HOME:-/usr/lib/jvm/java-21}
export JAVA_HOME
-exec $JAVA_HOME/bin/java -jar /home/opc/sundol/sundol-backend/target/sundol-backend-0.0.1-SNAPSHOT.jar
+# Playwright: use pre-installed browsers, skip auto-download
+export PLAYWRIGHT_BROWSERS_PATH=/home/opc/.cache/ms-playwright
+export PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1
+
+# Playwright driver-bundle requires exploded classpath (fat JAR extraction fails)
+BACKEND_DIR=/home/opc/sundol/sundol-backend
+exec $JAVA_HOME/bin/java -cp "$BACKEND_DIR/target/classes:$BACKEND_DIR/target/dependency/*" com.sundol.SundolApplication
diff --git a/sundol-backend/pom.xml b/sundol-backend/pom.xml
index e4c704d..eb4f3cc 100644
--- a/sundol-backend/pom.xml
+++ b/sundol-backend/pom.xml
@@ -104,6 +104,18 @@
1.18.3
+
+
+ com.microsoft.playwright
+ playwright
+ 1.51.0
+
+
+ com.microsoft.playwright
+ driver-bundle
+ 1.51.0
+
+
com.fasterxml.jackson.core
diff --git a/sundol-backend/src/main/java/com/sundol/service/WebCrawlerService.java b/sundol-backend/src/main/java/com/sundol/service/WebCrawlerService.java
index 26ad0be..5ebb40a 100644
--- a/sundol-backend/src/main/java/com/sundol/service/WebCrawlerService.java
+++ b/sundol-backend/src/main/java/com/sundol/service/WebCrawlerService.java
@@ -1,5 +1,9 @@
package com.sundol.service;
+import com.microsoft.playwright.Browser;
+import com.microsoft.playwright.BrowserType;
+import com.microsoft.playwright.Page;
+import com.microsoft.playwright.Playwright;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@@ -18,6 +22,13 @@ public class WebCrawlerService {
private static final Logger log = LoggerFactory.getLogger(WebCrawlerService.class);
private static final String JINA_READER_BASE = "https://r.jina.ai/";
private static final int MIN_CONTENT_LENGTH = 100;
+ private static final java.util.List ERROR_PATTERNS = java.util.List.of(
+ "access denied", "403 forbidden", "you don't have permission",
+ "error 403", "error 401", "unauthorized", "captcha",
+ "please enable javascript", "checking your browser",
+ "attention required", "just a moment",
+ "technical difficulty", "page not found", "404 not found"
+ );
private final WebClient webClient;
@@ -34,17 +45,48 @@ public class WebCrawlerService {
// 1차: Jsoup 시도
try {
String text = crawlWithJsoup(url);
- if (text != null && text.length() >= MIN_CONTENT_LENGTH) {
+ if (isValidContent(text)) {
return text;
}
- log.warn("Jsoup returned insufficient content ({} chars), falling back to Jina Reader",
+ log.warn("Jsoup returned invalid content ({} chars), falling back to Jina Reader",
text != null ? text.length() : 0);
} catch (Exception e) {
log.warn("Jsoup crawl failed for {}: {}, falling back to Jina Reader", url, e.getMessage());
}
// 2차: Jina Reader fallback
- return crawlWithJinaReader(url);
+ try {
+ String text = crawlWithJinaReader(url);
+ if (isValidContent(text)) {
+ return text;
+ }
+ log.warn("Jina Reader returned invalid content ({} chars), falling back to Playwright",
+ text != null ? text.length() : 0);
+ } catch (Exception e) {
+ log.warn("Jina Reader failed for {}: {}, falling back to Playwright", url, e.getMessage());
+ }
+
+ // 3차: Playwright headless browser (최후의 수단)
+ String playwrightText = crawlWithPlaywright(url);
+ if (!isValidContent(playwrightText)) {
+ throw new IOException("All crawl methods failed for " + url + " (error page detected from all 3 sources)");
+ }
+ return playwrightText;
+ }
+
+ private boolean isValidContent(String text) {
+ if (text == null || text.length() < MIN_CONTENT_LENGTH) {
+ return false;
+ }
+ // 에러 페이지 패턴 감지 (앞 500자만 검사)
+ String preview = text.substring(0, Math.min(text.length(), 500)).toLowerCase();
+ for (String pattern : ERROR_PATTERNS) {
+ if (preview.contains(pattern)) {
+ log.warn("Error page detected: content contains '{}'", pattern);
+ return false;
+ }
+ }
+ return true;
}
private String crawlWithJsoup(String url) throws IOException {
@@ -97,6 +139,52 @@ public class WebCrawlerService {
}
}
+ private String crawlWithPlaywright(String url) throws IOException {
+ log.info("Crawling with Playwright: {}", url);
+ try (Playwright playwright = Playwright.create()) {
+ BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions()
+ .setHeadless(true)
+ .setArgs(java.util.List.of(
+ "--no-sandbox",
+ "--disable-setuid-sandbox",
+ "--disable-dev-shm-usage",
+ "--disable-gpu"
+ ));
+
+ try (Browser browser = playwright.chromium().launch(launchOptions)) {
+ Browser.NewContextOptions contextOptions = new Browser.NewContextOptions()
+ .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36");
+
+ var context = browser.newContext(contextOptions);
+ Page page = context.newPage();
+
+ page.navigate(url, new Page.NavigateOptions()
+ .setTimeout(30_000)
+ .setWaitUntil(com.microsoft.playwright.options.WaitUntilState.NETWORKIDLE));
+
+ // JS 실행으로 본문 텍스트 추출
+ String text = page.evaluate("() => {" +
+ " ['nav','footer','header','script','style','.ad','#cookie-banner','.sidebar','.comments']" +
+ " .forEach(sel => document.querySelectorAll(sel).forEach(el => el.remove()));" +
+ " const article = document.querySelector('article, main, .post-content, .article-body, .entry-content');" +
+ " return (article || document.body).innerText;" +
+ "}").toString();
+
+ log.info("Playwright crawled {} - {} chars", url, text.length());
+
+ if (text == null || text.isBlank()) {
+ throw new IOException("Playwright returned empty content for: " + url);
+ }
+
+ return text;
+ }
+ } catch (IOException e) {
+ throw e;
+ } catch (Exception e) {
+ throw new IOException("Playwright failed for " + url + ": " + e.getMessage(), e);
+ }
+ }
+
public String extractTitle(String url) throws IOException {
// Jsoup으로 제목만 가져오기 (가벼움)
try {
@@ -108,13 +196,28 @@ public class WebCrawlerService {
} catch (Exception e) {
log.warn("Title extraction via Jsoup failed for {}, trying Jina Reader", url);
// Jina Reader 응답에서 첫 줄을 제목으로 사용
- String content = crawlWithJinaReader(url);
- String firstLine = content.strip().split("\\r?\\n", 2)[0].strip();
- // Jina Reader는 "Title: ..." 형태로 제목을 포함하는 경우가 있음
- if (firstLine.startsWith("Title:")) {
- return firstLine.substring(6).strip();
+ try {
+ String content = crawlWithJinaReader(url);
+ String firstLine = content.strip().split("\\r?\\n", 2)[0].strip();
+ if (firstLine.startsWith("Title:")) {
+ return firstLine.substring(6).strip();
+ }
+ return firstLine.length() > 80 ? firstLine.substring(0, 77) + "..." : firstLine;
+ } catch (Exception e2) {
+ log.warn("Jina Reader title extraction also failed, trying Playwright", e2);
+ // Playwright로 제목 추출
+ try (Playwright playwright = Playwright.create()) {
+ try (Browser browser = playwright.chromium().launch(
+ new BrowserType.LaunchOptions().setHeadless(true)
+ .setArgs(java.util.List.of("--no-sandbox", "--disable-setuid-sandbox")))) {
+ Page page = browser.newPage();
+ page.navigate(url, new Page.NavigateOptions().setTimeout(30_000));
+ return page.title();
+ }
+ } catch (Exception e3) {
+ throw new IOException("All title extraction methods failed for: " + url, e3);
+ }
}
- return firstLine.length() > 80 ? firstLine.substring(0, 77) + "..." : firstLine;
}
}
}