diff --git a/ecosystem.config.cjs b/ecosystem.config.cjs index f7d9ea3..2b5d190 100644 --- a/ecosystem.config.cjs +++ b/ecosystem.config.cjs @@ -7,6 +7,7 @@ module.exports = { cwd: "/home/opc/sundol", env: { JAVA_HOME: "/usr/lib/jvm/java-21", + PLAYWRIGHT_NODEJS_PATH: "/home/opc/.playwright-driver/driver/linux/node", }, }, { diff --git a/start-backend.sh b/start-backend.sh index 177241e..57dcce0 100755 --- a/start-backend.sh +++ b/start-backend.sh @@ -6,4 +6,10 @@ set +a JAVA_HOME=${JAVA_HOME:-/usr/lib/jvm/java-21} export JAVA_HOME -exec $JAVA_HOME/bin/java -jar /home/opc/sundol/sundol-backend/target/sundol-backend-0.0.1-SNAPSHOT.jar +# Playwright: use pre-installed browsers, skip auto-download +export PLAYWRIGHT_BROWSERS_PATH=/home/opc/.cache/ms-playwright +export PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 + +# Playwright driver-bundle requires exploded classpath (fat JAR extraction fails) +BACKEND_DIR=/home/opc/sundol/sundol-backend +exec $JAVA_HOME/bin/java -cp "$BACKEND_DIR/target/classes:$BACKEND_DIR/target/dependency/*" com.sundol.SundolApplication diff --git a/sundol-backend/pom.xml b/sundol-backend/pom.xml index e4c704d..eb4f3cc 100644 --- a/sundol-backend/pom.xml +++ b/sundol-backend/pom.xml @@ -104,6 +104,18 @@ 1.18.3 + + + com.microsoft.playwright + playwright + 1.51.0 + + + com.microsoft.playwright + driver-bundle + 1.51.0 + + com.fasterxml.jackson.core diff --git a/sundol-backend/src/main/java/com/sundol/service/WebCrawlerService.java b/sundol-backend/src/main/java/com/sundol/service/WebCrawlerService.java index 26ad0be..5ebb40a 100644 --- a/sundol-backend/src/main/java/com/sundol/service/WebCrawlerService.java +++ b/sundol-backend/src/main/java/com/sundol/service/WebCrawlerService.java @@ -1,5 +1,9 @@ package com.sundol.service; +import com.microsoft.playwright.Browser; +import com.microsoft.playwright.BrowserType; +import com.microsoft.playwright.Page; +import com.microsoft.playwright.Playwright; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -18,6 +22,13 @@ public class WebCrawlerService { private static final Logger log = LoggerFactory.getLogger(WebCrawlerService.class); private static final String JINA_READER_BASE = "https://r.jina.ai/"; private static final int MIN_CONTENT_LENGTH = 100; + private static final java.util.List ERROR_PATTERNS = java.util.List.of( + "access denied", "403 forbidden", "you don't have permission", + "error 403", "error 401", "unauthorized", "captcha", + "please enable javascript", "checking your browser", + "attention required", "just a moment", + "technical difficulty", "page not found", "404 not found" + ); private final WebClient webClient; @@ -34,17 +45,48 @@ public class WebCrawlerService { // 1차: Jsoup 시도 try { String text = crawlWithJsoup(url); - if (text != null && text.length() >= MIN_CONTENT_LENGTH) { + if (isValidContent(text)) { return text; } - log.warn("Jsoup returned insufficient content ({} chars), falling back to Jina Reader", + log.warn("Jsoup returned invalid content ({} chars), falling back to Jina Reader", text != null ? text.length() : 0); } catch (Exception e) { log.warn("Jsoup crawl failed for {}: {}, falling back to Jina Reader", url, e.getMessage()); } // 2차: Jina Reader fallback - return crawlWithJinaReader(url); + try { + String text = crawlWithJinaReader(url); + if (isValidContent(text)) { + return text; + } + log.warn("Jina Reader returned invalid content ({} chars), falling back to Playwright", + text != null ? text.length() : 0); + } catch (Exception e) { + log.warn("Jina Reader failed for {}: {}, falling back to Playwright", url, e.getMessage()); + } + + // 3차: Playwright headless browser (최후의 수단) + String playwrightText = crawlWithPlaywright(url); + if (!isValidContent(playwrightText)) { + throw new IOException("All crawl methods failed for " + url + " (error page detected from all 3 sources)"); + } + return playwrightText; + } + + private boolean isValidContent(String text) { + if (text == null || text.length() < MIN_CONTENT_LENGTH) { + return false; + } + // 에러 페이지 패턴 감지 (앞 500자만 검사) + String preview = text.substring(0, Math.min(text.length(), 500)).toLowerCase(); + for (String pattern : ERROR_PATTERNS) { + if (preview.contains(pattern)) { + log.warn("Error page detected: content contains '{}'", pattern); + return false; + } + } + return true; } private String crawlWithJsoup(String url) throws IOException { @@ -97,6 +139,52 @@ public class WebCrawlerService { } } + private String crawlWithPlaywright(String url) throws IOException { + log.info("Crawling with Playwright: {}", url); + try (Playwright playwright = Playwright.create()) { + BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions() + .setHeadless(true) + .setArgs(java.util.List.of( + "--no-sandbox", + "--disable-setuid-sandbox", + "--disable-dev-shm-usage", + "--disable-gpu" + )); + + try (Browser browser = playwright.chromium().launch(launchOptions)) { + Browser.NewContextOptions contextOptions = new Browser.NewContextOptions() + .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"); + + var context = browser.newContext(contextOptions); + Page page = context.newPage(); + + page.navigate(url, new Page.NavigateOptions() + .setTimeout(30_000) + .setWaitUntil(com.microsoft.playwright.options.WaitUntilState.NETWORKIDLE)); + + // JS 실행으로 본문 텍스트 추출 + String text = page.evaluate("() => {" + + " ['nav','footer','header','script','style','.ad','#cookie-banner','.sidebar','.comments']" + + " .forEach(sel => document.querySelectorAll(sel).forEach(el => el.remove()));" + + " const article = document.querySelector('article, main, .post-content, .article-body, .entry-content');" + + " return (article || document.body).innerText;" + + "}").toString(); + + log.info("Playwright crawled {} - {} chars", url, text.length()); + + if (text == null || text.isBlank()) { + throw new IOException("Playwright returned empty content for: " + url); + } + + return text; + } + } catch (IOException e) { + throw e; + } catch (Exception e) { + throw new IOException("Playwright failed for " + url + ": " + e.getMessage(), e); + } + } + public String extractTitle(String url) throws IOException { // Jsoup으로 제목만 가져오기 (가벼움) try { @@ -108,13 +196,28 @@ public class WebCrawlerService { } catch (Exception e) { log.warn("Title extraction via Jsoup failed for {}, trying Jina Reader", url); // Jina Reader 응답에서 첫 줄을 제목으로 사용 - String content = crawlWithJinaReader(url); - String firstLine = content.strip().split("\\r?\\n", 2)[0].strip(); - // Jina Reader는 "Title: ..." 형태로 제목을 포함하는 경우가 있음 - if (firstLine.startsWith("Title:")) { - return firstLine.substring(6).strip(); + try { + String content = crawlWithJinaReader(url); + String firstLine = content.strip().split("\\r?\\n", 2)[0].strip(); + if (firstLine.startsWith("Title:")) { + return firstLine.substring(6).strip(); + } + return firstLine.length() > 80 ? firstLine.substring(0, 77) + "..." : firstLine; + } catch (Exception e2) { + log.warn("Jina Reader title extraction also failed, trying Playwright", e2); + // Playwright로 제목 추출 + try (Playwright playwright = Playwright.create()) { + try (Browser browser = playwright.chromium().launch( + new BrowserType.LaunchOptions().setHeadless(true) + .setArgs(java.util.List.of("--no-sandbox", "--disable-setuid-sandbox")))) { + Page page = browser.newPage(); + page.navigate(url, new Page.NavigateOptions().setTimeout(30_000)); + return page.title(); + } + } catch (Exception e3) { + throw new IOException("All title extraction methods failed for: " + url, e3); + } } - return firstLine.length() > 80 ? firstLine.substring(0, 77) + "..." : firstLine; } } }