From 0cc84354f5caf911a681621b6eb91d7040a9945d Mon Sep 17 00:00:00 2001 From: joungmin Date: Mon, 30 Mar 2026 22:03:09 +0000 Subject: [PATCH] Add Jina Reader API fallback for web crawling Jsoup fails on bot-blocked sites (403). Now tries Jsoup first, then falls back to Jina Reader (r.jina.ai) for better coverage. Supports optional API key via JINA_READER_API_KEY env var. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../com/sundol/service/WebCrawlerService.java | 94 +++++++++++++++++-- .../src/main/resources/application.yml | 4 + 2 files changed, 88 insertions(+), 10 deletions(-) diff --git a/sundol-backend/src/main/java/com/sundol/service/WebCrawlerService.java b/sundol-backend/src/main/java/com/sundol/service/WebCrawlerService.java index 384a701..26ad0be 100644 --- a/sundol-backend/src/main/java/com/sundol/service/WebCrawlerService.java +++ b/sundol-backend/src/main/java/com/sundol/service/WebCrawlerService.java @@ -5,19 +5,52 @@ import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; +import org.springframework.web.reactive.function.client.WebClient; import java.io.IOException; +import java.time.Duration; @Service public class WebCrawlerService { private static final Logger log = LoggerFactory.getLogger(WebCrawlerService.class); + private static final String JINA_READER_BASE = "https://r.jina.ai/"; + private static final int MIN_CONTENT_LENGTH = 100; + + private final WebClient webClient; + + @Value("${jina.reader.api-key:}") + private String jinaApiKey; + + public WebCrawlerService() { + this.webClient = WebClient.builder() + .codecs(configurer -> configurer.defaultCodecs().maxInMemorySize(5 * 1024 * 1024)) + .build(); + } public String crawl(String url) throws IOException { - log.info("Crawling URL: {}", url); + // 1차: Jsoup 시도 + try { + String text = crawlWithJsoup(url); + if (text != null && text.length() >= MIN_CONTENT_LENGTH) { + return text; + } + log.warn("Jsoup returned insufficient content ({} chars), falling back to Jina Reader", + text != null ? text.length() : 0); + } catch (Exception e) { + log.warn("Jsoup crawl failed for {}: {}, falling back to Jina Reader", url, e.getMessage()); + } + + // 2차: Jina Reader fallback + return crawlWithJinaReader(url); + } + + private String crawlWithJsoup(String url) throws IOException { + log.info("Crawling with Jsoup: {}", url); Document doc = Jsoup.connect(url) - .userAgent("Mozilla/5.0 (compatible; SUNDOL-bot/1.0)") + .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36") .timeout(15_000) .followRedirects(true) .get(); @@ -29,18 +62,59 @@ public class WebCrawlerService { Element article = doc.selectFirst("article, main, .post-content, .article-body, .entry-content"); String text = (article != null ? article : doc.body()).text(); - // Extract title if available String title = doc.title(); - log.info("Crawled '{}' - {} chars", title, text.length()); - + log.info("Jsoup crawled '{}' - {} chars", title, text.length()); return text; } + private String crawlWithJinaReader(String url) throws IOException { + log.info("Crawling with Jina Reader: {}", url); + try { + WebClient.RequestHeadersSpec request = webClient.get() + .uri(JINA_READER_BASE + url) + .header("Accept", "text/plain"); + + if (jinaApiKey != null && !jinaApiKey.isBlank()) { + request = request.header("Authorization", "Bearer " + jinaApiKey); + } + + String result = ((WebClient.RequestHeadersSpec) request) + .retrieve() + .bodyToMono(String.class) + .timeout(Duration.ofSeconds(30)) + .block(); + + if (result == null || result.isBlank()) { + throw new IOException("Jina Reader returned empty response for: " + url); + } + + log.info("Jina Reader crawled {} - {} chars", url, result.length()); + return result; + } catch (IOException e) { + throw e; + } catch (Exception e) { + throw new IOException("Jina Reader failed for " + url + ": " + e.getMessage(), e); + } + } + public String extractTitle(String url) throws IOException { - Document doc = Jsoup.connect(url) - .userAgent("Mozilla/5.0 (compatible; SUNDOL-bot/1.0)") - .timeout(10_000) - .get(); - return doc.title(); + // Jsoup으로 제목만 가져오기 (가벼움) + try { + Document doc = Jsoup.connect(url) + .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36") + .timeout(10_000) + .get(); + return doc.title(); + } catch (Exception e) { + log.warn("Title extraction via Jsoup failed for {}, trying Jina Reader", url); + // Jina Reader 응답에서 첫 줄을 제목으로 사용 + String content = crawlWithJinaReader(url); + String firstLine = content.strip().split("\\r?\\n", 2)[0].strip(); + // Jina Reader는 "Title: ..." 형태로 제목을 포함하는 경우가 있음 + if (firstLine.startsWith("Title:")) { + return firstLine.substring(6).strip(); + } + return firstLine.length() > 80 ? firstLine.substring(0, 77) + "..." : firstLine; + } } } diff --git a/sundol-backend/src/main/resources/application.yml b/sundol-backend/src/main/resources/application.yml index f652beb..ae70c0f 100644 --- a/sundol-backend/src/main/resources/application.yml +++ b/sundol-backend/src/main/resources/application.yml @@ -29,6 +29,10 @@ oci: model: ${OCI_GENAI_MODEL:google.gemini-2.5-flash} base-url: ${OCI_GENAI_BASE_URL:https://inference.generativeai.us-chicago-1.oci.oraclecloud.com/20231130/actions} +jina: + reader: + api-key: ${JINA_READER_API_KEY:} + logging: level: com.sundol: DEBUG