Add Jina Reader API fallback for web crawling

Jsoup fails on bot-blocked sites (403). Now tries Jsoup first,
then falls back to Jina Reader (r.jina.ai) for better coverage.
Supports optional API key via JINA_READER_API_KEY env var.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-30 22:03:09 +00:00
parent 9929322de0
commit 0cc84354f5
2 changed files with 88 additions and 10 deletions

View File

@@ -5,19 +5,52 @@ import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import org.springframework.web.reactive.function.client.WebClient;
import java.io.IOException; import java.io.IOException;
import java.time.Duration;
@Service @Service
public class WebCrawlerService { public class WebCrawlerService {
private static final Logger log = LoggerFactory.getLogger(WebCrawlerService.class); private static final Logger log = LoggerFactory.getLogger(WebCrawlerService.class);
private static final String JINA_READER_BASE = "https://r.jina.ai/";
private static final int MIN_CONTENT_LENGTH = 100;
private final WebClient webClient;
@Value("${jina.reader.api-key:}")
private String jinaApiKey;
public WebCrawlerService() {
this.webClient = WebClient.builder()
.codecs(configurer -> configurer.defaultCodecs().maxInMemorySize(5 * 1024 * 1024))
.build();
}
public String crawl(String url) throws IOException { public String crawl(String url) throws IOException {
log.info("Crawling URL: {}", url); // 1차: Jsoup 시도
try {
String text = crawlWithJsoup(url);
if (text != null && text.length() >= MIN_CONTENT_LENGTH) {
return text;
}
log.warn("Jsoup returned insufficient content ({} chars), falling back to Jina Reader",
text != null ? text.length() : 0);
} catch (Exception e) {
log.warn("Jsoup crawl failed for {}: {}, falling back to Jina Reader", url, e.getMessage());
}
// 2차: Jina Reader fallback
return crawlWithJinaReader(url);
}
private String crawlWithJsoup(String url) throws IOException {
log.info("Crawling with Jsoup: {}", url);
Document doc = Jsoup.connect(url) Document doc = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (compatible; SUNDOL-bot/1.0)") .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
.timeout(15_000) .timeout(15_000)
.followRedirects(true) .followRedirects(true)
.get(); .get();
@@ -29,18 +62,59 @@ public class WebCrawlerService {
Element article = doc.selectFirst("article, main, .post-content, .article-body, .entry-content"); Element article = doc.selectFirst("article, main, .post-content, .article-body, .entry-content");
String text = (article != null ? article : doc.body()).text(); String text = (article != null ? article : doc.body()).text();
// Extract title if available
String title = doc.title(); String title = doc.title();
log.info("Crawled '{}' - {} chars", title, text.length()); log.info("Jsoup crawled '{}' - {} chars", title, text.length());
return text; return text;
} }
private String crawlWithJinaReader(String url) throws IOException {
log.info("Crawling with Jina Reader: {}", url);
try {
WebClient.RequestHeadersSpec<?> request = webClient.get()
.uri(JINA_READER_BASE + url)
.header("Accept", "text/plain");
if (jinaApiKey != null && !jinaApiKey.isBlank()) {
request = request.header("Authorization", "Bearer " + jinaApiKey);
}
String result = ((WebClient.RequestHeadersSpec<?>) request)
.retrieve()
.bodyToMono(String.class)
.timeout(Duration.ofSeconds(30))
.block();
if (result == null || result.isBlank()) {
throw new IOException("Jina Reader returned empty response for: " + url);
}
log.info("Jina Reader crawled {} - {} chars", url, result.length());
return result;
} catch (IOException e) {
throw e;
} catch (Exception e) {
throw new IOException("Jina Reader failed for " + url + ": " + e.getMessage(), e);
}
}
public String extractTitle(String url) throws IOException { public String extractTitle(String url) throws IOException {
// Jsoup으로 제목만 가져오기 (가벼움)
try {
Document doc = Jsoup.connect(url) Document doc = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (compatible; SUNDOL-bot/1.0)") .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
.timeout(10_000) .timeout(10_000)
.get(); .get();
return doc.title(); return doc.title();
} catch (Exception e) {
log.warn("Title extraction via Jsoup failed for {}, trying Jina Reader", url);
// Jina Reader 응답에서 첫 줄을 제목으로 사용
String content = crawlWithJinaReader(url);
String firstLine = content.strip().split("\\r?\\n", 2)[0].strip();
// Jina Reader는 "Title: ..." 형태로 제목을 포함하는 경우가 있음
if (firstLine.startsWith("Title:")) {
return firstLine.substring(6).strip();
}
return firstLine.length() > 80 ? firstLine.substring(0, 77) + "..." : firstLine;
}
} }
} }

View File

@@ -29,6 +29,10 @@ oci:
model: ${OCI_GENAI_MODEL:google.gemini-2.5-flash} model: ${OCI_GENAI_MODEL:google.gemini-2.5-flash}
base-url: ${OCI_GENAI_BASE_URL:https://inference.generativeai.us-chicago-1.oci.oraclecloud.com/20231130/actions} base-url: ${OCI_GENAI_BASE_URL:https://inference.generativeai.us-chicago-1.oci.oraclecloud.com/20231130/actions}
jina:
reader:
api-key: ${JINA_READER_API_KEY:}
logging: logging:
level: level:
com.sundol: DEBUG com.sundol: DEBUG