Add Jina Reader API fallback for web crawling
Jsoup fails on bot-blocked sites (403). Now tries Jsoup first, then falls back to Jina Reader (r.jina.ai) for better coverage. Supports optional API key via JINA_READER_API_KEY env var. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -5,19 +5,52 @@ import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.web.reactive.function.client.WebClient;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.time.Duration;
|
||||
|
||||
@Service
|
||||
public class WebCrawlerService {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(WebCrawlerService.class);
|
||||
private static final String JINA_READER_BASE = "https://r.jina.ai/";
|
||||
private static final int MIN_CONTENT_LENGTH = 100;
|
||||
|
||||
private final WebClient webClient;
|
||||
|
||||
@Value("${jina.reader.api-key:}")
|
||||
private String jinaApiKey;
|
||||
|
||||
public WebCrawlerService() {
|
||||
this.webClient = WebClient.builder()
|
||||
.codecs(configurer -> configurer.defaultCodecs().maxInMemorySize(5 * 1024 * 1024))
|
||||
.build();
|
||||
}
|
||||
|
||||
public String crawl(String url) throws IOException {
|
||||
log.info("Crawling URL: {}", url);
|
||||
// 1차: Jsoup 시도
|
||||
try {
|
||||
String text = crawlWithJsoup(url);
|
||||
if (text != null && text.length() >= MIN_CONTENT_LENGTH) {
|
||||
return text;
|
||||
}
|
||||
log.warn("Jsoup returned insufficient content ({} chars), falling back to Jina Reader",
|
||||
text != null ? text.length() : 0);
|
||||
} catch (Exception e) {
|
||||
log.warn("Jsoup crawl failed for {}: {}, falling back to Jina Reader", url, e.getMessage());
|
||||
}
|
||||
|
||||
// 2차: Jina Reader fallback
|
||||
return crawlWithJinaReader(url);
|
||||
}
|
||||
|
||||
private String crawlWithJsoup(String url) throws IOException {
|
||||
log.info("Crawling with Jsoup: {}", url);
|
||||
Document doc = Jsoup.connect(url)
|
||||
.userAgent("Mozilla/5.0 (compatible; SUNDOL-bot/1.0)")
|
||||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
|
||||
.timeout(15_000)
|
||||
.followRedirects(true)
|
||||
.get();
|
||||
@@ -29,18 +62,59 @@ public class WebCrawlerService {
|
||||
Element article = doc.selectFirst("article, main, .post-content, .article-body, .entry-content");
|
||||
String text = (article != null ? article : doc.body()).text();
|
||||
|
||||
// Extract title if available
|
||||
String title = doc.title();
|
||||
log.info("Crawled '{}' - {} chars", title, text.length());
|
||||
|
||||
log.info("Jsoup crawled '{}' - {} chars", title, text.length());
|
||||
return text;
|
||||
}
|
||||
|
||||
private String crawlWithJinaReader(String url) throws IOException {
|
||||
log.info("Crawling with Jina Reader: {}", url);
|
||||
try {
|
||||
WebClient.RequestHeadersSpec<?> request = webClient.get()
|
||||
.uri(JINA_READER_BASE + url)
|
||||
.header("Accept", "text/plain");
|
||||
|
||||
if (jinaApiKey != null && !jinaApiKey.isBlank()) {
|
||||
request = request.header("Authorization", "Bearer " + jinaApiKey);
|
||||
}
|
||||
|
||||
String result = ((WebClient.RequestHeadersSpec<?>) request)
|
||||
.retrieve()
|
||||
.bodyToMono(String.class)
|
||||
.timeout(Duration.ofSeconds(30))
|
||||
.block();
|
||||
|
||||
if (result == null || result.isBlank()) {
|
||||
throw new IOException("Jina Reader returned empty response for: " + url);
|
||||
}
|
||||
|
||||
log.info("Jina Reader crawled {} - {} chars", url, result.length());
|
||||
return result;
|
||||
} catch (IOException e) {
|
||||
throw e;
|
||||
} catch (Exception e) {
|
||||
throw new IOException("Jina Reader failed for " + url + ": " + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
public String extractTitle(String url) throws IOException {
|
||||
Document doc = Jsoup.connect(url)
|
||||
.userAgent("Mozilla/5.0 (compatible; SUNDOL-bot/1.0)")
|
||||
.timeout(10_000)
|
||||
.get();
|
||||
return doc.title();
|
||||
// Jsoup으로 제목만 가져오기 (가벼움)
|
||||
try {
|
||||
Document doc = Jsoup.connect(url)
|
||||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
|
||||
.timeout(10_000)
|
||||
.get();
|
||||
return doc.title();
|
||||
} catch (Exception e) {
|
||||
log.warn("Title extraction via Jsoup failed for {}, trying Jina Reader", url);
|
||||
// Jina Reader 응답에서 첫 줄을 제목으로 사용
|
||||
String content = crawlWithJinaReader(url);
|
||||
String firstLine = content.strip().split("\\r?\\n", 2)[0].strip();
|
||||
// Jina Reader는 "Title: ..." 형태로 제목을 포함하는 경우가 있음
|
||||
if (firstLine.startsWith("Title:")) {
|
||||
return firstLine.substring(6).strip();
|
||||
}
|
||||
return firstLine.length() > 80 ? firstLine.substring(0, 77) + "..." : firstLine;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -29,6 +29,10 @@ oci:
|
||||
model: ${OCI_GENAI_MODEL:google.gemini-2.5-flash}
|
||||
base-url: ${OCI_GENAI_BASE_URL:https://inference.generativeai.us-chicago-1.oci.oraclecloud.com/20231130/actions}
|
||||
|
||||
jina:
|
||||
reader:
|
||||
api-key: ${JINA_READER_API_KEY:}
|
||||
|
||||
logging:
|
||||
level:
|
||||
com.sundol: DEBUG
|
||||
|
||||
Reference in New Issue
Block a user