Use youtube-transcript-api library with Playwright fallback for YouTube transcripts
Replace Jsoup-based approach with io.github.thoroldvix:youtube-transcript-api as primary method (supports manual/generated captions, language priority). Playwright head mode kept as fallback when API fails. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -104,6 +104,13 @@
|
||||
<version>1.18.3</version>
|
||||
</dependency>
|
||||
|
||||
<!-- YouTube Transcript API -->
|
||||
<dependency>
|
||||
<groupId>io.github.thoroldvix</groupId>
|
||||
<artifactId>youtube-transcript-api</artifactId>
|
||||
<version>0.4.0</version>
|
||||
</dependency>
|
||||
|
||||
<!-- Playwright (headless browser, driver-bundle includes node runtime) -->
|
||||
<dependency>
|
||||
<groupId>com.microsoft.playwright</groupId>
|
||||
|
||||
@@ -2,24 +2,30 @@ package com.sundol.service;
|
||||
|
||||
import com.microsoft.playwright.*;
|
||||
import com.microsoft.playwright.options.WaitUntilState;
|
||||
import io.github.thoroldvix.api.TranscriptApiFactory;
|
||||
import io.github.thoroldvix.api.TranscriptContent;
|
||||
import io.github.thoroldvix.api.TranscriptList;
|
||||
import io.github.thoroldvix.api.Transcript;
|
||||
import io.github.thoroldvix.api.YoutubeTranscriptApi;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.web.reactive.function.client.WebClient;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URLDecoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Duration;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Service
|
||||
public class YouTubeTranscriptService {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(YouTubeTranscriptService.class);
|
||||
|
||||
private static final String[] PREFERRED_LANGS = {"ko", "en"};
|
||||
|
||||
private static final Pattern CAPTION_TRACK_PATTERN =
|
||||
Pattern.compile("\"captionTracks\":\\s*\\[(.*?)]", Pattern.DOTALL);
|
||||
private static final Pattern BASE_URL_PATTERN =
|
||||
@@ -29,13 +35,7 @@ public class YouTubeTranscriptService {
|
||||
private static final Pattern XML_TEXT_PATTERN =
|
||||
Pattern.compile("<text[^>]*>(.*?)</text>", Pattern.DOTALL);
|
||||
|
||||
private final WebClient webClient;
|
||||
|
||||
public YouTubeTranscriptService() {
|
||||
this.webClient = WebClient.builder()
|
||||
.codecs(configurer -> configurer.defaultCodecs().maxInMemorySize(5 * 1024 * 1024))
|
||||
.build();
|
||||
}
|
||||
private final YoutubeTranscriptApi transcriptApi = TranscriptApiFactory.createDefault();
|
||||
|
||||
public String fetchTranscript(String youtubeUrl) throws IOException {
|
||||
String videoId = extractVideoId(youtubeUrl);
|
||||
@@ -43,10 +43,67 @@ public class YouTubeTranscriptService {
|
||||
throw new IOException("유효하지 않은 YouTube URL입니다: " + youtubeUrl);
|
||||
}
|
||||
|
||||
String watchUrl = "https://www.youtube.com/watch?v=" + videoId;
|
||||
log.info("Fetching YouTube transcript for: {}", watchUrl);
|
||||
log.info("Fetching YouTube transcript for videoId: {}", videoId);
|
||||
|
||||
// 1차: youtube-transcript-api 라이브러리
|
||||
try {
|
||||
String transcript = fetchWithApi(videoId);
|
||||
if (transcript != null && !transcript.isBlank()) {
|
||||
log.info("Successfully fetched transcript via API: {} chars", transcript.length());
|
||||
return transcript;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.warn("youtube-transcript-api failed for {}: {}", videoId, e.getMessage());
|
||||
}
|
||||
|
||||
// 2차 fallback: Playwright head 모드
|
||||
log.info("Falling back to Playwright for videoId: {}", videoId);
|
||||
return fetchWithPlaywright(videoId);
|
||||
}
|
||||
|
||||
private String fetchWithApi(String videoId) {
|
||||
TranscriptList transcriptList;
|
||||
try {
|
||||
transcriptList = transcriptApi.listTranscripts(videoId);
|
||||
} catch (Exception e) {
|
||||
log.warn("Cannot list transcripts for {}: {}", videoId, e.getMessage());
|
||||
return null;
|
||||
}
|
||||
|
||||
// manual(수동 자막) 먼저 시도, 없으면 generated(자동 생성)
|
||||
String result = fetchTranscriptByType(transcriptList, true);
|
||||
if (result != null) return result;
|
||||
return fetchTranscriptByType(transcriptList, false);
|
||||
}
|
||||
|
||||
private String fetchTranscriptByType(TranscriptList list, boolean manual) {
|
||||
Transcript picked;
|
||||
try {
|
||||
picked = manual ? list.findManualTranscript(PREFERRED_LANGS)
|
||||
: list.findGeneratedTranscript(PREFERRED_LANGS);
|
||||
} catch (Exception e) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
TranscriptContent content = picked.fetch();
|
||||
String text = content.getContent().stream()
|
||||
.map(TranscriptContent.Fragment::getText)
|
||||
.collect(Collectors.joining(" "));
|
||||
if (text.isBlank()) return null;
|
||||
String label = manual ? "manual" : "generated";
|
||||
log.info("Transcript source: {} ({})", label, picked.getLanguageCode());
|
||||
return text;
|
||||
} catch (Exception e) {
|
||||
log.warn("Failed to fetch transcript for language {}: {}",
|
||||
picked.getLanguageCode(), e.getMessage());
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private String fetchWithPlaywright(String videoId) throws IOException {
|
||||
String watchUrl = "https://www.youtube.com/watch?v=" + videoId;
|
||||
|
||||
// Playwright head 모드로 YouTube 페이지 HTML 가져오기
|
||||
String html;
|
||||
try (Playwright playwright = Playwright.create()) {
|
||||
BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions()
|
||||
@@ -83,48 +140,37 @@ public class YouTubeTranscriptService {
|
||||
}
|
||||
|
||||
String captionTracksJson = captionMatcher.group(1);
|
||||
|
||||
// 자막 트랙 URL 선택 (한국어 > 영어 > 첫 번째)
|
||||
String captionUrl = selectCaptionUrl(captionTracksJson);
|
||||
if (captionUrl == null) {
|
||||
throw new IOException("자막 트랙 URL을 추출할 수 없습니다.");
|
||||
}
|
||||
|
||||
// Unicode escape 처리
|
||||
captionUrl = captionUrl.replace("\\u0026", "&");
|
||||
|
||||
log.info("Fetching caption XML from: {}", captionUrl);
|
||||
|
||||
// 자막 XML 가져오기
|
||||
String xml;
|
||||
try {
|
||||
xml = webClient.get()
|
||||
.uri(captionUrl)
|
||||
.retrieve()
|
||||
.bodyToMono(String.class)
|
||||
.timeout(Duration.ofSeconds(15))
|
||||
.block();
|
||||
xml = new java.net.URI(captionUrl).toURL().openConnection()
|
||||
.getInputStream().readAllBytes().toString();
|
||||
// Use simple HTTP fetch
|
||||
var conn = new java.net.URI(captionUrl).toURL().openConnection();
|
||||
conn.setRequestProperty("User-Agent", "Mozilla/5.0");
|
||||
xml = new String(conn.getInputStream().readAllBytes(), StandardCharsets.UTF_8);
|
||||
} catch (Exception e) {
|
||||
throw new IOException("자막 XML을 가져올 수 없습니다: " + e.getMessage(), e);
|
||||
}
|
||||
|
||||
if (xml == null || xml.isBlank()) {
|
||||
throw new IOException("자막 XML이 비어있습니다.");
|
||||
}
|
||||
|
||||
// XML에서 텍스트 추출
|
||||
String transcript = parseTranscriptXml(xml);
|
||||
if (transcript.isBlank()) {
|
||||
throw new IOException("자막 텍스트를 파싱할 수 없습니다.");
|
||||
}
|
||||
|
||||
log.info("Successfully fetched transcript: {} chars", transcript.length());
|
||||
log.info("Successfully fetched transcript via Playwright: {} chars", transcript.length());
|
||||
return transcript;
|
||||
}
|
||||
|
||||
private String selectCaptionUrl(String captionTracksJson) {
|
||||
// 각 트랙에서 baseUrl과 languageCode 추출
|
||||
// 여러 트랙이 있을 수 있으므로 개별 트랙을 분리
|
||||
String[] tracks = captionTracksJson.split("\\},\\s*\\{");
|
||||
|
||||
String koUrl = null;
|
||||
|
||||
Reference in New Issue
Block a user