Use youtube-transcript-api library with Playwright fallback for YouTube transcripts

Replace Jsoup-based approach with io.github.thoroldvix:youtube-transcript-api
as primary method (supports manual/generated captions, language priority).
Playwright head mode kept as fallback when API fails.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-01 07:26:52 +00:00
parent 1bfe55d5a8
commit 677a79978f
2 changed files with 83 additions and 30 deletions

View File

@@ -104,6 +104,13 @@
<version>1.18.3</version> <version>1.18.3</version>
</dependency> </dependency>
<!-- YouTube Transcript API -->
<dependency>
<groupId>io.github.thoroldvix</groupId>
<artifactId>youtube-transcript-api</artifactId>
<version>0.4.0</version>
</dependency>
<!-- Playwright (headless browser, driver-bundle includes node runtime) --> <!-- Playwright (headless browser, driver-bundle includes node runtime) -->
<dependency> <dependency>
<groupId>com.microsoft.playwright</groupId> <groupId>com.microsoft.playwright</groupId>

View File

@@ -2,24 +2,30 @@ package com.sundol.service;
import com.microsoft.playwright.*; import com.microsoft.playwright.*;
import com.microsoft.playwright.options.WaitUntilState; import com.microsoft.playwright.options.WaitUntilState;
import io.github.thoroldvix.api.TranscriptApiFactory;
import io.github.thoroldvix.api.TranscriptContent;
import io.github.thoroldvix.api.TranscriptList;
import io.github.thoroldvix.api.Transcript;
import io.github.thoroldvix.api.YoutubeTranscriptApi;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import org.springframework.web.reactive.function.client.WebClient;
import java.io.IOException; import java.io.IOException;
import java.net.URLDecoder; import java.net.URLDecoder;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.List; import java.util.List;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors;
@Service @Service
public class YouTubeTranscriptService { public class YouTubeTranscriptService {
private static final Logger log = LoggerFactory.getLogger(YouTubeTranscriptService.class); private static final Logger log = LoggerFactory.getLogger(YouTubeTranscriptService.class);
private static final String[] PREFERRED_LANGS = {"ko", "en"};
private static final Pattern CAPTION_TRACK_PATTERN = private static final Pattern CAPTION_TRACK_PATTERN =
Pattern.compile("\"captionTracks\":\\s*\\[(.*?)]", Pattern.DOTALL); Pattern.compile("\"captionTracks\":\\s*\\[(.*?)]", Pattern.DOTALL);
private static final Pattern BASE_URL_PATTERN = private static final Pattern BASE_URL_PATTERN =
@@ -29,13 +35,7 @@ public class YouTubeTranscriptService {
private static final Pattern XML_TEXT_PATTERN = private static final Pattern XML_TEXT_PATTERN =
Pattern.compile("<text[^>]*>(.*?)</text>", Pattern.DOTALL); Pattern.compile("<text[^>]*>(.*?)</text>", Pattern.DOTALL);
private final WebClient webClient; private final YoutubeTranscriptApi transcriptApi = TranscriptApiFactory.createDefault();
public YouTubeTranscriptService() {
this.webClient = WebClient.builder()
.codecs(configurer -> configurer.defaultCodecs().maxInMemorySize(5 * 1024 * 1024))
.build();
}
public String fetchTranscript(String youtubeUrl) throws IOException { public String fetchTranscript(String youtubeUrl) throws IOException {
String videoId = extractVideoId(youtubeUrl); String videoId = extractVideoId(youtubeUrl);
@@ -43,10 +43,67 @@ public class YouTubeTranscriptService {
throw new IOException("유효하지 않은 YouTube URL입니다: " + youtubeUrl); throw new IOException("유효하지 않은 YouTube URL입니다: " + youtubeUrl);
} }
String watchUrl = "https://www.youtube.com/watch?v=" + videoId; log.info("Fetching YouTube transcript for videoId: {}", videoId);
log.info("Fetching YouTube transcript for: {}", watchUrl);
// 1차: youtube-transcript-api 라이브러리
try {
String transcript = fetchWithApi(videoId);
if (transcript != null && !transcript.isBlank()) {
log.info("Successfully fetched transcript via API: {} chars", transcript.length());
return transcript;
}
} catch (Exception e) {
log.warn("youtube-transcript-api failed for {}: {}", videoId, e.getMessage());
}
// 2차 fallback: Playwright head 모드
log.info("Falling back to Playwright for videoId: {}", videoId);
return fetchWithPlaywright(videoId);
}
private String fetchWithApi(String videoId) {
TranscriptList transcriptList;
try {
transcriptList = transcriptApi.listTranscripts(videoId);
} catch (Exception e) {
log.warn("Cannot list transcripts for {}: {}", videoId, e.getMessage());
return null;
}
// manual(수동 자막) 먼저 시도, 없으면 generated(자동 생성)
String result = fetchTranscriptByType(transcriptList, true);
if (result != null) return result;
return fetchTranscriptByType(transcriptList, false);
}
private String fetchTranscriptByType(TranscriptList list, boolean manual) {
Transcript picked;
try {
picked = manual ? list.findManualTranscript(PREFERRED_LANGS)
: list.findGeneratedTranscript(PREFERRED_LANGS);
} catch (Exception e) {
return null;
}
try {
TranscriptContent content = picked.fetch();
String text = content.getContent().stream()
.map(TranscriptContent.Fragment::getText)
.collect(Collectors.joining(" "));
if (text.isBlank()) return null;
String label = manual ? "manual" : "generated";
log.info("Transcript source: {} ({})", label, picked.getLanguageCode());
return text;
} catch (Exception e) {
log.warn("Failed to fetch transcript for language {}: {}",
picked.getLanguageCode(), e.getMessage());
return null;
}
}
private String fetchWithPlaywright(String videoId) throws IOException {
String watchUrl = "https://www.youtube.com/watch?v=" + videoId;
// Playwright head 모드로 YouTube 페이지 HTML 가져오기
String html; String html;
try (Playwright playwright = Playwright.create()) { try (Playwright playwright = Playwright.create()) {
BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions() BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions()
@@ -83,48 +140,37 @@ public class YouTubeTranscriptService {
} }
String captionTracksJson = captionMatcher.group(1); String captionTracksJson = captionMatcher.group(1);
// 자막 트랙 URL 선택 (한국어 > 영어 > 첫 번째)
String captionUrl = selectCaptionUrl(captionTracksJson); String captionUrl = selectCaptionUrl(captionTracksJson);
if (captionUrl == null) { if (captionUrl == null) {
throw new IOException("자막 트랙 URL을 추출할 수 없습니다."); throw new IOException("자막 트랙 URL을 추출할 수 없습니다.");
} }
// Unicode escape 처리
captionUrl = captionUrl.replace("\\u0026", "&"); captionUrl = captionUrl.replace("\\u0026", "&");
log.info("Fetching caption XML from: {}", captionUrl); log.info("Fetching caption XML from: {}", captionUrl);
// 자막 XML 가져오기 // 자막 XML 가져오기
String xml; String xml;
try { try {
xml = webClient.get() xml = new java.net.URI(captionUrl).toURL().openConnection()
.uri(captionUrl) .getInputStream().readAllBytes().toString();
.retrieve() // Use simple HTTP fetch
.bodyToMono(String.class) var conn = new java.net.URI(captionUrl).toURL().openConnection();
.timeout(Duration.ofSeconds(15)) conn.setRequestProperty("User-Agent", "Mozilla/5.0");
.block(); xml = new String(conn.getInputStream().readAllBytes(), StandardCharsets.UTF_8);
} catch (Exception e) { } catch (Exception e) {
throw new IOException("자막 XML을 가져올 수 없습니다: " + e.getMessage(), e); throw new IOException("자막 XML을 가져올 수 없습니다: " + e.getMessage(), e);
} }
if (xml == null || xml.isBlank()) {
throw new IOException("자막 XML이 비어있습니다.");
}
// XML에서 텍스트 추출
String transcript = parseTranscriptXml(xml); String transcript = parseTranscriptXml(xml);
if (transcript.isBlank()) { if (transcript.isBlank()) {
throw new IOException("자막 텍스트를 파싱할 수 없습니다."); throw new IOException("자막 텍스트를 파싱할 수 없습니다.");
} }
log.info("Successfully fetched transcript: {} chars", transcript.length()); log.info("Successfully fetched transcript via Playwright: {} chars", transcript.length());
return transcript; return transcript;
} }
private String selectCaptionUrl(String captionTracksJson) { private String selectCaptionUrl(String captionTracksJson) {
// 각 트랙에서 baseUrl과 languageCode 추출
// 여러 트랙이 있을 수 있으므로 개별 트랙을 분리
String[] tracks = captionTracksJson.split("\\},\\s*\\{"); String[] tracks = captionTracksJson.split("\\},\\s*\\{");
String koUrl = null; String koUrl = null;