diff --git a/sundol-backend/pom.xml b/sundol-backend/pom.xml
index eb4f3cc..55a59e1 100644
--- a/sundol-backend/pom.xml
+++ b/sundol-backend/pom.xml
@@ -104,6 +104,13 @@
1.18.3
+
+
+ io.github.thoroldvix
+ youtube-transcript-api
+ 0.4.0
+
+
com.microsoft.playwright
diff --git a/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java b/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java
index f691226..8bff2d1 100644
--- a/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java
+++ b/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java
@@ -2,24 +2,30 @@ package com.sundol.service;
import com.microsoft.playwright.*;
import com.microsoft.playwright.options.WaitUntilState;
+import io.github.thoroldvix.api.TranscriptApiFactory;
+import io.github.thoroldvix.api.TranscriptContent;
+import io.github.thoroldvix.api.TranscriptList;
+import io.github.thoroldvix.api.Transcript;
+import io.github.thoroldvix.api.YoutubeTranscriptApi;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
-import org.springframework.web.reactive.function.client.WebClient;
import java.io.IOException;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
-import java.time.Duration;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import java.util.stream.Collectors;
@Service
public class YouTubeTranscriptService {
private static final Logger log = LoggerFactory.getLogger(YouTubeTranscriptService.class);
+ private static final String[] PREFERRED_LANGS = {"ko", "en"};
+
private static final Pattern CAPTION_TRACK_PATTERN =
Pattern.compile("\"captionTracks\":\\s*\\[(.*?)]", Pattern.DOTALL);
private static final Pattern BASE_URL_PATTERN =
@@ -29,13 +35,7 @@ public class YouTubeTranscriptService {
private static final Pattern XML_TEXT_PATTERN =
Pattern.compile("]*>(.*?)", Pattern.DOTALL);
- private final WebClient webClient;
-
- public YouTubeTranscriptService() {
- this.webClient = WebClient.builder()
- .codecs(configurer -> configurer.defaultCodecs().maxInMemorySize(5 * 1024 * 1024))
- .build();
- }
+ private final YoutubeTranscriptApi transcriptApi = TranscriptApiFactory.createDefault();
public String fetchTranscript(String youtubeUrl) throws IOException {
String videoId = extractVideoId(youtubeUrl);
@@ -43,10 +43,67 @@ public class YouTubeTranscriptService {
throw new IOException("유효하지 않은 YouTube URL입니다: " + youtubeUrl);
}
- String watchUrl = "https://www.youtube.com/watch?v=" + videoId;
- log.info("Fetching YouTube transcript for: {}", watchUrl);
+ log.info("Fetching YouTube transcript for videoId: {}", videoId);
+
+ // 1차: youtube-transcript-api 라이브러리
+ try {
+ String transcript = fetchWithApi(videoId);
+ if (transcript != null && !transcript.isBlank()) {
+ log.info("Successfully fetched transcript via API: {} chars", transcript.length());
+ return transcript;
+ }
+ } catch (Exception e) {
+ log.warn("youtube-transcript-api failed for {}: {}", videoId, e.getMessage());
+ }
+
+ // 2차 fallback: Playwright head 모드
+ log.info("Falling back to Playwright for videoId: {}", videoId);
+ return fetchWithPlaywright(videoId);
+ }
+
+ private String fetchWithApi(String videoId) {
+ TranscriptList transcriptList;
+ try {
+ transcriptList = transcriptApi.listTranscripts(videoId);
+ } catch (Exception e) {
+ log.warn("Cannot list transcripts for {}: {}", videoId, e.getMessage());
+ return null;
+ }
+
+ // manual(수동 자막) 먼저 시도, 없으면 generated(자동 생성)
+ String result = fetchTranscriptByType(transcriptList, true);
+ if (result != null) return result;
+ return fetchTranscriptByType(transcriptList, false);
+ }
+
+ private String fetchTranscriptByType(TranscriptList list, boolean manual) {
+ Transcript picked;
+ try {
+ picked = manual ? list.findManualTranscript(PREFERRED_LANGS)
+ : list.findGeneratedTranscript(PREFERRED_LANGS);
+ } catch (Exception e) {
+ return null;
+ }
+
+ try {
+ TranscriptContent content = picked.fetch();
+ String text = content.getContent().stream()
+ .map(TranscriptContent.Fragment::getText)
+ .collect(Collectors.joining(" "));
+ if (text.isBlank()) return null;
+ String label = manual ? "manual" : "generated";
+ log.info("Transcript source: {} ({})", label, picked.getLanguageCode());
+ return text;
+ } catch (Exception e) {
+ log.warn("Failed to fetch transcript for language {}: {}",
+ picked.getLanguageCode(), e.getMessage());
+ return null;
+ }
+ }
+
+ private String fetchWithPlaywright(String videoId) throws IOException {
+ String watchUrl = "https://www.youtube.com/watch?v=" + videoId;
- // Playwright head 모드로 YouTube 페이지 HTML 가져오기
String html;
try (Playwright playwright = Playwright.create()) {
BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions()
@@ -83,48 +140,37 @@ public class YouTubeTranscriptService {
}
String captionTracksJson = captionMatcher.group(1);
-
- // 자막 트랙 URL 선택 (한국어 > 영어 > 첫 번째)
String captionUrl = selectCaptionUrl(captionTracksJson);
if (captionUrl == null) {
throw new IOException("자막 트랙 URL을 추출할 수 없습니다.");
}
- // Unicode escape 처리
captionUrl = captionUrl.replace("\\u0026", "&");
-
log.info("Fetching caption XML from: {}", captionUrl);
// 자막 XML 가져오기
String xml;
try {
- xml = webClient.get()
- .uri(captionUrl)
- .retrieve()
- .bodyToMono(String.class)
- .timeout(Duration.ofSeconds(15))
- .block();
+ xml = new java.net.URI(captionUrl).toURL().openConnection()
+ .getInputStream().readAllBytes().toString();
+ // Use simple HTTP fetch
+ var conn = new java.net.URI(captionUrl).toURL().openConnection();
+ conn.setRequestProperty("User-Agent", "Mozilla/5.0");
+ xml = new String(conn.getInputStream().readAllBytes(), StandardCharsets.UTF_8);
} catch (Exception e) {
throw new IOException("자막 XML을 가져올 수 없습니다: " + e.getMessage(), e);
}
- if (xml == null || xml.isBlank()) {
- throw new IOException("자막 XML이 비어있습니다.");
- }
-
- // XML에서 텍스트 추출
String transcript = parseTranscriptXml(xml);
if (transcript.isBlank()) {
throw new IOException("자막 텍스트를 파싱할 수 없습니다.");
}
- log.info("Successfully fetched transcript: {} chars", transcript.length());
+ log.info("Successfully fetched transcript via Playwright: {} chars", transcript.length());
return transcript;
}
private String selectCaptionUrl(String captionTracksJson) {
- // 각 트랙에서 baseUrl과 languageCode 추출
- // 여러 트랙이 있을 수 있으므로 개별 트랙을 분리
String[] tracks = captionTracksJson.split("\\},\\s*\\{");
String koUrl = null;