Use youtube-transcript-api library with Playwright fallback for YouTube transcripts

Replace Jsoup-based approach with io.github.thoroldvix:youtube-transcript-api as primary method (supports manual/generated captions, language priority). Playwright head mode kept as fallback when API fails. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-01 07:26:52 +00:00
parent 1bfe55d5a8
commit 677a79978f
2 changed files with 83 additions and 30 deletions
--- a/sundol-backend/pom.xml
+++ b/sundol-backend/pom.xml
@@ -104,6 +104,13 @@
            <version>1.18.3</version>
        </dependency>

+        <!-- YouTube Transcript API -->
+        <dependency>
+            <groupId>io.github.thoroldvix</groupId>
+            <artifactId>youtube-transcript-api</artifactId>
+            <version>0.4.0</version>
+        </dependency>
+
        <!-- Playwright (headless browser, driver-bundle includes node runtime) -->
        <dependency>
            <groupId>com.microsoft.playwright</groupId>
--- a/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java
+++ b/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java
@@ -2,24 +2,30 @@ package com.sundol.service;

 import com.microsoft.playwright.*;
 import com.microsoft.playwright.options.WaitUntilState;
+import io.github.thoroldvix.api.TranscriptApiFactory;
+import io.github.thoroldvix.api.TranscriptContent;
+import io.github.thoroldvix.api.TranscriptList;
+import io.github.thoroldvix.api.Transcript;
+import io.github.thoroldvix.api.YoutubeTranscriptApi;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.stereotype.Service;
-import org.springframework.web.reactive.function.client.WebClient;

 import java.io.IOException;
 import java.net.URLDecoder;
 import java.nio.charset.StandardCharsets;
-import java.time.Duration;
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+import java.util.stream.Collectors;

@Service
 public class YouTubeTranscriptService {

    private static final Logger log = LoggerFactory.getLogger(YouTubeTranscriptService.class);

+    private static final String[] PREFERRED_LANGS = {"ko", "en"};
+
    private static final Pattern CAPTION_TRACK_PATTERN =
            Pattern.compile("\"captionTracks\":\\s*\\[(.*?)]", Pattern.DOTALL);
    private static final Pattern BASE_URL_PATTERN =
@@ -29,13 +35,7 @@ public class YouTubeTranscriptService {
    private static final Pattern XML_TEXT_PATTERN =
            Pattern.compile("<text[^>]*>(.*?)</text>", Pattern.DOTALL);

-    private final WebClient webClient;
-
-    public YouTubeTranscriptService() {
-        this.webClient = WebClient.builder()
-                .codecs(configurer -> configurer.defaultCodecs().maxInMemorySize(5 * 1024 * 1024))
-                .build();
-    }
+    private final YoutubeTranscriptApi transcriptApi = TranscriptApiFactory.createDefault();

    public String fetchTranscript(String youtubeUrl) throws IOException {
        String videoId = extractVideoId(youtubeUrl);
@@ -43,10 +43,67 @@ public class YouTubeTranscriptService {
            throw new IOException("유효하지 않은 YouTube URL입니다: " + youtubeUrl);
        }

-        String watchUrl = "https://www.youtube.com/watch?v=" + videoId;
-        log.info("Fetching YouTube transcript for: {}", watchUrl);
+        log.info("Fetching YouTube transcript for videoId: {}", videoId);
+
+        // 1차: youtube-transcript-api 라이브러리
+        try {
+            String transcript = fetchWithApi(videoId);
+            if (transcript != null && !transcript.isBlank()) {
+                log.info("Successfully fetched transcript via API: {} chars", transcript.length());
+                return transcript;
+            }
+        } catch (Exception e) {
+            log.warn("youtube-transcript-api failed for {}: {}", videoId, e.getMessage());
+        }
+
+        // 2차 fallback: Playwright head 모드
+        log.info("Falling back to Playwright for videoId: {}", videoId);
+        return fetchWithPlaywright(videoId);
+    }
+
+    private String fetchWithApi(String videoId) {
+        TranscriptList transcriptList;
+        try {
+            transcriptList = transcriptApi.listTranscripts(videoId);
+        } catch (Exception e) {
+            log.warn("Cannot list transcripts for {}: {}", videoId, e.getMessage());
+            return null;
+        }
+
+        // manual(수동 자막) 먼저 시도, 없으면 generated(자동 생성)
+        String result = fetchTranscriptByType(transcriptList, true);
+        if (result != null) return result;
+        return fetchTranscriptByType(transcriptList, false);
+    }
+
+    private String fetchTranscriptByType(TranscriptList list, boolean manual) {
+        Transcript picked;
+        try {
+            picked = manual ? list.findManualTranscript(PREFERRED_LANGS)
+                            : list.findGeneratedTranscript(PREFERRED_LANGS);
+        } catch (Exception e) {
+            return null;
+        }
+
+        try {
+            TranscriptContent content = picked.fetch();
+            String text = content.getContent().stream()
+                    .map(TranscriptContent.Fragment::getText)
+                    .collect(Collectors.joining(" "));
+            if (text.isBlank()) return null;
+            String label = manual ? "manual" : "generated";
+            log.info("Transcript source: {} ({})", label, picked.getLanguageCode());
+            return text;
+        } catch (Exception e) {
+            log.warn("Failed to fetch transcript for language {}: {}",
+                    picked.getLanguageCode(), e.getMessage());
+            return null;
+        }
+    }
+
+    private String fetchWithPlaywright(String videoId) throws IOException {
+        String watchUrl = "https://www.youtube.com/watch?v=" + videoId;

-        // Playwright head 모드로 YouTube 페이지 HTML 가져오기
        String html;
        try (Playwright playwright = Playwright.create()) {
            BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions()
@@ -83,48 +140,37 @@ public class YouTubeTranscriptService {
        }

        String captionTracksJson = captionMatcher.group(1);
-
-        // 자막 트랙 URL 선택 (한국어 > 영어 > 첫 번째)
        String captionUrl = selectCaptionUrl(captionTracksJson);
        if (captionUrl == null) {
            throw new IOException("자막 트랙 URL을 추출할 수 없습니다.");
        }

-        // Unicode escape 처리
        captionUrl = captionUrl.replace("\\u0026", "&");
-
        log.info("Fetching caption XML from: {}", captionUrl);

        // 자막 XML 가져오기
        String xml;
        try {
-            xml = webClient.get()
-                    .uri(captionUrl)
-                    .retrieve()
-                    .bodyToMono(String.class)
-                    .timeout(Duration.ofSeconds(15))
-                    .block();
+            xml = new java.net.URI(captionUrl).toURL().openConnection()
+                    .getInputStream().readAllBytes().toString();
+            // Use simple HTTP fetch
+            var conn = new java.net.URI(captionUrl).toURL().openConnection();
+            conn.setRequestProperty("User-Agent", "Mozilla/5.0");
+            xml = new String(conn.getInputStream().readAllBytes(), StandardCharsets.UTF_8);
        } catch (Exception e) {
            throw new IOException("자막 XML을 가져올 수 없습니다: " + e.getMessage(), e);
        }

-        if (xml == null || xml.isBlank()) {
-            throw new IOException("자막 XML이 비어있습니다.");
-        }
-
-        // XML에서 텍스트 추출
        String transcript = parseTranscriptXml(xml);
        if (transcript.isBlank()) {
            throw new IOException("자막 텍스트를 파싱할 수 없습니다.");
        }

-        log.info("Successfully fetched transcript: {} chars", transcript.length());
+        log.info("Successfully fetched transcript via Playwright: {} chars", transcript.length());
        return transcript;
    }

    private String selectCaptionUrl(String captionTracksJson) {
-        // 각 트랙에서 baseUrl과 languageCode 추출
-        // 여러 트랙이 있을 수 있으므로 개별 트랙을 분리
        String[] tracks = captionTracksJson.split("\\},\\s*\\{");

        String koUrl = null;