Use youtube-transcript-api library with Playwright fallback for YouTube transcripts

Replace Jsoup-based approach with io.github.thoroldvix:youtube-transcript-api as primary method (supports manual/generated captions, language priority). Playwright head mode kept as fallback when API fails. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-01 07:26:52 +00:00
parent 1bfe55d5a8
commit 677a79978f
2 changed files with 83 additions and 30 deletions
--- a/sundol-backend/pom.xml
+++ b/sundol-backend/pom.xml
@@ -104,6 +104,13 @@
            <version>1.18.3</version>
        </dependency>
        <!-- YouTube Transcript API -->
        <dependency>
            <groupId>io.github.thoroldvix</groupId>
            <artifactId>youtube-transcript-api</artifactId>
            <version>0.4.0</version>
        </dependency>
        <!-- Playwright (headless browser, driver-bundle includes node runtime) -->
        <dependency>
            <groupId>com.microsoft.playwright</groupId>
--- a/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java
+++ b/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java
@@ -2,24 +2,30 @@ package com.sundol.service;
 import com.microsoft.playwright.*;
 import com.microsoft.playwright.options.WaitUntilState;
 import io.github.thoroldvix.api.TranscriptApiFactory;
 import io.github.thoroldvix.api.TranscriptContent;
 import io.github.thoroldvix.api.TranscriptList;
 import io.github.thoroldvix.api.Transcript;
 import io.github.thoroldvix.api.YoutubeTranscriptApi;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.stereotype.Service;
 import org.springframework.web.reactive.function.client.WebClient;
 import java.io.IOException;
 import java.net.URLDecoder;
 import java.nio.charset.StandardCharsets;
 import java.time.Duration;
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
@Service
 public class YouTubeTranscriptService {
    private static final Logger log = LoggerFactory.getLogger(YouTubeTranscriptService.class);
    private static final String[] PREFERRED_LANGS = {"ko", "en"};
    private static final Pattern CAPTION_TRACK_PATTERN =
            Pattern.compile("\"captionTracks\":\\s*\\[(.*?)]", Pattern.DOTALL);
    private static final Pattern BASE_URL_PATTERN =
@@ -29,13 +35,7 @@ public class YouTubeTranscriptService {
    private static final Pattern XML_TEXT_PATTERN =
            Pattern.compile("<text[^>]*>(.*?)</text>", Pattern.DOTALL);
-    private final WebClient webClient;
+    private final YoutubeTranscriptApi transcriptApi = TranscriptApiFactory.createDefault();
    public YouTubeTranscriptService() {
        this.webClient = WebClient.builder()
                .codecs(configurer -> configurer.defaultCodecs().maxInMemorySize(5 * 1024 * 1024))
                .build();
    }
    public String fetchTranscript(String youtubeUrl) throws IOException {
        String videoId = extractVideoId(youtubeUrl);
@@ -43,10 +43,67 @@ public class YouTubeTranscriptService {
            throw new IOException("유효하지 않은 YouTube URL입니다: " + youtubeUrl);
        }
-        String watchUrl = "https://www.youtube.com/watch?v=" + videoId;
+        log.info("Fetching YouTube transcript for videoId: {}", videoId);
-        log.info("Fetching YouTube transcript for: {}", watchUrl);
+
        // 1차: youtube-transcript-api 라이브러리
        try {
            String transcript = fetchWithApi(videoId);
            if (transcript != null && !transcript.isBlank()) {
                log.info("Successfully fetched transcript via API: {} chars", transcript.length());
                return transcript;
            }
        } catch (Exception e) {
            log.warn("youtube-transcript-api failed for {}: {}", videoId, e.getMessage());
        }
        // 2차 fallback: Playwright head 모드
        log.info("Falling back to Playwright for videoId: {}", videoId);
        return fetchWithPlaywright(videoId);
    }
    private String fetchWithApi(String videoId) {
        TranscriptList transcriptList;
        try {
            transcriptList = transcriptApi.listTranscripts(videoId);
        } catch (Exception e) {
            log.warn("Cannot list transcripts for {}: {}", videoId, e.getMessage());
            return null;
        }
        // manual(수동 자막) 먼저 시도, 없으면 generated(자동 생성)
        String result = fetchTranscriptByType(transcriptList, true);
        if (result != null) return result;
        return fetchTranscriptByType(transcriptList, false);
    }
    private String fetchTranscriptByType(TranscriptList list, boolean manual) {
        Transcript picked;
        try {
            picked = manual ? list.findManualTranscript(PREFERRED_LANGS)
                            : list.findGeneratedTranscript(PREFERRED_LANGS);
        } catch (Exception e) {
            return null;
        }
        try {
            TranscriptContent content = picked.fetch();
            String text = content.getContent().stream()
                    .map(TranscriptContent.Fragment::getText)
                    .collect(Collectors.joining(" "));
            if (text.isBlank()) return null;
            String label = manual ? "manual" : "generated";
            log.info("Transcript source: {} ({})", label, picked.getLanguageCode());
            return text;
        } catch (Exception e) {
            log.warn("Failed to fetch transcript for language {}: {}",
                    picked.getLanguageCode(), e.getMessage());
            return null;
        }
    }
    private String fetchWithPlaywright(String videoId) throws IOException {
        String watchUrl = "https://www.youtube.com/watch?v=" + videoId;
        // Playwright head 모드로 YouTube 페이지 HTML 가져오기
        String html;
        try (Playwright playwright = Playwright.create()) {
            BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions()
@@ -83,48 +140,37 @@ public class YouTubeTranscriptService {
        }
        String captionTracksJson = captionMatcher.group(1);
        // 자막 트랙 URL 선택 (한국어 > 영어 > 첫 번째)
        String captionUrl = selectCaptionUrl(captionTracksJson);
        if (captionUrl == null) {
            throw new IOException("자막 트랙 URL을 추출할 수 없습니다.");
        }
        // Unicode escape 처리
        captionUrl = captionUrl.replace("\\u0026", "&");
        log.info("Fetching caption XML from: {}", captionUrl);
        // 자막 XML 가져오기
        String xml;
        try {
-            xml = webClient.get()
+            xml = new java.net.URI(captionUrl).toURL().openConnection()
-                    .uri(captionUrl)
+                    .getInputStream().readAllBytes().toString();
-                    .retrieve()
+            // Use simple HTTP fetch
-                    .bodyToMono(String.class)
+            var conn = new java.net.URI(captionUrl).toURL().openConnection();
-                    .timeout(Duration.ofSeconds(15))
+            conn.setRequestProperty("User-Agent", "Mozilla/5.0");
-                    .block();
+            xml = new String(conn.getInputStream().readAllBytes(), StandardCharsets.UTF_8);
        } catch (Exception e) {
            throw new IOException("자막 XML을 가져올 수 없습니다: " + e.getMessage(), e);
        }
        if (xml == null || xml.isBlank()) {
            throw new IOException("자막 XML이 비어있습니다.");
        }
        // XML에서 텍스트 추출
        String transcript = parseTranscriptXml(xml);
        if (transcript.isBlank()) {
            throw new IOException("자막 텍스트를 파싱할 수 없습니다.");
        }
-        log.info("Successfully fetched transcript: {} chars", transcript.length());
+        log.info("Successfully fetched transcript via Playwright: {} chars", transcript.length());
        return transcript;
    }
    private String selectCaptionUrl(String captionTracksJson) {
        // 각 트랙에서 baseUrl과 languageCode 추출
        // 여러 트랙이 있을 수 있으므로 개별 트랙을 분리
        String[] tracks = captionTracksJson.split("\\},\\s*\\{");
        String koUrl = null;