diff --git a/sundol-backend/pom.xml b/sundol-backend/pom.xml index eb4f3cc..55a59e1 100644 --- a/sundol-backend/pom.xml +++ b/sundol-backend/pom.xml @@ -104,6 +104,13 @@ 1.18.3 + + + io.github.thoroldvix + youtube-transcript-api + 0.4.0 + + com.microsoft.playwright diff --git a/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java b/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java index f691226..8bff2d1 100644 --- a/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java +++ b/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java @@ -2,24 +2,30 @@ package com.sundol.service; import com.microsoft.playwright.*; import com.microsoft.playwright.options.WaitUntilState; +import io.github.thoroldvix.api.TranscriptApiFactory; +import io.github.thoroldvix.api.TranscriptContent; +import io.github.thoroldvix.api.TranscriptList; +import io.github.thoroldvix.api.Transcript; +import io.github.thoroldvix.api.YoutubeTranscriptApi; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.stereotype.Service; -import org.springframework.web.reactive.function.client.WebClient; import java.io.IOException; import java.net.URLDecoder; import java.nio.charset.StandardCharsets; -import java.time.Duration; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Collectors; @Service public class YouTubeTranscriptService { private static final Logger log = LoggerFactory.getLogger(YouTubeTranscriptService.class); + private static final String[] PREFERRED_LANGS = {"ko", "en"}; + private static final Pattern CAPTION_TRACK_PATTERN = Pattern.compile("\"captionTracks\":\\s*\\[(.*?)]", Pattern.DOTALL); private static final Pattern BASE_URL_PATTERN = @@ -29,13 +35,7 @@ public class YouTubeTranscriptService { private static final Pattern XML_TEXT_PATTERN = Pattern.compile("]*>(.*?)", Pattern.DOTALL); - private final WebClient webClient; - - public YouTubeTranscriptService() { - this.webClient = WebClient.builder() - .codecs(configurer -> configurer.defaultCodecs().maxInMemorySize(5 * 1024 * 1024)) - .build(); - } + private final YoutubeTranscriptApi transcriptApi = TranscriptApiFactory.createDefault(); public String fetchTranscript(String youtubeUrl) throws IOException { String videoId = extractVideoId(youtubeUrl); @@ -43,10 +43,67 @@ public class YouTubeTranscriptService { throw new IOException("유효하지 않은 YouTube URL입니다: " + youtubeUrl); } - String watchUrl = "https://www.youtube.com/watch?v=" + videoId; - log.info("Fetching YouTube transcript for: {}", watchUrl); + log.info("Fetching YouTube transcript for videoId: {}", videoId); + + // 1차: youtube-transcript-api 라이브러리 + try { + String transcript = fetchWithApi(videoId); + if (transcript != null && !transcript.isBlank()) { + log.info("Successfully fetched transcript via API: {} chars", transcript.length()); + return transcript; + } + } catch (Exception e) { + log.warn("youtube-transcript-api failed for {}: {}", videoId, e.getMessage()); + } + + // 2차 fallback: Playwright head 모드 + log.info("Falling back to Playwright for videoId: {}", videoId); + return fetchWithPlaywright(videoId); + } + + private String fetchWithApi(String videoId) { + TranscriptList transcriptList; + try { + transcriptList = transcriptApi.listTranscripts(videoId); + } catch (Exception e) { + log.warn("Cannot list transcripts for {}: {}", videoId, e.getMessage()); + return null; + } + + // manual(수동 자막) 먼저 시도, 없으면 generated(자동 생성) + String result = fetchTranscriptByType(transcriptList, true); + if (result != null) return result; + return fetchTranscriptByType(transcriptList, false); + } + + private String fetchTranscriptByType(TranscriptList list, boolean manual) { + Transcript picked; + try { + picked = manual ? list.findManualTranscript(PREFERRED_LANGS) + : list.findGeneratedTranscript(PREFERRED_LANGS); + } catch (Exception e) { + return null; + } + + try { + TranscriptContent content = picked.fetch(); + String text = content.getContent().stream() + .map(TranscriptContent.Fragment::getText) + .collect(Collectors.joining(" ")); + if (text.isBlank()) return null; + String label = manual ? "manual" : "generated"; + log.info("Transcript source: {} ({})", label, picked.getLanguageCode()); + return text; + } catch (Exception e) { + log.warn("Failed to fetch transcript for language {}: {}", + picked.getLanguageCode(), e.getMessage()); + return null; + } + } + + private String fetchWithPlaywright(String videoId) throws IOException { + String watchUrl = "https://www.youtube.com/watch?v=" + videoId; - // Playwright head 모드로 YouTube 페이지 HTML 가져오기 String html; try (Playwright playwright = Playwright.create()) { BrowserType.LaunchOptions launchOptions = new BrowserType.LaunchOptions() @@ -83,48 +140,37 @@ public class YouTubeTranscriptService { } String captionTracksJson = captionMatcher.group(1); - - // 자막 트랙 URL 선택 (한국어 > 영어 > 첫 번째) String captionUrl = selectCaptionUrl(captionTracksJson); if (captionUrl == null) { throw new IOException("자막 트랙 URL을 추출할 수 없습니다."); } - // Unicode escape 처리 captionUrl = captionUrl.replace("\\u0026", "&"); - log.info("Fetching caption XML from: {}", captionUrl); // 자막 XML 가져오기 String xml; try { - xml = webClient.get() - .uri(captionUrl) - .retrieve() - .bodyToMono(String.class) - .timeout(Duration.ofSeconds(15)) - .block(); + xml = new java.net.URI(captionUrl).toURL().openConnection() + .getInputStream().readAllBytes().toString(); + // Use simple HTTP fetch + var conn = new java.net.URI(captionUrl).toURL().openConnection(); + conn.setRequestProperty("User-Agent", "Mozilla/5.0"); + xml = new String(conn.getInputStream().readAllBytes(), StandardCharsets.UTF_8); } catch (Exception e) { throw new IOException("자막 XML을 가져올 수 없습니다: " + e.getMessage(), e); } - if (xml == null || xml.isBlank()) { - throw new IOException("자막 XML이 비어있습니다."); - } - - // XML에서 텍스트 추출 String transcript = parseTranscriptXml(xml); if (transcript.isBlank()) { throw new IOException("자막 텍스트를 파싱할 수 없습니다."); } - log.info("Successfully fetched transcript: {} chars", transcript.length()); + log.info("Successfully fetched transcript via Playwright: {} chars", transcript.length()); return transcript; } private String selectCaptionUrl(String captionTracksJson) { - // 각 트랙에서 baseUrl과 languageCode 추출 - // 여러 트랙이 있을 수 있으므로 개별 트랙을 분리 String[] tracks = captionTracksJson.split("\\},\\s*\\{"); String koUrl = null;