From bb5a601433d0a73ddf8589857a5ec55e49a769c3 Mon Sep 17 00:00:00 2001 From: joungmin Date: Wed, 1 Apr 2026 04:20:13 +0000 Subject: [PATCH] Add YouTube transcript auto-fetch button on Knowledge add page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - YouTubeTranscriptService: fetches captions from YouTube page (ko > en > first available) - GET /api/knowledge/youtube-transcript endpoint - Frontend: "트랜스크립트 자동 가져오기" button appears when valid YouTube URL entered Co-Authored-By: Claude Opus 4.6 (1M context) --- .../controller/KnowledgeController.java | 17 +- .../service/YouTubeTranscriptService.java | 183 ++++++++++++++++++ .../src/app/knowledge/add/page.tsx | 36 ++++ 3 files changed, 235 insertions(+), 1 deletion(-) create mode 100644 sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java diff --git a/sundol-backend/src/main/java/com/sundol/controller/KnowledgeController.java b/sundol-backend/src/main/java/com/sundol/controller/KnowledgeController.java index 4b535bc..1926d6a 100644 --- a/sundol-backend/src/main/java/com/sundol/controller/KnowledgeController.java +++ b/sundol-backend/src/main/java/com/sundol/controller/KnowledgeController.java @@ -2,6 +2,7 @@ package com.sundol.controller; import com.sundol.dto.IngestRequest; import com.sundol.service.KnowledgeService; +import com.sundol.service.YouTubeTranscriptService; import org.springframework.http.HttpStatus; import org.springframework.http.ResponseEntity; import org.springframework.security.core.annotation.AuthenticationPrincipal; @@ -16,9 +17,12 @@ import java.util.Map; public class KnowledgeController { private final KnowledgeService knowledgeService; + private final YouTubeTranscriptService youTubeTranscriptService; - public KnowledgeController(KnowledgeService knowledgeService) { + public KnowledgeController(KnowledgeService knowledgeService, + YouTubeTranscriptService youTubeTranscriptService) { this.knowledgeService = knowledgeService; + this.youTubeTranscriptService = youTubeTranscriptService; } @GetMapping @@ -40,6 +44,17 @@ public class KnowledgeController { .map(result -> ResponseEntity.status(HttpStatus.ACCEPTED).body(result)); } + @GetMapping("/youtube-transcript") + public Mono>> fetchYouTubeTranscript( + @AuthenticationPrincipal String userId, + @RequestParam String url) { + return Mono.fromCallable(() -> youTubeTranscriptService.fetchTranscript(url)) + .map(transcript -> ResponseEntity.ok(Map.of("transcript", transcript))) + .onErrorResume(e -> Mono.just( + ResponseEntity.status(HttpStatus.BAD_REQUEST) + .body(Map.of("error", e.getMessage() != null ? e.getMessage() : "트랜스크립트를 가져올 수 없습니다")))); + } + @GetMapping("/{id}") public Mono>> getById( @AuthenticationPrincipal String userId, diff --git a/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java b/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java new file mode 100644 index 0000000..db09521 --- /dev/null +++ b/sundol-backend/src/main/java/com/sundol/service/YouTubeTranscriptService.java @@ -0,0 +1,183 @@ +package com.sundol.service; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.stereotype.Service; +import org.springframework.web.reactive.function.client.WebClient; + +import java.io.IOException; +import java.net.URLDecoder; +import java.nio.charset.StandardCharsets; +import java.time.Duration; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +@Service +public class YouTubeTranscriptService { + + private static final Logger log = LoggerFactory.getLogger(YouTubeTranscriptService.class); + + private static final Pattern CAPTION_TRACK_PATTERN = + Pattern.compile("\"captionTracks\":\\s*\\[(.*?)]", Pattern.DOTALL); + private static final Pattern BASE_URL_PATTERN = + Pattern.compile("\"baseUrl\":\\s*\"(.*?)\""); + private static final Pattern LANG_PATTERN = + Pattern.compile("\"languageCode\":\\s*\"(.*?)\""); + private static final Pattern XML_TEXT_PATTERN = + Pattern.compile("]*>(.*?)", Pattern.DOTALL); + + private final WebClient webClient; + + public YouTubeTranscriptService() { + this.webClient = WebClient.builder() + .codecs(configurer -> configurer.defaultCodecs().maxInMemorySize(5 * 1024 * 1024)) + .build(); + } + + public String fetchTranscript(String youtubeUrl) throws IOException { + String videoId = extractVideoId(youtubeUrl); + if (videoId == null) { + throw new IOException("유효하지 않은 YouTube URL입니다: " + youtubeUrl); + } + + String watchUrl = "https://www.youtube.com/watch?v=" + videoId; + log.info("Fetching YouTube transcript for: {}", watchUrl); + + // YouTube 페이지 HTML 가져오기 + String html; + try { + Document doc = Jsoup.connect(watchUrl) + .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36") + .header("Accept-Language", "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7") + .timeout(15_000) + .maxBodySize(0) + .get(); + html = doc.html(); + } catch (Exception e) { + throw new IOException("YouTube 페이지를 가져올 수 없습니다: " + e.getMessage(), e); + } + + // captionTracks JSON 추출 + Matcher captionMatcher = CAPTION_TRACK_PATTERN.matcher(html); + if (!captionMatcher.find()) { + throw new IOException("이 영상에는 자막(caption)이 없습니다."); + } + + String captionTracksJson = captionMatcher.group(1); + + // 자막 트랙 URL 선택 (한국어 > 영어 > 첫 번째) + String captionUrl = selectCaptionUrl(captionTracksJson); + if (captionUrl == null) { + throw new IOException("자막 트랙 URL을 추출할 수 없습니다."); + } + + // Unicode escape 처리 + captionUrl = captionUrl.replace("\\u0026", "&"); + + log.info("Fetching caption XML from: {}", captionUrl); + + // 자막 XML 가져오기 + String xml; + try { + xml = webClient.get() + .uri(captionUrl) + .retrieve() + .bodyToMono(String.class) + .timeout(Duration.ofSeconds(15)) + .block(); + } catch (Exception e) { + throw new IOException("자막 XML을 가져올 수 없습니다: " + e.getMessage(), e); + } + + if (xml == null || xml.isBlank()) { + throw new IOException("자막 XML이 비어있습니다."); + } + + // XML에서 텍스트 추출 + String transcript = parseTranscriptXml(xml); + if (transcript.isBlank()) { + throw new IOException("자막 텍스트를 파싱할 수 없습니다."); + } + + log.info("Successfully fetched transcript: {} chars", transcript.length()); + return transcript; + } + + private String selectCaptionUrl(String captionTracksJson) { + // 각 트랙에서 baseUrl과 languageCode 추출 + // 여러 트랙이 있을 수 있으므로 개별 트랙을 분리 + String[] tracks = captionTracksJson.split("\\},\\s*\\{"); + + String koUrl = null; + String enUrl = null; + String firstUrl = null; + + for (String track : tracks) { + Matcher urlMatcher = BASE_URL_PATTERN.matcher(track); + Matcher langMatcher = LANG_PATTERN.matcher(track); + + if (urlMatcher.find()) { + String url = urlMatcher.group(1); + if (firstUrl == null) firstUrl = url; + + if (langMatcher.find()) { + String lang = langMatcher.group(1); + if (lang.startsWith("ko") && koUrl == null) koUrl = url; + if (lang.startsWith("en") && enUrl == null) enUrl = url; + } + } + } + + if (koUrl != null) return koUrl; + if (enUrl != null) return enUrl; + return firstUrl; + } + + private String parseTranscriptXml(String xml) { + StringBuilder sb = new StringBuilder(); + Matcher matcher = XML_TEXT_PATTERN.matcher(xml); + while (matcher.find()) { + String text = matcher.group(1) + .replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace(""", "\"") + .replace("'", "'") + .replace("\n", " ") + .trim(); + if (!text.isEmpty()) { + if (sb.length() > 0) sb.append(" "); + sb.append(text); + } + } + return sb.toString(); + } + + private String extractVideoId(String url) { + if (url == null || url.isBlank()) return null; + try { + java.net.URI uri = new java.net.URI(url); + String host = uri.getHost(); + if (host == null) return null; + if (host.equals("youtu.be")) { + String path = uri.getPath(); + return path != null && path.length() > 1 ? path.substring(1) : null; + } + if (host.contains("youtube.com")) { + String query = uri.getQuery(); + if (query == null) return null; + for (String param : query.split("&")) { + String[] kv = param.split("=", 2); + if (kv.length == 2 && kv[0].equals("v")) { + return URLDecoder.decode(kv[1], StandardCharsets.UTF_8); + } + } + } + } catch (Exception e) { + log.warn("Failed to parse YouTube URL: {}", url); + } + return null; + } +} diff --git a/sundol-frontend/src/app/knowledge/add/page.tsx b/sundol-frontend/src/app/knowledge/add/page.tsx index 812a996..3abbab1 100644 --- a/sundol-frontend/src/app/knowledge/add/page.tsx +++ b/sundol-frontend/src/app/knowledge/add/page.tsx @@ -36,6 +36,7 @@ export default function KnowledgeAddPage() { const [modelId, setModelId] = useState(""); const [models, setModels] = useState([]); const [submitting, setSubmitting] = useState(false); + const [fetchingTranscript, setFetchingTranscript] = useState(false); const [error, setError] = useState(null); useEffect(() => { @@ -52,6 +53,30 @@ export default function KnowledgeAddPage() { const videoId = useMemo(() => (type === "YOUTUBE" ? extractYouTubeVideoId(url) : null), [type, url]); + const canFetchTranscript = type === "YOUTUBE" && videoId !== null && !fetchingTranscript; + + const handleFetchTranscript = async () => { + if (!canFetchTranscript) return; + setError(null); + setFetchingTranscript(true); + try { + const data = await request<{ transcript?: string; error?: string }>({ + method: "GET", + url: `/api/knowledge/youtube-transcript?url=${encodeURIComponent(url.trim())}`, + }); + if (data.error) { + setError(data.error); + } else if (data.transcript) { + setRawText(data.transcript); + } + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : "트랜스크립트를 가져올 수 없습니다"; + setError(msg); + } finally { + setFetchingTranscript(false); + } + }; + const canSubmit = !submitting && ((type === "TEXT" && rawText.trim().length > 0) || @@ -168,6 +193,17 @@ export default function KnowledgeAddPage() { )} + {/* Fetch Transcript Button (YOUTUBE) */} + {type === "YOUTUBE" && videoId && ( + + )} + {/* Text Input (TEXT / YOUTUBE) */} {type !== "WEB" && (