Add YouTube transcript auto-fetch button on Knowledge add page

- YouTubeTranscriptService: fetches captions from YouTube page (ko > en > first available)
- GET /api/knowledge/youtube-transcript endpoint
- Frontend: "트랜스크립트 자동 가져오기" button appears when valid YouTube URL entered

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-01 04:20:13 +00:00
parent f0f7b62e3d
commit bb5a601433
3 changed files with 235 additions and 1 deletions

View File

@@ -2,6 +2,7 @@ package com.sundol.controller;
import com.sundol.dto.IngestRequest; import com.sundol.dto.IngestRequest;
import com.sundol.service.KnowledgeService; import com.sundol.service.KnowledgeService;
import com.sundol.service.YouTubeTranscriptService;
import org.springframework.http.HttpStatus; import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity; import org.springframework.http.ResponseEntity;
import org.springframework.security.core.annotation.AuthenticationPrincipal; import org.springframework.security.core.annotation.AuthenticationPrincipal;
@@ -16,9 +17,12 @@ import java.util.Map;
public class KnowledgeController { public class KnowledgeController {
private final KnowledgeService knowledgeService; private final KnowledgeService knowledgeService;
private final YouTubeTranscriptService youTubeTranscriptService;
public KnowledgeController(KnowledgeService knowledgeService) { public KnowledgeController(KnowledgeService knowledgeService,
YouTubeTranscriptService youTubeTranscriptService) {
this.knowledgeService = knowledgeService; this.knowledgeService = knowledgeService;
this.youTubeTranscriptService = youTubeTranscriptService;
} }
@GetMapping @GetMapping
@@ -40,6 +44,17 @@ public class KnowledgeController {
.map(result -> ResponseEntity.status(HttpStatus.ACCEPTED).body(result)); .map(result -> ResponseEntity.status(HttpStatus.ACCEPTED).body(result));
} }
@GetMapping("/youtube-transcript")
public Mono<ResponseEntity<Map<String, Object>>> fetchYouTubeTranscript(
@AuthenticationPrincipal String userId,
@RequestParam String url) {
return Mono.fromCallable(() -> youTubeTranscriptService.fetchTranscript(url))
.map(transcript -> ResponseEntity.ok(Map.<String, Object>of("transcript", transcript)))
.onErrorResume(e -> Mono.just(
ResponseEntity.status(HttpStatus.BAD_REQUEST)
.body(Map.of("error", e.getMessage() != null ? e.getMessage() : "트랜스크립트를 가져올 수 없습니다"))));
}
@GetMapping("/{id}") @GetMapping("/{id}")
public Mono<ResponseEntity<Map<String, Object>>> getById( public Mono<ResponseEntity<Map<String, Object>>> getById(
@AuthenticationPrincipal String userId, @AuthenticationPrincipal String userId,

View File

@@ -0,0 +1,183 @@
package com.sundol.service;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import org.springframework.web.reactive.function.client.WebClient;
import java.io.IOException;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@Service
public class YouTubeTranscriptService {
private static final Logger log = LoggerFactory.getLogger(YouTubeTranscriptService.class);
private static final Pattern CAPTION_TRACK_PATTERN =
Pattern.compile("\"captionTracks\":\\s*\\[(.*?)]", Pattern.DOTALL);
private static final Pattern BASE_URL_PATTERN =
Pattern.compile("\"baseUrl\":\\s*\"(.*?)\"");
private static final Pattern LANG_PATTERN =
Pattern.compile("\"languageCode\":\\s*\"(.*?)\"");
private static final Pattern XML_TEXT_PATTERN =
Pattern.compile("<text[^>]*>(.*?)</text>", Pattern.DOTALL);
private final WebClient webClient;
public YouTubeTranscriptService() {
this.webClient = WebClient.builder()
.codecs(configurer -> configurer.defaultCodecs().maxInMemorySize(5 * 1024 * 1024))
.build();
}
public String fetchTranscript(String youtubeUrl) throws IOException {
String videoId = extractVideoId(youtubeUrl);
if (videoId == null) {
throw new IOException("유효하지 않은 YouTube URL입니다: " + youtubeUrl);
}
String watchUrl = "https://www.youtube.com/watch?v=" + videoId;
log.info("Fetching YouTube transcript for: {}", watchUrl);
// YouTube 페이지 HTML 가져오기
String html;
try {
Document doc = Jsoup.connect(watchUrl)
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
.header("Accept-Language", "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7")
.timeout(15_000)
.maxBodySize(0)
.get();
html = doc.html();
} catch (Exception e) {
throw new IOException("YouTube 페이지를 가져올 수 없습니다: " + e.getMessage(), e);
}
// captionTracks JSON 추출
Matcher captionMatcher = CAPTION_TRACK_PATTERN.matcher(html);
if (!captionMatcher.find()) {
throw new IOException("이 영상에는 자막(caption)이 없습니다.");
}
String captionTracksJson = captionMatcher.group(1);
// 자막 트랙 URL 선택 (한국어 > 영어 > 첫 번째)
String captionUrl = selectCaptionUrl(captionTracksJson);
if (captionUrl == null) {
throw new IOException("자막 트랙 URL을 추출할 수 없습니다.");
}
// Unicode escape 처리
captionUrl = captionUrl.replace("\\u0026", "&");
log.info("Fetching caption XML from: {}", captionUrl);
// 자막 XML 가져오기
String xml;
try {
xml = webClient.get()
.uri(captionUrl)
.retrieve()
.bodyToMono(String.class)
.timeout(Duration.ofSeconds(15))
.block();
} catch (Exception e) {
throw new IOException("자막 XML을 가져올 수 없습니다: " + e.getMessage(), e);
}
if (xml == null || xml.isBlank()) {
throw new IOException("자막 XML이 비어있습니다.");
}
// XML에서 텍스트 추출
String transcript = parseTranscriptXml(xml);
if (transcript.isBlank()) {
throw new IOException("자막 텍스트를 파싱할 수 없습니다.");
}
log.info("Successfully fetched transcript: {} chars", transcript.length());
return transcript;
}
private String selectCaptionUrl(String captionTracksJson) {
// 각 트랙에서 baseUrl과 languageCode 추출
// 여러 트랙이 있을 수 있으므로 개별 트랙을 분리
String[] tracks = captionTracksJson.split("\\},\\s*\\{");
String koUrl = null;
String enUrl = null;
String firstUrl = null;
for (String track : tracks) {
Matcher urlMatcher = BASE_URL_PATTERN.matcher(track);
Matcher langMatcher = LANG_PATTERN.matcher(track);
if (urlMatcher.find()) {
String url = urlMatcher.group(1);
if (firstUrl == null) firstUrl = url;
if (langMatcher.find()) {
String lang = langMatcher.group(1);
if (lang.startsWith("ko") && koUrl == null) koUrl = url;
if (lang.startsWith("en") && enUrl == null) enUrl = url;
}
}
}
if (koUrl != null) return koUrl;
if (enUrl != null) return enUrl;
return firstUrl;
}
private String parseTranscriptXml(String xml) {
StringBuilder sb = new StringBuilder();
Matcher matcher = XML_TEXT_PATTERN.matcher(xml);
while (matcher.find()) {
String text = matcher.group(1)
.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&quot;", "\"")
.replace("&#39;", "'")
.replace("\n", " ")
.trim();
if (!text.isEmpty()) {
if (sb.length() > 0) sb.append(" ");
sb.append(text);
}
}
return sb.toString();
}
private String extractVideoId(String url) {
if (url == null || url.isBlank()) return null;
try {
java.net.URI uri = new java.net.URI(url);
String host = uri.getHost();
if (host == null) return null;
if (host.equals("youtu.be")) {
String path = uri.getPath();
return path != null && path.length() > 1 ? path.substring(1) : null;
}
if (host.contains("youtube.com")) {
String query = uri.getQuery();
if (query == null) return null;
for (String param : query.split("&")) {
String[] kv = param.split("=", 2);
if (kv.length == 2 && kv[0].equals("v")) {
return URLDecoder.decode(kv[1], StandardCharsets.UTF_8);
}
}
}
} catch (Exception e) {
log.warn("Failed to parse YouTube URL: {}", url);
}
return null;
}
}

View File

@@ -36,6 +36,7 @@ export default function KnowledgeAddPage() {
const [modelId, setModelId] = useState(""); const [modelId, setModelId] = useState("");
const [models, setModels] = useState<ModelInfo[]>([]); const [models, setModels] = useState<ModelInfo[]>([]);
const [submitting, setSubmitting] = useState(false); const [submitting, setSubmitting] = useState(false);
const [fetchingTranscript, setFetchingTranscript] = useState(false);
const [error, setError] = useState<string | null>(null); const [error, setError] = useState<string | null>(null);
useEffect(() => { useEffect(() => {
@@ -52,6 +53,30 @@ export default function KnowledgeAddPage() {
const videoId = useMemo(() => (type === "YOUTUBE" ? extractYouTubeVideoId(url) : null), [type, url]); const videoId = useMemo(() => (type === "YOUTUBE" ? extractYouTubeVideoId(url) : null), [type, url]);
const canFetchTranscript = type === "YOUTUBE" && videoId !== null && !fetchingTranscript;
const handleFetchTranscript = async () => {
if (!canFetchTranscript) return;
setError(null);
setFetchingTranscript(true);
try {
const data = await request<{ transcript?: string; error?: string }>({
method: "GET",
url: `/api/knowledge/youtube-transcript?url=${encodeURIComponent(url.trim())}`,
});
if (data.error) {
setError(data.error);
} else if (data.transcript) {
setRawText(data.transcript);
}
} catch (err: unknown) {
const msg = err instanceof Error ? err.message : "트랜스크립트를 가져올 수 없습니다";
setError(msg);
} finally {
setFetchingTranscript(false);
}
};
const canSubmit = const canSubmit =
!submitting && !submitting &&
((type === "TEXT" && rawText.trim().length > 0) || ((type === "TEXT" && rawText.trim().length > 0) ||
@@ -168,6 +193,17 @@ export default function KnowledgeAddPage() {
</div> </div>
)} )}
{/* Fetch Transcript Button (YOUTUBE) */}
{type === "YOUTUBE" && videoId && (
<button
onClick={handleFetchTranscript}
disabled={!canFetchTranscript}
className="w-full px-4 py-2 bg-[var(--color-bg-card)] border border-[var(--color-border)] hover:border-[var(--color-primary)] disabled:opacity-40 disabled:cursor-not-allowed rounded-lg transition-colors text-sm font-medium"
>
{fetchingTranscript ? "트랜스크립트 가져오는 중..." : "트랜스크립트 자동 가져오기"}
</button>
)}
{/* Text Input (TEXT / YOUTUBE) */} {/* Text Input (TEXT / YOUTUBE) */}
{type !== "WEB" && ( {type !== "WEB" && (
<div> <div>