Add YouTube transcript auto-fetch button on Knowledge add page
- YouTubeTranscriptService: fetches captions from YouTube page (ko > en > first available) - GET /api/knowledge/youtube-transcript endpoint - Frontend: "트랜스크립트 자동 가져오기" button appears when valid YouTube URL entered Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2,6 +2,7 @@ package com.sundol.controller;
|
||||
|
||||
import com.sundol.dto.IngestRequest;
|
||||
import com.sundol.service.KnowledgeService;
|
||||
import com.sundol.service.YouTubeTranscriptService;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.security.core.annotation.AuthenticationPrincipal;
|
||||
@@ -16,9 +17,12 @@ import java.util.Map;
|
||||
public class KnowledgeController {
|
||||
|
||||
private final KnowledgeService knowledgeService;
|
||||
private final YouTubeTranscriptService youTubeTranscriptService;
|
||||
|
||||
public KnowledgeController(KnowledgeService knowledgeService) {
|
||||
public KnowledgeController(KnowledgeService knowledgeService,
|
||||
YouTubeTranscriptService youTubeTranscriptService) {
|
||||
this.knowledgeService = knowledgeService;
|
||||
this.youTubeTranscriptService = youTubeTranscriptService;
|
||||
}
|
||||
|
||||
@GetMapping
|
||||
@@ -40,6 +44,17 @@ public class KnowledgeController {
|
||||
.map(result -> ResponseEntity.status(HttpStatus.ACCEPTED).body(result));
|
||||
}
|
||||
|
||||
@GetMapping("/youtube-transcript")
|
||||
public Mono<ResponseEntity<Map<String, Object>>> fetchYouTubeTranscript(
|
||||
@AuthenticationPrincipal String userId,
|
||||
@RequestParam String url) {
|
||||
return Mono.fromCallable(() -> youTubeTranscriptService.fetchTranscript(url))
|
||||
.map(transcript -> ResponseEntity.ok(Map.<String, Object>of("transcript", transcript)))
|
||||
.onErrorResume(e -> Mono.just(
|
||||
ResponseEntity.status(HttpStatus.BAD_REQUEST)
|
||||
.body(Map.of("error", e.getMessage() != null ? e.getMessage() : "트랜스크립트를 가져올 수 없습니다"))));
|
||||
}
|
||||
|
||||
@GetMapping("/{id}")
|
||||
public Mono<ResponseEntity<Map<String, Object>>> getById(
|
||||
@AuthenticationPrincipal String userId,
|
||||
|
||||
@@ -0,0 +1,183 @@
|
||||
package com.sundol.service;
|
||||
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.web.reactive.function.client.WebClient;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URLDecoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Duration;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@Service
|
||||
public class YouTubeTranscriptService {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(YouTubeTranscriptService.class);
|
||||
|
||||
private static final Pattern CAPTION_TRACK_PATTERN =
|
||||
Pattern.compile("\"captionTracks\":\\s*\\[(.*?)]", Pattern.DOTALL);
|
||||
private static final Pattern BASE_URL_PATTERN =
|
||||
Pattern.compile("\"baseUrl\":\\s*\"(.*?)\"");
|
||||
private static final Pattern LANG_PATTERN =
|
||||
Pattern.compile("\"languageCode\":\\s*\"(.*?)\"");
|
||||
private static final Pattern XML_TEXT_PATTERN =
|
||||
Pattern.compile("<text[^>]*>(.*?)</text>", Pattern.DOTALL);
|
||||
|
||||
private final WebClient webClient;
|
||||
|
||||
public YouTubeTranscriptService() {
|
||||
this.webClient = WebClient.builder()
|
||||
.codecs(configurer -> configurer.defaultCodecs().maxInMemorySize(5 * 1024 * 1024))
|
||||
.build();
|
||||
}
|
||||
|
||||
public String fetchTranscript(String youtubeUrl) throws IOException {
|
||||
String videoId = extractVideoId(youtubeUrl);
|
||||
if (videoId == null) {
|
||||
throw new IOException("유효하지 않은 YouTube URL입니다: " + youtubeUrl);
|
||||
}
|
||||
|
||||
String watchUrl = "https://www.youtube.com/watch?v=" + videoId;
|
||||
log.info("Fetching YouTube transcript for: {}", watchUrl);
|
||||
|
||||
// YouTube 페이지 HTML 가져오기
|
||||
String html;
|
||||
try {
|
||||
Document doc = Jsoup.connect(watchUrl)
|
||||
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
|
||||
.header("Accept-Language", "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7")
|
||||
.timeout(15_000)
|
||||
.maxBodySize(0)
|
||||
.get();
|
||||
html = doc.html();
|
||||
} catch (Exception e) {
|
||||
throw new IOException("YouTube 페이지를 가져올 수 없습니다: " + e.getMessage(), e);
|
||||
}
|
||||
|
||||
// captionTracks JSON 추출
|
||||
Matcher captionMatcher = CAPTION_TRACK_PATTERN.matcher(html);
|
||||
if (!captionMatcher.find()) {
|
||||
throw new IOException("이 영상에는 자막(caption)이 없습니다.");
|
||||
}
|
||||
|
||||
String captionTracksJson = captionMatcher.group(1);
|
||||
|
||||
// 자막 트랙 URL 선택 (한국어 > 영어 > 첫 번째)
|
||||
String captionUrl = selectCaptionUrl(captionTracksJson);
|
||||
if (captionUrl == null) {
|
||||
throw new IOException("자막 트랙 URL을 추출할 수 없습니다.");
|
||||
}
|
||||
|
||||
// Unicode escape 처리
|
||||
captionUrl = captionUrl.replace("\\u0026", "&");
|
||||
|
||||
log.info("Fetching caption XML from: {}", captionUrl);
|
||||
|
||||
// 자막 XML 가져오기
|
||||
String xml;
|
||||
try {
|
||||
xml = webClient.get()
|
||||
.uri(captionUrl)
|
||||
.retrieve()
|
||||
.bodyToMono(String.class)
|
||||
.timeout(Duration.ofSeconds(15))
|
||||
.block();
|
||||
} catch (Exception e) {
|
||||
throw new IOException("자막 XML을 가져올 수 없습니다: " + e.getMessage(), e);
|
||||
}
|
||||
|
||||
if (xml == null || xml.isBlank()) {
|
||||
throw new IOException("자막 XML이 비어있습니다.");
|
||||
}
|
||||
|
||||
// XML에서 텍스트 추출
|
||||
String transcript = parseTranscriptXml(xml);
|
||||
if (transcript.isBlank()) {
|
||||
throw new IOException("자막 텍스트를 파싱할 수 없습니다.");
|
||||
}
|
||||
|
||||
log.info("Successfully fetched transcript: {} chars", transcript.length());
|
||||
return transcript;
|
||||
}
|
||||
|
||||
private String selectCaptionUrl(String captionTracksJson) {
|
||||
// 각 트랙에서 baseUrl과 languageCode 추출
|
||||
// 여러 트랙이 있을 수 있으므로 개별 트랙을 분리
|
||||
String[] tracks = captionTracksJson.split("\\},\\s*\\{");
|
||||
|
||||
String koUrl = null;
|
||||
String enUrl = null;
|
||||
String firstUrl = null;
|
||||
|
||||
for (String track : tracks) {
|
||||
Matcher urlMatcher = BASE_URL_PATTERN.matcher(track);
|
||||
Matcher langMatcher = LANG_PATTERN.matcher(track);
|
||||
|
||||
if (urlMatcher.find()) {
|
||||
String url = urlMatcher.group(1);
|
||||
if (firstUrl == null) firstUrl = url;
|
||||
|
||||
if (langMatcher.find()) {
|
||||
String lang = langMatcher.group(1);
|
||||
if (lang.startsWith("ko") && koUrl == null) koUrl = url;
|
||||
if (lang.startsWith("en") && enUrl == null) enUrl = url;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (koUrl != null) return koUrl;
|
||||
if (enUrl != null) return enUrl;
|
||||
return firstUrl;
|
||||
}
|
||||
|
||||
private String parseTranscriptXml(String xml) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
Matcher matcher = XML_TEXT_PATTERN.matcher(xml);
|
||||
while (matcher.find()) {
|
||||
String text = matcher.group(1)
|
||||
.replace("&", "&")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
.replace(""", "\"")
|
||||
.replace("'", "'")
|
||||
.replace("\n", " ")
|
||||
.trim();
|
||||
if (!text.isEmpty()) {
|
||||
if (sb.length() > 0) sb.append(" ");
|
||||
sb.append(text);
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private String extractVideoId(String url) {
|
||||
if (url == null || url.isBlank()) return null;
|
||||
try {
|
||||
java.net.URI uri = new java.net.URI(url);
|
||||
String host = uri.getHost();
|
||||
if (host == null) return null;
|
||||
if (host.equals("youtu.be")) {
|
||||
String path = uri.getPath();
|
||||
return path != null && path.length() > 1 ? path.substring(1) : null;
|
||||
}
|
||||
if (host.contains("youtube.com")) {
|
||||
String query = uri.getQuery();
|
||||
if (query == null) return null;
|
||||
for (String param : query.split("&")) {
|
||||
String[] kv = param.split("=", 2);
|
||||
if (kv.length == 2 && kv[0].equals("v")) {
|
||||
return URLDecoder.decode(kv[1], StandardCharsets.UTF_8);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.warn("Failed to parse YouTube URL: {}", url);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -36,6 +36,7 @@ export default function KnowledgeAddPage() {
|
||||
const [modelId, setModelId] = useState("");
|
||||
const [models, setModels] = useState<ModelInfo[]>([]);
|
||||
const [submitting, setSubmitting] = useState(false);
|
||||
const [fetchingTranscript, setFetchingTranscript] = useState(false);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
useEffect(() => {
|
||||
@@ -52,6 +53,30 @@ export default function KnowledgeAddPage() {
|
||||
|
||||
const videoId = useMemo(() => (type === "YOUTUBE" ? extractYouTubeVideoId(url) : null), [type, url]);
|
||||
|
||||
const canFetchTranscript = type === "YOUTUBE" && videoId !== null && !fetchingTranscript;
|
||||
|
||||
const handleFetchTranscript = async () => {
|
||||
if (!canFetchTranscript) return;
|
||||
setError(null);
|
||||
setFetchingTranscript(true);
|
||||
try {
|
||||
const data = await request<{ transcript?: string; error?: string }>({
|
||||
method: "GET",
|
||||
url: `/api/knowledge/youtube-transcript?url=${encodeURIComponent(url.trim())}`,
|
||||
});
|
||||
if (data.error) {
|
||||
setError(data.error);
|
||||
} else if (data.transcript) {
|
||||
setRawText(data.transcript);
|
||||
}
|
||||
} catch (err: unknown) {
|
||||
const msg = err instanceof Error ? err.message : "트랜스크립트를 가져올 수 없습니다";
|
||||
setError(msg);
|
||||
} finally {
|
||||
setFetchingTranscript(false);
|
||||
}
|
||||
};
|
||||
|
||||
const canSubmit =
|
||||
!submitting &&
|
||||
((type === "TEXT" && rawText.trim().length > 0) ||
|
||||
@@ -168,6 +193,17 @@ export default function KnowledgeAddPage() {
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Fetch Transcript Button (YOUTUBE) */}
|
||||
{type === "YOUTUBE" && videoId && (
|
||||
<button
|
||||
onClick={handleFetchTranscript}
|
||||
disabled={!canFetchTranscript}
|
||||
className="w-full px-4 py-2 bg-[var(--color-bg-card)] border border-[var(--color-border)] hover:border-[var(--color-primary)] disabled:opacity-40 disabled:cursor-not-allowed rounded-lg transition-colors text-sm font-medium"
|
||||
>
|
||||
{fetchingTranscript ? "트랜스크립트 가져오는 중..." : "트랜스크립트 자동 가져오기"}
|
||||
</button>
|
||||
)}
|
||||
|
||||
{/* Text Input (TEXT / YOUTUBE) */}
|
||||
{type !== "WEB" && (
|
||||
<div>
|
||||
|
||||
Reference in New Issue
Block a user