Add YouTube transcript auto-fetch button on Knowledge add page
- YouTubeTranscriptService: fetches captions from YouTube page (ko > en > first available) - GET /api/knowledge/youtube-transcript endpoint - Frontend: "트랜스크립트 자동 가져오기" button appears when valid YouTube URL entered Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2,6 +2,7 @@ package com.sundol.controller;
|
|||||||
|
|
||||||
import com.sundol.dto.IngestRequest;
|
import com.sundol.dto.IngestRequest;
|
||||||
import com.sundol.service.KnowledgeService;
|
import com.sundol.service.KnowledgeService;
|
||||||
|
import com.sundol.service.YouTubeTranscriptService;
|
||||||
import org.springframework.http.HttpStatus;
|
import org.springframework.http.HttpStatus;
|
||||||
import org.springframework.http.ResponseEntity;
|
import org.springframework.http.ResponseEntity;
|
||||||
import org.springframework.security.core.annotation.AuthenticationPrincipal;
|
import org.springframework.security.core.annotation.AuthenticationPrincipal;
|
||||||
@@ -16,9 +17,12 @@ import java.util.Map;
|
|||||||
public class KnowledgeController {
|
public class KnowledgeController {
|
||||||
|
|
||||||
private final KnowledgeService knowledgeService;
|
private final KnowledgeService knowledgeService;
|
||||||
|
private final YouTubeTranscriptService youTubeTranscriptService;
|
||||||
|
|
||||||
public KnowledgeController(KnowledgeService knowledgeService) {
|
public KnowledgeController(KnowledgeService knowledgeService,
|
||||||
|
YouTubeTranscriptService youTubeTranscriptService) {
|
||||||
this.knowledgeService = knowledgeService;
|
this.knowledgeService = knowledgeService;
|
||||||
|
this.youTubeTranscriptService = youTubeTranscriptService;
|
||||||
}
|
}
|
||||||
|
|
||||||
@GetMapping
|
@GetMapping
|
||||||
@@ -40,6 +44,17 @@ public class KnowledgeController {
|
|||||||
.map(result -> ResponseEntity.status(HttpStatus.ACCEPTED).body(result));
|
.map(result -> ResponseEntity.status(HttpStatus.ACCEPTED).body(result));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@GetMapping("/youtube-transcript")
|
||||||
|
public Mono<ResponseEntity<Map<String, Object>>> fetchYouTubeTranscript(
|
||||||
|
@AuthenticationPrincipal String userId,
|
||||||
|
@RequestParam String url) {
|
||||||
|
return Mono.fromCallable(() -> youTubeTranscriptService.fetchTranscript(url))
|
||||||
|
.map(transcript -> ResponseEntity.ok(Map.<String, Object>of("transcript", transcript)))
|
||||||
|
.onErrorResume(e -> Mono.just(
|
||||||
|
ResponseEntity.status(HttpStatus.BAD_REQUEST)
|
||||||
|
.body(Map.of("error", e.getMessage() != null ? e.getMessage() : "트랜스크립트를 가져올 수 없습니다"))));
|
||||||
|
}
|
||||||
|
|
||||||
@GetMapping("/{id}")
|
@GetMapping("/{id}")
|
||||||
public Mono<ResponseEntity<Map<String, Object>>> getById(
|
public Mono<ResponseEntity<Map<String, Object>>> getById(
|
||||||
@AuthenticationPrincipal String userId,
|
@AuthenticationPrincipal String userId,
|
||||||
|
|||||||
@@ -0,0 +1,183 @@
|
|||||||
|
package com.sundol.service;
|
||||||
|
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
import org.springframework.web.reactive.function.client.WebClient;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URLDecoder;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
public class YouTubeTranscriptService {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(YouTubeTranscriptService.class);
|
||||||
|
|
||||||
|
private static final Pattern CAPTION_TRACK_PATTERN =
|
||||||
|
Pattern.compile("\"captionTracks\":\\s*\\[(.*?)]", Pattern.DOTALL);
|
||||||
|
private static final Pattern BASE_URL_PATTERN =
|
||||||
|
Pattern.compile("\"baseUrl\":\\s*\"(.*?)\"");
|
||||||
|
private static final Pattern LANG_PATTERN =
|
||||||
|
Pattern.compile("\"languageCode\":\\s*\"(.*?)\"");
|
||||||
|
private static final Pattern XML_TEXT_PATTERN =
|
||||||
|
Pattern.compile("<text[^>]*>(.*?)</text>", Pattern.DOTALL);
|
||||||
|
|
||||||
|
private final WebClient webClient;
|
||||||
|
|
||||||
|
public YouTubeTranscriptService() {
|
||||||
|
this.webClient = WebClient.builder()
|
||||||
|
.codecs(configurer -> configurer.defaultCodecs().maxInMemorySize(5 * 1024 * 1024))
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String fetchTranscript(String youtubeUrl) throws IOException {
|
||||||
|
String videoId = extractVideoId(youtubeUrl);
|
||||||
|
if (videoId == null) {
|
||||||
|
throw new IOException("유효하지 않은 YouTube URL입니다: " + youtubeUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
String watchUrl = "https://www.youtube.com/watch?v=" + videoId;
|
||||||
|
log.info("Fetching YouTube transcript for: {}", watchUrl);
|
||||||
|
|
||||||
|
// YouTube 페이지 HTML 가져오기
|
||||||
|
String html;
|
||||||
|
try {
|
||||||
|
Document doc = Jsoup.connect(watchUrl)
|
||||||
|
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
|
||||||
|
.header("Accept-Language", "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7")
|
||||||
|
.timeout(15_000)
|
||||||
|
.maxBodySize(0)
|
||||||
|
.get();
|
||||||
|
html = doc.html();
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new IOException("YouTube 페이지를 가져올 수 없습니다: " + e.getMessage(), e);
|
||||||
|
}
|
||||||
|
|
||||||
|
// captionTracks JSON 추출
|
||||||
|
Matcher captionMatcher = CAPTION_TRACK_PATTERN.matcher(html);
|
||||||
|
if (!captionMatcher.find()) {
|
||||||
|
throw new IOException("이 영상에는 자막(caption)이 없습니다.");
|
||||||
|
}
|
||||||
|
|
||||||
|
String captionTracksJson = captionMatcher.group(1);
|
||||||
|
|
||||||
|
// 자막 트랙 URL 선택 (한국어 > 영어 > 첫 번째)
|
||||||
|
String captionUrl = selectCaptionUrl(captionTracksJson);
|
||||||
|
if (captionUrl == null) {
|
||||||
|
throw new IOException("자막 트랙 URL을 추출할 수 없습니다.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unicode escape 처리
|
||||||
|
captionUrl = captionUrl.replace("\\u0026", "&");
|
||||||
|
|
||||||
|
log.info("Fetching caption XML from: {}", captionUrl);
|
||||||
|
|
||||||
|
// 자막 XML 가져오기
|
||||||
|
String xml;
|
||||||
|
try {
|
||||||
|
xml = webClient.get()
|
||||||
|
.uri(captionUrl)
|
||||||
|
.retrieve()
|
||||||
|
.bodyToMono(String.class)
|
||||||
|
.timeout(Duration.ofSeconds(15))
|
||||||
|
.block();
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new IOException("자막 XML을 가져올 수 없습니다: " + e.getMessage(), e);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (xml == null || xml.isBlank()) {
|
||||||
|
throw new IOException("자막 XML이 비어있습니다.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// XML에서 텍스트 추출
|
||||||
|
String transcript = parseTranscriptXml(xml);
|
||||||
|
if (transcript.isBlank()) {
|
||||||
|
throw new IOException("자막 텍스트를 파싱할 수 없습니다.");
|
||||||
|
}
|
||||||
|
|
||||||
|
log.info("Successfully fetched transcript: {} chars", transcript.length());
|
||||||
|
return transcript;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String selectCaptionUrl(String captionTracksJson) {
|
||||||
|
// 각 트랙에서 baseUrl과 languageCode 추출
|
||||||
|
// 여러 트랙이 있을 수 있으므로 개별 트랙을 분리
|
||||||
|
String[] tracks = captionTracksJson.split("\\},\\s*\\{");
|
||||||
|
|
||||||
|
String koUrl = null;
|
||||||
|
String enUrl = null;
|
||||||
|
String firstUrl = null;
|
||||||
|
|
||||||
|
for (String track : tracks) {
|
||||||
|
Matcher urlMatcher = BASE_URL_PATTERN.matcher(track);
|
||||||
|
Matcher langMatcher = LANG_PATTERN.matcher(track);
|
||||||
|
|
||||||
|
if (urlMatcher.find()) {
|
||||||
|
String url = urlMatcher.group(1);
|
||||||
|
if (firstUrl == null) firstUrl = url;
|
||||||
|
|
||||||
|
if (langMatcher.find()) {
|
||||||
|
String lang = langMatcher.group(1);
|
||||||
|
if (lang.startsWith("ko") && koUrl == null) koUrl = url;
|
||||||
|
if (lang.startsWith("en") && enUrl == null) enUrl = url;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (koUrl != null) return koUrl;
|
||||||
|
if (enUrl != null) return enUrl;
|
||||||
|
return firstUrl;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String parseTranscriptXml(String xml) {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
Matcher matcher = XML_TEXT_PATTERN.matcher(xml);
|
||||||
|
while (matcher.find()) {
|
||||||
|
String text = matcher.group(1)
|
||||||
|
.replace("&", "&")
|
||||||
|
.replace("<", "<")
|
||||||
|
.replace(">", ">")
|
||||||
|
.replace(""", "\"")
|
||||||
|
.replace("'", "'")
|
||||||
|
.replace("\n", " ")
|
||||||
|
.trim();
|
||||||
|
if (!text.isEmpty()) {
|
||||||
|
if (sb.length() > 0) sb.append(" ");
|
||||||
|
sb.append(text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private String extractVideoId(String url) {
|
||||||
|
if (url == null || url.isBlank()) return null;
|
||||||
|
try {
|
||||||
|
java.net.URI uri = new java.net.URI(url);
|
||||||
|
String host = uri.getHost();
|
||||||
|
if (host == null) return null;
|
||||||
|
if (host.equals("youtu.be")) {
|
||||||
|
String path = uri.getPath();
|
||||||
|
return path != null && path.length() > 1 ? path.substring(1) : null;
|
||||||
|
}
|
||||||
|
if (host.contains("youtube.com")) {
|
||||||
|
String query = uri.getQuery();
|
||||||
|
if (query == null) return null;
|
||||||
|
for (String param : query.split("&")) {
|
||||||
|
String[] kv = param.split("=", 2);
|
||||||
|
if (kv.length == 2 && kv[0].equals("v")) {
|
||||||
|
return URLDecoder.decode(kv[1], StandardCharsets.UTF_8);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.warn("Failed to parse YouTube URL: {}", url);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -36,6 +36,7 @@ export default function KnowledgeAddPage() {
|
|||||||
const [modelId, setModelId] = useState("");
|
const [modelId, setModelId] = useState("");
|
||||||
const [models, setModels] = useState<ModelInfo[]>([]);
|
const [models, setModels] = useState<ModelInfo[]>([]);
|
||||||
const [submitting, setSubmitting] = useState(false);
|
const [submitting, setSubmitting] = useState(false);
|
||||||
|
const [fetchingTranscript, setFetchingTranscript] = useState(false);
|
||||||
const [error, setError] = useState<string | null>(null);
|
const [error, setError] = useState<string | null>(null);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
@@ -52,6 +53,30 @@ export default function KnowledgeAddPage() {
|
|||||||
|
|
||||||
const videoId = useMemo(() => (type === "YOUTUBE" ? extractYouTubeVideoId(url) : null), [type, url]);
|
const videoId = useMemo(() => (type === "YOUTUBE" ? extractYouTubeVideoId(url) : null), [type, url]);
|
||||||
|
|
||||||
|
const canFetchTranscript = type === "YOUTUBE" && videoId !== null && !fetchingTranscript;
|
||||||
|
|
||||||
|
const handleFetchTranscript = async () => {
|
||||||
|
if (!canFetchTranscript) return;
|
||||||
|
setError(null);
|
||||||
|
setFetchingTranscript(true);
|
||||||
|
try {
|
||||||
|
const data = await request<{ transcript?: string; error?: string }>({
|
||||||
|
method: "GET",
|
||||||
|
url: `/api/knowledge/youtube-transcript?url=${encodeURIComponent(url.trim())}`,
|
||||||
|
});
|
||||||
|
if (data.error) {
|
||||||
|
setError(data.error);
|
||||||
|
} else if (data.transcript) {
|
||||||
|
setRawText(data.transcript);
|
||||||
|
}
|
||||||
|
} catch (err: unknown) {
|
||||||
|
const msg = err instanceof Error ? err.message : "트랜스크립트를 가져올 수 없습니다";
|
||||||
|
setError(msg);
|
||||||
|
} finally {
|
||||||
|
setFetchingTranscript(false);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
const canSubmit =
|
const canSubmit =
|
||||||
!submitting &&
|
!submitting &&
|
||||||
((type === "TEXT" && rawText.trim().length > 0) ||
|
((type === "TEXT" && rawText.trim().length > 0) ||
|
||||||
@@ -168,6 +193,17 @@ export default function KnowledgeAddPage() {
|
|||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
|
{/* Fetch Transcript Button (YOUTUBE) */}
|
||||||
|
{type === "YOUTUBE" && videoId && (
|
||||||
|
<button
|
||||||
|
onClick={handleFetchTranscript}
|
||||||
|
disabled={!canFetchTranscript}
|
||||||
|
className="w-full px-4 py-2 bg-[var(--color-bg-card)] border border-[var(--color-border)] hover:border-[var(--color-primary)] disabled:opacity-40 disabled:cursor-not-allowed rounded-lg transition-colors text-sm font-medium"
|
||||||
|
>
|
||||||
|
{fetchingTranscript ? "트랜스크립트 가져오는 중..." : "트랜스크립트 자동 가져오기"}
|
||||||
|
</button>
|
||||||
|
)}
|
||||||
|
|
||||||
{/* Text Input (TEXT / YOUTUBE) */}
|
{/* Text Input (TEXT / YOUTUBE) */}
|
||||||
{type !== "WEB" && (
|
{type !== "WEB" && (
|
||||||
<div>
|
<div>
|
||||||
|
|||||||
Reference in New Issue
Block a user