Files
tasteby/backend-java/src/main/java/com/tasteby/service/YouTubeService.java
joungmin 0f985d52a9 벌크 자막/추출 개선, 검색 필터 무시, geocoding 필드 수정, 네이버맵 링크
- 벌크 자막: 브라우저 우선 + API fallback, 광고 즉시 skip, 대기 시간 단축
- 벌크 자막/추출: 선택한 영상만 처리 가능 (체크박스 선택 후 실행)
- 자막 실패 시 no_transcript 상태 마킹하여 재시도 방지
- 검색 시 필터 조건 무시 (채널/장르/가격/지역/영역 초기화)
- 리셋 버튼 클릭 시 검색어 입력란 초기화
- RestaurantMapper updateFields에 google_place_id, rating 등 geocoding 필드 추가
- SearchMapper에 tabling_url, catchtable_url, phone, website 필드 추가
- 식당 상세에 네이버 지도 링크 추가
- YouTubeService.getTranscriptApi public 전환

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 09:00:40 +09:00

596 lines
26 KiB
Java

package com.tasteby.service;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import io.github.thoroldvix.api.Transcript;
import io.github.thoroldvix.api.TranscriptContent;
import io.github.thoroldvix.api.TranscriptList;
import io.github.thoroldvix.api.TranscriptApiFactory;
import io.github.thoroldvix.api.YoutubeTranscriptApi;
import com.microsoft.playwright.*;
import com.microsoft.playwright.options.Cookie;
import com.microsoft.playwright.options.WaitUntilState;
import com.tasteby.domain.Channel;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.web.reactive.function.client.WebClient;
import java.nio.file.Path;
import java.time.Duration;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@Service
public class YouTubeService {
private static final Logger log = LoggerFactory.getLogger(YouTubeService.class);
private static final Pattern DURATION_PATTERN = Pattern.compile("PT(?:(\\d+)H)?(?:(\\d+)M)?(?:(\\d+)S)?");
private final WebClient webClient;
private final ObjectMapper mapper;
private final ChannelService channelService;
private final VideoService videoService;
private final String apiKey;
public YouTubeService(ObjectMapper mapper,
ChannelService channelService,
VideoService videoService,
@Value("${app.google.youtube-api-key}") String apiKey) {
this.webClient = WebClient.builder()
.baseUrl("https://www.googleapis.com/youtube/v3")
.build();
this.mapper = mapper;
this.channelService = channelService;
this.videoService = videoService;
this.apiKey = apiKey;
}
/**
* Fetch videos from a YouTube channel using the uploads playlist (UC→UU).
* This returns ALL videos unlike the Search API which caps results.
* Falls back to Search API if playlist approach fails.
*/
public List<Map<String, Object>> fetchChannelVideos(String channelId, String publishedAfter, boolean excludeShorts) {
// Convert channel ID UC... → uploads playlist UU...
String uploadsPlaylistId = "UU" + channelId.substring(2);
List<Map<String, Object>> allVideos = new ArrayList<>();
String nextPage = null;
try {
do {
String pageToken = nextPage;
String response = webClient.get()
.uri(uriBuilder -> {
var b = uriBuilder.path("/playlistItems")
.queryParam("key", apiKey)
.queryParam("playlistId", uploadsPlaylistId)
.queryParam("part", "snippet")
.queryParam("maxResults", 50);
if (pageToken != null) b.queryParam("pageToken", pageToken);
return b.build();
})
.retrieve()
.bodyToMono(String.class)
.block(Duration.ofSeconds(30));
JsonNode data = mapper.readTree(response);
List<Map<String, Object>> pageVideos = new ArrayList<>();
for (JsonNode item : data.path("items")) {
JsonNode snippet = item.path("snippet");
String vid = snippet.path("resourceId").path("videoId").asText();
String publishedAt = snippet.path("publishedAt").asText();
// publishedAfter 필터: 이미 스캔한 영상 이후만
if (publishedAfter != null && publishedAt.compareTo(publishedAfter) <= 0) {
// 업로드 재생목록은 최신순이므로 이전 날짜 만나면 중단
nextPage = null;
break;
}
pageVideos.add(Map.of(
"video_id", vid,
"title", snippet.path("title").asText(),
"published_at", publishedAt,
"url", "https://www.youtube.com/watch?v=" + vid
));
}
if (excludeShorts && !pageVideos.isEmpty()) {
pageVideos = filterShorts(pageVideos);
}
allVideos.addAll(pageVideos);
if (nextPage != null || data.has("nextPageToken")) {
nextPage = data.has("nextPageToken") ? data.path("nextPageToken").asText() : null;
}
} while (nextPage != null);
} catch (Exception e) {
log.warn("PlaylistItems API failed for {}, falling back to Search API", channelId, e);
return fetchChannelVideosViaSearch(channelId, publishedAfter, excludeShorts);
}
return allVideos;
}
/**
* Fallback: fetch via Search API (may not return all videos).
*/
private List<Map<String, Object>> fetchChannelVideosViaSearch(String channelId, String publishedAfter, boolean excludeShorts) {
List<Map<String, Object>> allVideos = new ArrayList<>();
String nextPage = null;
do {
String pageToken = nextPage;
String response = webClient.get()
.uri(uriBuilder -> {
var b = uriBuilder.path("/search")
.queryParam("key", apiKey)
.queryParam("channelId", channelId)
.queryParam("part", "snippet")
.queryParam("order", "date")
.queryParam("maxResults", 50)
.queryParam("type", "video");
if (publishedAfter != null) b.queryParam("publishedAfter", publishedAfter);
if (pageToken != null) b.queryParam("pageToken", pageToken);
return b.build();
})
.retrieve()
.bodyToMono(String.class)
.block(Duration.ofSeconds(30));
try {
JsonNode data = mapper.readTree(response);
List<Map<String, Object>> pageVideos = new ArrayList<>();
for (JsonNode item : data.path("items")) {
String vid = item.path("id").path("videoId").asText();
JsonNode snippet = item.path("snippet");
pageVideos.add(Map.of(
"video_id", vid,
"title", snippet.path("title").asText(),
"published_at", snippet.path("publishedAt").asText(),
"url", "https://www.youtube.com/watch?v=" + vid
));
}
if (excludeShorts && !pageVideos.isEmpty()) {
pageVideos = filterShorts(pageVideos);
}
allVideos.addAll(pageVideos);
nextPage = data.has("nextPageToken") ? data.path("nextPageToken").asText() : null;
} catch (Exception e) {
log.error("Failed to parse YouTube Search API response", e);
break;
}
} while (nextPage != null);
return allVideos;
}
/**
* Filter out YouTube Shorts (<=60s duration).
* YouTube /videos API accepts max 50 IDs per request, so we batch.
*/
private List<Map<String, Object>> filterShorts(List<Map<String, Object>> videos) {
Map<String, Integer> durations = new HashMap<>();
List<String> allIds = videos.stream().map(v -> (String) v.get("video_id")).toList();
for (int i = 0; i < allIds.size(); i += 50) {
List<String> batch = allIds.subList(i, Math.min(i + 50, allIds.size()));
String ids = String.join(",", batch);
try {
String response = webClient.get()
.uri(uriBuilder -> uriBuilder.path("/videos")
.queryParam("key", apiKey)
.queryParam("id", ids)
.queryParam("part", "contentDetails")
.build())
.retrieve()
.bodyToMono(String.class)
.block(Duration.ofSeconds(30));
JsonNode data = mapper.readTree(response);
for (JsonNode item : data.path("items")) {
String duration = item.path("contentDetails").path("duration").asText();
durations.put(item.path("id").asText(), parseDuration(duration));
}
} catch (Exception e) {
log.warn("Failed to fetch video durations for batch starting at {}", i, e);
}
}
return videos.stream()
.filter(v -> durations.getOrDefault(v.get("video_id"), 61) > 60)
.toList();
}
private int parseDuration(String dur) {
Matcher m = DURATION_PATTERN.matcher(dur != null ? dur : "");
if (!m.matches()) return 0;
int h = m.group(1) != null ? Integer.parseInt(m.group(1)) : 0;
int min = m.group(2) != null ? Integer.parseInt(m.group(2)) : 0;
int s = m.group(3) != null ? Integer.parseInt(m.group(3)) : 0;
return h * 3600 + min * 60 + s;
}
/**
* Scan a single channel for new videos. Returns scan result map.
*/
public Map<String, Object> scanChannel(String channelId, boolean full) {
Channel ch = channelService.findByChannelId(channelId);
if (ch == null) return null;
String dbId = ch.getId();
String titleFilter = ch.getTitleFilter();
String after = full ? null : videoService.getLatestVideoDate(dbId);
Set<String> existing = videoService.getExistingVideoIds(dbId);
List<Map<String, Object>> allFetched = fetchChannelVideos(channelId, after, true);
int totalFetched = allFetched.size();
List<Map<String, Object>> candidates = new ArrayList<>();
for (var v : allFetched) {
if (titleFilter != null && !((String) v.get("title")).contains(titleFilter)) continue;
if (existing.contains(v.get("video_id"))) continue;
candidates.add(v);
}
int newCount = videoService.saveVideosBatch(dbId, candidates);
return Map.of(
"total_fetched", totalFetched,
"new_videos", newCount,
"filtered", titleFilter != null ? totalFetched - candidates.size() : 0
);
}
/**
* Scan all active channels. Returns total new video count.
*/
public int scanAllChannels() {
List<Channel> channels = channelService.findAllActive();
int totalNew = 0;
for (var ch : channels) {
try {
var result = scanChannel(ch.getChannelId(), false);
if (result != null) {
totalNew += ((Number) result.get("new_videos")).intValue();
}
} catch (Exception e) {
log.error("Failed to scan channel {}: {}", ch.getChannelName(), e.getMessage());
}
}
return totalNew;
}
public record TranscriptResult(String text, String source) {}
private static final List<String> PREFERRED_LANGS = List.of("ko", "en");
private final YoutubeTranscriptApi transcriptApi = TranscriptApiFactory.createDefault();
/**
* Fetch transcript for a YouTube video.
* Tries API first (fast), then falls back to Playwright browser extraction.
* @param mode "auto" = manual first then generated, "manual" = manual only, "generated" = generated only
*/
public TranscriptResult getTranscript(String videoId, String mode) {
if (mode == null) mode = "auto";
// 1) Playwright headed browser (봇 판정 회피)
TranscriptResult browserResult = getTranscriptBrowser(videoId);
if (browserResult != null) return browserResult;
// 2) Fallback: youtube-transcript-api
log.warn("Browser failed for {}, trying API", videoId);
return getTranscriptApi(videoId, mode);
}
public TranscriptResult getTranscriptApi(String videoId, String mode) {
TranscriptList transcriptList;
try {
transcriptList = transcriptApi.listTranscripts(videoId);
} catch (Exception e) {
log.warn("Cannot list transcripts for {}: {}", videoId, e.getMessage());
return null;
}
String[] langs = PREFERRED_LANGS.toArray(String[]::new);
return switch (mode) {
case "manual" -> fetchTranscript(transcriptList, langs, true);
case "generated" -> fetchTranscript(transcriptList, langs, false);
default -> {
// auto: try manual first, then generated
var result = fetchTranscript(transcriptList, langs, true);
if (result != null) yield result;
yield fetchTranscript(transcriptList, langs, false);
}
};
}
private TranscriptResult fetchTranscript(TranscriptList list, String[] langs, boolean manual) {
Transcript picked;
try {
picked = manual ? list.findManualTranscript(langs) : list.findGeneratedTranscript(langs);
} catch (Exception e) {
return null;
}
try {
TranscriptContent content = picked.fetch();
String text = content.getContent().stream()
.map(TranscriptContent.Fragment::getText)
.collect(Collectors.joining(" "));
if (text.isBlank()) return null;
String label = manual ? "manual" : "generated";
return new TranscriptResult(text, label + " (" + picked.getLanguageCode() + ")");
} catch (Exception e) {
log.warn("Failed to fetch transcript for language {}: {}", picked.getLanguageCode(), e.getMessage());
return null;
}
}
// ─── Playwright browser ───────────────────────────────────────────────────
/**
* Fetch transcript using an existing Playwright Page (for bulk reuse).
*/
@SuppressWarnings("unchecked")
public TranscriptResult getTranscriptWithPage(Page page, String videoId) {
return fetchTranscriptFromPage(page, videoId);
}
/**
* Create a Playwright browser + context + page for transcript fetching.
* Caller must close the returned resources (Playwright, Browser).
*/
public record BrowserSession(Playwright playwright, Browser browser, Page page) implements AutoCloseable {
@Override
public void close() {
try { browser.close(); } catch (Exception ignored) {}
try { playwright.close(); } catch (Exception ignored) {}
}
}
public BrowserSession createBrowserSession() {
Playwright pw = Playwright.create();
Browser browser = pw.chromium().launch(new BrowserType.LaunchOptions()
.setHeadless(false)
.setArgs(List.of("--disable-blink-features=AutomationControlled")));
BrowserContext ctx = browser.newContext(new Browser.NewContextOptions()
.setLocale("ko-KR")
.setViewportSize(1280, 900));
loadCookies(ctx);
Page page = ctx.newPage();
page.addInitScript("Object.defineProperty(navigator, 'webdriver', {get: () => false})");
return new BrowserSession(pw, browser, page);
}
@SuppressWarnings("unchecked")
private TranscriptResult getTranscriptBrowser(String videoId) {
try (BrowserSession session = createBrowserSession()) {
return fetchTranscriptFromPage(session.page(), videoId);
} catch (Exception e) {
log.error("[TRANSCRIPT] Playwright failed for {}: {}", videoId, e.getMessage());
return null;
}
}
@SuppressWarnings("unchecked")
private TranscriptResult fetchTranscriptFromPage(Page page, String videoId) {
try {
log.info("[TRANSCRIPT] Opening YouTube page for {}", videoId);
page.navigate("https://www.youtube.com/watch?v=" + videoId,
new Page.NavigateOptions().setWaitUntil(WaitUntilState.DOMCONTENTLOADED).setTimeout(30000));
page.waitForTimeout(3000);
skipAds(page);
page.waitForTimeout(1000);
log.info("[TRANSCRIPT] Page loaded, looking for transcript button");
// Click "더보기" (expand description)
page.evaluate("""
() => {
const moreBtn = document.querySelector('tp-yt-paper-button#expand');
if (moreBtn) moreBtn.click();
}
""");
page.waitForTimeout(2000);
// Click transcript button
Object clicked = page.evaluate("""
() => {
// Method 1: aria-label
for (const label of ['스크립트 표시', 'Show transcript']) {
const btns = document.querySelectorAll(`button[aria-label="${label}"]`);
for (const b of btns) { b.click(); return 'aria-label: ' + label; }
}
// Method 2: text content
const allBtns = document.querySelectorAll('button');
for (const b of allBtns) {
const text = b.textContent.trim();
if (text === '스크립트 표시' || text === 'Show transcript') {
b.click();
return 'text: ' + text;
}
}
// Method 3: engagement panel buttons
const engBtns = document.querySelectorAll('ytd-button-renderer button, ytd-button-renderer a');
for (const b of engBtns) {
const text = b.textContent.trim().toLowerCase();
if (text.includes('transcript') || text.includes('스크립트')) {
b.click();
return 'engagement: ' + text;
}
}
return false;
}
""");
log.info("[TRANSCRIPT] Clicked transcript button: {}", clicked);
if (Boolean.FALSE.equals(clicked)) {
Object btnLabels = page.evaluate("""
() => {
const btns = document.querySelectorAll('button[aria-label]');
return Array.from(btns).map(b => b.getAttribute('aria-label')).slice(0, 30);
}
""");
log.warn("[TRANSCRIPT] Transcript button not found. Available buttons: {}", btnLabels);
return null;
}
// Wait for transcript segments to appear (max ~15s)
page.waitForTimeout(2000);
for (int attempt = 0; attempt < 10; attempt++) {
page.waitForTimeout(1500);
Object count = page.evaluate(
"() => document.querySelectorAll('ytd-transcript-segment-renderer').length");
int segCount = count instanceof Number n ? n.intValue() : 0;
log.info("[TRANSCRIPT] Wait {}s: {} segments", (attempt + 1) * 1.5 + 2, segCount);
if (segCount > 0) break;
}
selectKorean(page);
// Scroll transcript panel and collect segments
Object segmentsObj = page.evaluate("""
async () => {
const container = document.querySelector(
'ytd-transcript-segment-list-renderer #segments-container, ' +
'ytd-transcript-renderer #body'
);
if (!container) {
const segs = document.querySelectorAll('ytd-transcript-segment-renderer');
return Array.from(segs).map(s => {
const txt = s.querySelector('.segment-text, yt-formatted-string.segment-text');
return txt ? txt.textContent.trim() : '';
}).filter(t => t);
}
let prevCount = 0;
for (let i = 0; i < 50; i++) {
container.scrollTop = container.scrollHeight;
await new Promise(r => setTimeout(r, 300));
const segs = document.querySelectorAll('ytd-transcript-segment-renderer');
if (segs.length === prevCount && i > 3) break;
prevCount = segs.length;
}
const segs = document.querySelectorAll('ytd-transcript-segment-renderer');
return Array.from(segs).map(s => {
const txt = s.querySelector('.segment-text, yt-formatted-string.segment-text');
return txt ? txt.textContent.trim() : '';
}).filter(t => t);
}
""");
if (segmentsObj instanceof List<?> segments && !segments.isEmpty()) {
String text = segments.stream()
.map(Object::toString)
.collect(Collectors.joining(" "));
log.info("[TRANSCRIPT] Browser success: {} chars from {} segments", text.length(), segments.size());
return new TranscriptResult(text, "browser");
}
log.warn("[TRANSCRIPT] No segments found via browser for {}", videoId);
return null;
} catch (Exception e) {
log.error("[TRANSCRIPT] Page fetch failed for {}: {}", videoId, e.getMessage());
return null;
}
}
private void skipAds(Page page) {
for (int i = 0; i < 30; i++) {
Object adStatus = page.evaluate("""
() => {
const skipBtn = document.querySelector('.ytp-skip-ad-button, .ytp-ad-skip-button, .ytp-ad-skip-button-modern, button.ytp-ad-skip-button-modern');
if (skipBtn) { skipBtn.click(); return 'skipped'; }
const adOverlay = document.querySelector('.ytp-ad-player-overlay, .ad-showing');
if (adOverlay) {
// 광고 중: 뮤트 + 끝으로 이동 시도
const video = document.querySelector('video');
if (video) {
video.muted = true;
if (video.duration && isFinite(video.duration)) {
video.currentTime = video.duration;
}
}
return 'playing';
}
const adBadge = document.querySelector('.ytp-ad-text');
if (adBadge && adBadge.textContent) return 'badge';
return 'none';
}
""");
String status = String.valueOf(adStatus);
if ("none".equals(status)) break;
log.info("[TRANSCRIPT] Ad detected: {}, waiting...", status);
if ("skipped".equals(status)) {
page.waitForTimeout(1000);
break;
}
page.waitForTimeout(1000);
}
}
private void selectKorean(Page page) {
page.evaluate("""
() => {
const menu = document.querySelector('ytd-transcript-renderer ytd-menu-renderer yt-dropdown-menu');
if (!menu) return;
const trigger = menu.querySelector('button, tp-yt-paper-button');
if (trigger) trigger.click();
}
""");
page.waitForTimeout(1000);
page.evaluate("""
() => {
const items = document.querySelectorAll('tp-yt-paper-listbox a, tp-yt-paper-listbox tp-yt-paper-item');
for (const item of items) {
const text = item.textContent.trim();
if (text.includes('한국어') || text.includes('Korean')) {
item.click();
return;
}
}
}
""");
page.waitForTimeout(2000);
}
private void loadCookies(BrowserContext ctx) {
try {
Path cookieFile = Path.of(System.getProperty("user.dir"), "cookies.txt");
if (!cookieFile.toFile().exists()) return;
List<String> lines = java.nio.file.Files.readAllLines(cookieFile);
List<Cookie> cookies = new ArrayList<>();
for (String line : lines) {
if (line.startsWith("#") || line.isBlank()) continue;
String[] parts = line.split("\t");
if (parts.length < 7) continue;
String domain = parts[0];
if (!domain.contains("youtube") && !domain.contains("google")) continue;
cookies.add(new Cookie(parts[5], parts[6])
.setDomain(domain)
.setPath(parts[2])
.setSecure("TRUE".equalsIgnoreCase(parts[3]))
.setHttpOnly(false));
}
if (!cookies.isEmpty()) {
ctx.addCookies(cookies);
log.info("[TRANSCRIPT] Loaded {} cookies", cookies.size());
}
} catch (Exception e) {
log.debug("Failed to load cookies: {}", e.getMessage());
}
}
}