Fix TTS: switch to 1.7B with ref_audio, speakable text on all lines

- Use 1.7B model (0.6B had tensor mismatch with cached prompts)
- Speak endpoint uses ref_audio directly (not cached pkl) as fallback
- Cache voice clone prompts in memory on startup
- Add SpeakableText component: 🔊 icon on each p and li element
- Remove old TTSReader sequential approach
- Add global exception handler to TTS server
- Fix profile localStorage caching
- inference_mode + bf16 optimization

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-13 12:14:06 +00:00
parent 1088b23790
commit 20210830cf
6 changed files with 440 additions and 158 deletions

View File

@@ -6,6 +6,7 @@ import AuthGuard from "@/components/auth-guard";
import NavBar from "@/components/nav-bar";
import { useApi } from "@/lib/use-api";
import ReactMarkdown from "react-markdown";
import SpeakableText from "@/components/speakable-text";
interface Category {
ID: string;
@@ -314,10 +315,18 @@ export default function KnowledgeDetailPage() {
h1: ({children}) => <h1 className="text-xl font-bold mt-6 mb-3">{children}</h1>,
h2: ({children}) => <h2 className="text-lg font-bold mt-5 mb-2">{children}</h2>,
h3: ({children}) => <h3 className="text-base font-bold mt-4 mb-2">{children}</h3>,
p: ({children}) => <p className="mb-3">{children}</p>,
p: ({children, node}) => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const txt = node?.children?.map((c: any) => c.type === 'text' ? c.value : '').join('') || '';
return <p className="mb-3"><SpeakableText text={txt}>{children}</SpeakableText></p>;
},
ul: ({children}) => <ul className="list-disc ml-5 mb-3 space-y-1">{children}</ul>,
ol: ({children}) => <ol className="list-decimal ml-5 mb-3 space-y-1">{children}</ol>,
li: ({children}) => <li className="leading-relaxed">{children}</li>,
li: ({children, node}) => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const txt = node?.children?.map((c: any) => c.type === 'text' ? c.value : '').join('') || '';
return <li className="leading-relaxed"><SpeakableText text={txt}>{children}</SpeakableText></li>;
},
strong: ({children}) => <strong className="font-bold">{children}</strong>,
blockquote: ({children}) => <blockquote className="border-l-2 border-[var(--color-primary)] pl-4 my-3 italic text-[var(--color-text-muted)]">{children}</blockquote>,
code: ({children}) => <code className="bg-[var(--color-bg-hover)] px-1.5 py-0.5 rounded text-xs">{children}</code>,

View File

@@ -6,6 +6,7 @@ import AuthGuard from "@/components/auth-guard";
import NavBar from "@/components/nav-bar";
import { useApi } from "@/lib/use-api";
import ReactMarkdown from "react-markdown";
import SpeakableText from "@/components/speakable-text";
interface NoteDetail {
ID: string;
@@ -183,10 +184,18 @@ export default function NoteDetailPage() {
h1: ({children}) => <h1 className="text-xl font-bold mt-6 mb-3">{children}</h1>,
h2: ({children}) => <h2 className="text-lg font-bold mt-5 mb-2">{children}</h2>,
h3: ({children}) => <h3 className="text-base font-bold mt-4 mb-2">{children}</h3>,
p: ({children}) => <p className="mb-3">{children}</p>,
p: ({children, node}) => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const txt = node?.children?.map((c: any) => c.type === 'text' ? c.value : '').join('') || '';
return <p className="mb-3"><SpeakableText text={txt}>{children}</SpeakableText></p>;
},
ul: ({children}) => <ul className="list-disc ml-5 mb-3 space-y-1">{children}</ul>,
ol: ({children}) => <ol className="list-decimal ml-5 mb-3 space-y-1">{children}</ol>,
li: ({children}) => <li className="leading-relaxed">{children}</li>,
li: ({children, node}) => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const txt = node?.children?.map((c: any) => c.type === 'text' ? c.value : '').join('') || '';
return <li className="leading-relaxed"><SpeakableText text={txt}>{children}</SpeakableText></li>;
},
strong: ({children}) => <strong className="font-bold">{children}</strong>,
blockquote: ({children}) => <blockquote className="border-l-2 border-[var(--color-primary)] pl-4 my-3 italic text-[var(--color-text-muted)]">{children}</blockquote>,
}}

View File

@@ -45,8 +45,21 @@ export default function TTSPage() {
}, []);
const fetchProfiles = () => {
// 캐시 먼저
const cached = localStorage.getItem("tts_profiles");
if (cached) {
try {
const data = JSON.parse(cached);
setProfiles(data);
if (data.length > 0 && !selectedProfile) setSelectedProfile(data[0].id);
} catch {}
}
fetch("/api/tts/profiles").then(r => r.json())
.then(setProfiles).catch(() => {});
.then(data => {
setProfiles(data);
localStorage.setItem("tts_profiles", JSON.stringify(data));
if (data.length > 0 && !selectedProfile) setSelectedProfile(data[0].id);
}).catch(() => {});
};
const startRecording = async () => {
@@ -97,6 +110,7 @@ export default function TTSPage() {
setRecordedUrl(null);
setUploadedFile(null);
fetchProfiles();
localStorage.removeItem("tts_profiles"); // 캐시 강제 갱신
setSelectedProfile(result.id);
setTab("generate");
} catch (err) {
@@ -125,9 +139,10 @@ export default function TTSPage() {
fd.append("text", text);
fd.append("profile_id", selectedProfile);
fd.append("language", language);
const res = await fetch("/api/tts/generate", { method: "POST", body: fd });
const res = await fetch("/api/tts/speak", { method: "POST", body: fd });
if (!res.ok) throw new Error(`HTTP ${res.status}`);
const blob = await res.blob();
if (blob.size < 100) throw new Error("Empty audio");
setOutputUrl(URL.createObjectURL(blob));
} catch (err) {
setError("생성 실패: " + (err instanceof Error ? err.message : ""));

View File

@@ -0,0 +1,85 @@
"use client";
import { useState, useRef, useEffect } from "react";
interface SpeakableProps {
children: React.ReactNode;
text: string;
}
let cachedProfileId: string | null = null;
let profileChecked = false;
export default function SpeakableText({ children, text }: SpeakableProps) {
const [playing, setPlaying] = useState(false);
const [loading, setLoading] = useState(false);
const [hasProfile, setHasProfile] = useState(false);
const audioRef = useRef<HTMLAudioElement | null>(null);
useEffect(() => {
if (profileChecked) {
setHasProfile(!!cachedProfileId);
return;
}
try {
const profiles = JSON.parse(localStorage.getItem("tts_profiles") || "[]");
if (profiles.length > 0) {
cachedProfileId = profiles[0].id;
setHasProfile(true);
}
profileChecked = true;
} catch {}
}, []);
const handleSpeak = async (e: React.MouseEvent) => {
e.preventDefault();
e.stopPropagation();
if (playing) {
audioRef.current?.pause();
setPlaying(false);
return;
}
if (!cachedProfileId || text.length < 5) return;
setLoading(true);
try {
const fd = new FormData();
fd.append("text", text);
fd.append("profile_id", cachedProfileId);
fd.append("language", "Korean");
const res = await fetch("/api/tts/speak", { method: "POST", body: fd });
if (!res.ok) { setLoading(false); return; }
const blob = await res.blob();
if (blob.size < 200) { setLoading(false); return; }
const url = URL.createObjectURL(blob);
const audio = new Audio(url);
audioRef.current = audio;
audio.onended = () => setPlaying(false);
setPlaying(true);
setLoading(false);
audio.play();
} catch {
setLoading(false);
}
};
if (!hasProfile || text.length < 5) return <>{children}</>;
return (
<>
{children}
<button
onClick={handleSpeak}
disabled={loading}
className="inline-flex items-center ml-1 text-[var(--color-text-muted)] hover:text-[var(--color-primary)] disabled:opacity-30 align-middle"
title={playing ? "중지" : "읽어주기"}
style={{ fontSize: "0.85em", verticalAlign: "middle", cursor: "pointer" }}
>
{loading ? "⏳" : playing ? "⏹" : "🔊"}
</button>
</>
);
}

View File

@@ -0,0 +1,159 @@
"use client";
import { useState, useEffect, useRef } from "react";
interface TTSReaderProps {
text: string;
}
interface VoiceProfile {
id: string;
name: string;
}
export default function TTSReader({ text }: TTSReaderProps) {
const [profiles, setProfiles] = useState<VoiceProfile[]>([]);
const [selectedProfile, setSelectedProfile] = useState("");
const [generating, setGenerating] = useState(false);
const [playing, setPlaying] = useState(false);
const [progress, setProgress] = useState("");
const audioRef = useRef<HTMLAudioElement | null>(null);
const stoppedRef = useRef(false);
const audioUrlsRef = useRef<string[]>([]);
useEffect(() => {
// localStorage 캐시
const cached = localStorage.getItem("tts_profiles");
if (cached) {
try {
const data = JSON.parse(cached);
setProfiles(data);
if (data.length > 0) setSelectedProfile(data[0].id);
} catch {}
}
// 백그라운드에서 갱신 (블록 안 됨)
fetch("/api/tts/profiles").then(r => r.json()).then(data => {
setProfiles(data);
if (data.length > 0 && !selectedProfile) setSelectedProfile(data[0].id);
localStorage.setItem("tts_profiles", JSON.stringify(data));
}).catch(() => {});
}, []);
const toSentences = (md: string): string[] => {
return md
.replace(/^#+\s+.*$/gm, "")
.replace(/\*\*/g, "")
.replace(/^[-*]\s+/gm, "")
.replace(/^>\s+/gm, "")
.replace(/---+/g, "")
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
.split("\n")
.map(s => s.trim())
.filter(s => s.length >= 10);
};
// 직접 동기 호출 — 바로 wav 반환
const speak = async (chunk: string): Promise<string | null> => {
const fd = new FormData();
fd.append("text", chunk);
fd.append("profile_id", selectedProfile);
fd.append("language", "Korean");
const res = await fetch("/api/tts/speak", { method: "POST", body: fd });
if (!res.ok) return null;
const blob = await res.blob();
return blob.size > 100 ? URL.createObjectURL(blob) : null;
};
const handleGenerate = async () => {
if (!selectedProfile || !text.trim()) return;
setGenerating(true);
setPlaying(true);
stoppedRef.current = false;
audioUrlsRef.current = [];
const sentences = toSentences(text);
let isAudioPlaying = false;
let playIdx = 0;
const playNext = () => {
if (stoppedRef.current) return;
if (playIdx >= audioUrlsRef.current.length) { isAudioPlaying = false; return; }
isAudioPlaying = true;
const a = new Audio(audioUrlsRef.current[playIdx++]);
audioRef.current = a;
a.onended = () => {
if (stoppedRef.current) return;
playIdx < audioUrlsRef.current.length ? playNext() : (isAudioPlaying = false);
};
a.play();
};
for (let i = 0; i < sentences.length; i++) {
if (stoppedRef.current) break;
setProgress(`${i + 1}/${sentences.length}`);
const url = await speak(sentences[i]);
if (url && !stoppedRef.current) {
audioUrlsRef.current.push(url);
if (!isAudioPlaying) playNext();
}
}
setGenerating(false);
setProgress("");
if (!isAudioPlaying) setPlaying(false);
};
const handleStop = () => {
stoppedRef.current = true;
audioRef.current?.pause();
setPlaying(false);
setGenerating(false);
setProgress("");
};
const handleReplay = () => {
if (audioUrlsRef.current.length === 0) return;
stoppedRef.current = false;
setPlaying(true);
let idx = 0;
const play = () => {
if (idx >= audioUrlsRef.current.length || stoppedRef.current) { setPlaying(false); return; }
const audio = new Audio(audioUrlsRef.current[idx]);
audioRef.current = audio;
idx++;
audio.onended = play;
audio.play();
};
play();
};
if (profiles.length === 0) return null;
return (
<div className="flex items-center gap-2 flex-wrap">
<select value={selectedProfile} onChange={e => setSelectedProfile(e.target.value)}
className="text-xs px-2 py-1 rounded bg-[var(--color-bg-hover)] border border-[var(--color-border)]">
{profiles.map(p => <option key={p.id} value={p.id}>{p.name}</option>)}
</select>
{playing || generating ? (
<button onClick={handleStop}
className="text-xs px-3 py-1 bg-red-500/20 text-red-400 rounded hover:bg-red-500/30">
{progress || "중지"}
</button>
) : (
<button onClick={handleGenerate} disabled={!selectedProfile}
className="text-xs px-3 py-1 bg-[var(--color-primary)]/20 text-[var(--color-primary)] rounded hover:bg-[var(--color-primary)]/30 disabled:opacity-40">
</button>
)}
{audioUrlsRef.current.length > 0 && !playing && !generating && (
<button onClick={handleReplay}
className="text-xs px-3 py-1 bg-[var(--color-bg-hover)] border border-[var(--color-border)] rounded">
</button>
)}
</div>
);
}