Fix TTS: switch to 1.7B with ref_audio, speakable text on all lines

- Use 1.7B model (0.6B had tensor mismatch with cached prompts) - Speak endpoint uses ref_audio directly (not cached pkl) as fallback - Cache voice clone prompts in memory on startup - Add SpeakableText component: 🔊 icon on each p and li element - Remove old TTSReader sequential approach - Add global exception handler to TTS server - Fix profile localStorage caching - inference_mode + bf16 optimization Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 12:14:06 +00:00
parent 1088b23790
commit 20210830cf
6 changed files with 440 additions and 158 deletions
--- a/sundol-frontend/src/app/knowledge/[id]/page.tsx
+++ b/sundol-frontend/src/app/knowledge/[id]/page.tsx
@@ -6,6 +6,7 @@ import AuthGuard from "@/components/auth-guard";
 import NavBar from "@/components/nav-bar";
 import { useApi } from "@/lib/use-api";
 import ReactMarkdown from "react-markdown";
+import SpeakableText from "@/components/speakable-text";

 interface Category {
  ID: string;
@@ -314,10 +315,18 @@ export default function KnowledgeDetailPage() {
                      h1: ({children}) => <h1 className="text-xl font-bold mt-6 mb-3">{children}</h1>,
                      h2: ({children}) => <h2 className="text-lg font-bold mt-5 mb-2">{children}</h2>,
                      h3: ({children}) => <h3 className="text-base font-bold mt-4 mb-2">{children}</h3>,
-                      p: ({children}) => <p className="mb-3">{children}</p>,
+                      p: ({children, node}) => {
+                        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+                        const txt = node?.children?.map((c: any) => c.type === 'text' ? c.value : '').join('') || '';
+                        return <p className="mb-3"><SpeakableText text={txt}>{children}</SpeakableText></p>;
+                      },
                      ul: ({children}) => <ul className="list-disc ml-5 mb-3 space-y-1">{children}</ul>,
                      ol: ({children}) => <ol className="list-decimal ml-5 mb-3 space-y-1">{children}</ol>,
-                      li: ({children}) => <li className="leading-relaxed">{children}</li>,
+                      li: ({children, node}) => {
+                        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+                        const txt = node?.children?.map((c: any) => c.type === 'text' ? c.value : '').join('') || '';
+                        return <li className="leading-relaxed"><SpeakableText text={txt}>{children}</SpeakableText></li>;
+                      },
                      strong: ({children}) => <strong className="font-bold">{children}</strong>,
                      blockquote: ({children}) => <blockquote className="border-l-2 border-[var(--color-primary)] pl-4 my-3 italic text-[var(--color-text-muted)]">{children}</blockquote>,
                      code: ({children}) => <code className="bg-[var(--color-bg-hover)] px-1.5 py-0.5 rounded text-xs">{children}</code>,
--- a/sundol-frontend/src/app/notes/[id]/page.tsx
+++ b/sundol-frontend/src/app/notes/[id]/page.tsx
@@ -6,6 +6,7 @@ import AuthGuard from "@/components/auth-guard";
 import NavBar from "@/components/nav-bar";
 import { useApi } from "@/lib/use-api";
 import ReactMarkdown from "react-markdown";
+import SpeakableText from "@/components/speakable-text";

 interface NoteDetail {
  ID: string;
@@ -183,10 +184,18 @@ export default function NoteDetailPage() {
                  h1: ({children}) => <h1 className="text-xl font-bold mt-6 mb-3">{children}</h1>,
                  h2: ({children}) => <h2 className="text-lg font-bold mt-5 mb-2">{children}</h2>,
                  h3: ({children}) => <h3 className="text-base font-bold mt-4 mb-2">{children}</h3>,
-                  p: ({children}) => <p className="mb-3">{children}</p>,
+                  p: ({children, node}) => {
+                    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+                    const txt = node?.children?.map((c: any) => c.type === 'text' ? c.value : '').join('') || '';
+                    return <p className="mb-3"><SpeakableText text={txt}>{children}</SpeakableText></p>;
+                  },
                  ul: ({children}) => <ul className="list-disc ml-5 mb-3 space-y-1">{children}</ul>,
                  ol: ({children}) => <ol className="list-decimal ml-5 mb-3 space-y-1">{children}</ol>,
-                  li: ({children}) => <li className="leading-relaxed">{children}</li>,
+                  li: ({children, node}) => {
+                    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+                    const txt = node?.children?.map((c: any) => c.type === 'text' ? c.value : '').join('') || '';
+                    return <li className="leading-relaxed"><SpeakableText text={txt}>{children}</SpeakableText></li>;
+                  },
                  strong: ({children}) => <strong className="font-bold">{children}</strong>,
                  blockquote: ({children}) => <blockquote className="border-l-2 border-[var(--color-primary)] pl-4 my-3 italic text-[var(--color-text-muted)]">{children}</blockquote>,
                }}
--- a/sundol-frontend/src/app/tts/page.tsx
+++ b/sundol-frontend/src/app/tts/page.tsx
@@ -45,8 +45,21 @@ export default function TTSPage() {
  }, []);

  const fetchProfiles = () => {
+    // 캐시 먼저
+    const cached = localStorage.getItem("tts_profiles");
+    if (cached) {
+      try {
+        const data = JSON.parse(cached);
+        setProfiles(data);
+        if (data.length > 0 && !selectedProfile) setSelectedProfile(data[0].id);
+      } catch {}
+    }
    fetch("/api/tts/profiles").then(r => r.json())
-      .then(setProfiles).catch(() => {});
+      .then(data => {
+        setProfiles(data);
+        localStorage.setItem("tts_profiles", JSON.stringify(data));
+        if (data.length > 0 && !selectedProfile) setSelectedProfile(data[0].id);
+      }).catch(() => {});
  };

  const startRecording = async () => {
@@ -97,6 +110,7 @@ export default function TTSPage() {
      setRecordedUrl(null);
      setUploadedFile(null);
      fetchProfiles();
+      localStorage.removeItem("tts_profiles"); // 캐시 강제 갱신
      setSelectedProfile(result.id);
      setTab("generate");
    } catch (err) {
@@ -125,9 +139,10 @@ export default function TTSPage() {
      fd.append("text", text);
      fd.append("profile_id", selectedProfile);
      fd.append("language", language);
-      const res = await fetch("/api/tts/generate", { method: "POST", body: fd });
+      const res = await fetch("/api/tts/speak", { method: "POST", body: fd });
      if (!res.ok) throw new Error(`HTTP ${res.status}`);
      const blob = await res.blob();
+      if (blob.size < 100) throw new Error("Empty audio");
      setOutputUrl(URL.createObjectURL(blob));
    } catch (err) {
      setError("생성 실패: " + (err instanceof Error ? err.message : ""));
--- a/sundol-frontend/src/components/speakable-text.tsx
+++ b/sundol-frontend/src/components/speakable-text.tsx
@@ -0,0 +1,85 @@
+"use client";
+
+import { useState, useRef, useEffect } from "react";
+
+interface SpeakableProps {
+  children: React.ReactNode;
+  text: string;
+}
+
+let cachedProfileId: string | null = null;
+let profileChecked = false;
+
+export default function SpeakableText({ children, text }: SpeakableProps) {
+  const [playing, setPlaying] = useState(false);
+  const [loading, setLoading] = useState(false);
+  const [hasProfile, setHasProfile] = useState(false);
+  const audioRef = useRef<HTMLAudioElement | null>(null);
+
+  useEffect(() => {
+    if (profileChecked) {
+      setHasProfile(!!cachedProfileId);
+      return;
+    }
+    try {
+      const profiles = JSON.parse(localStorage.getItem("tts_profiles") || "[]");
+      if (profiles.length > 0) {
+        cachedProfileId = profiles[0].id;
+        setHasProfile(true);
+      }
+      profileChecked = true;
+    } catch {}
+  }, []);
+
+  const handleSpeak = async (e: React.MouseEvent) => {
+    e.preventDefault();
+    e.stopPropagation();
+
+    if (playing) {
+      audioRef.current?.pause();
+      setPlaying(false);
+      return;
+    }
+
+    if (!cachedProfileId || text.length < 5) return;
+
+    setLoading(true);
+    try {
+      const fd = new FormData();
+      fd.append("text", text);
+      fd.append("profile_id", cachedProfileId);
+      fd.append("language", "Korean");
+      const res = await fetch("/api/tts/speak", { method: "POST", body: fd });
+      if (!res.ok) { setLoading(false); return; }
+      const blob = await res.blob();
+      if (blob.size < 200) { setLoading(false); return; }
+
+      const url = URL.createObjectURL(blob);
+      const audio = new Audio(url);
+      audioRef.current = audio;
+      audio.onended = () => setPlaying(false);
+      setPlaying(true);
+      setLoading(false);
+      audio.play();
+    } catch {
+      setLoading(false);
+    }
+  };
+
+  if (!hasProfile || text.length < 5) return <>{children}</>;
+
+  return (
+    <>
+      {children}
+      <button
+        onClick={handleSpeak}
+        disabled={loading}
+        className="inline-flex items-center ml-1 text-[var(--color-text-muted)] hover:text-[var(--color-primary)] disabled:opacity-30 align-middle"
+        title={playing ? "중지" : "읽어주기"}
+        style={{ fontSize: "0.85em", verticalAlign: "middle", cursor: "pointer" }}
+      >
+        {loading ? "⏳" : playing ? "⏹" : "🔊"}
+      </button>
+    </>
+  );
+}
--- a/sundol-frontend/src/components/tts-reader.tsx
+++ b/sundol-frontend/src/components/tts-reader.tsx
@@ -0,0 +1,159 @@
+"use client";
+
+import { useState, useEffect, useRef } from "react";
+
+interface TTSReaderProps {
+  text: string;
+}
+
+interface VoiceProfile {
+  id: string;
+  name: string;
+}
+
+export default function TTSReader({ text }: TTSReaderProps) {
+  const [profiles, setProfiles] = useState<VoiceProfile[]>([]);
+  const [selectedProfile, setSelectedProfile] = useState("");
+  const [generating, setGenerating] = useState(false);
+  const [playing, setPlaying] = useState(false);
+  const [progress, setProgress] = useState("");
+  const audioRef = useRef<HTMLAudioElement | null>(null);
+  const stoppedRef = useRef(false);
+  const audioUrlsRef = useRef<string[]>([]);
+
+  useEffect(() => {
+    // localStorage 캐시
+    const cached = localStorage.getItem("tts_profiles");
+    if (cached) {
+      try {
+        const data = JSON.parse(cached);
+        setProfiles(data);
+        if (data.length > 0) setSelectedProfile(data[0].id);
+      } catch {}
+    }
+    // 백그라운드에서 갱신 (블록 안 됨)
+    fetch("/api/tts/profiles").then(r => r.json()).then(data => {
+      setProfiles(data);
+      if (data.length > 0 && !selectedProfile) setSelectedProfile(data[0].id);
+      localStorage.setItem("tts_profiles", JSON.stringify(data));
+    }).catch(() => {});
+  }, []);
+
+  const toSentences = (md: string): string[] => {
+    return md
+      .replace(/^#+\s+.*$/gm, "")
+      .replace(/\*\*/g, "")
+      .replace(/^[-*]\s+/gm, "")
+      .replace(/^>\s+/gm, "")
+      .replace(/---+/g, "")
+      .replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
+      .split("\n")
+      .map(s => s.trim())
+      .filter(s => s.length >= 10);
+  };
+
+  // 직접 동기 호출 — 바로 wav 반환
+  const speak = async (chunk: string): Promise<string | null> => {
+    const fd = new FormData();
+    fd.append("text", chunk);
+    fd.append("profile_id", selectedProfile);
+    fd.append("language", "Korean");
+    const res = await fetch("/api/tts/speak", { method: "POST", body: fd });
+    if (!res.ok) return null;
+    const blob = await res.blob();
+    return blob.size > 100 ? URL.createObjectURL(blob) : null;
+  };
+
+  const handleGenerate = async () => {
+    if (!selectedProfile || !text.trim()) return;
+    setGenerating(true);
+    setPlaying(true);
+    stoppedRef.current = false;
+    audioUrlsRef.current = [];
+
+    const sentences = toSentences(text);
+    let isAudioPlaying = false;
+    let playIdx = 0;
+
+    const playNext = () => {
+      if (stoppedRef.current) return;
+      if (playIdx >= audioUrlsRef.current.length) { isAudioPlaying = false; return; }
+      isAudioPlaying = true;
+      const a = new Audio(audioUrlsRef.current[playIdx++]);
+      audioRef.current = a;
+      a.onended = () => {
+        if (stoppedRef.current) return;
+        playIdx < audioUrlsRef.current.length ? playNext() : (isAudioPlaying = false);
+      };
+      a.play();
+    };
+
+    for (let i = 0; i < sentences.length; i++) {
+      if (stoppedRef.current) break;
+      setProgress(`${i + 1}/${sentences.length}`);
+      const url = await speak(sentences[i]);
+      if (url && !stoppedRef.current) {
+        audioUrlsRef.current.push(url);
+        if (!isAudioPlaying) playNext();
+      }
+    }
+
+    setGenerating(false);
+    setProgress("");
+    if (!isAudioPlaying) setPlaying(false);
+  };
+
+  const handleStop = () => {
+    stoppedRef.current = true;
+    audioRef.current?.pause();
+    setPlaying(false);
+    setGenerating(false);
+    setProgress("");
+  };
+
+  const handleReplay = () => {
+    if (audioUrlsRef.current.length === 0) return;
+    stoppedRef.current = false;
+    setPlaying(true);
+    let idx = 0;
+    const play = () => {
+      if (idx >= audioUrlsRef.current.length || stoppedRef.current) { setPlaying(false); return; }
+      const audio = new Audio(audioUrlsRef.current[idx]);
+      audioRef.current = audio;
+      idx++;
+      audio.onended = play;
+      audio.play();
+    };
+    play();
+  };
+
+  if (profiles.length === 0) return null;
+
+  return (
+    <div className="flex items-center gap-2 flex-wrap">
+      <select value={selectedProfile} onChange={e => setSelectedProfile(e.target.value)}
+        className="text-xs px-2 py-1 rounded bg-[var(--color-bg-hover)] border border-[var(--color-border)]">
+        {profiles.map(p => <option key={p.id} value={p.id}>{p.name}</option>)}
+      </select>
+
+      {playing || generating ? (
+        <button onClick={handleStop}
+          className="text-xs px-3 py-1 bg-red-500/20 text-red-400 rounded hover:bg-red-500/30">
+          {progress || "중지"}
+        </button>
+      ) : (
+        <button onClick={handleGenerate} disabled={!selectedProfile}
+          className="text-xs px-3 py-1 bg-[var(--color-primary)]/20 text-[var(--color-primary)] rounded hover:bg-[var(--color-primary)]/30 disabled:opacity-40">
+          읽어주기
+        </button>
+      )}
+
+      {audioUrlsRef.current.length > 0 && !playing && !generating && (
+        <button onClick={handleReplay}
+          className="text-xs px-3 py-1 bg-[var(--color-bg-hover)] border border-[var(--color-border)] rounded">
+          다시 재생
+        </button>
+      )}
+    </div>
+  );
+}