Files
sundol/tts-server.py
joungmin 1088b23790 Add Notes, Voice Clone TTS, fix auth persistence and maxTokens
Notes:
- notes table with TEXT/AUDIO types, category support
- Audio upload → OpenRouter Gemini STT → OCI GenAI polish/summary
- Raw STT saved separately in raw_content column
- Polish/summary button for manual re-processing
- Async processing with real-time polling

Voice Clone TTS:
- Qwen3-TTS 1.7B model on A10 GPU via FastAPI server
- Voice profile registration (record/upload → save embedding)
- Profile-based TTS generation API
- TTS web page with recording, profile management, generation

Auth fixes:
- Store both access + refresh tokens in localStorage
- Initialize state from localStorage synchronously (no flash)
- Request interceptor reads token from localStorage every request
- Refresh via body (not just cookie)

Other fixes:
- maxTokens 4096 → 65536 (OCI GenAI Gemini supports up to 65536)
- Fix broken Korean chars in source files
- OpenRouter config for STT
- ffmpeg installed for audio conversion
- Ollama + Gemma 4 E4B installed (STT fallback)
- nginx proxy for TTS server (/api/tts/)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 07:34:18 +00:00

212 lines
6.8 KiB
Python

"""
Qwen3-TTS Voice Clone API Server
별도 프로세스로 실행 (GPU 메모리 관리를 위해)
"""
import os
import io
import base64
import tempfile
import torch
import soundfile as sf
import numpy as np
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import StreamingResponse
from fastapi.middleware.cors import CORSMiddleware
import json
import pickle
app = FastAPI()
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
model = None
PROFILES_DIR = os.path.join(os.path.dirname(__file__), "voice-profiles")
os.makedirs(PROFILES_DIR, exist_ok=True)
def get_model():
global model
if model is None:
from qwen_tts import Qwen3TTSModel
print("Loading Qwen3-TTS model...")
model = Qwen3TTSModel.from_pretrained(
"Qwen/Qwen3-TTS-12Hz-1.7B-Base",
device_map="cuda:0",
dtype=torch.bfloat16,
)
print("Model loaded!")
return model
@app.get("/health")
@app.get("/api/tts/health")
def health():
return {"status": "ok", "model_loaded": model is not None}
@app.post("/api/tts/clone")
async def voice_clone(
text: str = Form(...),
language: str = Form("korean"),
ref_audio: UploadFile = File(...),
ref_text: str = Form(""),
):
"""참조 음성으로 보이스 클로닝"""
m = get_model()
# 참조 음성 저장
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
content = await ref_audio.read()
tmp.write(content)
tmp_path = tmp.name
try:
# wav 변환 (필요 시)
if not ref_audio.filename.endswith(".wav"):
wav_path = tmp_path + "_converted.wav"
os.system(f'ffmpeg -i "{tmp_path}" -ar 16000 -ac 1 -y "{wav_path}" 2>/dev/null')
os.unlink(tmp_path)
tmp_path = wav_path
kwargs = {
"text": text,
"language": language,
"ref_audio": tmp_path,
}
if ref_text and ref_text.strip():
kwargs["ref_text"] = ref_text
else:
kwargs["x_vector_only_mode"] = True
wavs, sr = m.generate_voice_clone(**kwargs)
print(f"Clone generated: wavs={len(wavs)}, samples={len(wavs[0]) if len(wavs) > 0 else 0}, sr={sr}")
audio_data = np.array(wavs[0], dtype=np.float32)
buf = io.BytesIO()
sf.write(buf, audio_data, sr, format="WAV")
buf.seek(0)
return StreamingResponse(buf, media_type="audio/wav",
headers={"Content-Disposition": "attachment; filename=tts_output.wav"})
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
@app.post("/api/tts/design")
async def voice_design(
text: str = Form(...),
language: str = Form("korean"),
instruct: str = Form("A calm, professional Korean male voice"),
):
"""음성 디자인으로 생성 (참조 음성 없이)"""
m = get_model()
wavs, sr = m.generate_voice_design(text=text, instruct=instruct, language=language)
buf = io.BytesIO()
sf.write(buf, wavs[0], sr, format="WAV")
buf.seek(0)
return StreamingResponse(buf, media_type="audio/wav",
headers={"Content-Disposition": "attachment; filename=tts_output.wav"})
@app.post("/api/tts/profiles")
async def create_profile(
name: str = Form(...),
ref_audio: UploadFile = File(...),
ref_text: str = Form(""),
):
"""음성 프로필 등록: 참조 음성으로 보이스 프로필 생성 후 저장"""
m = get_model()
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
content = await ref_audio.read()
tmp.write(content)
tmp_path = tmp.name
try:
if not ref_audio.filename.endswith(".wav"):
wav_path = tmp_path + "_converted.wav"
os.system(f'ffmpeg -i "{tmp_path}" -ar 16000 -ac 1 -y "{wav_path}" 2>/dev/null')
os.unlink(tmp_path)
tmp_path = wav_path
# 프로필 생성
kwargs = {"ref_audio": tmp_path}
if ref_text and ref_text.strip():
kwargs["ref_text"] = ref_text
prompt = m.create_voice_clone_prompt(**kwargs)
else:
kwargs["x_vector_only_mode"] = True
prompt = m.create_voice_clone_prompt(**kwargs)
# 저장
profile_id = name.replace(" ", "_").lower()
profile_path = os.path.join(PROFILES_DIR, f"{profile_id}.pkl")
meta_path = os.path.join(PROFILES_DIR, f"{profile_id}.json")
with open(profile_path, "wb") as f:
pickle.dump(prompt, f)
with open(meta_path, "w") as f:
json.dump({"id": profile_id, "name": name, "ref_text": ref_text}, f, ensure_ascii=False)
return {"id": profile_id, "name": name, "status": "created"}
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
@app.get("/api/tts/profiles")
def list_profiles():
"""저장된 음성 프로필 목록"""
profiles = []
for f in os.listdir(PROFILES_DIR):
if f.endswith(".json"):
with open(os.path.join(PROFILES_DIR, f)) as fh:
profiles.append(json.load(fh))
return profiles
@app.delete("/api/tts/profiles/{profile_id}")
def delete_profile(profile_id: str):
"""음성 프로필 삭제"""
pkl = os.path.join(PROFILES_DIR, f"{profile_id}.pkl")
meta = os.path.join(PROFILES_DIR, f"{profile_id}.json")
if os.path.exists(pkl): os.unlink(pkl)
if os.path.exists(meta): os.unlink(meta)
return {"status": "deleted"}
@app.post("/api/tts/generate")
async def generate_from_profile(
text: str = Form(...),
profile_id: str = Form(...),
language: str = Form("korean"),
):
"""저장된 음성 프로필로 TTS 생성"""
m = get_model()
profile_path = os.path.join(PROFILES_DIR, f"{profile_id}.pkl")
if not os.path.exists(profile_path):
return {"error": f"Profile '{profile_id}' not found"}, 404
with open(profile_path, "rb") as f:
prompt = pickle.load(f)
print(f"Generating with profile '{profile_id}', text='{text[:50]}...', language={language}")
wavs, sr = m.generate_voice_clone(
text=text,
language=language,
voice_clone_prompt=prompt,
)
print(f"Generated: wavs={len(wavs)}, samples={len(wavs[0]) if len(wavs) > 0 else 0}, sr={sr}")
if len(wavs) == 0 or len(wavs[0]) == 0:
return {"error": "Empty audio generated"}, 500
audio_data = np.array(wavs[0], dtype=np.float32)
buf = io.BytesIO()
sf.write(buf, audio_data, sr, format="WAV")
buf.seek(0)
return StreamingResponse(buf, media_type="audio/wav",
headers={"Content-Disposition": "attachment; filename=tts_output.wav"})
if __name__ == "__main__":
import uvicorn
get_model()
uvicorn.run(app, host="0.0.0.0", port=8090)