Notes: - notes table with TEXT/AUDIO types, category support - Audio upload → OpenRouter Gemini STT → OCI GenAI polish/summary - Raw STT saved separately in raw_content column - Polish/summary button for manual re-processing - Async processing with real-time polling Voice Clone TTS: - Qwen3-TTS 1.7B model on A10 GPU via FastAPI server - Voice profile registration (record/upload → save embedding) - Profile-based TTS generation API - TTS web page with recording, profile management, generation Auth fixes: - Store both access + refresh tokens in localStorage - Initialize state from localStorage synchronously (no flash) - Request interceptor reads token from localStorage every request - Refresh via body (not just cookie) Other fixes: - maxTokens 4096 → 65536 (OCI GenAI Gemini supports up to 65536) - Fix broken Korean chars in source files - OpenRouter config for STT - ffmpeg installed for audio conversion - Ollama + Gemma 4 E4B installed (STT fallback) - nginx proxy for TTS server (/api/tts/) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
212 lines
6.8 KiB
Python
212 lines
6.8 KiB
Python
"""
|
|
Qwen3-TTS Voice Clone API Server
|
|
별도 프로세스로 실행 (GPU 메모리 관리를 위해)
|
|
"""
|
|
import os
|
|
import io
|
|
import base64
|
|
import tempfile
|
|
import torch
|
|
import soundfile as sf
|
|
import numpy as np
|
|
from fastapi import FastAPI, UploadFile, File, Form
|
|
from fastapi.responses import StreamingResponse
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
import json
|
|
import pickle
|
|
|
|
app = FastAPI()
|
|
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
|
|
|
|
model = None
|
|
PROFILES_DIR = os.path.join(os.path.dirname(__file__), "voice-profiles")
|
|
os.makedirs(PROFILES_DIR, exist_ok=True)
|
|
|
|
def get_model():
|
|
global model
|
|
if model is None:
|
|
from qwen_tts import Qwen3TTSModel
|
|
print("Loading Qwen3-TTS model...")
|
|
model = Qwen3TTSModel.from_pretrained(
|
|
"Qwen/Qwen3-TTS-12Hz-1.7B-Base",
|
|
device_map="cuda:0",
|
|
dtype=torch.bfloat16,
|
|
)
|
|
print("Model loaded!")
|
|
return model
|
|
|
|
@app.get("/health")
|
|
@app.get("/api/tts/health")
|
|
def health():
|
|
return {"status": "ok", "model_loaded": model is not None}
|
|
|
|
@app.post("/api/tts/clone")
|
|
async def voice_clone(
|
|
text: str = Form(...),
|
|
language: str = Form("korean"),
|
|
ref_audio: UploadFile = File(...),
|
|
ref_text: str = Form(""),
|
|
):
|
|
"""참조 음성으로 보이스 클로닝"""
|
|
m = get_model()
|
|
|
|
# 참조 음성 저장
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
|
content = await ref_audio.read()
|
|
tmp.write(content)
|
|
tmp_path = tmp.name
|
|
|
|
try:
|
|
# wav 변환 (필요 시)
|
|
if not ref_audio.filename.endswith(".wav"):
|
|
wav_path = tmp_path + "_converted.wav"
|
|
os.system(f'ffmpeg -i "{tmp_path}" -ar 16000 -ac 1 -y "{wav_path}" 2>/dev/null')
|
|
os.unlink(tmp_path)
|
|
tmp_path = wav_path
|
|
|
|
kwargs = {
|
|
"text": text,
|
|
"language": language,
|
|
"ref_audio": tmp_path,
|
|
}
|
|
if ref_text and ref_text.strip():
|
|
kwargs["ref_text"] = ref_text
|
|
else:
|
|
kwargs["x_vector_only_mode"] = True
|
|
|
|
wavs, sr = m.generate_voice_clone(**kwargs)
|
|
print(f"Clone generated: wavs={len(wavs)}, samples={len(wavs[0]) if len(wavs) > 0 else 0}, sr={sr}")
|
|
|
|
audio_data = np.array(wavs[0], dtype=np.float32)
|
|
buf = io.BytesIO()
|
|
sf.write(buf, audio_data, sr, format="WAV")
|
|
buf.seek(0)
|
|
|
|
return StreamingResponse(buf, media_type="audio/wav",
|
|
headers={"Content-Disposition": "attachment; filename=tts_output.wav"})
|
|
finally:
|
|
if os.path.exists(tmp_path):
|
|
os.unlink(tmp_path)
|
|
|
|
@app.post("/api/tts/design")
|
|
async def voice_design(
|
|
text: str = Form(...),
|
|
language: str = Form("korean"),
|
|
instruct: str = Form("A calm, professional Korean male voice"),
|
|
):
|
|
"""음성 디자인으로 생성 (참조 음성 없이)"""
|
|
m = get_model()
|
|
wavs, sr = m.generate_voice_design(text=text, instruct=instruct, language=language)
|
|
|
|
buf = io.BytesIO()
|
|
sf.write(buf, wavs[0], sr, format="WAV")
|
|
buf.seek(0)
|
|
|
|
return StreamingResponse(buf, media_type="audio/wav",
|
|
headers={"Content-Disposition": "attachment; filename=tts_output.wav"})
|
|
|
|
@app.post("/api/tts/profiles")
|
|
async def create_profile(
|
|
name: str = Form(...),
|
|
ref_audio: UploadFile = File(...),
|
|
ref_text: str = Form(""),
|
|
):
|
|
"""음성 프로필 등록: 참조 음성으로 보이스 프로필 생성 후 저장"""
|
|
m = get_model()
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
|
content = await ref_audio.read()
|
|
tmp.write(content)
|
|
tmp_path = tmp.name
|
|
|
|
try:
|
|
if not ref_audio.filename.endswith(".wav"):
|
|
wav_path = tmp_path + "_converted.wav"
|
|
os.system(f'ffmpeg -i "{tmp_path}" -ar 16000 -ac 1 -y "{wav_path}" 2>/dev/null')
|
|
os.unlink(tmp_path)
|
|
tmp_path = wav_path
|
|
|
|
# 프로필 생성
|
|
kwargs = {"ref_audio": tmp_path}
|
|
if ref_text and ref_text.strip():
|
|
kwargs["ref_text"] = ref_text
|
|
prompt = m.create_voice_clone_prompt(**kwargs)
|
|
else:
|
|
kwargs["x_vector_only_mode"] = True
|
|
prompt = m.create_voice_clone_prompt(**kwargs)
|
|
|
|
# 저장
|
|
profile_id = name.replace(" ", "_").lower()
|
|
profile_path = os.path.join(PROFILES_DIR, f"{profile_id}.pkl")
|
|
meta_path = os.path.join(PROFILES_DIR, f"{profile_id}.json")
|
|
|
|
with open(profile_path, "wb") as f:
|
|
pickle.dump(prompt, f)
|
|
with open(meta_path, "w") as f:
|
|
json.dump({"id": profile_id, "name": name, "ref_text": ref_text}, f, ensure_ascii=False)
|
|
|
|
return {"id": profile_id, "name": name, "status": "created"}
|
|
finally:
|
|
if os.path.exists(tmp_path):
|
|
os.unlink(tmp_path)
|
|
|
|
@app.get("/api/tts/profiles")
|
|
def list_profiles():
|
|
"""저장된 음성 프로필 목록"""
|
|
profiles = []
|
|
for f in os.listdir(PROFILES_DIR):
|
|
if f.endswith(".json"):
|
|
with open(os.path.join(PROFILES_DIR, f)) as fh:
|
|
profiles.append(json.load(fh))
|
|
return profiles
|
|
|
|
@app.delete("/api/tts/profiles/{profile_id}")
|
|
def delete_profile(profile_id: str):
|
|
"""음성 프로필 삭제"""
|
|
pkl = os.path.join(PROFILES_DIR, f"{profile_id}.pkl")
|
|
meta = os.path.join(PROFILES_DIR, f"{profile_id}.json")
|
|
if os.path.exists(pkl): os.unlink(pkl)
|
|
if os.path.exists(meta): os.unlink(meta)
|
|
return {"status": "deleted"}
|
|
|
|
@app.post("/api/tts/generate")
|
|
async def generate_from_profile(
|
|
text: str = Form(...),
|
|
profile_id: str = Form(...),
|
|
language: str = Form("korean"),
|
|
):
|
|
"""저장된 음성 프로필로 TTS 생성"""
|
|
m = get_model()
|
|
|
|
profile_path = os.path.join(PROFILES_DIR, f"{profile_id}.pkl")
|
|
if not os.path.exists(profile_path):
|
|
return {"error": f"Profile '{profile_id}' not found"}, 404
|
|
|
|
with open(profile_path, "rb") as f:
|
|
prompt = pickle.load(f)
|
|
|
|
print(f"Generating with profile '{profile_id}', text='{text[:50]}...', language={language}")
|
|
wavs, sr = m.generate_voice_clone(
|
|
text=text,
|
|
language=language,
|
|
voice_clone_prompt=prompt,
|
|
)
|
|
print(f"Generated: wavs={len(wavs)}, samples={len(wavs[0]) if len(wavs) > 0 else 0}, sr={sr}")
|
|
|
|
if len(wavs) == 0 or len(wavs[0]) == 0:
|
|
return {"error": "Empty audio generated"}, 500
|
|
|
|
audio_data = np.array(wavs[0], dtype=np.float32)
|
|
buf = io.BytesIO()
|
|
sf.write(buf, audio_data, sr, format="WAV")
|
|
buf.seek(0)
|
|
|
|
return StreamingResponse(buf, media_type="audio/wav",
|
|
headers={"Content-Disposition": "attachment; filename=tts_output.wav"})
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
get_model()
|
|
uvicorn.run(app, host="0.0.0.0", port=8090)
|