from pathlib import Path
import soundfile as sf
import torch
import qwen_tts
import numpy as np
from transformers import AutoConfig, AutoModel
from qwen_tts.core.models.configuration_qwen3_tts import Qwen3TTSConfig
from qwen_tts.core.models.modeling_qwen3_tts import Qwen3TTSForConditionalGeneration
ROOT = Path('/opt/ai-avatar-demo')
MODEL_ROOT = ROOT / 'models/qwen3-tts-12hz-1b7-base'
REF_AUDIO = ROOT / 'work/MARSsc.wav'
REF_TEXT_FILE = ROOT / 'work/MARSsc_ref_text.txt'
OUT_WAV = ROOT / 'work/gate602_m2_marssc_clone_validation.wav'
TARGET_TEXT = '在這一條路上有順走的路我我先提我的'
LANGUAGE = 'Chinese'
AutoConfig.register('qwen3_tts', Qwen3TTSConfig)
AutoModel.register(Qwen3TTSConfig, Qwen3TTSForConditionalGeneration)
ref_text = REF_TEXT_FILE.read_text(encoding='utf-8').strip()
Model = getattr(qwen_tts, 'Qwen3TTSModel', None)
model = Model.from_pretrained(str(MODEL_ROOT), device_map='cuda:0' if torch.cuda.is_available() else 'cpu', dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32)
wavs, sample_rate = model.generate_voice_clone(text=TARGET_TEXT, language=LANGUAGE, ref_audio=str(REF_AUDIO), ref_text=ref_text, non_streaming_mode=True)
if isinstance(wavs, (list, tuple)):
    audio_data = np.concatenate([w.detach().cpu().numpy() if hasattr(w, 'detach') else w for w in wavs])
else:
    audio_data = wavs.detach().cpu().numpy() if hasattr(wavs, 'detach') else wavs
sf.write(str(OUT_WAV), audio_data, sample_rate)
print(OUT_WAV)
