from pathlib import Path
import sys
import torch
import soundfile as sf
import numpy as np

# Pre-register custom HuggingFace architectures
from transformers import AutoConfig, AutoModel
from qwen_tts.core.models.configuration_qwen3_tts import Qwen3TTSConfig
from qwen_tts.core.models.modeling_qwen3_tts import Qwen3TTSForConditionalGeneration
try:
    AutoConfig.register("qwen3_tts", Qwen3TTSConfig)
    AutoModel.register(Qwen3TTSConfig, Qwen3TTSForConditionalGeneration)
    print("Pre-registered architecture successfully!")
except Exception as reg_err:
    print("Pre-registration warning:", reg_err)

import qwen_tts
Model = getattr(qwen_tts, 'Qwen3TTSModel', None)
if Model is None:
    print("Qwen3TTSModel not found!")
    sys.exit(1)

MODEL_ROOT = '/opt/ai-avatar-demo/models/qwen3-tts'
print("Loading model...")
model = Model.from_pretrained(MODEL_ROOT)
print("Model loaded!")

# Inspect model structure
print("model.model.tts_model_type BEFORE:", getattr(model.model, 'tts_model_type', None))
# Try to force set to "base"
if hasattr(model.model, 'tts_model_type'):
    model.model.tts_model_type = "base"
    print("Forced model.model.tts_model_type to 'base'!")
if hasattr(model.model.config, 'tts_model_type'):
    model.model.config.tts_model_type = "base"
    print("Forced model.model.config.tts_model_type to 'base'!")

REF_AUDIO = '/opt/ai-avatar-demo/data/voice_refs/raw/mandy0526.wav'
REF_TEXT = '早安一早要去吃早餐啦，早餐據說很好吃，好期待'
TARGET_TEXT = '我是Yuka，很高興認識大家'
OUT_WAV = '/opt/ai-avatar-demo/data/tts_outputs/test_force_clone.wav'

print("Attempting generate_voice_clone...")
try:
    wavs, sample_rate = model.generate_voice_clone(
        text=TARGET_TEXT,
        ref_audio=REF_AUDIO,
        ref_text=REF_TEXT,
        non_streaming_mode=True
    )
    print("generate_voice_clone success!")
    audio_data = np.concatenate(wavs)
    sf.write(OUT_WAV, audio_data, sample_rate)
    print("Saved to:", OUT_WAV)
except Exception as e:
    import traceback
    print("Failed with exception:", repr(e))
    traceback.print_exc()