diff options
| -rw-r--r-- | Dockerfile | 4 | ||||
| -rw-r--r-- | makima/makima-vllm/Dockerfile | 31 | ||||
| -rw-r--r-- | makima/makima-vllm/requirements.txt | 23 | ||||
| -rwxr-xr-x | makima/makima-vllm/run.sh | 14 | ||||
| -rw-r--r-- | makima/makima-vllm/server.py | 390 | ||||
| -rwxr-xr-x | makima/sh/download-models.sh | 69 | ||||
| -rw-r--r-- | makima/src/bin/makima.rs | 2 | ||||
| -rw-r--r-- | makima/src/daemon/cli/server.rs | 6 | ||||
| -rw-r--r-- | makima/src/daemon/task/manager.rs | 4 | ||||
| -rw-r--r-- | makima/src/server/handlers/speak.rs | 5 | ||||
| -rw-r--r-- | makima/src/server/state.rs | 16 |
11 files changed, 52 insertions, 512 deletions
@@ -19,7 +19,7 @@ RUN chmod +x /app/download-models.sh ARG MODEL_BASE_URL ENV MODEL_BASE_URL=${MODEL_BASE_URL} ENV MODELS_DIR=/app/models -ENV QWEN3_TTS_DIR=/app/models/qwen3-tts +ENV CHATTERBOX_MODEL_DIR=/app/models/chatterbox-turbo RUN /app/download-models.sh echo "Models downloaded" # Copy workspace files @@ -42,7 +42,7 @@ ENV RUST_LOG=makima=info,tower_http=info ENV PARAKEET_MODEL_DIR=/app/models/parakeet-tdt-0.6b-v3 ENV PARAKEET_EOU_DIR=/app/models/realtime_eou_120m-v1-onnx ENV SORTFORMER_MODEL_PATH=/app/models/diarization/diar_streaming_sortformer_4spk-v2.1.onnx -ENV QWEN3_TTS_DIR=/app/models/qwen3-tts +ENV CHATTERBOX_MODEL_DIR=/app/models/chatterbox-turbo EXPOSE 8080 diff --git a/makima/makima-vllm/Dockerfile b/makima/makima-vllm/Dockerfile deleted file mode 100644 index 3ffb557..0000000 --- a/makima/makima-vllm/Dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -FROM python:3.12-slim-bookworm - -WORKDIR /app - -# Install system dependencies including sox for audio processing -RUN apt-get update && apt-get install -y \ - sox \ - libsox-dev \ - libsndfile1 \ - ffmpeg \ - curl \ - && rm -rf /var/lib/apt/lists/* - -# Install Python dependencies -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -# Copy server code -COPY server.py . - -# Set environment variables -ENV PORT=8100 -ENV TTS_DEVICE=auto -ENV QWEN3_TTS_MODEL=Qwen/Qwen3-TTS-12Hz-0.6B-Base - -EXPOSE 8100 - -HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \ - CMD curl -f http://localhost:${PORT}/health || exit 1 - -CMD ["python", "server.py"] diff --git a/makima/makima-vllm/requirements.txt b/makima/makima-vllm/requirements.txt deleted file mode 100644 index cd3ac2e..0000000 --- a/makima/makima-vllm/requirements.txt +++ /dev/null @@ -1,23 +0,0 @@ -# Qwen3-TTS Server Dependencies - -# PyTorch - use CPU wheel for smaller image or cuda for GPU ---index-url https://download.pytorch.org/whl/cpu -torch>=2.0.0 -torchaudio>=2.0.0 - -# TTS Model -qwen-tts>=0.0.4 -transformers>=4.40.0 -accelerate>=0.30.0 - -# Web framework -fastapi>=0.100.0 -uvicorn[standard]>=0.20.0 -websockets>=11.0 - -# Audio processing -numpy>=1.24.0 -soundfile>=0.12.0 - -# Other -pydantic>=2.0.0 diff --git a/makima/makima-vllm/run.sh b/makima/makima-vllm/run.sh deleted file mode 100755 index 246fcbf..0000000 --- a/makima/makima-vllm/run.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -# Run the Qwen3-TTS server - -set -e - -cd "$(dirname "$0")" - -# Activate virtual environment if it exists -if [ -d ".venv" ]; then - source .venv/bin/activate -fi - -# Use exec to replace shell with python so Ctrl+C works properly -exec python server.py diff --git a/makima/makima-vllm/server.py b/makima/makima-vllm/server.py deleted file mode 100644 index 2d9ea40..0000000 --- a/makima/makima-vllm/server.py +++ /dev/null @@ -1,390 +0,0 @@ -#!/usr/bin/env python3 -""" -Qwen3-TTS FastAPI Server - -Simple HTTP wrapper around Qwen3-TTS for use by makima. -Supports streaming audio output for real-time playback. -""" - -import io -import os -import base64 -import time -import asyncio -from typing import Optional, AsyncGenerator -from contextlib import asynccontextmanager - -import numpy as np -import torch -import soundfile as sf -from fastapi import FastAPI, HTTPException, WebSocket, WebSocketDisconnect -from fastapi.responses import Response, StreamingResponse -from pydantic import BaseModel - -# Global model instance -model = None - - -class TTSRequest(BaseModel): - text: str - # Supported: auto, chinese, english, french, german, italian, japanese, korean, portuguese, russian, spanish - language: str = "english" - # Reference audio for voice cloning (base64 encoded WAV) - reference_audio: Optional[str] = None - reference_text: Optional[str] = None - - -class TTSResponse(BaseModel): - # Base64 encoded WAV audio - audio: str - sample_rate: int - duration_seconds: float - - -def get_model_name(): - """Get model name from environment or use default.""" - return os.environ.get("QWEN3_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-0.6B-Base") - - -def get_device(): - """Get device to use for inference.""" - device = os.environ.get("TTS_DEVICE", "auto") - if device == "auto": - # MPS has limitations with large output channels, prefer CPU on macOS - import platform - if platform.system() == "Darwin": - return "cpu" - elif torch.cuda.is_available(): - return "cuda" - else: - return "cpu" - return device - - -@asynccontextmanager -async def lifespan(app: FastAPI): - """Load model on startup.""" - global model - model_name = get_model_name() - print(f"Loading Qwen3-TTS model: {model_name}") - start = time.time() - - from qwen_tts import Qwen3TTSModel - - # Check if flash attention is available - try: - import flash_attn - attn_impl = "flash_attention_2" - print("Using Flash Attention 2") - except ImportError: - attn_impl = "eager" - print("Flash Attention not available, using eager attention") - - device = get_device() - print(f"Using device: {device}") - - # Use float32 for CPU (bfloat16 can be slow on CPU) - dtype = torch.float32 if device == "cpu" else torch.bfloat16 - - model = Qwen3TTSModel.from_pretrained( - model_name, - torch_dtype=dtype, - attn_implementation=attn_impl, - device_map=device, - ) - - print(f"Model loaded in {time.time() - start:.2f}s") - yield - # Cleanup - model = None - - -app = FastAPI( - title="Qwen3-TTS Server", - description="HTTP API for Qwen3-TTS text-to-speech", - lifespan=lifespan, -) - - -@app.get("/health") -async def health(): - """Health check endpoint.""" - return {"status": "ok", "model_loaded": model is not None} - - -@app.post("/tts", response_model=TTSResponse) -async def generate_tts(request: TTSRequest): - """Generate speech from text.""" - if model is None: - raise HTTPException(status_code=503, detail="Model not loaded") - - try: - start = time.time() - - # Decode reference audio if provided - ref_audio = None - if request.reference_audio: - audio_bytes = base64.b64decode(request.reference_audio) - audio_data, audio_sr = sf.read(io.BytesIO(audio_bytes)) - # qwen-tts expects tuple of (audio, sample_rate) for numpy input - ref_audio = (audio_data, audio_sr) - - # Voice cloning requires reference audio - if ref_audio is None: - raise HTTPException( - status_code=400, - detail="reference_audio is required for the Base model. Please provide a base64-encoded WAV file." - ) - - # Use x_vector_only_mode if no reference text provided (simpler voice extraction) - use_x_vector_only = request.reference_text is None or request.reference_text.strip() == "" - - wavs, sample_rate = model.generate_voice_clone( - text=request.text, - language=request.language, - ref_audio=ref_audio, - ref_text=request.reference_text if not use_x_vector_only else None, - x_vector_only_mode=use_x_vector_only, - max_new_tokens=2048, - temperature=0.9, - top_k=50, - repetition_penalty=1.05, - ) - - # Get first waveform - waveform = wavs[0] if isinstance(wavs, list) else wavs - - # Convert to numpy if tensor - if torch.is_tensor(waveform): - waveform = waveform.cpu().numpy() - - # Ensure 1D array - if waveform.ndim > 1: - waveform = waveform.squeeze() - - # Encode as WAV - buffer = io.BytesIO() - sf.write(buffer, waveform, sample_rate, format="WAV") - audio_bytes = buffer.getvalue() - - duration = len(waveform) / sample_rate - elapsed = time.time() - start - print(f"Generated {duration:.2f}s audio in {elapsed:.2f}s (RTF: {elapsed/duration:.2f})") - - return TTSResponse( - audio=base64.b64encode(audio_bytes).decode("utf-8"), - sample_rate=sample_rate, - duration_seconds=duration, - ) - - except Exception as e: - import traceback - traceback.print_exc() - raise HTTPException(status_code=500, detail=str(e)) - - -@app.post("/tts/raw") -async def generate_tts_raw(request: TTSRequest): - """Generate speech and return raw WAV bytes.""" - if model is None: - raise HTTPException(status_code=503, detail="Model not loaded") - - try: - # Decode reference audio if provided - ref_audio = None - if request.reference_audio: - audio_bytes = base64.b64decode(request.reference_audio) - audio_data, audio_sr = sf.read(io.BytesIO(audio_bytes)) - ref_audio = (audio_data, audio_sr) - - # Voice cloning requires reference audio - if ref_audio is None: - raise HTTPException( - status_code=400, - detail="reference_audio is required for the Base model." - ) - - use_x_vector_only = request.reference_text is None or request.reference_text.strip() == "" - - wavs, sample_rate = model.generate_voice_clone( - text=request.text, - language=request.language, - ref_audio=ref_audio, - ref_text=request.reference_text if not use_x_vector_only else None, - x_vector_only_mode=use_x_vector_only, - max_new_tokens=2048, - temperature=0.9, - top_k=50, - repetition_penalty=1.05, - ) - - waveform = wavs[0] if isinstance(wavs, list) else wavs - - if torch.is_tensor(waveform): - waveform = waveform.cpu().numpy() - - if waveform.ndim > 1: - waveform = waveform.squeeze() - - # Return raw WAV - buffer = io.BytesIO() - sf.write(buffer, waveform, sample_rate, format="WAV") - - return Response( - content=buffer.getvalue(), - media_type="audio/wav", - ) - - except Exception as e: - import traceback - traceback.print_exc() - raise HTTPException(status_code=500, detail=str(e)) - - -@app.websocket("/tts/stream") -async def stream_tts(websocket: WebSocket): - """ - WebSocket endpoint for streaming TTS. - - Protocol: - - Client sends JSON: {"text": "...", "language": "english", "reference_audio": "base64...", "reference_text": "..."} - - Server sends binary PCM16 chunks as they're generated - - Server sends JSON {"type": "audio_end", "sample_rate": 24000} when done - - Server sends JSON {"type": "error", "message": "..."} on error - """ - await websocket.accept() - - if model is None: - await websocket.send_json({"type": "error", "message": "Model not loaded"}) - await websocket.close() - return - - try: - # Wait for request - data = await websocket.receive_json() - text = data.get("text", "") - language = data.get("language", "english") - ref_audio_b64 = data.get("reference_audio") - ref_text = data.get("reference_text") - - if not text.strip(): - await websocket.send_json({"type": "error", "message": "No text provided"}) - await websocket.close() - return - - # Decode reference audio - ref_audio = None - if ref_audio_b64: - audio_bytes = base64.b64decode(ref_audio_b64) - audio_data, audio_sr = sf.read(io.BytesIO(audio_bytes)) - ref_audio = (audio_data, audio_sr) - - if ref_audio is None: - await websocket.send_json({"type": "error", "message": "reference_audio is required"}) - await websocket.close() - return - - use_x_vector_only = ref_text is None or ref_text.strip() == "" - - print(f"Streaming TTS for {len(text)} chars...") - start = time.time() - - # Use streaming mode (non_streaming_mode=False is default) - # This returns a generator that yields audio chunks - generator = model.generate_voice_clone( - text=text, - language=language, - ref_audio=ref_audio, - ref_text=ref_text if not use_x_vector_only else None, - x_vector_only_mode=use_x_vector_only, - max_new_tokens=2048, - temperature=0.9, - top_k=50, - repetition_penalty=1.05, - non_streaming_mode=False, # Enable streaming - ) - - total_samples = 0 - sample_rate = 24000 - - # Check if generator is actually a generator or just the result - if hasattr(generator, '__iter__') and not isinstance(generator, tuple): - for chunk_data in generator: - # chunk_data might be (wav_chunk, sr) or just wav_chunk - if isinstance(chunk_data, tuple): - wav_chunk, sample_rate = chunk_data - else: - wav_chunk = chunk_data - - if torch.is_tensor(wav_chunk): - wav_chunk = wav_chunk.cpu().numpy() - - if wav_chunk.ndim > 1: - wav_chunk = wav_chunk.squeeze() - - # Convert to PCM16 - pcm16 = (wav_chunk * 32767).astype(np.int16) - total_samples += len(pcm16) - - # Send binary audio chunk - await websocket.send_bytes(pcm16.tobytes()) - - # Yield to allow other tasks - await asyncio.sleep(0) - else: - # Non-streaming fallback - model returned full result - wavs, sample_rate = generator if isinstance(generator, tuple) else (generator, 24000) - waveform = wavs[0] if isinstance(wavs, list) else wavs - - if torch.is_tensor(waveform): - waveform = waveform.cpu().numpy() - - if waveform.ndim > 1: - waveform = waveform.squeeze() - - # Send in chunks for better streaming behavior - chunk_size = sample_rate // 4 # 250ms chunks - for i in range(0, len(waveform), chunk_size): - chunk = waveform[i:i + chunk_size] - pcm16 = (chunk * 32767).astype(np.int16) - total_samples += len(pcm16) - await websocket.send_bytes(pcm16.tobytes()) - await asyncio.sleep(0) - - duration = total_samples / sample_rate - elapsed = time.time() - start - print(f"Streamed {duration:.2f}s audio in {elapsed:.2f}s (RTF: {elapsed/duration:.2f})") - - # Send completion message - await websocket.send_json({ - "type": "audio_end", - "sample_rate": sample_rate, - "duration_seconds": duration, - }) - - except WebSocketDisconnect: - print("Client disconnected during streaming") - except Exception as e: - import traceback - traceback.print_exc() - try: - await websocket.send_json({"type": "error", "message": str(e)}) - except: - pass - finally: - try: - await websocket.close() - except: - pass - - -if __name__ == "__main__": - import uvicorn - port = int(os.environ.get("PORT", "8100")) - uvicorn.run( - app, - host="0.0.0.0", - port=port, - # Increase keep-alive timeout to avoid connection resets - timeout_keep_alive=120, - ) diff --git a/makima/sh/download-models.sh b/makima/sh/download-models.sh index 4f188f3..14fc46e 100755 --- a/makima/sh/download-models.sh +++ b/makima/sh/download-models.sh @@ -114,47 +114,46 @@ else echo "All models downloaded successfully" fi -# Download Qwen3-TTS models (for TTS functionality) -QWEN3_TTS_DIR="${QWEN3_TTS_DIR:-/app/models/qwen3-tts}" - -download_qwen3_tts() { - if [ -d "$QWEN3_TTS_DIR" ] && \ - [ -f "$QWEN3_TTS_DIR/model.safetensors" ] && \ - [ -f "$QWEN3_TTS_DIR/speech_tokenizer.safetensors" ] && \ - [ -f "$QWEN3_TTS_DIR/vocab.json" ] && \ - [ -f "$QWEN3_TTS_DIR/merges.txt" ] && \ - [ -f "$QWEN3_TTS_DIR/config.json" ]; then - echo "Qwen3-TTS models already exist, skipping..." +# Download Chatterbox TTS models (for TTS functionality) +CHATTERBOX_MODEL_DIR="${CHATTERBOX_MODEL_DIR:-/app/models/chatterbox-turbo}" + +download_chatterbox_tts() { + if [ -d "$CHATTERBOX_MODEL_DIR" ] && \ + [ -f "$CHATTERBOX_MODEL_DIR/speech_encoder.onnx" ] && \ + [ -f "$CHATTERBOX_MODEL_DIR/language_model.onnx" ] && \ + [ -f "$CHATTERBOX_MODEL_DIR/conditional_decoder.onnx" ] && \ + [ -f "$CHATTERBOX_MODEL_DIR/tokenizer.json" ]; then + echo "Chatterbox TTS models already exist, skipping..." return 0 fi - echo "Downloading Qwen3-TTS models..." - mkdir -p "$QWEN3_TTS_DIR" - - # Download base TTS model files from Qwen/Qwen3-TTS-12Hz-0.6B-Base - # Note: This repo uses vocab.json + merges.txt (not tokenizer.json) - echo "Downloading Qwen3-TTS-12Hz-0.6B-Base..." - hf download Qwen/Qwen3-TTS-12Hz-0.6B-Base \ - model.safetensors \ - config.json \ - vocab.json \ - merges.txt \ - tokenizer_config.json \ - --local-dir "$QWEN3_TTS_DIR" - - # Download speech tokenizer from Qwen/Qwen3-TTS-Tokenizer-12Hz - echo "Downloading Qwen3-TTS-Tokenizer-12Hz..." - local tmpdir=$(mktemp -d) - hf download Qwen/Qwen3-TTS-Tokenizer-12Hz \ - model.safetensors \ - --local-dir "$tmpdir" - mv "$tmpdir/model.safetensors" "$QWEN3_TTS_DIR/speech_tokenizer.safetensors" - rm -rf "$tmpdir" + echo "Downloading Chatterbox TTS models..." + mkdir -p "$CHATTERBOX_MODEL_DIR" + + # Download ONNX models from ResembleAI/chatterbox-turbo-ONNX + echo "Downloading ResembleAI/chatterbox-turbo-ONNX..." + hf download ResembleAI/chatterbox-turbo-ONNX \ + onnx/speech_encoder.onnx \ + onnx/speech_encoder.onnx_data \ + onnx/embed_tokens.onnx \ + onnx/embed_tokens.onnx_data \ + onnx/language_model.onnx \ + onnx/language_model.onnx_data \ + onnx/conditional_decoder.onnx \ + onnx/conditional_decoder.onnx_data \ + tokenizer.json \ + --local-dir "$CHATTERBOX_MODEL_DIR" + + # Move ONNX files from onnx/ subdirectory to root + if [ -d "$CHATTERBOX_MODEL_DIR/onnx" ]; then + mv "$CHATTERBOX_MODEL_DIR/onnx"/* "$CHATTERBOX_MODEL_DIR/" + rmdir "$CHATTERBOX_MODEL_DIR/onnx" + fi - echo "Qwen3-TTS models downloaded successfully" + echo "Chatterbox TTS models downloaded successfully" } -download_qwen3_tts +download_chatterbox_tts # Execute the main command exec "$@" diff --git a/makima/src/bin/makima.rs b/makima/src/bin/makima.rs index ac577b8..753f60e 100644 --- a/makima/src/bin/makima.rs +++ b/makima/src/bin/makima.rs @@ -49,7 +49,7 @@ async fn run_server( &args.parakeet_model_dir, &args.parakeet_eou_dir, &args.sortformer_model_path, - &args.qwen3_tts_dir, + &args.chatterbox_model_dir, ); // Connect to database if URL provided diff --git a/makima/src/daemon/cli/server.rs b/makima/src/daemon/cli/server.rs index 81dafc9..adb765d 100644 --- a/makima/src/daemon/cli/server.rs +++ b/makima/src/daemon/cli/server.rs @@ -33,9 +33,9 @@ pub struct ServerArgs { )] pub sortformer_model_path: String, - /// Path to Qwen3-TTS model directory - #[arg(long, env = "QWEN3_TTS_DIR", default_value = "models/qwen3-tts")] - pub qwen3_tts_dir: String, + /// Path to Chatterbox TTS model directory + #[arg(long, env = "CHATTERBOX_MODEL_DIR", default_value = "models/chatterbox-turbo")] + pub chatterbox_model_dir: String, /// PostgreSQL connection URI #[arg(long, env = "POSTGRES_CONNECTION_URI")] diff --git a/makima/src/daemon/task/manager.rs b/makima/src/daemon/task/manager.rs index e0437ce..bf495d9 100644 --- a/makima/src/daemon/task/manager.rs +++ b/makima/src/daemon/task/manager.rs @@ -598,7 +598,7 @@ rsync -av --exclude='.git' --exclude='.makima' "$FINAL_TASK_PATH/" ./ /// System prompt for supervisor tasks (contract orchestrators). /// Supervisors monitor all tasks in a contract, create new tasks, and drive the contract to completion. -const SUPERVISOR_SYSTEM_PROMPT: &str = r#"You are the SUPERVISOR for this contract. Your ONLY job is to coordinate work by spawning tasks, waiting for them to complete, and managing git operations. +const SUPERVISOR_SYSTEM_PROMPT: &str = r###"You are the SUPERVISOR for this contract. Your ONLY job is to coordinate work by spawning tasks, waiting for them to complete, and managing git operations. ## CRITICAL RULES - READ CAREFULLY @@ -960,7 +960,7 @@ After all tasks are "done" and merged, you MUST take the following actions: --- -"#; +"###; /// System prompt for tasks that are part of a contract. /// This tells the task about contract.sh and how to use it to interact with the contract. diff --git a/makima/src/server/handlers/speak.rs b/makima/src/server/handlers/speak.rs index 3ed2620..b235c65 100644 --- a/makima/src/server/handlers/speak.rs +++ b/makima/src/server/handlers/speak.rs @@ -1,19 +1,18 @@ //! WebSocket handler for TTS streaming (direct in-process inference). //! //! This module implements the `/api/v1/speak` endpoint which performs -//! text-to-speech synthesis directly using the candle-based TTS engine. +//! text-to-speech synthesis directly using the Chatterbox ONNX TTS engine. //! No external Python service or proxy — the model runs in-process. //! //! ## Architecture //! //! The speak handler will: //! 1. Accept a WebSocket connection from the client -//! 2. Lazily load the TTS model (candle) on first request +//! 2. Lazily load the TTS model (Chatterbox ONNX) on first request //! 3. Parse JSON control messages (start, speak, stop, cancel) //! 4. Run inference directly and stream audio chunks back //! //! See `makima/src/tts/` for the TTS engine implementation. -//! See `docs/specs/qwen3-tts-spec.md` for the full protocol specification. use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; diff --git a/makima/src/server/state.rs b/makima/src/server/state.rs index f662e30..bd6864f 100644 --- a/makima/src/server/state.rs +++ b/makima/src/server/state.rs @@ -560,7 +560,7 @@ pub struct ModelConfig { pub parakeet_model_dir: String, pub parakeet_eou_dir: String, pub sortformer_model_path: String, - pub qwen3_tts_dir: String, + pub chatterbox_model_dir: String, } /// Lazily-loaded ML models. @@ -619,12 +619,12 @@ impl AppState { /// * `parakeet_model_dir` - Path to the Parakeet TDT model directory /// * `parakeet_eou_dir` - Path to the Parakeet EOU model directory /// * `sortformer_model_path` - Path to the Sortformer diarization model file - /// * `qwen3_tts_dir` - Path to the Qwen3-TTS model directory + /// * `chatterbox_model_dir` - Path to the Chatterbox TTS model directory pub fn new( parakeet_model_dir: &str, parakeet_eou_dir: &str, sortformer_model_path: &str, - qwen3_tts_dir: &str, + chatterbox_model_dir: &str, ) -> Self { // Create broadcast channels with buffer for 256 messages let (file_updates, _) = broadcast::channel(256); @@ -668,7 +668,7 @@ impl AppState { parakeet_model_dir: parakeet_model_dir.to_string(), parakeet_eou_dir: parakeet_eou_dir.to_string(), sortformer_model_path: sortformer_model_path.to_string(), - qwen3_tts_dir: qwen3_tts_dir.to_string(), + chatterbox_model_dir: chatterbox_model_dir.to_string(), }), ml_models: OnceCell::new(), db_pool: None, @@ -691,17 +691,17 @@ impl AppState { /// Get or initialize the TTS engine (lazy loading). /// - /// The TTS engine is loaded on first Speak connection using the Qwen3 backend. + /// The TTS engine is loaded on first Speak connection using the Chatterbox backend. /// Returns a reference to the engine, or an error if loading fails. pub async fn get_tts_engine(&self) -> Result<&dyn TtsEngine, Box<dyn std::error::Error + Send + Sync>> { - let tts_dir = self.model_config.as_ref().map(|c| c.qwen3_tts_dir.as_str()); + let tts_dir = self.model_config.as_ref().map(|c| c.chatterbox_model_dir.as_str()); self.tts_engine.get_or_try_init(|| async { tracing::info!( model_dir = ?tts_dir, - "Lazy-loading TTS engine (Qwen3) on first Speak connection..." + "Lazy-loading TTS engine (Chatterbox) on first Speak connection..." ); let engine = crate::tts::TtsEngineFactory::create( - crate::tts::TtsBackend::Qwen3, + crate::tts::TtsBackend::Chatterbox, tts_dir, ).map_err(|e| -> Box<dyn std::error::Error + Send + Sync> { Box::new(e) |
