Remove QWEN TTSmakima/multi-phase-plan-fix

author: soryu <soryu@soryu.co> 2026-02-01 02:39:19 +0000
committer: soryu <soryu@soryu.co> 2026-02-01 02:39:52 +0000
commit: ddd956118880d3416a5e8101dcee7f880cbdc444 (patch)
tree: 9406c510782f7f91c68b3d461ce46f6428a49072
parent: d0062efd34dfc22c2d8cfee0a47431ac0c8adfda (diff)
download: soryu-makima/multi-phase-plan-fix.tar.gz
soryu-makima/multi-phase-plan-fix.zip
11 files changed, 52 insertions, 512 deletions
diff --git a/Dockerfile b/Dockerfile
index 48b74b6..f33045c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -19,7 +19,7 @@ RUN chmod +x /app/download-models.sh
 ARG MODEL_BASE_URL
 ENV MODEL_BASE_URL=${MODEL_BASE_URL}
 ENV MODELS_DIR=/app/models
-ENV QWEN3_TTS_DIR=/app/models/qwen3-tts
+ENV CHATTERBOX_MODEL_DIR=/app/models/chatterbox-turbo
 RUN /app/download-models.sh echo "Models downloaded"
 
 # Copy workspace files
@@ -42,7 +42,7 @@ ENV RUST_LOG=makima=info,tower_http=info
 ENV PARAKEET_MODEL_DIR=/app/models/parakeet-tdt-0.6b-v3
 ENV PARAKEET_EOU_DIR=/app/models/realtime_eou_120m-v1-onnx
 ENV SORTFORMER_MODEL_PATH=/app/models/diarization/diar_streaming_sortformer_4spk-v2.1.onnx
-ENV QWEN3_TTS_DIR=/app/models/qwen3-tts
+ENV CHATTERBOX_MODEL_DIR=/app/models/chatterbox-turbo
 
 EXPOSE 8080
 
diff --git a/makima/makima-vllm/Dockerfile b/makima/makima-vllm/Dockerfile
deleted file mode 100644
index 3ffb557..0000000
--- a/makima/makima-vllm/Dockerfile
+++ /dev/null
@@ -1,31 +0,0 @@
-FROM python:3.12-slim-bookworm
-
-WORKDIR /app
-
-# Install system dependencies including sox for audio processing
-RUN apt-get update && apt-get install -y \
-    sox \
-    libsox-dev \
-    libsndfile1 \
-    ffmpeg \
-    curl \
-    && rm -rf /var/lib/apt/lists/*
-
-# Install Python dependencies
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-
-# Copy server code
-COPY server.py .
-
-# Set environment variables
-ENV PORT=8100
-ENV TTS_DEVICE=auto
-ENV QWEN3_TTS_MODEL=Qwen/Qwen3-TTS-12Hz-0.6B-Base
-
-EXPOSE 8100
-
-HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
-    CMD curl -f http://localhost:${PORT}/health || exit 1
-
-CMD ["python", "server.py"]
diff --git a/makima/makima-vllm/requirements.txt b/makima/makima-vllm/requirements.txt
deleted file mode 100644
index cd3ac2e..0000000
--- a/makima/makima-vllm/requirements.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-# Qwen3-TTS Server Dependencies
-
-# PyTorch - use CPU wheel for smaller image or cuda for GPU
---index-url https://download.pytorch.org/whl/cpu
-torch>=2.0.0
-torchaudio>=2.0.0
-
-# TTS Model
-qwen-tts>=0.0.4
-transformers>=4.40.0
-accelerate>=0.30.0
-
-# Web framework
-fastapi>=0.100.0
-uvicorn[standard]>=0.20.0
-websockets>=11.0
-
-# Audio processing
-numpy>=1.24.0
-soundfile>=0.12.0
-
-# Other
-pydantic>=2.0.0
diff --git a/makima/makima-vllm/run.sh b/makima/makima-vllm/run.sh
deleted file mode 100755
index 246fcbf..0000000
--- a/makima/makima-vllm/run.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-# Run the Qwen3-TTS server
-
-set -e
-
-cd "$(dirname "$0")"
-
-# Activate virtual environment if it exists
-if [ -d ".venv" ]; then
-    source .venv/bin/activate
-fi
-
-# Use exec to replace shell with python so Ctrl+C works properly
-exec python server.py
diff --git a/makima/makima-vllm/server.py b/makima/makima-vllm/server.py
deleted file mode 100644
index 2d9ea40..0000000
--- a/makima/makima-vllm/server.py
+++ /dev/null
@@ -1,390 +0,0 @@
-#!/usr/bin/env python3
-"""
-Qwen3-TTS FastAPI Server
-
-Simple HTTP wrapper around Qwen3-TTS for use by makima.
-Supports streaming audio output for real-time playback.
-"""
-
-import io
-import os
-import base64
-import time
-import asyncio
-from typing import Optional, AsyncGenerator
-from contextlib import asynccontextmanager
-
-import numpy as np
-import torch
-import soundfile as sf
-from fastapi import FastAPI, HTTPException, WebSocket, WebSocketDisconnect
-from fastapi.responses import Response, StreamingResponse
-from pydantic import BaseModel
-
-# Global model instance
-model = None
-
-
-class TTSRequest(BaseModel):
-    text: str
-    # Supported: auto, chinese, english, french, german, italian, japanese, korean, portuguese, russian, spanish
-    language: str = "english"
-    # Reference audio for voice cloning (base64 encoded WAV)
-    reference_audio: Optional[str] = None
-    reference_text: Optional[str] = None
-
-
-class TTSResponse(BaseModel):
-    # Base64 encoded WAV audio
-    audio: str
-    sample_rate: int
-    duration_seconds: float
-
-
-def get_model_name():
-    """Get model name from environment or use default."""
-    return os.environ.get("QWEN3_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-0.6B-Base")
-
-
-def get_device():
-    """Get device to use for inference."""
-    device = os.environ.get("TTS_DEVICE", "auto")
-    if device == "auto":
-        # MPS has limitations with large output channels, prefer CPU on macOS
-        import platform
-        if platform.system() == "Darwin":
-            return "cpu"
-        elif torch.cuda.is_available():
-            return "cuda"
-        else:
-            return "cpu"
-    return device
-
-
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    """Load model on startup."""
-    global model
-    model_name = get_model_name()
-    print(f"Loading Qwen3-TTS model: {model_name}")
-    start = time.time()
-
-    from qwen_tts import Qwen3TTSModel
-
-    # Check if flash attention is available
-    try:
-        import flash_attn
-        attn_impl = "flash_attention_2"
-        print("Using Flash Attention 2")
-    except ImportError:
-        attn_impl = "eager"
-        print("Flash Attention not available, using eager attention")
-
-    device = get_device()
-    print(f"Using device: {device}")
-
-    # Use float32 for CPU (bfloat16 can be slow on CPU)
-    dtype = torch.float32 if device == "cpu" else torch.bfloat16
-
-    model = Qwen3TTSModel.from_pretrained(
-        model_name,
-        torch_dtype=dtype,
-        attn_implementation=attn_impl,
-        device_map=device,
-    )
-
-    print(f"Model loaded in {time.time() - start:.2f}s")
-    yield
-    # Cleanup
-    model = None
-
-
-app = FastAPI(
-    title="Qwen3-TTS Server",
-    description="HTTP API for Qwen3-TTS text-to-speech",
-    lifespan=lifespan,
-)
-
-
-@app.get("/health")
-async def health():
-    """Health check endpoint."""
-    return {"status": "ok", "model_loaded": model is not None}
-
-
-@app.post("/tts", response_model=TTSResponse)
-async def generate_tts(request: TTSRequest):
-    """Generate speech from text."""
-    if model is None:
-        raise HTTPException(status_code=503, detail="Model not loaded")
-
-    try:
-        start = time.time()
-
-        # Decode reference audio if provided
-        ref_audio = None
-        if request.reference_audio:
-            audio_bytes = base64.b64decode(request.reference_audio)
-            audio_data, audio_sr = sf.read(io.BytesIO(audio_bytes))
-            # qwen-tts expects tuple of (audio, sample_rate) for numpy input
-            ref_audio = (audio_data, audio_sr)
-
-        # Voice cloning requires reference audio
-        if ref_audio is None:
-            raise HTTPException(
-                status_code=400,
-                detail="reference_audio is required for the Base model. Please provide a base64-encoded WAV file."
-            )
-
-        # Use x_vector_only_mode if no reference text provided (simpler voice extraction)
-        use_x_vector_only = request.reference_text is None or request.reference_text.strip() == ""
-
-        wavs, sample_rate = model.generate_voice_clone(
-            text=request.text,
-            language=request.language,
-            ref_audio=ref_audio,
-            ref_text=request.reference_text if not use_x_vector_only else None,
-            x_vector_only_mode=use_x_vector_only,
-            max_new_tokens=2048,
-            temperature=0.9,
-            top_k=50,
-            repetition_penalty=1.05,
-        )
-
-        # Get first waveform
-        waveform = wavs[0] if isinstance(wavs, list) else wavs
-
-        # Convert to numpy if tensor
-        if torch.is_tensor(waveform):
-            waveform = waveform.cpu().numpy()
-
-        # Ensure 1D array
-        if waveform.ndim > 1:
-            waveform = waveform.squeeze()
-
-        # Encode as WAV
-        buffer = io.BytesIO()
-        sf.write(buffer, waveform, sample_rate, format="WAV")
-        audio_bytes = buffer.getvalue()
-
-        duration = len(waveform) / sample_rate
-        elapsed = time.time() - start
-        print(f"Generated {duration:.2f}s audio in {elapsed:.2f}s (RTF: {elapsed/duration:.2f})")
-
-        return TTSResponse(
-            audio=base64.b64encode(audio_bytes).decode("utf-8"),
-            sample_rate=sample_rate,
-            duration_seconds=duration,
-        )
-
-    except Exception as e:
-        import traceback
-        traceback.print_exc()
-        raise HTTPException(status_code=500, detail=str(e))
-
-
-@app.post("/tts/raw")
-async def generate_tts_raw(request: TTSRequest):
-    """Generate speech and return raw WAV bytes."""
-    if model is None:
-        raise HTTPException(status_code=503, detail="Model not loaded")
-
-    try:
-        # Decode reference audio if provided
-        ref_audio = None
-        if request.reference_audio:
-            audio_bytes = base64.b64decode(request.reference_audio)
-            audio_data, audio_sr = sf.read(io.BytesIO(audio_bytes))
-            ref_audio = (audio_data, audio_sr)
-
-        # Voice cloning requires reference audio
-        if ref_audio is None:
-            raise HTTPException(
-                status_code=400,
-                detail="reference_audio is required for the Base model."
-            )
-
-        use_x_vector_only = request.reference_text is None or request.reference_text.strip() == ""
-
-        wavs, sample_rate = model.generate_voice_clone(
-            text=request.text,
-            language=request.language,
-            ref_audio=ref_audio,
-            ref_text=request.reference_text if not use_x_vector_only else None,
-            x_vector_only_mode=use_x_vector_only,
-            max_new_tokens=2048,
-            temperature=0.9,
-            top_k=50,
-            repetition_penalty=1.05,
-        )
-
-        waveform = wavs[0] if isinstance(wavs, list) else wavs
-
-        if torch.is_tensor(waveform):
-            waveform = waveform.cpu().numpy()
-
-        if waveform.ndim > 1:
-            waveform = waveform.squeeze()
-
-        # Return raw WAV
-        buffer = io.BytesIO()
-        sf.write(buffer, waveform, sample_rate, format="WAV")
-
-        return Response(
-            content=buffer.getvalue(),
-            media_type="audio/wav",
-        )
-
-    except Exception as e:
-        import traceback
-        traceback.print_exc()
-        raise HTTPException(status_code=500, detail=str(e))
-
-
-@app.websocket("/tts/stream")
-async def stream_tts(websocket: WebSocket):
-    """
-    WebSocket endpoint for streaming TTS.
-
-    Protocol:
-    - Client sends JSON: {"text": "...", "language": "english", "reference_audio": "base64...", "reference_text": "..."}
-    - Server sends binary PCM16 chunks as they're generated
-    - Server sends JSON {"type": "audio_end", "sample_rate": 24000} when done
-    - Server sends JSON {"type": "error", "message": "..."} on error
-    """
-    await websocket.accept()
-
-    if model is None:
-        await websocket.send_json({"type": "error", "message": "Model not loaded"})
-        await websocket.close()
-        return
-
-    try:
-        # Wait for request
-        data = await websocket.receive_json()
-        text = data.get("text", "")
-        language = data.get("language", "english")
-        ref_audio_b64 = data.get("reference_audio")
-        ref_text = data.get("reference_text")
-
-        if not text.strip():
-            await websocket.send_json({"type": "error", "message": "No text provided"})
-            await websocket.close()
-            return
-
-        # Decode reference audio
-        ref_audio = None
-        if ref_audio_b64:
-            audio_bytes = base64.b64decode(ref_audio_b64)
-            audio_data, audio_sr = sf.read(io.BytesIO(audio_bytes))
-            ref_audio = (audio_data, audio_sr)
-
-        if ref_audio is None:
-            await websocket.send_json({"type": "error", "message": "reference_audio is required"})
-            await websocket.close()
-            return
-
-        use_x_vector_only = ref_text is None or ref_text.strip() == ""
-
-        print(f"Streaming TTS for {len(text)} chars...")
-        start = time.time()
-
-        # Use streaming mode (non_streaming_mode=False is default)
-        # This returns a generator that yields audio chunks
-        generator = model.generate_voice_clone(
-            text=text,
-            language=language,
-            ref_audio=ref_audio,
-            ref_text=ref_text if not use_x_vector_only else None,
-            x_vector_only_mode=use_x_vector_only,
-            max_new_tokens=2048,
-            temperature=0.9,
-            top_k=50,
-            repetition_penalty=1.05,
-            non_streaming_mode=False,  # Enable streaming
-        )
-
-        total_samples = 0
-        sample_rate = 24000
-
-        # Check if generator is actually a generator or just the result
-        if hasattr(generator, '__iter__') and not isinstance(generator, tuple):
-            for chunk_data in generator:
-                # chunk_data might be (wav_chunk, sr) or just wav_chunk
-                if isinstance(chunk_data, tuple):
-                    wav_chunk, sample_rate = chunk_data
-                else:
-                    wav_chunk = chunk_data
-
-                if torch.is_tensor(wav_chunk):
-                    wav_chunk = wav_chunk.cpu().numpy()
-
-                if wav_chunk.ndim > 1:
-                    wav_chunk = wav_chunk.squeeze()
-
-                # Convert to PCM16
-                pcm16 = (wav_chunk * 32767).astype(np.int16)
-                total_samples += len(pcm16)
-
-                # Send binary audio chunk
-                await websocket.send_bytes(pcm16.tobytes())
-
-                # Yield to allow other tasks
-                await asyncio.sleep(0)
-        else:
-            # Non-streaming fallback - model returned full result
-            wavs, sample_rate = generator if isinstance(generator, tuple) else (generator, 24000)
-            waveform = wavs[0] if isinstance(wavs, list) else wavs
-
-            if torch.is_tensor(waveform):
-                waveform = waveform.cpu().numpy()
-
-            if waveform.ndim > 1:
-                waveform = waveform.squeeze()
-
-            # Send in chunks for better streaming behavior
-            chunk_size = sample_rate // 4  # 250ms chunks
-            for i in range(0, len(waveform), chunk_size):
-                chunk = waveform[i:i + chunk_size]
-                pcm16 = (chunk * 32767).astype(np.int16)
-                total_samples += len(pcm16)
-                await websocket.send_bytes(pcm16.tobytes())
-                await asyncio.sleep(0)
-
-        duration = total_samples / sample_rate
-        elapsed = time.time() - start
-        print(f"Streamed {duration:.2f}s audio in {elapsed:.2f}s (RTF: {elapsed/duration:.2f})")
-
-        # Send completion message
-        await websocket.send_json({
-            "type": "audio_end",
-            "sample_rate": sample_rate,
-            "duration_seconds": duration,
-        })
-
-    except WebSocketDisconnect:
-        print("Client disconnected during streaming")
-    except Exception as e:
-        import traceback
-        traceback.print_exc()
-        try:
-            await websocket.send_json({"type": "error", "message": str(e)})
-        except:
-            pass
-    finally:
-        try:
-            await websocket.close()
-        except:
-            pass
-
-
-if __name__ == "__main__":
-    import uvicorn
-    port = int(os.environ.get("PORT", "8100"))
-    uvicorn.run(
-        app,
-        host="0.0.0.0",
-        port=port,
-        # Increase keep-alive timeout to avoid connection resets
-        timeout_keep_alive=120,
-    )
diff --git a/makima/sh/download-models.sh b/makima/sh/download-models.sh
index 4f188f3..14fc46e 100755
--- a/makima/sh/download-models.sh
+++ b/makima/sh/download-models.sh
@@ -114,47 +114,46 @@ else
     echo "All models downloaded successfully"
 fi
 
-# Download Qwen3-TTS models (for TTS functionality)
-QWEN3_TTS_DIR="${QWEN3_TTS_DIR:-/app/models/qwen3-tts}"
-
-download_qwen3_tts() {
-    if [ -d "$QWEN3_TTS_DIR" ] && \
-       [ -f "$QWEN3_TTS_DIR/model.safetensors" ] && \
-       [ -f "$QWEN3_TTS_DIR/speech_tokenizer.safetensors" ] && \
-       [ -f "$QWEN3_TTS_DIR/vocab.json" ] && \
-       [ -f "$QWEN3_TTS_DIR/merges.txt" ] && \
-       [ -f "$QWEN3_TTS_DIR/config.json" ]; then
-        echo "Qwen3-TTS models already exist, skipping..."
+# Download Chatterbox TTS models (for TTS functionality)
+CHATTERBOX_MODEL_DIR="${CHATTERBOX_MODEL_DIR:-/app/models/chatterbox-turbo}"
+
+download_chatterbox_tts() {
+    if [ -d "$CHATTERBOX_MODEL_DIR" ] && \
+       [ -f "$CHATTERBOX_MODEL_DIR/speech_encoder.onnx" ] && \
+       [ -f "$CHATTERBOX_MODEL_DIR/language_model.onnx" ] && \
+       [ -f "$CHATTERBOX_MODEL_DIR/conditional_decoder.onnx" ] && \
+       [ -f "$CHATTERBOX_MODEL_DIR/tokenizer.json" ]; then
+        echo "Chatterbox TTS models already exist, skipping..."
         return 0
     fi
 
-    echo "Downloading Qwen3-TTS models..."
-    mkdir -p "$QWEN3_TTS_DIR"
-
-    # Download base TTS model files from Qwen/Qwen3-TTS-12Hz-0.6B-Base
-    # Note: This repo uses vocab.json + merges.txt (not tokenizer.json)
-    echo "Downloading Qwen3-TTS-12Hz-0.6B-Base..."
-    hf download Qwen/Qwen3-TTS-12Hz-0.6B-Base \
-        model.safetensors \
-        config.json \
-        vocab.json \
-        merges.txt \
-        tokenizer_config.json \
-        --local-dir "$QWEN3_TTS_DIR"
-
-    # Download speech tokenizer from Qwen/Qwen3-TTS-Tokenizer-12Hz
-    echo "Downloading Qwen3-TTS-Tokenizer-12Hz..."
-    local tmpdir=$(mktemp -d)
-    hf download Qwen/Qwen3-TTS-Tokenizer-12Hz \
-        model.safetensors \
-        --local-dir "$tmpdir"
-    mv "$tmpdir/model.safetensors" "$QWEN3_TTS_DIR/speech_tokenizer.safetensors"
-    rm -rf "$tmpdir"
+    echo "Downloading Chatterbox TTS models..."
+    mkdir -p "$CHATTERBOX_MODEL_DIR"
+
+    # Download ONNX models from ResembleAI/chatterbox-turbo-ONNX
+    echo "Downloading ResembleAI/chatterbox-turbo-ONNX..."
+    hf download ResembleAI/chatterbox-turbo-ONNX \
+        onnx/speech_encoder.onnx \
+        onnx/speech_encoder.onnx_data \
+        onnx/embed_tokens.onnx \
+        onnx/embed_tokens.onnx_data \
+        onnx/language_model.onnx \
+        onnx/language_model.onnx_data \
+        onnx/conditional_decoder.onnx \
+        onnx/conditional_decoder.onnx_data \
+        tokenizer.json \
+        --local-dir "$CHATTERBOX_MODEL_DIR"
+
+    # Move ONNX files from onnx/ subdirectory to root
+    if [ -d "$CHATTERBOX_MODEL_DIR/onnx" ]; then
+        mv "$CHATTERBOX_MODEL_DIR/onnx"/* "$CHATTERBOX_MODEL_DIR/"
+        rmdir "$CHATTERBOX_MODEL_DIR/onnx"
+    fi
 
-    echo "Qwen3-TTS models downloaded successfully"
+    echo "Chatterbox TTS models downloaded successfully"
 }
 
-download_qwen3_tts
+download_chatterbox_tts
 
 # Execute the main command
 exec "$@"
diff --git a/makima/src/bin/makima.rs b/makima/src/bin/makima.rs
index ac577b8..753f60e 100644
--- a/makima/src/bin/makima.rs
+++ b/makima/src/bin/makima.rs
@@ -49,7 +49,7 @@ async fn run_server(
         &args.parakeet_model_dir,
         &args.parakeet_eou_dir,
         &args.sortformer_model_path,
-        &args.qwen3_tts_dir,
+        &args.chatterbox_model_dir,
     );
 
     // Connect to database if URL provided
diff --git a/makima/src/daemon/cli/server.rs b/makima/src/daemon/cli/server.rs
index 81dafc9..adb765d 100644
--- a/makima/src/daemon/cli/server.rs
+++ b/makima/src/daemon/cli/server.rs
@@ -33,9 +33,9 @@ pub struct ServerArgs {
     )]
     pub sortformer_model_path: String,
 
-    /// Path to Qwen3-TTS model directory
-    #[arg(long, env = "QWEN3_TTS_DIR", default_value = "models/qwen3-tts")]
-    pub qwen3_tts_dir: String,
+    /// Path to Chatterbox TTS model directory
+    #[arg(long, env = "CHATTERBOX_MODEL_DIR", default_value = "models/chatterbox-turbo")]
+    pub chatterbox_model_dir: String,
 
     /// PostgreSQL connection URI
     #[arg(long, env = "POSTGRES_CONNECTION_URI")]
diff --git a/makima/src/daemon/task/manager.rs b/makima/src/daemon/task/manager.rs
index e0437ce..bf495d9 100644
--- a/makima/src/daemon/task/manager.rs
+++ b/makima/src/daemon/task/manager.rs
@@ -598,7 +598,7 @@ rsync -av --exclude='.git' --exclude='.makima' "$FINAL_TASK_PATH/" ./
 
 /// System prompt for supervisor tasks (contract orchestrators).
 /// Supervisors monitor all tasks in a contract, create new tasks, and drive the contract to completion.
-const SUPERVISOR_SYSTEM_PROMPT: &str = r#"You are the SUPERVISOR for this contract. Your ONLY job is to coordinate work by spawning tasks, waiting for them to complete, and managing git operations.
+const SUPERVISOR_SYSTEM_PROMPT: &str = r###"You are the SUPERVISOR for this contract. Your ONLY job is to coordinate work by spawning tasks, waiting for them to complete, and managing git operations.
 
 ## CRITICAL RULES - READ CAREFULLY
 
@@ -960,7 +960,7 @@ After all tasks are "done" and merged, you MUST take the following actions:
 
 ---
 
-"#;
+"###;
 
 /// System prompt for tasks that are part of a contract.
 /// This tells the task about contract.sh and how to use it to interact with the contract.
diff --git a/makima/src/server/handlers/speak.rs b/makima/src/server/handlers/speak.rs
index 3ed2620..b235c65 100644
--- a/makima/src/server/handlers/speak.rs
+++ b/makima/src/server/handlers/speak.rs
@@ -1,19 +1,18 @@
 //! WebSocket handler for TTS streaming (direct in-process inference).
 //!
 //! This module implements the `/api/v1/speak` endpoint which performs
-//! text-to-speech synthesis directly using the candle-based TTS engine.
+//! text-to-speech synthesis directly using the Chatterbox ONNX TTS engine.
 //! No external Python service or proxy — the model runs in-process.
 //!
 //! ## Architecture
 //!
 //! The speak handler will:
 //! 1. Accept a WebSocket connection from the client
-//! 2. Lazily load the TTS model (candle) on first request
+//! 2. Lazily load the TTS model (Chatterbox ONNX) on first request
 //! 3. Parse JSON control messages (start, speak, stop, cancel)
 //! 4. Run inference directly and stream audio chunks back
 //!
 //! See `makima/src/tts/` for the TTS engine implementation.
-//! See `docs/specs/qwen3-tts-spec.md` for the full protocol specification.
 
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
diff --git a/makima/src/server/state.rs b/makima/src/server/state.rs
index f662e30..bd6864f 100644
--- a/makima/src/server/state.rs
+++ b/makima/src/server/state.rs
@@ -560,7 +560,7 @@ pub struct ModelConfig {
     pub parakeet_model_dir: String,
     pub parakeet_eou_dir: String,
     pub sortformer_model_path: String,
-    pub qwen3_tts_dir: String,
+    pub chatterbox_model_dir: String,
 }
 
 /// Lazily-loaded ML models.
@@ -619,12 +619,12 @@ impl AppState {
     /// * `parakeet_model_dir` - Path to the Parakeet TDT model directory
     /// * `parakeet_eou_dir` - Path to the Parakeet EOU model directory
     /// * `sortformer_model_path` - Path to the Sortformer diarization model file
-    /// * `qwen3_tts_dir` - Path to the Qwen3-TTS model directory
+    /// * `chatterbox_model_dir` - Path to the Chatterbox TTS model directory
     pub fn new(
         parakeet_model_dir: &str,
         parakeet_eou_dir: &str,
         sortformer_model_path: &str,
-        qwen3_tts_dir: &str,
+        chatterbox_model_dir: &str,
     ) -> Self {
         // Create broadcast channels with buffer for 256 messages
         let (file_updates, _) = broadcast::channel(256);
@@ -668,7 +668,7 @@ impl AppState {
                 parakeet_model_dir: parakeet_model_dir.to_string(),
                 parakeet_eou_dir: parakeet_eou_dir.to_string(),
                 sortformer_model_path: sortformer_model_path.to_string(),
-                qwen3_tts_dir: qwen3_tts_dir.to_string(),
+                chatterbox_model_dir: chatterbox_model_dir.to_string(),
             }),
             ml_models: OnceCell::new(),
             db_pool: None,
@@ -691,17 +691,17 @@ impl AppState {
 
     /// Get or initialize the TTS engine (lazy loading).
     ///
-    /// The TTS engine is loaded on first Speak connection using the Qwen3 backend.
+    /// The TTS engine is loaded on first Speak connection using the Chatterbox backend.
     /// Returns a reference to the engine, or an error if loading fails.
     pub async fn get_tts_engine(&self) -> Result<&dyn TtsEngine, Box<dyn std::error::Error + Send + Sync>> {
-        let tts_dir = self.model_config.as_ref().map(|c| c.qwen3_tts_dir.as_str());
+        let tts_dir = self.model_config.as_ref().map(|c| c.chatterbox_model_dir.as_str());
         self.tts_engine.get_or_try_init(|| async {
             tracing::info!(
                 model_dir = ?tts_dir,
-                "Lazy-loading TTS engine (Qwen3) on first Speak connection..."
+                "Lazy-loading TTS engine (Chatterbox) on first Speak connection..."
             );
             let engine = crate::tts::TtsEngineFactory::create(
-                crate::tts::TtsBackend::Qwen3,
+                crate::tts::TtsBackend::Chatterbox,
                 tts_dir,
             ).map_err(|e| -> Box<dyn std::error::Error + Send + Sync> {
                 Box::new(e)
author	soryu <soryu@soryu.co>	2026-02-01 02:39:19 +0000
committer	soryu <soryu@soryu.co>	2026-02-01 02:39:52 +0000
commit	ddd956118880d3416a5e8101dcee7f880cbdc444 (patch)
tree	9406c510782f7f91c68b3d461ce46f6428a49072
parent	d0062efd34dfc22c2d8cfee0a47431ac0c8adfda (diff)
download	soryu-makima/multi-phase-plan-fix.tar.gz soryu-makima/multi-phase-plan-fix.zip