diff options
| -rw-r--r-- | Dockerfile | 4 | ||||
| -rwxr-xr-x | makima/sh/download-models.sh | 54 | ||||
| -rw-r--r-- | makima/src/bin/makima.rs | 2 | ||||
| -rw-r--r-- | makima/src/daemon/cli/server.rs | 6 | ||||
| -rw-r--r-- | makima/src/daemon/task/manager.rs | 4 | ||||
| -rw-r--r-- | makima/src/server/handlers/speak.rs | 5 | ||||
| -rw-r--r-- | makima/src/server/state.rs | 16 |
7 files changed, 37 insertions, 54 deletions
@@ -19,7 +19,7 @@ RUN chmod +x /app/download-models.sh ARG MODEL_BASE_URL ENV MODEL_BASE_URL=${MODEL_BASE_URL} ENV MODELS_DIR=/app/models -ENV QWEN3_TTS_DIR=/app/models/qwen3-tts +ENV CHATTERBOX_MODEL_DIR=/app/models/chatterbox-turbo RUN /app/download-models.sh echo "Models downloaded" # Copy workspace files @@ -42,7 +42,7 @@ ENV RUST_LOG=makima=info,tower_http=info ENV PARAKEET_MODEL_DIR=/app/models/parakeet-tdt-0.6b-v3 ENV PARAKEET_EOU_DIR=/app/models/realtime_eou_120m-v1-onnx ENV SORTFORMER_MODEL_PATH=/app/models/diarization/diar_streaming_sortformer_4spk-v2.1.onnx -ENV QWEN3_TTS_DIR=/app/models/qwen3-tts +ENV CHATTERBOX_MODEL_DIR=/app/models/chatterbox-turbo EXPOSE 8080 diff --git a/makima/sh/download-models.sh b/makima/sh/download-models.sh index 4f188f3..b44e091 100755 --- a/makima/sh/download-models.sh +++ b/makima/sh/download-models.sh @@ -114,47 +114,31 @@ else echo "All models downloaded successfully" fi -# Download Qwen3-TTS models (for TTS functionality) -QWEN3_TTS_DIR="${QWEN3_TTS_DIR:-/app/models/qwen3-tts}" - -download_qwen3_tts() { - if [ -d "$QWEN3_TTS_DIR" ] && \ - [ -f "$QWEN3_TTS_DIR/model.safetensors" ] && \ - [ -f "$QWEN3_TTS_DIR/speech_tokenizer.safetensors" ] && \ - [ -f "$QWEN3_TTS_DIR/vocab.json" ] && \ - [ -f "$QWEN3_TTS_DIR/merges.txt" ] && \ - [ -f "$QWEN3_TTS_DIR/config.json" ]; then - echo "Qwen3-TTS models already exist, skipping..." +# Download Chatterbox TTS models (for TTS functionality) +CHATTERBOX_MODEL_DIR="${CHATTERBOX_MODEL_DIR:-/app/models/chatterbox-turbo}" + +download_chatterbox_tts() { + if [ -d "$CHATTERBOX_MODEL_DIR" ] && \ + [ -f "$CHATTERBOX_MODEL_DIR/speech_encoder.onnx" ] && \ + [ -f "$CHATTERBOX_MODEL_DIR/embed_tokens.onnx" ] && \ + [ -f "$CHATTERBOX_MODEL_DIR/language_model.onnx" ] && \ + [ -f "$CHATTERBOX_MODEL_DIR/conditional_decoder.onnx" ] && \ + [ -f "$CHATTERBOX_MODEL_DIR/tokenizer.json" ]; then + echo "Chatterbox TTS models already exist, skipping..." return 0 fi - echo "Downloading Qwen3-TTS models..." - mkdir -p "$QWEN3_TTS_DIR" - - # Download base TTS model files from Qwen/Qwen3-TTS-12Hz-0.6B-Base - # Note: This repo uses vocab.json + merges.txt (not tokenizer.json) - echo "Downloading Qwen3-TTS-12Hz-0.6B-Base..." - hf download Qwen/Qwen3-TTS-12Hz-0.6B-Base \ - model.safetensors \ - config.json \ - vocab.json \ - merges.txt \ - tokenizer_config.json \ - --local-dir "$QWEN3_TTS_DIR" - - # Download speech tokenizer from Qwen/Qwen3-TTS-Tokenizer-12Hz - echo "Downloading Qwen3-TTS-Tokenizer-12Hz..." - local tmpdir=$(mktemp -d) - hf download Qwen/Qwen3-TTS-Tokenizer-12Hz \ - model.safetensors \ - --local-dir "$tmpdir" - mv "$tmpdir/model.safetensors" "$QWEN3_TTS_DIR/speech_tokenizer.safetensors" - rm -rf "$tmpdir" + echo "Downloading Chatterbox TTS models..." + mkdir -p "$CHATTERBOX_MODEL_DIR" + + # Download from ResembleAI/chatterbox-turbo-ONNX + echo "Downloading ResembleAI/chatterbox-turbo-ONNX..." + hf download ResembleAI/chatterbox-turbo-ONNX --local-dir "$CHATTERBOX_MODEL_DIR" - echo "Qwen3-TTS models downloaded successfully" + echo "Chatterbox TTS models downloaded successfully" } -download_qwen3_tts +download_chatterbox_tts # Execute the main command exec "$@" diff --git a/makima/src/bin/makima.rs b/makima/src/bin/makima.rs index ac577b8..753f60e 100644 --- a/makima/src/bin/makima.rs +++ b/makima/src/bin/makima.rs @@ -49,7 +49,7 @@ async fn run_server( &args.parakeet_model_dir, &args.parakeet_eou_dir, &args.sortformer_model_path, - &args.qwen3_tts_dir, + &args.chatterbox_model_dir, ); // Connect to database if URL provided diff --git a/makima/src/daemon/cli/server.rs b/makima/src/daemon/cli/server.rs index 81dafc9..adb765d 100644 --- a/makima/src/daemon/cli/server.rs +++ b/makima/src/daemon/cli/server.rs @@ -33,9 +33,9 @@ pub struct ServerArgs { )] pub sortformer_model_path: String, - /// Path to Qwen3-TTS model directory - #[arg(long, env = "QWEN3_TTS_DIR", default_value = "models/qwen3-tts")] - pub qwen3_tts_dir: String, + /// Path to Chatterbox TTS model directory + #[arg(long, env = "CHATTERBOX_MODEL_DIR", default_value = "models/chatterbox-turbo")] + pub chatterbox_model_dir: String, /// PostgreSQL connection URI #[arg(long, env = "POSTGRES_CONNECTION_URI")] diff --git a/makima/src/daemon/task/manager.rs b/makima/src/daemon/task/manager.rs index e0437ce..bf495d9 100644 --- a/makima/src/daemon/task/manager.rs +++ b/makima/src/daemon/task/manager.rs @@ -598,7 +598,7 @@ rsync -av --exclude='.git' --exclude='.makima' "$FINAL_TASK_PATH/" ./ /// System prompt for supervisor tasks (contract orchestrators). /// Supervisors monitor all tasks in a contract, create new tasks, and drive the contract to completion. -const SUPERVISOR_SYSTEM_PROMPT: &str = r#"You are the SUPERVISOR for this contract. Your ONLY job is to coordinate work by spawning tasks, waiting for them to complete, and managing git operations. +const SUPERVISOR_SYSTEM_PROMPT: &str = r###"You are the SUPERVISOR for this contract. Your ONLY job is to coordinate work by spawning tasks, waiting for them to complete, and managing git operations. ## CRITICAL RULES - READ CAREFULLY @@ -960,7 +960,7 @@ After all tasks are "done" and merged, you MUST take the following actions: --- -"#; +"###; /// System prompt for tasks that are part of a contract. /// This tells the task about contract.sh and how to use it to interact with the contract. diff --git a/makima/src/server/handlers/speak.rs b/makima/src/server/handlers/speak.rs index 3ed2620..b235c65 100644 --- a/makima/src/server/handlers/speak.rs +++ b/makima/src/server/handlers/speak.rs @@ -1,19 +1,18 @@ //! WebSocket handler for TTS streaming (direct in-process inference). //! //! This module implements the `/api/v1/speak` endpoint which performs -//! text-to-speech synthesis directly using the candle-based TTS engine. +//! text-to-speech synthesis directly using the Chatterbox ONNX TTS engine. //! No external Python service or proxy — the model runs in-process. //! //! ## Architecture //! //! The speak handler will: //! 1. Accept a WebSocket connection from the client -//! 2. Lazily load the TTS model (candle) on first request +//! 2. Lazily load the TTS model (Chatterbox ONNX) on first request //! 3. Parse JSON control messages (start, speak, stop, cancel) //! 4. Run inference directly and stream audio chunks back //! //! See `makima/src/tts/` for the TTS engine implementation. -//! See `docs/specs/qwen3-tts-spec.md` for the full protocol specification. use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; diff --git a/makima/src/server/state.rs b/makima/src/server/state.rs index f662e30..bd6864f 100644 --- a/makima/src/server/state.rs +++ b/makima/src/server/state.rs @@ -560,7 +560,7 @@ pub struct ModelConfig { pub parakeet_model_dir: String, pub parakeet_eou_dir: String, pub sortformer_model_path: String, - pub qwen3_tts_dir: String, + pub chatterbox_model_dir: String, } /// Lazily-loaded ML models. @@ -619,12 +619,12 @@ impl AppState { /// * `parakeet_model_dir` - Path to the Parakeet TDT model directory /// * `parakeet_eou_dir` - Path to the Parakeet EOU model directory /// * `sortformer_model_path` - Path to the Sortformer diarization model file - /// * `qwen3_tts_dir` - Path to the Qwen3-TTS model directory + /// * `chatterbox_model_dir` - Path to the Chatterbox TTS model directory pub fn new( parakeet_model_dir: &str, parakeet_eou_dir: &str, sortformer_model_path: &str, - qwen3_tts_dir: &str, + chatterbox_model_dir: &str, ) -> Self { // Create broadcast channels with buffer for 256 messages let (file_updates, _) = broadcast::channel(256); @@ -668,7 +668,7 @@ impl AppState { parakeet_model_dir: parakeet_model_dir.to_string(), parakeet_eou_dir: parakeet_eou_dir.to_string(), sortformer_model_path: sortformer_model_path.to_string(), - qwen3_tts_dir: qwen3_tts_dir.to_string(), + chatterbox_model_dir: chatterbox_model_dir.to_string(), }), ml_models: OnceCell::new(), db_pool: None, @@ -691,17 +691,17 @@ impl AppState { /// Get or initialize the TTS engine (lazy loading). /// - /// The TTS engine is loaded on first Speak connection using the Qwen3 backend. + /// The TTS engine is loaded on first Speak connection using the Chatterbox backend. /// Returns a reference to the engine, or an error if loading fails. pub async fn get_tts_engine(&self) -> Result<&dyn TtsEngine, Box<dyn std::error::Error + Send + Sync>> { - let tts_dir = self.model_config.as_ref().map(|c| c.qwen3_tts_dir.as_str()); + let tts_dir = self.model_config.as_ref().map(|c| c.chatterbox_model_dir.as_str()); self.tts_engine.get_or_try_init(|| async { tracing::info!( model_dir = ?tts_dir, - "Lazy-loading TTS engine (Qwen3) on first Speak connection..." + "Lazy-loading TTS engine (Chatterbox) on first Speak connection..." ); let engine = crate::tts::TtsEngineFactory::create( - crate::tts::TtsBackend::Qwen3, + crate::tts::TtsBackend::Chatterbox, tts_dir, ).map_err(|e| -> Box<dyn std::error::Error + Send + Sync> { Box::new(e) |
