diff options
| author | soryu <soryu@soryu.co> | 2026-02-01 03:04:36 +0000 |
|---|---|---|
| committer | soryu <soryu@soryu.co> | 2026-02-01 03:04:36 +0000 |
| commit | a2c147ddd59f55a07b5be0c8970169726b55c876 (patch) | |
| tree | e41a80f2dfdd8fcaf6b6e91c899392f4e619ca3e /makima/src | |
| parent | 65eebd078af712d004a5a9e28863a16df30792a6 (diff) | |
| download | soryu-a2c147ddd59f55a07b5be0c8970169726b55c876.tar.gz soryu-a2c147ddd59f55a07b5be0c8970169726b55c876.zip | |
Use chatterbox TTS
Diffstat (limited to 'makima/src')
| -rw-r--r-- | makima/src/bin/makima.rs | 2 | ||||
| -rw-r--r-- | makima/src/daemon/cli/server.rs | 6 | ||||
| -rw-r--r-- | makima/src/daemon/task/manager.rs | 4 | ||||
| -rw-r--r-- | makima/src/server/handlers/speak.rs | 5 | ||||
| -rw-r--r-- | makima/src/server/state.rs | 16 |
5 files changed, 16 insertions, 17 deletions
diff --git a/makima/src/bin/makima.rs b/makima/src/bin/makima.rs index ac577b8..753f60e 100644 --- a/makima/src/bin/makima.rs +++ b/makima/src/bin/makima.rs @@ -49,7 +49,7 @@ async fn run_server( &args.parakeet_model_dir, &args.parakeet_eou_dir, &args.sortformer_model_path, - &args.qwen3_tts_dir, + &args.chatterbox_model_dir, ); // Connect to database if URL provided diff --git a/makima/src/daemon/cli/server.rs b/makima/src/daemon/cli/server.rs index 81dafc9..adb765d 100644 --- a/makima/src/daemon/cli/server.rs +++ b/makima/src/daemon/cli/server.rs @@ -33,9 +33,9 @@ pub struct ServerArgs { )] pub sortformer_model_path: String, - /// Path to Qwen3-TTS model directory - #[arg(long, env = "QWEN3_TTS_DIR", default_value = "models/qwen3-tts")] - pub qwen3_tts_dir: String, + /// Path to Chatterbox TTS model directory + #[arg(long, env = "CHATTERBOX_MODEL_DIR", default_value = "models/chatterbox-turbo")] + pub chatterbox_model_dir: String, /// PostgreSQL connection URI #[arg(long, env = "POSTGRES_CONNECTION_URI")] diff --git a/makima/src/daemon/task/manager.rs b/makima/src/daemon/task/manager.rs index e0437ce..bf495d9 100644 --- a/makima/src/daemon/task/manager.rs +++ b/makima/src/daemon/task/manager.rs @@ -598,7 +598,7 @@ rsync -av --exclude='.git' --exclude='.makima' "$FINAL_TASK_PATH/" ./ /// System prompt for supervisor tasks (contract orchestrators). /// Supervisors monitor all tasks in a contract, create new tasks, and drive the contract to completion. -const SUPERVISOR_SYSTEM_PROMPT: &str = r#"You are the SUPERVISOR for this contract. Your ONLY job is to coordinate work by spawning tasks, waiting for them to complete, and managing git operations. +const SUPERVISOR_SYSTEM_PROMPT: &str = r###"You are the SUPERVISOR for this contract. Your ONLY job is to coordinate work by spawning tasks, waiting for them to complete, and managing git operations. ## CRITICAL RULES - READ CAREFULLY @@ -960,7 +960,7 @@ After all tasks are "done" and merged, you MUST take the following actions: --- -"#; +"###; /// System prompt for tasks that are part of a contract. /// This tells the task about contract.sh and how to use it to interact with the contract. diff --git a/makima/src/server/handlers/speak.rs b/makima/src/server/handlers/speak.rs index 3ed2620..b235c65 100644 --- a/makima/src/server/handlers/speak.rs +++ b/makima/src/server/handlers/speak.rs @@ -1,19 +1,18 @@ //! WebSocket handler for TTS streaming (direct in-process inference). //! //! This module implements the `/api/v1/speak` endpoint which performs -//! text-to-speech synthesis directly using the candle-based TTS engine. +//! text-to-speech synthesis directly using the Chatterbox ONNX TTS engine. //! No external Python service or proxy — the model runs in-process. //! //! ## Architecture //! //! The speak handler will: //! 1. Accept a WebSocket connection from the client -//! 2. Lazily load the TTS model (candle) on first request +//! 2. Lazily load the TTS model (Chatterbox ONNX) on first request //! 3. Parse JSON control messages (start, speak, stop, cancel) //! 4. Run inference directly and stream audio chunks back //! //! See `makima/src/tts/` for the TTS engine implementation. -//! See `docs/specs/qwen3-tts-spec.md` for the full protocol specification. use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; diff --git a/makima/src/server/state.rs b/makima/src/server/state.rs index f662e30..bd6864f 100644 --- a/makima/src/server/state.rs +++ b/makima/src/server/state.rs @@ -560,7 +560,7 @@ pub struct ModelConfig { pub parakeet_model_dir: String, pub parakeet_eou_dir: String, pub sortformer_model_path: String, - pub qwen3_tts_dir: String, + pub chatterbox_model_dir: String, } /// Lazily-loaded ML models. @@ -619,12 +619,12 @@ impl AppState { /// * `parakeet_model_dir` - Path to the Parakeet TDT model directory /// * `parakeet_eou_dir` - Path to the Parakeet EOU model directory /// * `sortformer_model_path` - Path to the Sortformer diarization model file - /// * `qwen3_tts_dir` - Path to the Qwen3-TTS model directory + /// * `chatterbox_model_dir` - Path to the Chatterbox TTS model directory pub fn new( parakeet_model_dir: &str, parakeet_eou_dir: &str, sortformer_model_path: &str, - qwen3_tts_dir: &str, + chatterbox_model_dir: &str, ) -> Self { // Create broadcast channels with buffer for 256 messages let (file_updates, _) = broadcast::channel(256); @@ -668,7 +668,7 @@ impl AppState { parakeet_model_dir: parakeet_model_dir.to_string(), parakeet_eou_dir: parakeet_eou_dir.to_string(), sortformer_model_path: sortformer_model_path.to_string(), - qwen3_tts_dir: qwen3_tts_dir.to_string(), + chatterbox_model_dir: chatterbox_model_dir.to_string(), }), ml_models: OnceCell::new(), db_pool: None, @@ -691,17 +691,17 @@ impl AppState { /// Get or initialize the TTS engine (lazy loading). /// - /// The TTS engine is loaded on first Speak connection using the Qwen3 backend. + /// The TTS engine is loaded on first Speak connection using the Chatterbox backend. /// Returns a reference to the engine, or an error if loading fails. pub async fn get_tts_engine(&self) -> Result<&dyn TtsEngine, Box<dyn std::error::Error + Send + Sync>> { - let tts_dir = self.model_config.as_ref().map(|c| c.qwen3_tts_dir.as_str()); + let tts_dir = self.model_config.as_ref().map(|c| c.chatterbox_model_dir.as_str()); self.tts_engine.get_or_try_init(|| async { tracing::info!( model_dir = ?tts_dir, - "Lazy-loading TTS engine (Qwen3) on first Speak connection..." + "Lazy-loading TTS engine (Chatterbox) on first Speak connection..." ); let engine = crate::tts::TtsEngineFactory::create( - crate::tts::TtsBackend::Qwen3, + crate::tts::TtsBackend::Chatterbox, tts_dir, ).map_err(|e| -> Box<dyn std::error::Error + Send + Sync> { Box::new(e) |
