Use chatterbox TTS

author: soryu <soryu@soryu.co> 2026-02-01 03:04:36 +0000
committer: soryu <soryu@soryu.co> 2026-02-01 03:04:36 +0000
commit: a2c147ddd59f55a07b5be0c8970169726b55c876 (patch)
tree: e41a80f2dfdd8fcaf6b6e91c899392f4e619ca3e /makima/src/server
parent: 65eebd078af712d004a5a9e28863a16df30792a6 (diff)
download: soryu-a2c147ddd59f55a07b5be0c8970169726b55c876.tar.gz
soryu-a2c147ddd59f55a07b5be0c8970169726b55c876.zip
2 files changed, 10 insertions, 11 deletions
diff --git a/makima/src/server/handlers/speak.rs b/makima/src/server/handlers/speak.rs
index 3ed2620..b235c65 100644
--- a/makima/src/server/handlers/speak.rs
+++ b/makima/src/server/handlers/speak.rs
@@ -1,19 +1,18 @@
 //! WebSocket handler for TTS streaming (direct in-process inference).
 //!
 //! This module implements the `/api/v1/speak` endpoint which performs
-//! text-to-speech synthesis directly using the candle-based TTS engine.
+//! text-to-speech synthesis directly using the Chatterbox ONNX TTS engine.
 //! No external Python service or proxy — the model runs in-process.
 //!
 //! ## Architecture
 //!
 //! The speak handler will:
 //! 1. Accept a WebSocket connection from the client
-//! 2. Lazily load the TTS model (candle) on first request
+//! 2. Lazily load the TTS model (Chatterbox ONNX) on first request
 //! 3. Parse JSON control messages (start, speak, stop, cancel)
 //! 4. Run inference directly and stream audio chunks back
 //!
 //! See `makima/src/tts/` for the TTS engine implementation.
-//! See `docs/specs/qwen3-tts-spec.md` for the full protocol specification.
 
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
diff --git a/makima/src/server/state.rs b/makima/src/server/state.rs
index f662e30..bd6864f 100644
--- a/makima/src/server/state.rs
+++ b/makima/src/server/state.rs
@@ -560,7 +560,7 @@ pub struct ModelConfig {
     pub parakeet_model_dir: String,
     pub parakeet_eou_dir: String,
     pub sortformer_model_path: String,
-    pub qwen3_tts_dir: String,
+    pub chatterbox_model_dir: String,
 }
 
 /// Lazily-loaded ML models.
@@ -619,12 +619,12 @@ impl AppState {
     /// * `parakeet_model_dir` - Path to the Parakeet TDT model directory
     /// * `parakeet_eou_dir` - Path to the Parakeet EOU model directory
     /// * `sortformer_model_path` - Path to the Sortformer diarization model file
-    /// * `qwen3_tts_dir` - Path to the Qwen3-TTS model directory
+    /// * `chatterbox_model_dir` - Path to the Chatterbox TTS model directory
     pub fn new(
         parakeet_model_dir: &str,
         parakeet_eou_dir: &str,
         sortformer_model_path: &str,
-        qwen3_tts_dir: &str,
+        chatterbox_model_dir: &str,
     ) -> Self {
         // Create broadcast channels with buffer for 256 messages
         let (file_updates, _) = broadcast::channel(256);
@@ -668,7 +668,7 @@ impl AppState {
                 parakeet_model_dir: parakeet_model_dir.to_string(),
                 parakeet_eou_dir: parakeet_eou_dir.to_string(),
                 sortformer_model_path: sortformer_model_path.to_string(),
-                qwen3_tts_dir: qwen3_tts_dir.to_string(),
+                chatterbox_model_dir: chatterbox_model_dir.to_string(),
             }),
             ml_models: OnceCell::new(),
             db_pool: None,
@@ -691,17 +691,17 @@ impl AppState {
 
     /// Get or initialize the TTS engine (lazy loading).
     ///
-    /// The TTS engine is loaded on first Speak connection using the Qwen3 backend.
+    /// The TTS engine is loaded on first Speak connection using the Chatterbox backend.
     /// Returns a reference to the engine, or an error if loading fails.
     pub async fn get_tts_engine(&self) -> Result<&dyn TtsEngine, Box<dyn std::error::Error + Send + Sync>> {
-        let tts_dir = self.model_config.as_ref().map(|c| c.qwen3_tts_dir.as_str());
+        let tts_dir = self.model_config.as_ref().map(|c| c.chatterbox_model_dir.as_str());
         self.tts_engine.get_or_try_init(|| async {
             tracing::info!(
                 model_dir = ?tts_dir,
-                "Lazy-loading TTS engine (Qwen3) on first Speak connection..."
+                "Lazy-loading TTS engine (Chatterbox) on first Speak connection..."
             );
             let engine = crate::tts::TtsEngineFactory::create(
-                crate::tts::TtsBackend::Qwen3,
+                crate::tts::TtsBackend::Chatterbox,
                 tts_dir,
             ).map_err(|e| -> Box<dyn std::error::Error + Send + Sync> {
                 Box::new(e)
author	soryu <soryu@soryu.co>	2026-02-01 03:04:36 +0000
committer	soryu <soryu@soryu.co>	2026-02-01 03:04:36 +0000
commit	a2c147ddd59f55a07b5be0c8970169726b55c876 (patch)
tree	e41a80f2dfdd8fcaf6b6e91c899392f4e619ca3e /makima/src/server
parent	65eebd078af712d004a5a9e28863a16df30792a6 (diff)
download	soryu-a2c147ddd59f55a07b5be0c8970169726b55c876.tar.gz soryu-a2c147ddd59f55a07b5be0c8970169726b55c876.zip