summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Dockerfile4
-rwxr-xr-xmakima/sh/download-models.sh54
-rw-r--r--makima/src/bin/makima.rs2
-rw-r--r--makima/src/daemon/cli/server.rs6
-rw-r--r--makima/src/daemon/task/manager.rs4
-rw-r--r--makima/src/server/handlers/speak.rs5
-rw-r--r--makima/src/server/state.rs16
7 files changed, 37 insertions, 54 deletions
diff --git a/Dockerfile b/Dockerfile
index 48b74b6..f33045c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -19,7 +19,7 @@ RUN chmod +x /app/download-models.sh
ARG MODEL_BASE_URL
ENV MODEL_BASE_URL=${MODEL_BASE_URL}
ENV MODELS_DIR=/app/models
-ENV QWEN3_TTS_DIR=/app/models/qwen3-tts
+ENV CHATTERBOX_MODEL_DIR=/app/models/chatterbox-turbo
RUN /app/download-models.sh echo "Models downloaded"
# Copy workspace files
@@ -42,7 +42,7 @@ ENV RUST_LOG=makima=info,tower_http=info
ENV PARAKEET_MODEL_DIR=/app/models/parakeet-tdt-0.6b-v3
ENV PARAKEET_EOU_DIR=/app/models/realtime_eou_120m-v1-onnx
ENV SORTFORMER_MODEL_PATH=/app/models/diarization/diar_streaming_sortformer_4spk-v2.1.onnx
-ENV QWEN3_TTS_DIR=/app/models/qwen3-tts
+ENV CHATTERBOX_MODEL_DIR=/app/models/chatterbox-turbo
EXPOSE 8080
diff --git a/makima/sh/download-models.sh b/makima/sh/download-models.sh
index 4f188f3..b44e091 100755
--- a/makima/sh/download-models.sh
+++ b/makima/sh/download-models.sh
@@ -114,47 +114,31 @@ else
echo "All models downloaded successfully"
fi
-# Download Qwen3-TTS models (for TTS functionality)
-QWEN3_TTS_DIR="${QWEN3_TTS_DIR:-/app/models/qwen3-tts}"
-
-download_qwen3_tts() {
- if [ -d "$QWEN3_TTS_DIR" ] && \
- [ -f "$QWEN3_TTS_DIR/model.safetensors" ] && \
- [ -f "$QWEN3_TTS_DIR/speech_tokenizer.safetensors" ] && \
- [ -f "$QWEN3_TTS_DIR/vocab.json" ] && \
- [ -f "$QWEN3_TTS_DIR/merges.txt" ] && \
- [ -f "$QWEN3_TTS_DIR/config.json" ]; then
- echo "Qwen3-TTS models already exist, skipping..."
+# Download Chatterbox TTS models (for TTS functionality)
+CHATTERBOX_MODEL_DIR="${CHATTERBOX_MODEL_DIR:-/app/models/chatterbox-turbo}"
+
+download_chatterbox_tts() {
+ if [ -d "$CHATTERBOX_MODEL_DIR" ] && \
+ [ -f "$CHATTERBOX_MODEL_DIR/speech_encoder.onnx" ] && \
+ [ -f "$CHATTERBOX_MODEL_DIR/embed_tokens.onnx" ] && \
+ [ -f "$CHATTERBOX_MODEL_DIR/language_model.onnx" ] && \
+ [ -f "$CHATTERBOX_MODEL_DIR/conditional_decoder.onnx" ] && \
+ [ -f "$CHATTERBOX_MODEL_DIR/tokenizer.json" ]; then
+ echo "Chatterbox TTS models already exist, skipping..."
return 0
fi
- echo "Downloading Qwen3-TTS models..."
- mkdir -p "$QWEN3_TTS_DIR"
-
- # Download base TTS model files from Qwen/Qwen3-TTS-12Hz-0.6B-Base
- # Note: This repo uses vocab.json + merges.txt (not tokenizer.json)
- echo "Downloading Qwen3-TTS-12Hz-0.6B-Base..."
- hf download Qwen/Qwen3-TTS-12Hz-0.6B-Base \
- model.safetensors \
- config.json \
- vocab.json \
- merges.txt \
- tokenizer_config.json \
- --local-dir "$QWEN3_TTS_DIR"
-
- # Download speech tokenizer from Qwen/Qwen3-TTS-Tokenizer-12Hz
- echo "Downloading Qwen3-TTS-Tokenizer-12Hz..."
- local tmpdir=$(mktemp -d)
- hf download Qwen/Qwen3-TTS-Tokenizer-12Hz \
- model.safetensors \
- --local-dir "$tmpdir"
- mv "$tmpdir/model.safetensors" "$QWEN3_TTS_DIR/speech_tokenizer.safetensors"
- rm -rf "$tmpdir"
+ echo "Downloading Chatterbox TTS models..."
+ mkdir -p "$CHATTERBOX_MODEL_DIR"
+
+ # Download from ResembleAI/chatterbox-turbo-ONNX
+ echo "Downloading ResembleAI/chatterbox-turbo-ONNX..."
+ hf download ResembleAI/chatterbox-turbo-ONNX --local-dir "$CHATTERBOX_MODEL_DIR"
- echo "Qwen3-TTS models downloaded successfully"
+ echo "Chatterbox TTS models downloaded successfully"
}
-download_qwen3_tts
+download_chatterbox_tts
# Execute the main command
exec "$@"
diff --git a/makima/src/bin/makima.rs b/makima/src/bin/makima.rs
index ac577b8..753f60e 100644
--- a/makima/src/bin/makima.rs
+++ b/makima/src/bin/makima.rs
@@ -49,7 +49,7 @@ async fn run_server(
&args.parakeet_model_dir,
&args.parakeet_eou_dir,
&args.sortformer_model_path,
- &args.qwen3_tts_dir,
+ &args.chatterbox_model_dir,
);
// Connect to database if URL provided
diff --git a/makima/src/daemon/cli/server.rs b/makima/src/daemon/cli/server.rs
index 81dafc9..adb765d 100644
--- a/makima/src/daemon/cli/server.rs
+++ b/makima/src/daemon/cli/server.rs
@@ -33,9 +33,9 @@ pub struct ServerArgs {
)]
pub sortformer_model_path: String,
- /// Path to Qwen3-TTS model directory
- #[arg(long, env = "QWEN3_TTS_DIR", default_value = "models/qwen3-tts")]
- pub qwen3_tts_dir: String,
+ /// Path to Chatterbox TTS model directory
+ #[arg(long, env = "CHATTERBOX_MODEL_DIR", default_value = "models/chatterbox-turbo")]
+ pub chatterbox_model_dir: String,
/// PostgreSQL connection URI
#[arg(long, env = "POSTGRES_CONNECTION_URI")]
diff --git a/makima/src/daemon/task/manager.rs b/makima/src/daemon/task/manager.rs
index e0437ce..bf495d9 100644
--- a/makima/src/daemon/task/manager.rs
+++ b/makima/src/daemon/task/manager.rs
@@ -598,7 +598,7 @@ rsync -av --exclude='.git' --exclude='.makima' "$FINAL_TASK_PATH/" ./
/// System prompt for supervisor tasks (contract orchestrators).
/// Supervisors monitor all tasks in a contract, create new tasks, and drive the contract to completion.
-const SUPERVISOR_SYSTEM_PROMPT: &str = r#"You are the SUPERVISOR for this contract. Your ONLY job is to coordinate work by spawning tasks, waiting for them to complete, and managing git operations.
+const SUPERVISOR_SYSTEM_PROMPT: &str = r###"You are the SUPERVISOR for this contract. Your ONLY job is to coordinate work by spawning tasks, waiting for them to complete, and managing git operations.
## CRITICAL RULES - READ CAREFULLY
@@ -960,7 +960,7 @@ After all tasks are "done" and merged, you MUST take the following actions:
---
-"#;
+"###;
/// System prompt for tasks that are part of a contract.
/// This tells the task about contract.sh and how to use it to interact with the contract.
diff --git a/makima/src/server/handlers/speak.rs b/makima/src/server/handlers/speak.rs
index 3ed2620..b235c65 100644
--- a/makima/src/server/handlers/speak.rs
+++ b/makima/src/server/handlers/speak.rs
@@ -1,19 +1,18 @@
//! WebSocket handler for TTS streaming (direct in-process inference).
//!
//! This module implements the `/api/v1/speak` endpoint which performs
-//! text-to-speech synthesis directly using the candle-based TTS engine.
+//! text-to-speech synthesis directly using the Chatterbox ONNX TTS engine.
//! No external Python service or proxy — the model runs in-process.
//!
//! ## Architecture
//!
//! The speak handler will:
//! 1. Accept a WebSocket connection from the client
-//! 2. Lazily load the TTS model (candle) on first request
+//! 2. Lazily load the TTS model (Chatterbox ONNX) on first request
//! 3. Parse JSON control messages (start, speak, stop, cancel)
//! 4. Run inference directly and stream audio chunks back
//!
//! See `makima/src/tts/` for the TTS engine implementation.
-//! See `docs/specs/qwen3-tts-spec.md` for the full protocol specification.
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
diff --git a/makima/src/server/state.rs b/makima/src/server/state.rs
index f662e30..bd6864f 100644
--- a/makima/src/server/state.rs
+++ b/makima/src/server/state.rs
@@ -560,7 +560,7 @@ pub struct ModelConfig {
pub parakeet_model_dir: String,
pub parakeet_eou_dir: String,
pub sortformer_model_path: String,
- pub qwen3_tts_dir: String,
+ pub chatterbox_model_dir: String,
}
/// Lazily-loaded ML models.
@@ -619,12 +619,12 @@ impl AppState {
/// * `parakeet_model_dir` - Path to the Parakeet TDT model directory
/// * `parakeet_eou_dir` - Path to the Parakeet EOU model directory
/// * `sortformer_model_path` - Path to the Sortformer diarization model file
- /// * `qwen3_tts_dir` - Path to the Qwen3-TTS model directory
+ /// * `chatterbox_model_dir` - Path to the Chatterbox TTS model directory
pub fn new(
parakeet_model_dir: &str,
parakeet_eou_dir: &str,
sortformer_model_path: &str,
- qwen3_tts_dir: &str,
+ chatterbox_model_dir: &str,
) -> Self {
// Create broadcast channels with buffer for 256 messages
let (file_updates, _) = broadcast::channel(256);
@@ -668,7 +668,7 @@ impl AppState {
parakeet_model_dir: parakeet_model_dir.to_string(),
parakeet_eou_dir: parakeet_eou_dir.to_string(),
sortformer_model_path: sortformer_model_path.to_string(),
- qwen3_tts_dir: qwen3_tts_dir.to_string(),
+ chatterbox_model_dir: chatterbox_model_dir.to_string(),
}),
ml_models: OnceCell::new(),
db_pool: None,
@@ -691,17 +691,17 @@ impl AppState {
/// Get or initialize the TTS engine (lazy loading).
///
- /// The TTS engine is loaded on first Speak connection using the Qwen3 backend.
+ /// The TTS engine is loaded on first Speak connection using the Chatterbox backend.
/// Returns a reference to the engine, or an error if loading fails.
pub async fn get_tts_engine(&self) -> Result<&dyn TtsEngine, Box<dyn std::error::Error + Send + Sync>> {
- let tts_dir = self.model_config.as_ref().map(|c| c.qwen3_tts_dir.as_str());
+ let tts_dir = self.model_config.as_ref().map(|c| c.chatterbox_model_dir.as_str());
self.tts_engine.get_or_try_init(|| async {
tracing::info!(
model_dir = ?tts_dir,
- "Lazy-loading TTS engine (Qwen3) on first Speak connection..."
+ "Lazy-loading TTS engine (Chatterbox) on first Speak connection..."
);
let engine = crate::tts::TtsEngineFactory::create(
- crate::tts::TtsBackend::Qwen3,
+ crate::tts::TtsBackend::Chatterbox,
tts_dir,
).map_err(|e| -> Box<dyn std::error::Error + Send + Sync> {
Box::new(e)