From d7b0b576fb43902535f0ae8d4f257b50387ec01a Mon Sep 17 00:00:00 2001
From: soryu <soryu@soryu.co>
Date: Thu, 29 Jan 2026 01:04:42 +0000
Subject: fix: Add Qwen3-TTS model download to Docker build (#44)

* chore: fix unused import warnings in qwen3-tts module

- Remove unused import 'IndexOp' in model.rs
- Remove unused import 'DType' in speech_tokenizer.rs
- Add #[allow(dead_code)] to codebook_dim field in RvqCodebook

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* feat: add voice loading and selection for TTS cloning

Add voice reference audio loading so the TTS speak handler can perform
voice cloning using reference WAV files from the voices/ directory.

- Add voice.rs module: loads manifest.json and reference.wav for a given
  voice_id, decodes via symphonia, resamples to 24kHz for the TTS engine
- Update speak.rs: resolve voice_id from the speak request (default
  "makima"), load reference audio, pass it to engine.generate()
- Add voices/makima/README.md with instructions for obtaining reference
  audio (extraction from YouTube, recording, ffmpeg conversion)
- Graceful fallback: if reference audio is missing, TTS proceeds without
  voice cloning using the model's default voice

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* feat: add inference cancellation support for TTS generation

Add cooperative cancellation via Arc<AtomicBool> cancel flag that
threads through TtsEngine::generate -> Qwen3Tts -> GenerationContext.
The autoregressive loop and streaming decoder check the flag each
iteration and break early when set. The speak WebSocket handler
creates a per-session flag, passes it to generate, and sets it on
Cancel/Stop/Close messages.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* Add Qwen3-TTS model download to build process

Fix TTS engine failure due to missing tokenizer by downloading
Qwen3-TTS models during Docker build:
- Download model.safetensors, config.json, tokenizer.json, and
  tokenizer_config.json from Qwen/Qwen3-TTS-12Hz-0.6B-Base
- Download speech tokenizer from Qwen/Qwen3-TTS-Tokenizer-12Hz
- Add QWEN3_TTS_DIR environment variable to Dockerfile
- Script supports both env var override and default path

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
---
 makima/sh/download-models.sh | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'makima')
diff --git a/makima/sh/download-models.sh b/makima/sh/download-models.sh
index 0381e15..1aefad8 100755
--- a/makima/sh/download-models.sh
+++ b/makima/sh/download-models.sh
@@ -114,5 +114,40 @@ else
     echo "All models downloaded successfully"
 fi
 
+# Download Qwen3-TTS models (for TTS functionality)
+QWEN3_TTS_DIR="${QWEN3_TTS_DIR:-/app/models/qwen3-tts}"
+
+download_qwen3_tts() {
+    if [ -d "$QWEN3_TTS_DIR" ] && [ -f "$QWEN3_TTS_DIR/model.safetensors" ] && [ -f "$QWEN3_TTS_DIR/speech_tokenizer.safetensors" ]; then
+        echo "Qwen3-TTS models already exist, skipping..."
+        return 0
+    fi
+
+    echo "Downloading Qwen3-TTS models..."
+    mkdir -p "$QWEN3_TTS_DIR"
+
+    # Download base TTS model files from Qwen/Qwen3-TTS-12Hz-0.6B-Base
+    echo "Downloading Qwen3-TTS-12Hz-0.6B-Base..."
+    huggingface-cli download Qwen/Qwen3-TTS-12Hz-0.6B-Base \
+        model.safetensors \
+        config.json \
+        tokenizer.json \
+        tokenizer_config.json \
+        --local-dir "$QWEN3_TTS_DIR"
+
+    # Download speech tokenizer from Qwen/Qwen3-TTS-Tokenizer-12Hz
+    echo "Downloading Qwen3-TTS-Tokenizer-12Hz..."
+    local tmpdir=$(mktemp -d)
+    huggingface-cli download Qwen/Qwen3-TTS-Tokenizer-12Hz \
+        model.safetensors \
+        --local-dir "$tmpdir"
+    mv "$tmpdir/model.safetensors" "$QWEN3_TTS_DIR/speech_tokenizer.safetensors"
+    rm -rf "$tmpdir"
+
+    echo "Qwen3-TTS models downloaded successfully"
+}
+
+download_qwen3_tts
+
 # Execute the main command
 exec "$@"
-- 
cgit v1.2.3