summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsoryu <soryu@soryu.co>2026-01-29 01:04:42 +0000
committerGitHub <noreply@github.com>2026-01-29 01:04:42 +0000
commitd7b0b576fb43902535f0ae8d4f257b50387ec01a (patch)
treec67f35941380c6f54a7f61eaaae5f74e646b22eb
parentfc2aa0e9fc63365a78f983634efb25d4444e64c5 (diff)
downloadsoryu-d7b0b576fb43902535f0ae8d4f257b50387ec01a.tar.gz
soryu-d7b0b576fb43902535f0ae8d4f257b50387ec01a.zip
fix: Add Qwen3-TTS model download to Docker build (#44)
* chore: fix unused import warnings in qwen3-tts module - Remove unused import 'IndexOp' in model.rs - Remove unused import 'DType' in speech_tokenizer.rs - Add #[allow(dead_code)] to codebook_dim field in RvqCodebook Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * feat: add voice loading and selection for TTS cloning Add voice reference audio loading so the TTS speak handler can perform voice cloning using reference WAV files from the voices/ directory. - Add voice.rs module: loads manifest.json and reference.wav for a given voice_id, decodes via symphonia, resamples to 24kHz for the TTS engine - Update speak.rs: resolve voice_id from the speak request (default "makima"), load reference audio, pass it to engine.generate() - Add voices/makima/README.md with instructions for obtaining reference audio (extraction from YouTube, recording, ffmpeg conversion) - Graceful fallback: if reference audio is missing, TTS proceeds without voice cloning using the model's default voice Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * feat: add inference cancellation support for TTS generation Add cooperative cancellation via Arc<AtomicBool> cancel flag that threads through TtsEngine::generate -> Qwen3Tts -> GenerationContext. The autoregressive loop and streaming decoder check the flag each iteration and break early when set. The speak WebSocket handler creates a per-session flag, passes it to generate, and sets it on Cancel/Stop/Close messages. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * Add Qwen3-TTS model download to build process Fix TTS engine failure due to missing tokenizer by downloading Qwen3-TTS models during Docker build: - Download model.safetensors, config.json, tokenizer.json, and tokenizer_config.json from Qwen/Qwen3-TTS-12Hz-0.6B-Base - Download speech tokenizer from Qwen/Qwen3-TTS-Tokenizer-12Hz - Add QWEN3_TTS_DIR environment variable to Dockerfile - Script supports both env var override and default path Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
-rw-r--r--Dockerfile2
-rwxr-xr-xmakima/sh/download-models.sh35
2 files changed, 37 insertions, 0 deletions
diff --git a/Dockerfile b/Dockerfile
index e6c3d8b..a7d093c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -19,6 +19,7 @@ RUN chmod +x /app/download-models.sh
ARG MODEL_BASE_URL
ENV MODEL_BASE_URL=${MODEL_BASE_URL}
ENV MODELS_DIR=/models
+ENV QWEN3_TTS_DIR=/app/models/qwen3-tts
RUN /app/download-models.sh echo "Models downloaded"
# Copy workspace files
@@ -40,6 +41,7 @@ ENV RUST_LOG=makima=info,tower_http=info
ENV PARAKEET_MODEL_DIR=/app/models/parakeet-tdt-0.6b-v3
ENV PARAKEET_EOU_DIR=/app/models/realtime_eou_120m-v1-onnx
ENV SORTFORMER_MODEL_PATH=/app/models/diarization/diar_streaming_sortformer_4spk-v2.1.onnx
+ENV QWEN3_TTS_DIR=/app/models/qwen3-tts
EXPOSE 8080
diff --git a/makima/sh/download-models.sh b/makima/sh/download-models.sh
index 0381e15..1aefad8 100755
--- a/makima/sh/download-models.sh
+++ b/makima/sh/download-models.sh
@@ -114,5 +114,40 @@ else
echo "All models downloaded successfully"
fi
+# Download Qwen3-TTS models (for TTS functionality)
+QWEN3_TTS_DIR="${QWEN3_TTS_DIR:-/app/models/qwen3-tts}"
+
+download_qwen3_tts() {
+ if [ -d "$QWEN3_TTS_DIR" ] && [ -f "$QWEN3_TTS_DIR/model.safetensors" ] && [ -f "$QWEN3_TTS_DIR/speech_tokenizer.safetensors" ]; then
+ echo "Qwen3-TTS models already exist, skipping..."
+ return 0
+ fi
+
+ echo "Downloading Qwen3-TTS models..."
+ mkdir -p "$QWEN3_TTS_DIR"
+
+ # Download base TTS model files from Qwen/Qwen3-TTS-12Hz-0.6B-Base
+ echo "Downloading Qwen3-TTS-12Hz-0.6B-Base..."
+ huggingface-cli download Qwen/Qwen3-TTS-12Hz-0.6B-Base \
+ model.safetensors \
+ config.json \
+ tokenizer.json \
+ tokenizer_config.json \
+ --local-dir "$QWEN3_TTS_DIR"
+
+ # Download speech tokenizer from Qwen/Qwen3-TTS-Tokenizer-12Hz
+ echo "Downloading Qwen3-TTS-Tokenizer-12Hz..."
+ local tmpdir=$(mktemp -d)
+ huggingface-cli download Qwen/Qwen3-TTS-Tokenizer-12Hz \
+ model.safetensors \
+ --local-dir "$tmpdir"
+ mv "$tmpdir/model.safetensors" "$QWEN3_TTS_DIR/speech_tokenizer.safetensors"
+ rm -rf "$tmpdir"
+
+ echo "Qwen3-TTS models downloaded successfully"
+}
+
+download_qwen3_tts
+
# Execute the main command
exec "$@"