diff options
| author | soryu <soryu@soryu.co> | 2026-01-29 01:04:42 +0000 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2026-01-29 01:04:42 +0000 |
| commit | d7b0b576fb43902535f0ae8d4f257b50387ec01a (patch) | |
| tree | c67f35941380c6f54a7f61eaaae5f74e646b22eb | |
| parent | fc2aa0e9fc63365a78f983634efb25d4444e64c5 (diff) | |
| download | soryu-d7b0b576fb43902535f0ae8d4f257b50387ec01a.tar.gz soryu-d7b0b576fb43902535f0ae8d4f257b50387ec01a.zip | |
fix: Add Qwen3-TTS model download to Docker build (#44)
* chore: fix unused import warnings in qwen3-tts module
- Remove unused import 'IndexOp' in model.rs
- Remove unused import 'DType' in speech_tokenizer.rs
- Add #[allow(dead_code)] to codebook_dim field in RvqCodebook
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
* feat: add voice loading and selection for TTS cloning
Add voice reference audio loading so the TTS speak handler can perform
voice cloning using reference WAV files from the voices/ directory.
- Add voice.rs module: loads manifest.json and reference.wav for a given
voice_id, decodes via symphonia, resamples to 24kHz for the TTS engine
- Update speak.rs: resolve voice_id from the speak request (default
"makima"), load reference audio, pass it to engine.generate()
- Add voices/makima/README.md with instructions for obtaining reference
audio (extraction from YouTube, recording, ffmpeg conversion)
- Graceful fallback: if reference audio is missing, TTS proceeds without
voice cloning using the model's default voice
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
* feat: add inference cancellation support for TTS generation
Add cooperative cancellation via Arc<AtomicBool> cancel flag that
threads through TtsEngine::generate -> Qwen3Tts -> GenerationContext.
The autoregressive loop and streaming decoder check the flag each
iteration and break early when set. The speak WebSocket handler
creates a per-session flag, passes it to generate, and sets it on
Cancel/Stop/Close messages.
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
* Add Qwen3-TTS model download to build process
Fix TTS engine failure due to missing tokenizer by downloading
Qwen3-TTS models during Docker build:
- Download model.safetensors, config.json, tokenizer.json, and
tokenizer_config.json from Qwen/Qwen3-TTS-12Hz-0.6B-Base
- Download speech tokenizer from Qwen/Qwen3-TTS-Tokenizer-12Hz
- Add QWEN3_TTS_DIR environment variable to Dockerfile
- Script supports both env var override and default path
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---------
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
| -rw-r--r-- | Dockerfile | 2 | ||||
| -rwxr-xr-x | makima/sh/download-models.sh | 35 |
2 files changed, 37 insertions, 0 deletions
@@ -19,6 +19,7 @@ RUN chmod +x /app/download-models.sh ARG MODEL_BASE_URL ENV MODEL_BASE_URL=${MODEL_BASE_URL} ENV MODELS_DIR=/models +ENV QWEN3_TTS_DIR=/app/models/qwen3-tts RUN /app/download-models.sh echo "Models downloaded" # Copy workspace files @@ -40,6 +41,7 @@ ENV RUST_LOG=makima=info,tower_http=info ENV PARAKEET_MODEL_DIR=/app/models/parakeet-tdt-0.6b-v3 ENV PARAKEET_EOU_DIR=/app/models/realtime_eou_120m-v1-onnx ENV SORTFORMER_MODEL_PATH=/app/models/diarization/diar_streaming_sortformer_4spk-v2.1.onnx +ENV QWEN3_TTS_DIR=/app/models/qwen3-tts EXPOSE 8080 diff --git a/makima/sh/download-models.sh b/makima/sh/download-models.sh index 0381e15..1aefad8 100755 --- a/makima/sh/download-models.sh +++ b/makima/sh/download-models.sh @@ -114,5 +114,40 @@ else echo "All models downloaded successfully" fi +# Download Qwen3-TTS models (for TTS functionality) +QWEN3_TTS_DIR="${QWEN3_TTS_DIR:-/app/models/qwen3-tts}" + +download_qwen3_tts() { + if [ -d "$QWEN3_TTS_DIR" ] && [ -f "$QWEN3_TTS_DIR/model.safetensors" ] && [ -f "$QWEN3_TTS_DIR/speech_tokenizer.safetensors" ]; then + echo "Qwen3-TTS models already exist, skipping..." + return 0 + fi + + echo "Downloading Qwen3-TTS models..." + mkdir -p "$QWEN3_TTS_DIR" + + # Download base TTS model files from Qwen/Qwen3-TTS-12Hz-0.6B-Base + echo "Downloading Qwen3-TTS-12Hz-0.6B-Base..." + huggingface-cli download Qwen/Qwen3-TTS-12Hz-0.6B-Base \ + model.safetensors \ + config.json \ + tokenizer.json \ + tokenizer_config.json \ + --local-dir "$QWEN3_TTS_DIR" + + # Download speech tokenizer from Qwen/Qwen3-TTS-Tokenizer-12Hz + echo "Downloading Qwen3-TTS-Tokenizer-12Hz..." + local tmpdir=$(mktemp -d) + huggingface-cli download Qwen/Qwen3-TTS-Tokenizer-12Hz \ + model.safetensors \ + --local-dir "$tmpdir" + mv "$tmpdir/model.safetensors" "$QWEN3_TTS_DIR/speech_tokenizer.safetensors" + rm -rf "$tmpdir" + + echo "Qwen3-TTS models downloaded successfully" +} + +download_qwen3_tts + # Execute the main command exec "$@" |
