Replace TTS endpoint with Rust-native Qwen3-TTS (#41)

* chore: fix unused import warnings in qwen3-tts module - Remove unused import 'IndexOp' in model.rs - Remove unused import 'DType' in speech_tokenizer.rs - Add #[allow(dead_code)] to codebook_dim field in RvqCodebook Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * feat: add voice loading and selection for TTS cloning Add voice reference audio loading so the TTS speak handler can perform voice cloning using reference WAV files from the voices/ directory. - Add voice.rs module: loads manifest.json and reference.wav for a given voice_id, decodes via symphonia, resamples to 24kHz for the TTS engine - Update speak.rs: resolve voice_id from the speak request (default "makima"), load reference audio, pass it to engine.generate() - Add voices/makima/README.md with instructions for obtaining reference audio (extraction from YouTube, recording, ffmpeg conversion) - Graceful fallback: if reference audio is missing, TTS proceeds without voice cloning using the model's default voice Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * [WIP] Heartbeat checkpoint - 2026-01-28 03:49:13 UTC --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
author: soryu <soryu@soryu.co> 2026-01-28 03:50:45 +0000
committer: GitHub <noreply@github.com> 2026-01-28 03:50:45 +0000
commit: 9b53f6c6b01da85ef73bd5960b32ec319df0b947 (patch)
tree: 8c5e9983e1a5e75afab4a7d7a18ba22b75211628 /makima/src/tts/qwen3/mod.rs
parent: c14192cc8b0e82369c93c1aee615fcc9cfad5911 (diff)
download: soryu-9b53f6c6b01da85ef73bd5960b32ec319df0b947.tar.gz
soryu-9b53f6c6b01da85ef73bd5960b32ec319df0b947.zip
1 files changed, 5 insertions, 1 deletions
diff --git a/makima/src/tts/qwen3/mod.rs b/makima/src/tts/qwen3/mod.rs
index c55c118..9bac794 100644
--- a/makima/src/tts/qwen3/mod.rs
+++ b/makima/src/tts/qwen3/mod.rs
@@ -30,6 +30,7 @@ pub mod speech_tokenizer;
 
 use std::path::{Path, PathBuf};
 use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
 
 use candle_core::{DType, Device};
 use candle_nn::VarBuilder;
@@ -168,6 +169,7 @@ impl Qwen3Tts {
         text: &str,
         reference_audio: Option<&[f32]>,
         gen_config: Option<GenerationConfig>,
+        cancel_flag: Option<Arc<AtomicBool>>,
     ) -> Result<Vec<AudioChunk>, TtsError> {
         let config = gen_config.unwrap_or_default();
 
@@ -178,6 +180,7 @@ impl Qwen3Tts {
             &self.tokenizer,
             &self.device,
             config,
+            cancel_flag,
         );
 
         ctx.generate(text, reference_audio)
@@ -250,11 +253,12 @@ impl TtsEngine for Qwen3Tts {
         text: &str,
         reference_audio: Option<&[f32]>,
         _reference_sample_rate: Option<u32>,
+        cancel_flag: Option<Arc<AtomicBool>>,
     ) -> Result<Vec<AudioChunk>, TtsError> {
         // Note: reference audio should already be resampled to 24kHz
         // by the caller. If a different sample rate is provided,
         // the caller should resample using `resample_to_24k()`.
-        self.generate_speech(text, reference_audio, None)
+        self.generate_speech(text, reference_audio, None, cancel_flag)
     }
 
     fn is_ready(&self) -> bool {
author	soryu <soryu@soryu.co>	2026-01-28 03:50:45 +0000
committer	GitHub <noreply@github.com>	2026-01-28 03:50:45 +0000
commit	9b53f6c6b01da85ef73bd5960b32ec319df0b947 (patch)
tree	8c5e9983e1a5e75afab4a7d7a18ba22b75211628 /makima/src/tts/qwen3/mod.rs
parent	c14192cc8b0e82369c93c1aee615fcc9cfad5911 (diff)
download	soryu-9b53f6c6b01da85ef73bd5960b32ec319df0b947.tar.gz soryu-9b53f6c6b01da85ef73bd5960b32ec319df0b947.zip