summaryrefslogtreecommitdiff
path: root/makima/src/tts/mod.rs
diff options
context:
space:
mode:
authorsoryu <soryu@soryu.co>2026-01-28 03:50:45 +0000
committerGitHub <noreply@github.com>2026-01-28 03:50:45 +0000
commit9b53f6c6b01da85ef73bd5960b32ec319df0b947 (patch)
tree8c5e9983e1a5e75afab4a7d7a18ba22b75211628 /makima/src/tts/mod.rs
parentc14192cc8b0e82369c93c1aee615fcc9cfad5911 (diff)
downloadsoryu-9b53f6c6b01da85ef73bd5960b32ec319df0b947.tar.gz
soryu-9b53f6c6b01da85ef73bd5960b32ec319df0b947.zip
Replace TTS endpoint with Rust-native Qwen3-TTS (#41)
* chore: fix unused import warnings in qwen3-tts module - Remove unused import 'IndexOp' in model.rs - Remove unused import 'DType' in speech_tokenizer.rs - Add #[allow(dead_code)] to codebook_dim field in RvqCodebook Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * feat: add voice loading and selection for TTS cloning Add voice reference audio loading so the TTS speak handler can perform voice cloning using reference WAV files from the voices/ directory. - Add voice.rs module: loads manifest.json and reference.wav for a given voice_id, decodes via symphonia, resamples to 24kHz for the TTS engine - Update speak.rs: resolve voice_id from the speak request (default "makima"), load reference audio, pass it to engine.generate() - Add voices/makima/README.md with instructions for obtaining reference audio (extraction from YouTube, recording, ffmpeg conversion) - Graceful fallback: if reference audio is missing, TTS proceeds without voice cloning using the model's default voice Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * [WIP] Heartbeat checkpoint - 2026-01-28 03:49:13 UTC --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
Diffstat (limited to 'makima/src/tts/mod.rs')
-rw-r--r--makima/src/tts/mod.rs8
1 files changed, 8 insertions, 0 deletions
diff --git a/makima/src/tts/mod.rs b/makima/src/tts/mod.rs
index 2cd0412..b66f4a5 100644
--- a/makima/src/tts/mod.rs
+++ b/makima/src/tts/mod.rs
@@ -5,6 +5,8 @@
//! - **Qwen3**: Pure Rust candle-based Qwen3-TTS-12Hz-0.6B
use std::path::Path;
+use std::sync::atomic::AtomicBool;
+use std::sync::Arc;
pub mod chatterbox;
pub mod qwen3;
@@ -109,11 +111,17 @@ pub enum TtsBackend {
#[async_trait::async_trait]
pub trait TtsEngine: Send + Sync {
/// Generate complete audio from text with a voice reference.
+ ///
+ /// The optional `cancel_flag` can be set to `true` by another thread/task
+ /// to request early termination of the generation loop. Engines that
+ /// support cancellation will check this flag periodically and return
+ /// whatever audio has been produced so far.
async fn generate(
&self,
text: &str,
reference_audio: Option<&[f32]>,
reference_sample_rate: Option<u32>,
+ cancel_flag: Option<Arc<AtomicBool>>,
) -> Result<Vec<AudioChunk>, TtsError>;
/// Check if the engine is loaded and ready.