Replace TTS endpoint with Rust-native Qwen3-TTS (#41)

* chore: fix unused import warnings in qwen3-tts module - Remove unused import 'IndexOp' in model.rs - Remove unused import 'DType' in speech_tokenizer.rs - Add #[allow(dead_code)] to codebook_dim field in RvqCodebook Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * feat: add voice loading and selection for TTS cloning Add voice reference audio loading so the TTS speak handler can perform voice cloning using reference WAV files from the voices/ directory. - Add voice.rs module: loads manifest.json and reference.wav for a given voice_id, decodes via symphonia, resamples to 24kHz for the TTS engine - Update speak.rs: resolve voice_id from the speak request (default "makima"), load reference audio, pass it to engine.generate() - Add voices/makima/README.md with instructions for obtaining reference audio (extraction from YouTube, recording, ffmpeg conversion) - Graceful fallback: if reference audio is missing, TTS proceeds without voice cloning using the model's default voice Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * [WIP] Heartbeat checkpoint - 2026-01-28 03:49:13 UTC --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
author: soryu <soryu@soryu.co> 2026-01-28 03:50:45 +0000
committer: GitHub <noreply@github.com> 2026-01-28 03:50:45 +0000
commit: 9b53f6c6b01da85ef73bd5960b32ec319df0b947 (patch)
tree: 8c5e9983e1a5e75afab4a7d7a18ba22b75211628
parent: c14192cc8b0e82369c93c1aee615fcc9cfad5911 (diff)
download: soryu-9b53f6c6b01da85ef73bd5960b32ec319df0b947.tar.gz
soryu-9b53f6c6b01da85ef73bd5960b32ec319df0b947.zip
10 files changed, 478 insertions, 12 deletions
diff --git a/makima/src/server/handlers/mod.rs b/makima/src/server/handlers/mod.rs
index 8207399..8af2a37 100644
--- a/makima/src/server/handlers/mod.rs
+++ b/makima/src/server/handlers/mod.rs
@@ -19,6 +19,7 @@ pub mod mesh_ws;
 pub mod repository_history;
 pub mod speak;
 pub mod templates;
+pub mod voice;
 pub mod transcript_analysis;
 pub mod users;
 pub mod versions;
diff --git a/makima/src/server/handlers/speak.rs b/makima/src/server/handlers/speak.rs
index 75e7780..3ed2620 100644
--- a/makima/src/server/handlers/speak.rs
+++ b/makima/src/server/handlers/speak.rs
@@ -15,6 +15,9 @@
 //! See `makima/src/tts/` for the TTS engine implementation.
 //! See `docs/specs/qwen3-tts-spec.md` for the full protocol specification.
 
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
+
 use axum::{
     extract::{ws::Message, ws::WebSocket, State, WebSocketUpgrade},
     response::Response,
@@ -32,9 +35,9 @@ enum ClientMessage {
     /// Request speech synthesis for the given text.
     Speak {
         text: String,
-        /// Optional voice ID (e.g., "makima"). Not yet used — reserved for future voice selection.
+        /// Optional voice ID (e.g., "makima"). Used to load reference audio for voice cloning.
+        /// Defaults to "makima" if not specified.
         #[serde(default)]
-        #[allow(dead_code)]
         voice: Option<String>,
     },
     /// Cancel any in-progress synthesis.
@@ -76,6 +79,10 @@ async fn handle_speak_socket(socket: WebSocket, state: SharedState) {
 
     let (mut sender, mut receiver) = socket.split();
 
+    // Cancellation flag shared between the message loop and inference.
+    // Each new Speak request resets it to false; Cancel sets it to true.
+    let cancel_flag: Arc<AtomicBool> = Arc::new(AtomicBool::new(false));
+
     // Process incoming messages
     while let Some(msg) = receiver.next().await {
         let msg = match msg {
@@ -102,13 +109,41 @@ async fn handle_speak_socket(socket: WebSocket, state: SharedState) {
                 };
 
                 match client_msg {
-                    ClientMessage::Speak { text, .. } => {
+                    ClientMessage::Speak { text, voice } => {
+                        let voice_id = voice
+                            .as_deref()
+                            .unwrap_or(super::voice::DEFAULT_VOICE_ID);
+
                         tracing::info!(
                             session_id = %session_id,
                             text_len = text.len(),
+                            voice_id = %voice_id,
                             "TTS speak request"
                         );
 
+                        // Load voice reference audio for cloning
+                        let voice_ref = match super::voice::load_reference_audio(voice_id) {
+                            Ok(v) => {
+                                tracing::debug!(
+                                    session_id = %session_id,
+                                    voice_id = %voice_id,
+                                    voice_name = %v.manifest.name,
+                                    samples = v.samples.len(),
+                                    "Voice reference loaded"
+                                );
+                                Some(v)
+                            }
+                            Err(e) => {
+                                tracing::warn!(
+                                    session_id = %session_id,
+                                    voice_id = %voice_id,
+                                    error = %e,
+                                    "Failed to load voice reference, proceeding without cloning"
+                                );
+                                None
+                            }
+                        };
+
                         // Get or lazily load the TTS engine
                         let engine = match state.get_tts_engine().await {
                             Ok(e) => e,
@@ -138,9 +173,21 @@ async fn handle_speak_socket(socket: WebSocket, state: SharedState) {
                             continue;
                         }
 
-                        // Run TTS inference (no voice reference for now — uses default)
-                        match engine.generate(&text, None, None).await {
+                        // Reset the cancel flag for this new generation request
+                        cancel_flag.store(false, Ordering::Relaxed);
+
+                        // Run TTS inference with optional voice reference for cloning
+                        // and the cancel flag so it can be stopped early.
+                        let (ref_audio, ref_rate) = match &voice_ref {
+                            Some(v) => (Some(v.samples.as_slice()), Some(v.sample_rate)),
+                            None => (None, None),
+                        };
+                        let flag = cancel_flag.clone();
+                        match engine.generate(&text, ref_audio, ref_rate, Some(flag)).await {
                             Ok(chunks) => {
+                                // Check if generation was cancelled
+                                let was_cancelled = cancel_flag.load(Ordering::Relaxed);
+
                                 for chunk in &chunks {
                                     // Send binary PCM audio data
                                     let pcm_bytes = chunk.to_pcm16_bytes();
@@ -157,12 +204,13 @@ async fn handle_speak_socket(socket: WebSocket, state: SharedState) {
                                     }
                                 }
 
-                                // Signal end of audio
+                                // Signal end of audio (include cancelled status)
                                 let end_msg = serde_json::json!({
                                     "type": "audio_end",
                                     "sample_rate": engine.sample_rate(),
                                     "format": "pcm_s16le",
                                     "channels": 1,
+                                    "cancelled": was_cancelled,
                                 });
                                 let _ = sender
                                     .send(Message::Text(end_msg.to_string().into()))
@@ -185,16 +233,18 @@ async fn handle_speak_socket(socket: WebSocket, state: SharedState) {
                     }
                     ClientMessage::Cancel => {
                         tracing::info!(session_id = %session_id, "TTS cancel requested");
-                        // TODO: support cancellation of in-progress inference
+                        cancel_flag.store(true, Ordering::Relaxed);
                     }
                     ClientMessage::Stop => {
                         tracing::info!(session_id = %session_id, "TTS stop requested, closing");
+                        cancel_flag.store(true, Ordering::Relaxed);
                         break;
                     }
                 }
             }
             Message::Close(_) => {
                 tracing::info!(session_id = %session_id, "TTS WebSocket closed by client");
+                cancel_flag.store(true, Ordering::Relaxed);
                 break;
             }
             _ => {
@@ -271,4 +321,17 @@ mod tests {
         let msg: ClientMessage = serde_json::from_str(json).unwrap();
         assert!(matches!(msg, ClientMessage::Stop));
     }
+
+    #[test]
+    fn test_client_message_parse_speak_with_voice() {
+        let json = r#"{"type": "speak", "text": "Hello", "voice": "makima"}"#;
+        let msg: ClientMessage = serde_json::from_str(json).unwrap();
+        match msg {
+            ClientMessage::Speak { text, voice } => {
+                assert_eq!(text, "Hello");
+                assert_eq!(voice.as_deref(), Some("makima"));
+            }
+            _ => panic!("Expected Speak message"),
+        }
+    }
 }
diff --git a/makima/src/server/handlers/voice.rs b/makima/src/server/handlers/voice.rs
new file mode 100644
index 0000000..91b650d
--- /dev/null
+++ b/makima/src/server/handlers/voice.rs
@@ -0,0 +1,252 @@
+//! Voice loading utilities for TTS voice cloning.
+//!
+//! Loads voice manifests and reference audio from the `voices/` directory.
+//! Each voice is a directory containing:
+//! - `manifest.json` — voice metadata (name, sample rate, backend, etc.)
+//! - `reference.wav` — reference audio clip for voice cloning (5-15s, 24kHz mono)
+
+use serde::Deserialize;
+use std::path::{Path, PathBuf};
+
+use crate::tts::{resample_to_24k, SAMPLE_RATE};
+
+/// Default voice ID used when no voice is specified.
+pub const DEFAULT_VOICE_ID: &str = "makima";
+
+/// Voice manifest loaded from `voices/{voice_id}/manifest.json`.
+#[derive(Debug, Clone, Deserialize)]
+pub struct VoiceManifest {
+    pub name: String,
+    pub id: String,
+    #[serde(default)]
+    pub description: Option<String>,
+    #[serde(default = "default_language")]
+    pub language: String,
+    #[serde(default)]
+    pub accent: Option<String>,
+    #[serde(default = "default_sample_rate")]
+    pub sample_rate: u32,
+    #[serde(default)]
+    pub format: Option<String>,
+    #[serde(default)]
+    pub model_backend: Option<String>,
+    #[serde(default = "default_reference_audio")]
+    pub reference_audio: String,
+    #[serde(default)]
+    pub notes: Option<String>,
+}
+
+fn default_language() -> String {
+    "en".to_string()
+}
+
+fn default_sample_rate() -> u32 {
+    24_000
+}
+
+fn default_reference_audio() -> String {
+    "reference.wav".to_string()
+}
+
+/// Loaded voice reference: manifest + decoded PCM samples at 24kHz.
+#[derive(Debug, Clone)]
+pub struct VoiceReference {
+    pub manifest: VoiceManifest,
+    /// PCM f32 samples resampled to 24kHz mono.
+    pub samples: Vec<f32>,
+    /// Always 24000 after resampling.
+    pub sample_rate: u32,
+}
+
+/// Resolve the base directory for voice data.
+///
+/// Looks for the `voices/` directory relative to the current working directory,
+/// or falls back to the executable's directory.
+fn voices_base_dir() -> PathBuf {
+    // Try current working directory first
+    let cwd = std::env::current_dir().unwrap_or_default();
+    let cwd_voices = cwd.join("voices");
+    if cwd_voices.is_dir() {
+        return cwd_voices;
+    }
+
+    // Try relative to executable
+    if let Ok(exe) = std::env::current_exe() {
+        if let Some(exe_dir) = exe.parent() {
+            let exe_voices = exe_dir.join("voices");
+            if exe_voices.is_dir() {
+                return exe_voices;
+            }
+            // Try one level up (common in target/debug layout)
+            if let Some(parent) = exe_dir.parent() {
+                let parent_voices = parent.join("voices");
+                if parent_voices.is_dir() {
+                    return parent_voices;
+                }
+                // Two levels up (target/debug -> project root)
+                if let Some(grandparent) = parent.parent() {
+                    let gp_voices = grandparent.join("voices");
+                    if gp_voices.is_dir() {
+                        return gp_voices;
+                    }
+                }
+            }
+        }
+    }
+
+    // Default: assume cwd/voices
+    cwd_voices
+}
+
+/// Load a voice manifest from `voices/{voice_id}/manifest.json`.
+pub fn load_manifest(voice_id: &str) -> Result<VoiceManifest, VoiceLoadError> {
+    let base = voices_base_dir();
+    let manifest_path = base.join(voice_id).join("manifest.json");
+
+    if !manifest_path.exists() {
+        return Err(VoiceLoadError::NotFound(voice_id.to_string()));
+    }
+
+    let data = std::fs::read_to_string(&manifest_path).map_err(|e| {
+        VoiceLoadError::Io(format!(
+            "failed to read manifest at {}: {e}",
+            manifest_path.display()
+        ))
+    })?;
+
+    let manifest: VoiceManifest = serde_json::from_str(&data).map_err(|e| {
+        VoiceLoadError::InvalidManifest(format!("failed to parse manifest: {e}"))
+    })?;
+
+    Ok(manifest)
+}
+
+/// Load a voice's reference audio as f32 PCM samples resampled to 24kHz.
+///
+/// Uses symphonia (via `crate::audio`) to decode the WAV file, then
+/// resamples to 24kHz using `tts::resample_to_24k`.
+pub fn load_reference_audio(voice_id: &str) -> Result<VoiceReference, VoiceLoadError> {
+    let manifest = load_manifest(voice_id)?;
+
+    let base = voices_base_dir();
+    let audio_path = base.join(voice_id).join(&manifest.reference_audio);
+
+    if !audio_path.exists() {
+        return Err(VoiceLoadError::MissingAudio(format!(
+            "reference audio not found at {}. See voices/{}/README.md for instructions.",
+            audio_path.display(),
+            voice_id,
+        )));
+    }
+
+    load_reference_audio_from_path(&audio_path, manifest)
+}
+
+/// Load reference audio from a specific file path with a pre-loaded manifest.
+fn load_reference_audio_from_path(
+    audio_path: &Path,
+    manifest: VoiceManifest,
+) -> Result<VoiceReference, VoiceLoadError> {
+    // Use symphonia-based decoder from crate::audio to decode the WAV
+    let pcm = crate::audio::to_16k_mono_from_path(audio_path).map_err(|e| {
+        VoiceLoadError::AudioDecode(format!("failed to decode {}: {e}", audio_path.display()))
+    })?;
+
+    // The audio module decodes to 16kHz mono; we need 24kHz for TTS.
+    // Resample from 16kHz to 24kHz.
+    let samples = if pcm.sample_rate == SAMPLE_RATE {
+        pcm.samples
+    } else {
+        resample_to_24k(&pcm.samples, pcm.sample_rate)
+    };
+
+    tracing::info!(
+        voice_id = %manifest.id,
+        voice_name = %manifest.name,
+        samples_len = samples.len(),
+        duration_secs = samples.len() as f32 / SAMPLE_RATE as f32,
+        "Loaded voice reference audio"
+    );
+
+    Ok(VoiceReference {
+        manifest,
+        samples,
+        sample_rate: SAMPLE_RATE,
+    })
+}
+
+/// Errors that can occur when loading a voice.
+#[derive(Debug)]
+pub enum VoiceLoadError {
+    /// Voice directory not found.
+    NotFound(String),
+    /// IO error reading files.
+    Io(String),
+    /// Manifest JSON is invalid.
+    InvalidManifest(String),
+    /// Reference audio file is missing.
+    MissingAudio(String),
+    /// Failed to decode audio.
+    AudioDecode(String),
+}
+
+impl std::fmt::Display for VoiceLoadError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            VoiceLoadError::NotFound(id) => {
+                write!(f, "voice '{id}' not found (no voices/{id}/manifest.json)")
+            }
+            VoiceLoadError::Io(msg) => write!(f, "voice IO error: {msg}"),
+            VoiceLoadError::InvalidManifest(msg) => write!(f, "invalid voice manifest: {msg}"),
+            VoiceLoadError::MissingAudio(msg) => write!(f, "missing reference audio: {msg}"),
+            VoiceLoadError::AudioDecode(msg) => write!(f, "audio decode error: {msg}"),
+        }
+    }
+}
+
+impl std::error::Error for VoiceLoadError {}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_default_voice_id() {
+        assert_eq!(DEFAULT_VOICE_ID, "makima");
+    }
+
+    #[test]
+    fn test_manifest_deserialize() {
+        let json = r#"{
+            "name": "Test Voice",
+            "id": "test",
+            "sample_rate": 24000,
+            "reference_audio": "reference.wav"
+        }"#;
+        let manifest: VoiceManifest = serde_json::from_str(json).unwrap();
+        assert_eq!(manifest.name, "Test Voice");
+        assert_eq!(manifest.id, "test");
+        assert_eq!(manifest.sample_rate, 24000);
+        assert_eq!(manifest.reference_audio, "reference.wav");
+        assert_eq!(manifest.language, "en");
+    }
+
+    #[test]
+    fn test_manifest_deserialize_defaults() {
+        let json = r#"{"name": "Minimal", "id": "min"}"#;
+        let manifest: VoiceManifest = serde_json::from_str(json).unwrap();
+        assert_eq!(manifest.language, "en");
+        assert_eq!(manifest.sample_rate, 24000);
+        assert_eq!(manifest.reference_audio, "reference.wav");
+    }
+
+    #[test]
+    fn test_load_nonexistent_voice() {
+        let result = load_manifest("nonexistent_voice_xyz");
+        assert!(result.is_err());
+        match result.unwrap_err() {
+            VoiceLoadError::NotFound(id) => assert_eq!(id, "nonexistent_voice_xyz"),
+            other => panic!("Expected NotFound, got: {other}"),
+        }
+    }
+}
diff --git a/makima/src/tts/chatterbox.rs b/makima/src/tts/chatterbox.rs
index e26bc06..712910f 100644
--- a/makima/src/tts/chatterbox.rs
+++ b/makima/src/tts/chatterbox.rs
@@ -6,7 +6,8 @@
 use std::borrow::Cow;
 use std::fs;
 use std::path::{Path, PathBuf};
-use std::sync::Mutex;
+use std::sync::atomic::AtomicBool;
+use std::sync::{Arc, Mutex};
 
 use hf_hub::api::sync::Api;
 use ndarray::{Array2, Array3, Array4, ArrayD, IxDyn};
@@ -427,6 +428,7 @@ impl TtsEngine for ChatterboxTTS {
         text: &str,
         reference_audio: Option<&[f32]>,
         reference_sample_rate: Option<u32>,
+        _cancel_flag: Option<Arc<AtomicBool>>,
     ) -> Result<Vec<AudioChunk>, TtsError> {
         let samples = match reference_audio {
             Some(audio) => {
diff --git a/makima/src/tts/mod.rs b/makima/src/tts/mod.rs
index 2cd0412..b66f4a5 100644
--- a/makima/src/tts/mod.rs
+++ b/makima/src/tts/mod.rs
@@ -5,6 +5,8 @@
 //! - **Qwen3**: Pure Rust candle-based Qwen3-TTS-12Hz-0.6B
 
 use std::path::Path;
+use std::sync::atomic::AtomicBool;
+use std::sync::Arc;
 
 pub mod chatterbox;
 pub mod qwen3;
@@ -109,11 +111,17 @@ pub enum TtsBackend {
 #[async_trait::async_trait]
 pub trait TtsEngine: Send + Sync {
     /// Generate complete audio from text with a voice reference.
+    ///
+    /// The optional `cancel_flag` can be set to `true` by another thread/task
+    /// to request early termination of the generation loop. Engines that
+    /// support cancellation will check this flag periodically and return
+    /// whatever audio has been produced so far.
     async fn generate(
         &self,
         text: &str,
         reference_audio: Option<&[f32]>,
         reference_sample_rate: Option<u32>,
+        cancel_flag: Option<Arc<AtomicBool>>,
     ) -> Result<Vec<AudioChunk>, TtsError>;
 
     /// Check if the engine is loaded and ready.
diff --git a/makima/src/tts/qwen3/generate.rs b/makima/src/tts/qwen3/generate.rs
index 02161e6..30d165b 100644
--- a/makima/src/tts/qwen3/generate.rs
+++ b/makima/src/tts/qwen3/generate.rs
@@ -7,6 +7,9 @@
 //! 4. Code predictor → remaining 15 codebook tokens per frame
 //! 5. Speech tokenizer decoder → waveform audio
 
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
+
 use candle_core::{DType, Device, IndexOp, Result, Tensor};
 use tokenizers::Tokenizer;
 
@@ -60,6 +63,9 @@ pub struct GenerationContext<'a> {
     tokenizer: &'a Tokenizer,
     device: &'a Device,
     config: GenerationConfig,
+    /// Optional cancellation flag. When set to `true`, the generation loop
+    /// will break early and return whatever audio has been produced so far.
+    cancel_flag: Option<Arc<AtomicBool>>,
 }
 
 impl<'a> GenerationContext<'a> {
@@ -70,6 +76,7 @@ impl<'a> GenerationContext<'a> {
         tokenizer: &'a Tokenizer,
         device: &'a Device,
         config: GenerationConfig,
+        cancel_flag: Option<Arc<AtomicBool>>,
     ) -> Self {
         Self {
             model,
@@ -78,9 +85,17 @@ impl<'a> GenerationContext<'a> {
             tokenizer,
             device,
             config,
+            cancel_flag,
         }
     }
 
+    /// Check whether cancellation has been requested.
+    fn is_cancelled(&self) -> bool {
+        self.cancel_flag
+            .as_ref()
+            .map_or(false, |f| f.load(Ordering::Relaxed))
+    }
+
     /// Generate audio from text, optionally with a voice reference.
     ///
     /// Returns a list of audio chunks. If `streaming` is false, returns
@@ -194,6 +209,12 @@ impl<'a> GenerationContext<'a> {
 
         // === Subsequent iterations: one token at a time ===
         for _step in 1..self.config.max_new_tokens {
+            // Check for cancellation each iteration
+            if self.is_cancelled() {
+                tracing::info!("TTS generation cancelled after {} frames", generated_frames.len());
+                break;
+            }
+
             let past_len = kv_caches[0].seq_len();
 
             // Input: just the last generated zeroth codebook token
@@ -340,13 +361,22 @@ impl<'a> GenerationContext<'a> {
         &self,
         frames: &[Vec<u32>],
     ) -> std::result::Result<Vec<AudioChunk>, TtsError> {
-        let mut chunks = Vec::new();
+        let mut chunks: Vec<AudioChunk> = Vec::new();
 
         // Decode in groups of frames for efficiency
         let chunk_size = 10; // ~800ms per chunk at 12.5Hz
         let num_codebooks = self.speech_tokenizer.num_codebooks();
 
         for (chunk_idx, frame_chunk) in frames.chunks(chunk_size).enumerate() {
+            // Check for cancellation between streaming chunks
+            if self.is_cancelled() {
+                tracing::info!("TTS streaming decode cancelled after {} chunks", chunks.len());
+                if let Some(last) = chunks.last_mut() {
+                    last.is_final = true;
+                }
+                return Ok(chunks);
+            }
+
             let is_last = (chunk_idx + 1) * chunk_size >= frames.len();
 
             // Transpose chunk frames
diff --git a/makima/src/tts/qwen3/mod.rs b/makima/src/tts/qwen3/mod.rs
index c55c118..9bac794 100644
--- a/makima/src/tts/qwen3/mod.rs
+++ b/makima/src/tts/qwen3/mod.rs
@@ -30,6 +30,7 @@ pub mod speech_tokenizer;
 
 use std::path::{Path, PathBuf};
 use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
 
 use candle_core::{DType, Device};
 use candle_nn::VarBuilder;
@@ -168,6 +169,7 @@ impl Qwen3Tts {
         text: &str,
         reference_audio: Option<&[f32]>,
         gen_config: Option<GenerationConfig>,
+        cancel_flag: Option<Arc<AtomicBool>>,
     ) -> Result<Vec<AudioChunk>, TtsError> {
         let config = gen_config.unwrap_or_default();
 
@@ -178,6 +180,7 @@ impl Qwen3Tts {
             &self.tokenizer,
             &self.device,
             config,
+            cancel_flag,
         );
 
         ctx.generate(text, reference_audio)
@@ -250,11 +253,12 @@ impl TtsEngine for Qwen3Tts {
         text: &str,
         reference_audio: Option<&[f32]>,
         _reference_sample_rate: Option<u32>,
+        cancel_flag: Option<Arc<AtomicBool>>,
     ) -> Result<Vec<AudioChunk>, TtsError> {
         // Note: reference audio should already be resampled to 24kHz
         // by the caller. If a different sample rate is provided,
         // the caller should resample using `resample_to_24k()`.
-        self.generate_speech(text, reference_audio, None)
+        self.generate_speech(text, reference_audio, None, cancel_flag)
     }
 
     fn is_ready(&self) -> bool {
diff --git a/makima/src/tts/qwen3/model.rs b/makima/src/tts/qwen3/model.rs
index 551893b..8a1e986 100644
--- a/makima/src/tts/qwen3/model.rs
+++ b/makima/src/tts/qwen3/model.rs
@@ -10,7 +10,7 @@
 //! Based on the candle-transformers Qwen2 model architecture,
 //! extended for Qwen3-TTS.
 
-use candle_core::{DType, Device, IndexOp, Module, Result, Tensor, D};
+use candle_core::{DType, Device, Module, Result, Tensor, D};
 use candle_nn::{embedding, linear_no_bias, rms_norm, Embedding, Linear, RmsNorm, VarBuilder};
 
 use super::config::Qwen3LmConfig;
diff --git a/makima/src/tts/qwen3/speech_tokenizer.rs b/makima/src/tts/qwen3/speech_tokenizer.rs
index 752050a..86e00f2 100644
--- a/makima/src/tts/qwen3/speech_tokenizer.rs
+++ b/makima/src/tts/qwen3/speech_tokenizer.rs
@@ -11,7 +11,7 @@
 //! The speech tokenizer is a separate model (~682MB) loaded from
 //! `Qwen/Qwen3-TTS-Tokenizer-12Hz`.
 
-use candle_core::{DType, Device, Module, Result, Tensor, D};
+use candle_core::{Device, Module, Result, Tensor, D};
 use candle_nn::{
     conv1d, embedding, linear_no_bias, Conv1d, Conv1dConfig, Embedding, Linear, VarBuilder,
 };
@@ -259,6 +259,7 @@ impl DecoderBlock {
 pub struct RvqCodebook {
     codebooks: Vec<Embedding>,
     num_codebooks: usize,
+    #[allow(dead_code)]
     codebook_dim: usize,
 }
 
diff --git a/voices/makima/README.md b/voices/makima/README.md
new file mode 100644
index 0000000..8553daf
--- /dev/null
+++ b/voices/makima/README.md
@@ -0,0 +1,105 @@
+# Makima Voice Reference Audio
+
+This directory contains the voice profile for **Makima** — the default TTS voice used by the makima system for voice cloning.
+
+## What You Need
+
+A **reference audio clip** (`reference.wav`) of Makima's Japanese voice actress (Tomori Kusunoki) speaking English.
+
+### Requirements
+
+| Property       | Value                                    |
+|----------------|------------------------------------------|
+| **Filename**   | `reference.wav`                          |
+| **Duration**   | 5–15 seconds (10s ideal)                 |
+| **Format**     | WAV (PCM)                                |
+| **Sample Rate**| 24 kHz (will be resampled if different)  |
+| **Channels**   | Mono (1 channel)                         |
+| **Bit Depth**  | 16-bit or 32-bit float                   |
+
+### Why These Parameters?
+
+- **5–15 seconds**: Enough for the TTS model to capture voice characteristics without being too long for memory.
+- **24 kHz mono**: Native sample rate of the Qwen3-TTS model. Audio at other rates will be automatically resampled, but starting at 24 kHz avoids quality loss.
+- **Clear speech**: Minimal background noise, no music overlay. A single speaker only.
+
+## How to Obtain Reference Audio
+
+### Option 1: Record or Find English Speech
+
+The best reference audio is a clean clip of the target voice speaking English. Sources:
+
+- **Anime convention panels or interviews** where the VA speaks English
+- **Behind-the-scenes clips** from Chainsaw Man production
+- **Fan events or promotional videos** with English speech segments
+
+### Option 2: Extract from YouTube
+
+You can extract audio from YouTube clips. Here are some potential sources:
+
+1. Search YouTube for: `"Tomori Kusunoki" english` or `"楠木ともり" english`
+2. Look for interview clips, event recordings, or promotional content
+
+**Extraction steps using `yt-dlp` and `ffmpeg`:**
+
+```bash
+# 1. Download audio from a YouTube clip
+yt-dlp -x --audio-format wav -o "raw_audio.%(ext)s" "YOUTUBE_URL_HERE"
+
+# 2. Convert to 24kHz mono WAV, trimming to a 10-second segment
+#    Adjust -ss (start time) and -t (duration) as needed
+ffmpeg -i raw_audio.wav \
+  -ss 00:00:05 -t 00:00:10 \
+  -ar 24000 -ac 1 \
+  -acodec pcm_s16le \
+  voices/makima/reference.wav
+
+# 3. Verify the output
+ffprobe -v error -show_entries stream=sample_rate,channels,duration \
+  -of default=noprint_wrappers=1 voices/makima/reference.wav
+```
+
+### Option 3: Use Any Japanese-Accented English Voice
+
+If you cannot find clips of the specific VA, any clear recording of a female Japanese speaker speaking English will work as a starting point. The voice cloning will adapt to the reference audio's characteristics.
+
+```bash
+# Example: record your own reference using sox (if available)
+sox -d -r 24000 -c 1 -b 16 voices/makima/reference.wav trim 0 10
+```
+
+## Tips for Best Quality
+
+1. **Clean audio**: Remove any background music or noise. Use a noise gate or audio editor if needed.
+2. **Natural speech**: Conversational tone works better than reading. The model captures prosody and rhythm.
+3. **Consistent volume**: Normalize the audio to avoid clipping or very quiet segments.
+4. **Single speaker**: Only the target voice should be present in the clip.
+
+```bash
+# Normalize audio volume with ffmpeg
+ffmpeg -i reference_raw.wav \
+  -af "loudnorm=I=-16:TP=-1.5:LRA=11" \
+  -ar 24000 -ac 1 -acodec pcm_s16le \
+  voices/makima/reference.wav
+```
+
+## File Structure
+
+```
+voices/makima/
+├── manifest.json     # Voice metadata (name, sample rate, backend)
+├── reference.wav     # Reference audio clip (YOU PROVIDE THIS)
+└── README.md         # This file
+```
+
+## Verification
+
+After placing `reference.wav`, you can verify the system loads it correctly:
+
+```bash
+# The TTS handler will log voice loading on first speak request:
+# INFO makima::server::handlers::voice: Loaded voice reference audio
+#   voice_id="makima" voice_name="Makima" samples_len=240000 duration_secs=10.0
+```
+
+If the reference audio is missing, the TTS system will still work but without voice cloning — it will use the model's default voice instead.
author	soryu <soryu@soryu.co>	2026-01-28 03:50:45 +0000
committer	GitHub <noreply@github.com>	2026-01-28 03:50:45 +0000
commit	9b53f6c6b01da85ef73bd5960b32ec319df0b947 (patch)
tree	8c5e9983e1a5e75afab4a7d7a18ba22b75211628
parent	c14192cc8b0e82369c93c1aee615fcc9cfad5911 (diff)
download	soryu-9b53f6c6b01da85ef73bd5960b32ec319df0b947.tar.gz soryu-9b53f6c6b01da85ef73bd5960b32ec319df0b947.zip