diff options
| author | soryu <soryu@soryu.co> | 2026-01-28 03:50:45 +0000 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2026-01-28 03:50:45 +0000 |
| commit | 9b53f6c6b01da85ef73bd5960b32ec319df0b947 (patch) | |
| tree | 8c5e9983e1a5e75afab4a7d7a18ba22b75211628 | |
| parent | c14192cc8b0e82369c93c1aee615fcc9cfad5911 (diff) | |
| download | soryu-9b53f6c6b01da85ef73bd5960b32ec319df0b947.tar.gz soryu-9b53f6c6b01da85ef73bd5960b32ec319df0b947.zip | |
Replace TTS endpoint with Rust-native Qwen3-TTS (#41)
* chore: fix unused import warnings in qwen3-tts module
- Remove unused import 'IndexOp' in model.rs
- Remove unused import 'DType' in speech_tokenizer.rs
- Add #[allow(dead_code)] to codebook_dim field in RvqCodebook
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
* feat: add voice loading and selection for TTS cloning
Add voice reference audio loading so the TTS speak handler can perform
voice cloning using reference WAV files from the voices/ directory.
- Add voice.rs module: loads manifest.json and reference.wav for a given
voice_id, decodes via symphonia, resamples to 24kHz for the TTS engine
- Update speak.rs: resolve voice_id from the speak request (default
"makima"), load reference audio, pass it to engine.generate()
- Add voices/makima/README.md with instructions for obtaining reference
audio (extraction from YouTube, recording, ffmpeg conversion)
- Graceful fallback: if reference audio is missing, TTS proceeds without
voice cloning using the model's default voice
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
* [WIP] Heartbeat checkpoint - 2026-01-28 03:49:13 UTC
---------
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
| -rw-r--r-- | makima/src/server/handlers/mod.rs | 1 | ||||
| -rw-r--r-- | makima/src/server/handlers/speak.rs | 77 | ||||
| -rw-r--r-- | makima/src/server/handlers/voice.rs | 252 | ||||
| -rw-r--r-- | makima/src/tts/chatterbox.rs | 4 | ||||
| -rw-r--r-- | makima/src/tts/mod.rs | 8 | ||||
| -rw-r--r-- | makima/src/tts/qwen3/generate.rs | 32 | ||||
| -rw-r--r-- | makima/src/tts/qwen3/mod.rs | 6 | ||||
| -rw-r--r-- | makima/src/tts/qwen3/model.rs | 2 | ||||
| -rw-r--r-- | makima/src/tts/qwen3/speech_tokenizer.rs | 3 | ||||
| -rw-r--r-- | voices/makima/README.md | 105 |
10 files changed, 478 insertions, 12 deletions
diff --git a/makima/src/server/handlers/mod.rs b/makima/src/server/handlers/mod.rs index 8207399..8af2a37 100644 --- a/makima/src/server/handlers/mod.rs +++ b/makima/src/server/handlers/mod.rs @@ -19,6 +19,7 @@ pub mod mesh_ws; pub mod repository_history; pub mod speak; pub mod templates; +pub mod voice; pub mod transcript_analysis; pub mod users; pub mod versions; diff --git a/makima/src/server/handlers/speak.rs b/makima/src/server/handlers/speak.rs index 75e7780..3ed2620 100644 --- a/makima/src/server/handlers/speak.rs +++ b/makima/src/server/handlers/speak.rs @@ -15,6 +15,9 @@ //! See `makima/src/tts/` for the TTS engine implementation. //! See `docs/specs/qwen3-tts-spec.md` for the full protocol specification. +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; + use axum::{ extract::{ws::Message, ws::WebSocket, State, WebSocketUpgrade}, response::Response, @@ -32,9 +35,9 @@ enum ClientMessage { /// Request speech synthesis for the given text. Speak { text: String, - /// Optional voice ID (e.g., "makima"). Not yet used — reserved for future voice selection. + /// Optional voice ID (e.g., "makima"). Used to load reference audio for voice cloning. + /// Defaults to "makima" if not specified. #[serde(default)] - #[allow(dead_code)] voice: Option<String>, }, /// Cancel any in-progress synthesis. @@ -76,6 +79,10 @@ async fn handle_speak_socket(socket: WebSocket, state: SharedState) { let (mut sender, mut receiver) = socket.split(); + // Cancellation flag shared between the message loop and inference. + // Each new Speak request resets it to false; Cancel sets it to true. + let cancel_flag: Arc<AtomicBool> = Arc::new(AtomicBool::new(false)); + // Process incoming messages while let Some(msg) = receiver.next().await { let msg = match msg { @@ -102,13 +109,41 @@ async fn handle_speak_socket(socket: WebSocket, state: SharedState) { }; match client_msg { - ClientMessage::Speak { text, .. } => { + ClientMessage::Speak { text, voice } => { + let voice_id = voice + .as_deref() + .unwrap_or(super::voice::DEFAULT_VOICE_ID); + tracing::info!( session_id = %session_id, text_len = text.len(), + voice_id = %voice_id, "TTS speak request" ); + // Load voice reference audio for cloning + let voice_ref = match super::voice::load_reference_audio(voice_id) { + Ok(v) => { + tracing::debug!( + session_id = %session_id, + voice_id = %voice_id, + voice_name = %v.manifest.name, + samples = v.samples.len(), + "Voice reference loaded" + ); + Some(v) + } + Err(e) => { + tracing::warn!( + session_id = %session_id, + voice_id = %voice_id, + error = %e, + "Failed to load voice reference, proceeding without cloning" + ); + None + } + }; + // Get or lazily load the TTS engine let engine = match state.get_tts_engine().await { Ok(e) => e, @@ -138,9 +173,21 @@ async fn handle_speak_socket(socket: WebSocket, state: SharedState) { continue; } - // Run TTS inference (no voice reference for now — uses default) - match engine.generate(&text, None, None).await { + // Reset the cancel flag for this new generation request + cancel_flag.store(false, Ordering::Relaxed); + + // Run TTS inference with optional voice reference for cloning + // and the cancel flag so it can be stopped early. + let (ref_audio, ref_rate) = match &voice_ref { + Some(v) => (Some(v.samples.as_slice()), Some(v.sample_rate)), + None => (None, None), + }; + let flag = cancel_flag.clone(); + match engine.generate(&text, ref_audio, ref_rate, Some(flag)).await { Ok(chunks) => { + // Check if generation was cancelled + let was_cancelled = cancel_flag.load(Ordering::Relaxed); + for chunk in &chunks { // Send binary PCM audio data let pcm_bytes = chunk.to_pcm16_bytes(); @@ -157,12 +204,13 @@ async fn handle_speak_socket(socket: WebSocket, state: SharedState) { } } - // Signal end of audio + // Signal end of audio (include cancelled status) let end_msg = serde_json::json!({ "type": "audio_end", "sample_rate": engine.sample_rate(), "format": "pcm_s16le", "channels": 1, + "cancelled": was_cancelled, }); let _ = sender .send(Message::Text(end_msg.to_string().into())) @@ -185,16 +233,18 @@ async fn handle_speak_socket(socket: WebSocket, state: SharedState) { } ClientMessage::Cancel => { tracing::info!(session_id = %session_id, "TTS cancel requested"); - // TODO: support cancellation of in-progress inference + cancel_flag.store(true, Ordering::Relaxed); } ClientMessage::Stop => { tracing::info!(session_id = %session_id, "TTS stop requested, closing"); + cancel_flag.store(true, Ordering::Relaxed); break; } } } Message::Close(_) => { tracing::info!(session_id = %session_id, "TTS WebSocket closed by client"); + cancel_flag.store(true, Ordering::Relaxed); break; } _ => { @@ -271,4 +321,17 @@ mod tests { let msg: ClientMessage = serde_json::from_str(json).unwrap(); assert!(matches!(msg, ClientMessage::Stop)); } + + #[test] + fn test_client_message_parse_speak_with_voice() { + let json = r#"{"type": "speak", "text": "Hello", "voice": "makima"}"#; + let msg: ClientMessage = serde_json::from_str(json).unwrap(); + match msg { + ClientMessage::Speak { text, voice } => { + assert_eq!(text, "Hello"); + assert_eq!(voice.as_deref(), Some("makima")); + } + _ => panic!("Expected Speak message"), + } + } } diff --git a/makima/src/server/handlers/voice.rs b/makima/src/server/handlers/voice.rs new file mode 100644 index 0000000..91b650d --- /dev/null +++ b/makima/src/server/handlers/voice.rs @@ -0,0 +1,252 @@ +//! Voice loading utilities for TTS voice cloning. +//! +//! Loads voice manifests and reference audio from the `voices/` directory. +//! Each voice is a directory containing: +//! - `manifest.json` — voice metadata (name, sample rate, backend, etc.) +//! - `reference.wav` — reference audio clip for voice cloning (5-15s, 24kHz mono) + +use serde::Deserialize; +use std::path::{Path, PathBuf}; + +use crate::tts::{resample_to_24k, SAMPLE_RATE}; + +/// Default voice ID used when no voice is specified. +pub const DEFAULT_VOICE_ID: &str = "makima"; + +/// Voice manifest loaded from `voices/{voice_id}/manifest.json`. +#[derive(Debug, Clone, Deserialize)] +pub struct VoiceManifest { + pub name: String, + pub id: String, + #[serde(default)] + pub description: Option<String>, + #[serde(default = "default_language")] + pub language: String, + #[serde(default)] + pub accent: Option<String>, + #[serde(default = "default_sample_rate")] + pub sample_rate: u32, + #[serde(default)] + pub format: Option<String>, + #[serde(default)] + pub model_backend: Option<String>, + #[serde(default = "default_reference_audio")] + pub reference_audio: String, + #[serde(default)] + pub notes: Option<String>, +} + +fn default_language() -> String { + "en".to_string() +} + +fn default_sample_rate() -> u32 { + 24_000 +} + +fn default_reference_audio() -> String { + "reference.wav".to_string() +} + +/// Loaded voice reference: manifest + decoded PCM samples at 24kHz. +#[derive(Debug, Clone)] +pub struct VoiceReference { + pub manifest: VoiceManifest, + /// PCM f32 samples resampled to 24kHz mono. + pub samples: Vec<f32>, + /// Always 24000 after resampling. + pub sample_rate: u32, +} + +/// Resolve the base directory for voice data. +/// +/// Looks for the `voices/` directory relative to the current working directory, +/// or falls back to the executable's directory. +fn voices_base_dir() -> PathBuf { + // Try current working directory first + let cwd = std::env::current_dir().unwrap_or_default(); + let cwd_voices = cwd.join("voices"); + if cwd_voices.is_dir() { + return cwd_voices; + } + + // Try relative to executable + if let Ok(exe) = std::env::current_exe() { + if let Some(exe_dir) = exe.parent() { + let exe_voices = exe_dir.join("voices"); + if exe_voices.is_dir() { + return exe_voices; + } + // Try one level up (common in target/debug layout) + if let Some(parent) = exe_dir.parent() { + let parent_voices = parent.join("voices"); + if parent_voices.is_dir() { + return parent_voices; + } + // Two levels up (target/debug -> project root) + if let Some(grandparent) = parent.parent() { + let gp_voices = grandparent.join("voices"); + if gp_voices.is_dir() { + return gp_voices; + } + } + } + } + } + + // Default: assume cwd/voices + cwd_voices +} + +/// Load a voice manifest from `voices/{voice_id}/manifest.json`. +pub fn load_manifest(voice_id: &str) -> Result<VoiceManifest, VoiceLoadError> { + let base = voices_base_dir(); + let manifest_path = base.join(voice_id).join("manifest.json"); + + if !manifest_path.exists() { + return Err(VoiceLoadError::NotFound(voice_id.to_string())); + } + + let data = std::fs::read_to_string(&manifest_path).map_err(|e| { + VoiceLoadError::Io(format!( + "failed to read manifest at {}: {e}", + manifest_path.display() + )) + })?; + + let manifest: VoiceManifest = serde_json::from_str(&data).map_err(|e| { + VoiceLoadError::InvalidManifest(format!("failed to parse manifest: {e}")) + })?; + + Ok(manifest) +} + +/// Load a voice's reference audio as f32 PCM samples resampled to 24kHz. +/// +/// Uses symphonia (via `crate::audio`) to decode the WAV file, then +/// resamples to 24kHz using `tts::resample_to_24k`. +pub fn load_reference_audio(voice_id: &str) -> Result<VoiceReference, VoiceLoadError> { + let manifest = load_manifest(voice_id)?; + + let base = voices_base_dir(); + let audio_path = base.join(voice_id).join(&manifest.reference_audio); + + if !audio_path.exists() { + return Err(VoiceLoadError::MissingAudio(format!( + "reference audio not found at {}. See voices/{}/README.md for instructions.", + audio_path.display(), + voice_id, + ))); + } + + load_reference_audio_from_path(&audio_path, manifest) +} + +/// Load reference audio from a specific file path with a pre-loaded manifest. +fn load_reference_audio_from_path( + audio_path: &Path, + manifest: VoiceManifest, +) -> Result<VoiceReference, VoiceLoadError> { + // Use symphonia-based decoder from crate::audio to decode the WAV + let pcm = crate::audio::to_16k_mono_from_path(audio_path).map_err(|e| { + VoiceLoadError::AudioDecode(format!("failed to decode {}: {e}", audio_path.display())) + })?; + + // The audio module decodes to 16kHz mono; we need 24kHz for TTS. + // Resample from 16kHz to 24kHz. + let samples = if pcm.sample_rate == SAMPLE_RATE { + pcm.samples + } else { + resample_to_24k(&pcm.samples, pcm.sample_rate) + }; + + tracing::info!( + voice_id = %manifest.id, + voice_name = %manifest.name, + samples_len = samples.len(), + duration_secs = samples.len() as f32 / SAMPLE_RATE as f32, + "Loaded voice reference audio" + ); + + Ok(VoiceReference { + manifest, + samples, + sample_rate: SAMPLE_RATE, + }) +} + +/// Errors that can occur when loading a voice. +#[derive(Debug)] +pub enum VoiceLoadError { + /// Voice directory not found. + NotFound(String), + /// IO error reading files. + Io(String), + /// Manifest JSON is invalid. + InvalidManifest(String), + /// Reference audio file is missing. + MissingAudio(String), + /// Failed to decode audio. + AudioDecode(String), +} + +impl std::fmt::Display for VoiceLoadError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + VoiceLoadError::NotFound(id) => { + write!(f, "voice '{id}' not found (no voices/{id}/manifest.json)") + } + VoiceLoadError::Io(msg) => write!(f, "voice IO error: {msg}"), + VoiceLoadError::InvalidManifest(msg) => write!(f, "invalid voice manifest: {msg}"), + VoiceLoadError::MissingAudio(msg) => write!(f, "missing reference audio: {msg}"), + VoiceLoadError::AudioDecode(msg) => write!(f, "audio decode error: {msg}"), + } + } +} + +impl std::error::Error for VoiceLoadError {} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_voice_id() { + assert_eq!(DEFAULT_VOICE_ID, "makima"); + } + + #[test] + fn test_manifest_deserialize() { + let json = r#"{ + "name": "Test Voice", + "id": "test", + "sample_rate": 24000, + "reference_audio": "reference.wav" + }"#; + let manifest: VoiceManifest = serde_json::from_str(json).unwrap(); + assert_eq!(manifest.name, "Test Voice"); + assert_eq!(manifest.id, "test"); + assert_eq!(manifest.sample_rate, 24000); + assert_eq!(manifest.reference_audio, "reference.wav"); + assert_eq!(manifest.language, "en"); + } + + #[test] + fn test_manifest_deserialize_defaults() { + let json = r#"{"name": "Minimal", "id": "min"}"#; + let manifest: VoiceManifest = serde_json::from_str(json).unwrap(); + assert_eq!(manifest.language, "en"); + assert_eq!(manifest.sample_rate, 24000); + assert_eq!(manifest.reference_audio, "reference.wav"); + } + + #[test] + fn test_load_nonexistent_voice() { + let result = load_manifest("nonexistent_voice_xyz"); + assert!(result.is_err()); + match result.unwrap_err() { + VoiceLoadError::NotFound(id) => assert_eq!(id, "nonexistent_voice_xyz"), + other => panic!("Expected NotFound, got: {other}"), + } + } +} diff --git a/makima/src/tts/chatterbox.rs b/makima/src/tts/chatterbox.rs index e26bc06..712910f 100644 --- a/makima/src/tts/chatterbox.rs +++ b/makima/src/tts/chatterbox.rs @@ -6,7 +6,8 @@ use std::borrow::Cow; use std::fs; use std::path::{Path, PathBuf}; -use std::sync::Mutex; +use std::sync::atomic::AtomicBool; +use std::sync::{Arc, Mutex}; use hf_hub::api::sync::Api; use ndarray::{Array2, Array3, Array4, ArrayD, IxDyn}; @@ -427,6 +428,7 @@ impl TtsEngine for ChatterboxTTS { text: &str, reference_audio: Option<&[f32]>, reference_sample_rate: Option<u32>, + _cancel_flag: Option<Arc<AtomicBool>>, ) -> Result<Vec<AudioChunk>, TtsError> { let samples = match reference_audio { Some(audio) => { diff --git a/makima/src/tts/mod.rs b/makima/src/tts/mod.rs index 2cd0412..b66f4a5 100644 --- a/makima/src/tts/mod.rs +++ b/makima/src/tts/mod.rs @@ -5,6 +5,8 @@ //! - **Qwen3**: Pure Rust candle-based Qwen3-TTS-12Hz-0.6B use std::path::Path; +use std::sync::atomic::AtomicBool; +use std::sync::Arc; pub mod chatterbox; pub mod qwen3; @@ -109,11 +111,17 @@ pub enum TtsBackend { #[async_trait::async_trait] pub trait TtsEngine: Send + Sync { /// Generate complete audio from text with a voice reference. + /// + /// The optional `cancel_flag` can be set to `true` by another thread/task + /// to request early termination of the generation loop. Engines that + /// support cancellation will check this flag periodically and return + /// whatever audio has been produced so far. async fn generate( &self, text: &str, reference_audio: Option<&[f32]>, reference_sample_rate: Option<u32>, + cancel_flag: Option<Arc<AtomicBool>>, ) -> Result<Vec<AudioChunk>, TtsError>; /// Check if the engine is loaded and ready. diff --git a/makima/src/tts/qwen3/generate.rs b/makima/src/tts/qwen3/generate.rs index 02161e6..30d165b 100644 --- a/makima/src/tts/qwen3/generate.rs +++ b/makima/src/tts/qwen3/generate.rs @@ -7,6 +7,9 @@ //! 4. Code predictor → remaining 15 codebook tokens per frame //! 5. Speech tokenizer decoder → waveform audio +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; + use candle_core::{DType, Device, IndexOp, Result, Tensor}; use tokenizers::Tokenizer; @@ -60,6 +63,9 @@ pub struct GenerationContext<'a> { tokenizer: &'a Tokenizer, device: &'a Device, config: GenerationConfig, + /// Optional cancellation flag. When set to `true`, the generation loop + /// will break early and return whatever audio has been produced so far. + cancel_flag: Option<Arc<AtomicBool>>, } impl<'a> GenerationContext<'a> { @@ -70,6 +76,7 @@ impl<'a> GenerationContext<'a> { tokenizer: &'a Tokenizer, device: &'a Device, config: GenerationConfig, + cancel_flag: Option<Arc<AtomicBool>>, ) -> Self { Self { model, @@ -78,9 +85,17 @@ impl<'a> GenerationContext<'a> { tokenizer, device, config, + cancel_flag, } } + /// Check whether cancellation has been requested. + fn is_cancelled(&self) -> bool { + self.cancel_flag + .as_ref() + .map_or(false, |f| f.load(Ordering::Relaxed)) + } + /// Generate audio from text, optionally with a voice reference. /// /// Returns a list of audio chunks. If `streaming` is false, returns @@ -194,6 +209,12 @@ impl<'a> GenerationContext<'a> { // === Subsequent iterations: one token at a time === for _step in 1..self.config.max_new_tokens { + // Check for cancellation each iteration + if self.is_cancelled() { + tracing::info!("TTS generation cancelled after {} frames", generated_frames.len()); + break; + } + let past_len = kv_caches[0].seq_len(); // Input: just the last generated zeroth codebook token @@ -340,13 +361,22 @@ impl<'a> GenerationContext<'a> { &self, frames: &[Vec<u32>], ) -> std::result::Result<Vec<AudioChunk>, TtsError> { - let mut chunks = Vec::new(); + let mut chunks: Vec<AudioChunk> = Vec::new(); // Decode in groups of frames for efficiency let chunk_size = 10; // ~800ms per chunk at 12.5Hz let num_codebooks = self.speech_tokenizer.num_codebooks(); for (chunk_idx, frame_chunk) in frames.chunks(chunk_size).enumerate() { + // Check for cancellation between streaming chunks + if self.is_cancelled() { + tracing::info!("TTS streaming decode cancelled after {} chunks", chunks.len()); + if let Some(last) = chunks.last_mut() { + last.is_final = true; + } + return Ok(chunks); + } + let is_last = (chunk_idx + 1) * chunk_size >= frames.len(); // Transpose chunk frames diff --git a/makima/src/tts/qwen3/mod.rs b/makima/src/tts/qwen3/mod.rs index c55c118..9bac794 100644 --- a/makima/src/tts/qwen3/mod.rs +++ b/makima/src/tts/qwen3/mod.rs @@ -30,6 +30,7 @@ pub mod speech_tokenizer; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; use candle_core::{DType, Device}; use candle_nn::VarBuilder; @@ -168,6 +169,7 @@ impl Qwen3Tts { text: &str, reference_audio: Option<&[f32]>, gen_config: Option<GenerationConfig>, + cancel_flag: Option<Arc<AtomicBool>>, ) -> Result<Vec<AudioChunk>, TtsError> { let config = gen_config.unwrap_or_default(); @@ -178,6 +180,7 @@ impl Qwen3Tts { &self.tokenizer, &self.device, config, + cancel_flag, ); ctx.generate(text, reference_audio) @@ -250,11 +253,12 @@ impl TtsEngine for Qwen3Tts { text: &str, reference_audio: Option<&[f32]>, _reference_sample_rate: Option<u32>, + cancel_flag: Option<Arc<AtomicBool>>, ) -> Result<Vec<AudioChunk>, TtsError> { // Note: reference audio should already be resampled to 24kHz // by the caller. If a different sample rate is provided, // the caller should resample using `resample_to_24k()`. - self.generate_speech(text, reference_audio, None) + self.generate_speech(text, reference_audio, None, cancel_flag) } fn is_ready(&self) -> bool { diff --git a/makima/src/tts/qwen3/model.rs b/makima/src/tts/qwen3/model.rs index 551893b..8a1e986 100644 --- a/makima/src/tts/qwen3/model.rs +++ b/makima/src/tts/qwen3/model.rs @@ -10,7 +10,7 @@ //! Based on the candle-transformers Qwen2 model architecture, //! extended for Qwen3-TTS. -use candle_core::{DType, Device, IndexOp, Module, Result, Tensor, D}; +use candle_core::{DType, Device, Module, Result, Tensor, D}; use candle_nn::{embedding, linear_no_bias, rms_norm, Embedding, Linear, RmsNorm, VarBuilder}; use super::config::Qwen3LmConfig; diff --git a/makima/src/tts/qwen3/speech_tokenizer.rs b/makima/src/tts/qwen3/speech_tokenizer.rs index 752050a..86e00f2 100644 --- a/makima/src/tts/qwen3/speech_tokenizer.rs +++ b/makima/src/tts/qwen3/speech_tokenizer.rs @@ -11,7 +11,7 @@ //! The speech tokenizer is a separate model (~682MB) loaded from //! `Qwen/Qwen3-TTS-Tokenizer-12Hz`. -use candle_core::{DType, Device, Module, Result, Tensor, D}; +use candle_core::{Device, Module, Result, Tensor, D}; use candle_nn::{ conv1d, embedding, linear_no_bias, Conv1d, Conv1dConfig, Embedding, Linear, VarBuilder, }; @@ -259,6 +259,7 @@ impl DecoderBlock { pub struct RvqCodebook { codebooks: Vec<Embedding>, num_codebooks: usize, + #[allow(dead_code)] codebook_dim: usize, } diff --git a/voices/makima/README.md b/voices/makima/README.md new file mode 100644 index 0000000..8553daf --- /dev/null +++ b/voices/makima/README.md @@ -0,0 +1,105 @@ +# Makima Voice Reference Audio + +This directory contains the voice profile for **Makima** — the default TTS voice used by the makima system for voice cloning. + +## What You Need + +A **reference audio clip** (`reference.wav`) of Makima's Japanese voice actress (Tomori Kusunoki) speaking English. + +### Requirements + +| Property | Value | +|----------------|------------------------------------------| +| **Filename** | `reference.wav` | +| **Duration** | 5–15 seconds (10s ideal) | +| **Format** | WAV (PCM) | +| **Sample Rate**| 24 kHz (will be resampled if different) | +| **Channels** | Mono (1 channel) | +| **Bit Depth** | 16-bit or 32-bit float | + +### Why These Parameters? + +- **5–15 seconds**: Enough for the TTS model to capture voice characteristics without being too long for memory. +- **24 kHz mono**: Native sample rate of the Qwen3-TTS model. Audio at other rates will be automatically resampled, but starting at 24 kHz avoids quality loss. +- **Clear speech**: Minimal background noise, no music overlay. A single speaker only. + +## How to Obtain Reference Audio + +### Option 1: Record or Find English Speech + +The best reference audio is a clean clip of the target voice speaking English. Sources: + +- **Anime convention panels or interviews** where the VA speaks English +- **Behind-the-scenes clips** from Chainsaw Man production +- **Fan events or promotional videos** with English speech segments + +### Option 2: Extract from YouTube + +You can extract audio from YouTube clips. Here are some potential sources: + +1. Search YouTube for: `"Tomori Kusunoki" english` or `"楠木ともり" english` +2. Look for interview clips, event recordings, or promotional content + +**Extraction steps using `yt-dlp` and `ffmpeg`:** + +```bash +# 1. Download audio from a YouTube clip +yt-dlp -x --audio-format wav -o "raw_audio.%(ext)s" "YOUTUBE_URL_HERE" + +# 2. Convert to 24kHz mono WAV, trimming to a 10-second segment +# Adjust -ss (start time) and -t (duration) as needed +ffmpeg -i raw_audio.wav \ + -ss 00:00:05 -t 00:00:10 \ + -ar 24000 -ac 1 \ + -acodec pcm_s16le \ + voices/makima/reference.wav + +# 3. Verify the output +ffprobe -v error -show_entries stream=sample_rate,channels,duration \ + -of default=noprint_wrappers=1 voices/makima/reference.wav +``` + +### Option 3: Use Any Japanese-Accented English Voice + +If you cannot find clips of the specific VA, any clear recording of a female Japanese speaker speaking English will work as a starting point. The voice cloning will adapt to the reference audio's characteristics. + +```bash +# Example: record your own reference using sox (if available) +sox -d -r 24000 -c 1 -b 16 voices/makima/reference.wav trim 0 10 +``` + +## Tips for Best Quality + +1. **Clean audio**: Remove any background music or noise. Use a noise gate or audio editor if needed. +2. **Natural speech**: Conversational tone works better than reading. The model captures prosody and rhythm. +3. **Consistent volume**: Normalize the audio to avoid clipping or very quiet segments. +4. **Single speaker**: Only the target voice should be present in the clip. + +```bash +# Normalize audio volume with ffmpeg +ffmpeg -i reference_raw.wav \ + -af "loudnorm=I=-16:TP=-1.5:LRA=11" \ + -ar 24000 -ac 1 -acodec pcm_s16le \ + voices/makima/reference.wav +``` + +## File Structure + +``` +voices/makima/ +├── manifest.json # Voice metadata (name, sample rate, backend) +├── reference.wav # Reference audio clip (YOU PROVIDE THIS) +└── README.md # This file +``` + +## Verification + +After placing `reference.wav`, you can verify the system loads it correctly: + +```bash +# The TTS handler will log voice loading on first speak request: +# INFO makima::server::handlers::voice: Loaded voice reference audio +# voice_id="makima" voice_name="Makima" samples_len=240000 duration_secs=10.0 +``` + +If the reference audio is missing, the TTS system will still work but without voice cloning — it will use the model's default voice instead. |
