makima/src/server/handlers/voice.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252

//! Voice loading utilities for TTS voice cloning.
//!
//! Loads voice manifests and reference audio from the `voices/` directory.
//! Each voice is a directory containing:
//! - `manifest.json` — voice metadata (name, sample rate, backend, etc.)
//! - `reference.wav` — reference audio clip for voice cloning (5-15s, 24kHz mono)

use serde::Deserialize;
use std::path::{Path, PathBuf};

use crate::tts::{resample_to_24k, SAMPLE_RATE};

/// Default voice ID used when no voice is specified.
pub const DEFAULT_VOICE_ID: &str = "makima";

/// Voice manifest loaded from `voices/{voice_id}/manifest.json`.
#[derive(Debug, Clone, Deserialize)]
pub struct VoiceManifest {
    pub name: String,
    pub id: String,
    #[serde(default)]
    pub description: Option<String>,
    #[serde(default = "default_language")]
    pub language: String,
    #[serde(default)]
    pub accent: Option<String>,
    #[serde(default = "default_sample_rate")]
    pub sample_rate: u32,
    #[serde(default)]
    pub format: Option<String>,
    #[serde(default)]
    pub model_backend: Option<String>,
    #[serde(default = "default_reference_audio")]
    pub reference_audio: String,
    #[serde(default)]
    pub notes: Option<String>,
}

fn default_language() -> String {
    "en".to_string()
}

fn default_sample_rate() -> u32 {
    24_000
}

fn default_reference_audio() -> String {
    "reference.wav".to_string()
}

/// Loaded voice reference: manifest + decoded PCM samples at 24kHz.
#[derive(Debug, Clone)]
pub struct VoiceReference {
    pub manifest: VoiceManifest,
    /// PCM f32 samples resampled to 24kHz mono.
    pub samples: Vec<f32>,
    /// Always 24000 after resampling.
    pub sample_rate: u32,
}

/// Resolve the base directory for voice data.
///
/// Looks for the `voices/` directory relative to the current working directory,
/// or falls back to the executable's directory.
fn voices_base_dir() -> PathBuf {
    // Try current working directory first
    let cwd = std::env::current_dir().unwrap_or_default();
    let cwd_voices = cwd.join("voices");
    if cwd_voices.is_dir() {
        return cwd_voices;
    }

    // Try relative to executable
    if let Ok(exe) = std::env::current_exe() {
        if let Some(exe_dir) = exe.parent() {
            let exe_voices = exe_dir.join("voices");
            if exe_voices.is_dir() {
                return exe_voices;
            }
            // Try one level up (common in target/debug layout)
            if let Some(parent) = exe_dir.parent() {
                let parent_voices = parent.join("voices");
                if parent_voices.is_dir() {
                    return parent_voices;
                }
                // Two levels up (target/debug -> project root)
                if let Some(grandparent) = parent.parent() {
                    let gp_voices = grandparent.join("voices");
                    if gp_voices.is_dir() {
                        return gp_voices;
                    }
                }
            }
        }
    }

    // Default: assume cwd/voices
    cwd_voices
}

/// Load a voice manifest from `voices/{voice_id}/manifest.json`.
pub fn load_manifest(voice_id: &str) -> Result<VoiceManifest, VoiceLoadError> {
    let base = voices_base_dir();
    let manifest_path = base.join(voice_id).join("manifest.json");

    if !manifest_path.exists() {
        return Err(VoiceLoadError::NotFound(voice_id.to_string()));
    }

    let data = std::fs::read_to_string(&manifest_path).map_err(|e| {
        VoiceLoadError::Io(format!(
            "failed to read manifest at {}: {e}",
            manifest_path.display()
        ))
    })?;

    let manifest: VoiceManifest = serde_json::from_str(&data).map_err(|e| {
        VoiceLoadError::InvalidManifest(format!("failed to parse manifest: {e}"))
    })?;

    Ok(manifest)
}

/// Load a voice's reference audio as f32 PCM samples resampled to 24kHz.
///
/// Uses symphonia (via `crate::audio`) to decode the WAV file, then
/// resamples to 24kHz using `tts::resample_to_24k`.
pub fn load_reference_audio(voice_id: &str) -> Result<VoiceReference, VoiceLoadError> {
    let manifest = load_manifest(voice_id)?;

    let base = voices_base_dir();
    let audio_path = base.join(voice_id).join(&manifest.reference_audio);

    if !audio_path.exists() {
        return Err(VoiceLoadError::MissingAudio(format!(
            "reference audio not found at {}. See voices/{}/README.md for instructions.",
            audio_path.display(),
            voice_id,
        )));
    }

    load_reference_audio_from_path(&audio_path, manifest)
}

/// Load reference audio from a specific file path with a pre-loaded manifest.
fn load_reference_audio_from_path(
    audio_path: &Path,
    manifest: VoiceManifest,
) -> Result<VoiceReference, VoiceLoadError> {
    // Use symphonia-based decoder from crate::audio to decode the WAV
    let pcm = crate::audio::to_16k_mono_from_path(audio_path).map_err(|e| {
        VoiceLoadError::AudioDecode(format!("failed to decode {}: {e}", audio_path.display()))
    })?;

    // The audio module decodes to 16kHz mono; we need 24kHz for TTS.
    // Resample from 16kHz to 24kHz.
    let samples = if pcm.sample_rate == SAMPLE_RATE {
        pcm.samples
    } else {
        resample_to_24k(&pcm.samples, pcm.sample_rate)
    };

    tracing::info!(
        voice_id = %manifest.id,
        voice_name = %manifest.name,
        samples_len = samples.len(),
        duration_secs = samples.len() as f32 / SAMPLE_RATE as f32,
        "Loaded voice reference audio"
    );

    Ok(VoiceReference {
        manifest,
        samples,
        sample_rate: SAMPLE_RATE,
    })
}

/// Errors that can occur when loading a voice.
#[derive(Debug)]
pub enum VoiceLoadError {
    /// Voice directory not found.
    NotFound(String),
    /// IO error reading files.
    Io(String),
    /// Manifest JSON is invalid.
    InvalidManifest(String),
    /// Reference audio file is missing.
    MissingAudio(String),
    /// Failed to decode audio.
    AudioDecode(String),
}

impl std::fmt::Display for VoiceLoadError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            VoiceLoadError::NotFound(id) => {
                write!(f, "voice '{id}' not found (no voices/{id}/manifest.json)")
            }
            VoiceLoadError::Io(msg) => write!(f, "voice IO error: {msg}"),
            VoiceLoadError::InvalidManifest(msg) => write!(f, "invalid voice manifest: {msg}"),
            VoiceLoadError::MissingAudio(msg) => write!(f, "missing reference audio: {msg}"),
            VoiceLoadError::AudioDecode(msg) => write!(f, "audio decode error: {msg}"),
        }
    }
}

impl std::error::Error for VoiceLoadError {}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_default_voice_id() {
        assert_eq!(DEFAULT_VOICE_ID, "makima");
    }

    #[test]
    fn test_manifest_deserialize() {
        let json = r#"{
            "name": "Test Voice",
            "id": "test",
            "sample_rate": 24000,
            "reference_audio": "reference.wav"
        }"#;
        let manifest: VoiceManifest = serde_json::from_str(json).unwrap();
        assert_eq!(manifest.name, "Test Voice");
        assert_eq!(manifest.id, "test");
        assert_eq!(manifest.sample_rate, 24000);
        assert_eq!(manifest.reference_audio, "reference.wav");
        assert_eq!(manifest.language, "en");
    }

    #[test]
    fn test_manifest_deserialize_defaults() {
        let json = r#"{"name": "Minimal", "id": "min"}"#;
        let manifest: VoiceManifest = serde_json::from_str(json).unwrap();
        assert_eq!(manifest.language, "en");
        assert_eq!(manifest.sample_rate, 24000);
        assert_eq!(manifest.reference_audio, "reference.wav");
    }

    #[test]
    fn test_load_nonexistent_voice() {
        let result = load_manifest("nonexistent_voice_xyz");
        assert!(result.is_err());
        match result.unwrap_err() {
            VoiceLoadError::NotFound(id) => assert_eq!(id, "nonexistent_voice_xyz"),
            other => panic!("Expected NotFound, got: {other}"),
        }
    }
}