diff options
| author | soryu <soryu@soryu.co> | 2026-01-30 02:16:45 +0000 |
|---|---|---|
| committer | soryu <soryu@soryu.co> | 2026-01-30 02:16:45 +0000 |
| commit | 9ddf9a9832924d591bdbd6a78057f55857a33e88 (patch) | |
| tree | 2520bed24dcb4ecfebdec521960ef977be5ce1e4 /makima | |
| parent | 55bf0714a20e651ab70b1eed01ec665cfefac6b4 (diff) | |
| download | soryu-9ddf9a9832924d591bdbd6a78057f55857a33e88.tar.gz soryu-9ddf9a9832924d591bdbd6a78057f55857a33e88.zip | |
Support both tokenizor.json and vocab.json+merges.txt formats
Diffstat (limited to 'makima')
| -rw-r--r-- | makima/src/tts/qwen3/mod.rs | 32 |
1 files changed, 28 insertions, 4 deletions
diff --git a/makima/src/tts/qwen3/mod.rs b/makima/src/tts/qwen3/mod.rs index 9bac794..1520be6 100644 --- a/makima/src/tts/qwen3/mod.rs +++ b/makima/src/tts/qwen3/mod.rs @@ -99,10 +99,34 @@ impl Qwen3Tts { Qwen3TtsConfig::default() }; - // Load text tokenizer - let tokenizer_path = model_dir.join("tokenizer.json"); - let tokenizer = Tokenizer::from_file(&tokenizer_path) - .map_err(|e| TtsError::Tokenizer(format!("failed to load tokenizer: {e}")))?; + // Load text tokenizer (supports both tokenizer.json and vocab.json+merges.txt formats) + let tokenizer_json_path = model_dir.join("tokenizer.json"); + let tokenizer = if tokenizer_json_path.exists() { + Tokenizer::from_file(&tokenizer_json_path) + .map_err(|e| TtsError::Tokenizer(format!("failed to load tokenizer.json: {e}")))? + } else { + // Fall back to vocab.json + merges.txt (HuggingFace Qwen3-TTS format) + let vocab_path = model_dir.join("vocab.json"); + let merges_path = model_dir.join("merges.txt"); + + if !vocab_path.exists() || !merges_path.exists() { + return Err(TtsError::Tokenizer(format!( + "tokenizer files not found: need either tokenizer.json or vocab.json+merges.txt in {}", + model_dir.display() + ))); + } + + tokenizers::Tokenizer::from_file(&vocab_path) + .or_else(|_| { + // Build BPE tokenizer from vocab and merges + use tokenizers::models::bpe::BPE; + let bpe = BPE::from_file(&vocab_path.to_string_lossy(), &merges_path.to_string_lossy()) + .build() + .map_err(|e| TtsError::Tokenizer(format!("failed to build BPE tokenizer: {e}")))?; + Ok(Tokenizer::new(bpe)) + }) + .map_err(|e: TtsError| TtsError::Tokenizer(format!("failed to load tokenizer: {e}")))? + }; // Load LM weights from safetensors let lm_weights_path = model_dir.join("model.safetensors"); |
