From 9ddf9a9832924d591bdbd6a78057f55857a33e88 Mon Sep 17 00:00:00 2001 From: soryu Date: Fri, 30 Jan 2026 02:16:45 +0000 Subject: Support both tokenizor.json and vocab.json+merges.txt formats --- makima/src/tts/qwen3/mod.rs | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) (limited to 'makima') diff --git a/makima/src/tts/qwen3/mod.rs b/makima/src/tts/qwen3/mod.rs index 9bac794..1520be6 100644 --- a/makima/src/tts/qwen3/mod.rs +++ b/makima/src/tts/qwen3/mod.rs @@ -99,10 +99,34 @@ impl Qwen3Tts { Qwen3TtsConfig::default() }; - // Load text tokenizer - let tokenizer_path = model_dir.join("tokenizer.json"); - let tokenizer = Tokenizer::from_file(&tokenizer_path) - .map_err(|e| TtsError::Tokenizer(format!("failed to load tokenizer: {e}")))?; + // Load text tokenizer (supports both tokenizer.json and vocab.json+merges.txt formats) + let tokenizer_json_path = model_dir.join("tokenizer.json"); + let tokenizer = if tokenizer_json_path.exists() { + Tokenizer::from_file(&tokenizer_json_path) + .map_err(|e| TtsError::Tokenizer(format!("failed to load tokenizer.json: {e}")))? + } else { + // Fall back to vocab.json + merges.txt (HuggingFace Qwen3-TTS format) + let vocab_path = model_dir.join("vocab.json"); + let merges_path = model_dir.join("merges.txt"); + + if !vocab_path.exists() || !merges_path.exists() { + return Err(TtsError::Tokenizer(format!( + "tokenizer files not found: need either tokenizer.json or vocab.json+merges.txt in {}", + model_dir.display() + ))); + } + + tokenizers::Tokenizer::from_file(&vocab_path) + .or_else(|_| { + // Build BPE tokenizer from vocab and merges + use tokenizers::models::bpe::BPE; + let bpe = BPE::from_file(&vocab_path.to_string_lossy(), &merges_path.to_string_lossy()) + .build() + .map_err(|e| TtsError::Tokenizer(format!("failed to build BPE tokenizer: {e}")))?; + Ok(Tokenizer::new(bpe)) + }) + .map_err(|e: TtsError| TtsError::Tokenizer(format!("failed to load tokenizer: {e}")))? + }; // Load LM weights from safetensors let lm_weights_path = model_dir.join("model.safetensors"); -- cgit v1.2.3