summaryrefslogtreecommitdiff
path: root/makima
diff options
context:
space:
mode:
authorsoryu <soryu@soryu.co>2026-01-30 02:16:45 +0000
committersoryu <soryu@soryu.co>2026-01-30 02:16:45 +0000
commit9ddf9a9832924d591bdbd6a78057f55857a33e88 (patch)
tree2520bed24dcb4ecfebdec521960ef977be5ce1e4 /makima
parent55bf0714a20e651ab70b1eed01ec665cfefac6b4 (diff)
downloadsoryu-9ddf9a9832924d591bdbd6a78057f55857a33e88.tar.gz
soryu-9ddf9a9832924d591bdbd6a78057f55857a33e88.zip
Support both tokenizor.json and vocab.json+merges.txt formats
Diffstat (limited to 'makima')
-rw-r--r--makima/src/tts/qwen3/mod.rs32
1 files changed, 28 insertions, 4 deletions
diff --git a/makima/src/tts/qwen3/mod.rs b/makima/src/tts/qwen3/mod.rs
index 9bac794..1520be6 100644
--- a/makima/src/tts/qwen3/mod.rs
+++ b/makima/src/tts/qwen3/mod.rs
@@ -99,10 +99,34 @@ impl Qwen3Tts {
Qwen3TtsConfig::default()
};
- // Load text tokenizer
- let tokenizer_path = model_dir.join("tokenizer.json");
- let tokenizer = Tokenizer::from_file(&tokenizer_path)
- .map_err(|e| TtsError::Tokenizer(format!("failed to load tokenizer: {e}")))?;
+ // Load text tokenizer (supports both tokenizer.json and vocab.json+merges.txt formats)
+ let tokenizer_json_path = model_dir.join("tokenizer.json");
+ let tokenizer = if tokenizer_json_path.exists() {
+ Tokenizer::from_file(&tokenizer_json_path)
+ .map_err(|e| TtsError::Tokenizer(format!("failed to load tokenizer.json: {e}")))?
+ } else {
+ // Fall back to vocab.json + merges.txt (HuggingFace Qwen3-TTS format)
+ let vocab_path = model_dir.join("vocab.json");
+ let merges_path = model_dir.join("merges.txt");
+
+ if !vocab_path.exists() || !merges_path.exists() {
+ return Err(TtsError::Tokenizer(format!(
+ "tokenizer files not found: need either tokenizer.json or vocab.json+merges.txt in {}",
+ model_dir.display()
+ )));
+ }
+
+ tokenizers::Tokenizer::from_file(&vocab_path)
+ .or_else(|_| {
+ // Build BPE tokenizer from vocab and merges
+ use tokenizers::models::bpe::BPE;
+ let bpe = BPE::from_file(&vocab_path.to_string_lossy(), &merges_path.to_string_lossy())
+ .build()
+ .map_err(|e| TtsError::Tokenizer(format!("failed to build BPE tokenizer: {e}")))?;
+ Ok(Tokenizer::new(bpe))
+ })
+ .map_err(|e: TtsError| TtsError::Tokenizer(format!("failed to load tokenizer: {e}")))?
+ };
// Load LM weights from safetensors
let lm_weights_path = model_dir.join("model.safetensors");