summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsoryu <soryu@soryu.co>2026-01-30 03:07:52 +0000
committersoryu <soryu@soryu.co>2026-01-30 03:07:52 +0000
commitc526f93aa4255cb581eeb3f7a495c1689683b0a2 (patch)
treefbdc579d04fe92dc610ec8c84b77eeffb9141622
parenta9655dccdad116db2b92c13794ddd559f160148d (diff)
downloadsoryu-c526f93aa4255cb581eeb3f7a495c1689683b0a2.tar.gz
soryu-c526f93aa4255cb581eeb3f7a495c1689683b0a2.zip
Fix Qwen3-TTS tensor paths to match HuggingFace model structure
The HuggingFace model uses different tensor name prefixes: - talker.model.text_embedding instead of model.embed_tokens - talker.codec_head instead of lm_head - talker.code_predictor.model.codec_embedding instead of code_embeddings - talker.code_predictor.lm_head instead of output_heads Also removed input_proj which doesn't exist in the HF model. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
-rw-r--r--makima/src/tts/qwen3/code_predictor.rs66
-rw-r--r--makima/src/tts/qwen3/model.rs11
2 files changed, 36 insertions, 41 deletions
diff --git a/makima/src/tts/qwen3/code_predictor.rs b/makima/src/tts/qwen3/code_predictor.rs
index 0ef8a1d..363105f 100644
--- a/makima/src/tts/qwen3/code_predictor.rs
+++ b/makima/src/tts/qwen3/code_predictor.rs
@@ -10,7 +10,7 @@
//! - Input: last hidden state from main LM + zeroth codebook embedding
//! - Output: 16 codebook token predictions
-use candle_core::{Device, Module, Result, Tensor, D};
+use candle_core::{Device, Module, Result, Tensor};
use candle_nn::{embedding, linear_no_bias, rms_norm, Embedding, Linear, RmsNorm, VarBuilder};
use super::config::{CodePredictorConfig, Qwen3LmConfig};
@@ -90,15 +90,13 @@ impl CodePredictorLayer {
/// tokens. The zeroth codebook is predicted by the main LM head; this
/// module predicts the remaining 15 residual codebooks.
pub struct CodePredictor {
- /// Embedding layer for codebook tokens (shared across groups).
+ /// Embedding layer for codebook tokens (one per residual codebook group, 0-14).
code_embeddings: Vec<Embedding>,
- /// Projection from LM hidden + code embedding to predictor hidden.
- input_proj: Linear,
/// 5 transformer layers.
layers: Vec<CodePredictorLayer>,
/// Final normalization.
norm: RmsNorm,
- /// Per-codebook output heads (16 heads, each projecting to codebook_vocab_size).
+ /// Per-codebook output heads (15 heads for residual codebooks).
output_heads: Vec<Linear>,
/// RoPE for the predictor's attention layers.
rope: RotaryEmbedding,
@@ -111,47 +109,45 @@ impl CodePredictor {
lm_config: &Qwen3LmConfig,
vb: VarBuilder,
) -> Result<Self> {
- let predictor_vb = vb.pp("code_predictor");
-
- // Code embeddings for each codebook group
- let mut code_embeddings = Vec::with_capacity(config.num_code_groups);
- for i in 0..config.num_code_groups {
+ // HuggingFace Qwen3-TTS uses "talker.code_predictor.*" prefix
+ let predictor_vb = vb.pp("talker").pp("code_predictor");
+ let model_vb = predictor_vb.pp("model");
+
+ // Code embeddings for residual codebook groups (15 groups, indices 0-14)
+ // HF names them "codec_embedding" not "code_embeddings"
+ let num_residual_groups = config.num_code_groups - 1; // 15, not 16
+ let mut code_embeddings = Vec::with_capacity(num_residual_groups);
+ for i in 0..num_residual_groups {
let emb = embedding(
config.codebook_vocab_size,
config.hidden_size,
- predictor_vb.pp(format!("code_embeddings.{i}")),
+ model_vb.pp(format!("codec_embedding.{i}")),
)?;
code_embeddings.push(emb);
}
- // Input projection: LM hidden (1024) + code embedding (1024) -> predictor hidden (1024)
- let input_proj = linear_no_bias(
- config.hidden_size * 2,
- config.hidden_size,
- predictor_vb.pp("input_proj"),
- )?;
-
// Transformer layers
let mut layers = Vec::with_capacity(config.num_layers);
for i in 0..config.num_layers {
let layer =
- CodePredictorLayer::new(config, predictor_vb.pp(format!("layers.{i}")))?;
+ CodePredictorLayer::new(config, model_vb.pp(format!("layers.{i}")))?;
layers.push(layer);
}
let norm = rms_norm(
config.hidden_size,
config.rms_norm_eps,
- predictor_vb.pp("norm"),
+ model_vb.pp("norm"),
)?;
- // Output heads for each codebook
- let mut output_heads = Vec::with_capacity(config.num_code_groups);
- for i in 0..config.num_code_groups {
+ // Output heads for residual codebooks (15 heads, indices 0-14)
+ // HF names them "lm_head" not "output_heads"
+ let mut output_heads = Vec::with_capacity(num_residual_groups);
+ for i in 0..num_residual_groups {
let head = linear_no_bias(
config.hidden_size,
config.codebook_vocab_size,
- predictor_vb.pp(format!("output_heads.{i}")),
+ predictor_vb.pp(format!("lm_head.{i}")),
)?;
output_heads.push(head);
}
@@ -168,7 +164,6 @@ impl CodePredictor {
Ok(Self {
code_embeddings,
- input_proj,
layers,
norm,
output_heads,
@@ -192,28 +187,25 @@ impl CodePredictor {
let mut all_codes = Vec::with_capacity(self.config.num_code_groups);
all_codes.push(zeroth_code);
- // The code predictor iterates through codebook groups.
- // For each group i (1..16), it:
+ // The code predictor iterates through the 15 residual codebook groups.
+ // For each group i (0..15), it:
// 1. Embeds the previous codebook token
- // 2. Concatenates with LM hidden state
- // 3. Projects through the predictor layers
- // 4. Predicts the next codebook token via output_head[i]
+ // 2. Adds to LM hidden state
+ // 3. Runs through predictor layers
+ // 4. Predicts the next codebook token via lm_head[i]
let mut prev_code = zeroth_code;
- for group_idx in 1..self.config.num_code_groups {
+ for group_idx in 0..self.code_embeddings.len() {
// Embed the previous codebook token
let code_tensor = Tensor::from_vec(
vec![prev_code],
(1, 1),
device,
)?;
- let code_emb = self.code_embeddings[group_idx - 1].forward(&code_tensor)?;
-
- // Concatenate LM hidden state with code embedding
- let combined = Tensor::cat(&[lm_hidden, &code_emb], D::Minus1)?;
+ let code_emb = self.code_embeddings[group_idx].forward(&code_tensor)?;
- // Project to predictor hidden size
- let mut hidden = self.input_proj.forward(&combined)?;
+ // Add code embedding to LM hidden state (no concatenation, no projection)
+ let mut hidden = (lm_hidden + &code_emb)?;
// Run through predictor transformer layers (no KV cache needed — single step)
let mut kv_caches: Vec<KvCache> =
diff --git a/makima/src/tts/qwen3/model.rs b/makima/src/tts/qwen3/model.rs
index 8a1e986..e19e5f9 100644
--- a/makima/src/tts/qwen3/model.rs
+++ b/makima/src/tts/qwen3/model.rs
@@ -389,9 +389,12 @@ pub struct Qwen3Model {
impl Qwen3Model {
pub fn new(config: &Qwen3LmConfig, vb: VarBuilder) -> Result<Self> {
- let model_vb = vb.pp("model");
+ // HuggingFace Qwen3-TTS uses "talker.model.*" prefix
+ let talker_vb = vb.pp("talker");
+ let model_vb = talker_vb.pp("model");
- let embed_tokens = embedding(config.vocab_size, config.hidden_size, model_vb.pp("embed_tokens"))?;
+ // Text embedding (called "text_embedding" in HF, not "embed_tokens")
+ let embed_tokens = embedding(config.vocab_size, config.hidden_size, model_vb.pp("text_embedding"))?;
let mut layers = Vec::with_capacity(config.num_hidden_layers);
for i in 0..config.num_hidden_layers {
@@ -401,8 +404,8 @@ impl Qwen3Model {
let norm = rms_norm(config.hidden_size, config.rms_norm_eps, model_vb.pp("norm"))?;
- // LM head — may or may not share weights with embed_tokens
- let lm_head = linear_no_bias(config.hidden_size, config.vocab_size, vb.pp("lm_head"))?;
+ // Codec head (called "codec_head" in HF, not "lm_head")
+ let lm_head = linear_no_bias(config.hidden_size, config.vocab_size, talker_vb.pp("codec_head"))?;
let dtype = vb.dtype();
let device = vb.device().clone();