diff options
| author | soryu <soryu@soryu.co> | 2025-12-21 19:14:29 +0000 |
|---|---|---|
| committer | soryu <soryu@soryu.co> | 2025-12-23 14:47:18 +0000 |
| commit | 75f2a72a06af6f722fce1bba1d1fc2f4c5e844df (patch) | |
| tree | 7d9ba06bdade73b10be580bf09df8404e6f4ea8c /makima/sh | |
| parent | 87e6c9c49fca144e3de3ea4a3618a84b1c418536 (diff) | |
| download | soryu-75f2a72a06af6f722fce1bba1d1fc2f4c5e844df.tar.gz soryu-75f2a72a06af6f722fce1bba1d1fc2f4c5e844df.zip | |
Bump diarization version to 2.1 and fix downloading the tokenizer
Diffstat (limited to 'makima/sh')
| -rw-r--r-- | makima/sh/download-models.sh | 56 |
1 files changed, 45 insertions, 11 deletions
diff --git a/makima/sh/download-models.sh b/makima/sh/download-models.sh index 7aecefe..0381e15 100644 --- a/makima/sh/download-models.sh +++ b/makima/sh/download-models.sh @@ -28,7 +28,6 @@ download_from_url() { download_from_hf() { local dest=$1 local repo=$2 - local include=${3:-} if [ -d "$dest" ] && [ "$(ls -A $dest 2>/dev/null)" ]; then echo "Model $dest already exists, skipping..." @@ -37,21 +36,46 @@ download_from_hf() { echo "Downloading from Hugging Face ($repo)..." mkdir -p "$dest" + hf download "$repo" --local-dir "$dest" + echo "Downloaded to $dest successfully" +} - if [ -n "$include" ]; then - hf download "$repo" --include "$include" --local-dir "$dest" - else - hf download "$repo" --local-dir "$dest" +download_from_hf_subdir() { + local dest=$1 + local repo=$2 + local subdir=$3 + + if [ -d "$dest" ] && [ "$(ls -A $dest 2>/dev/null)" ]; then + echo "Model $dest already exists, skipping..." + return 0 fi + echo "Downloading $subdir from Hugging Face ($repo)..." + local tmpdir=$(mktemp -d) + hf download "$repo" --include "$subdir/*" --local-dir "$tmpdir" + + # Move subdirectory contents to destination + mkdir -p "$dest" + mv "$tmpdir/$subdir"/* "$dest"/ + rm -rf "$tmpdir" echo "Downloaded to $dest successfully" } # Check if models exist +# TDT: encoder-model.onnx, encoder-model.onnx.data, decoder_joint-model.onnx, vocab.txt +# EOU: encoder.onnx, decoder_joint.onnx, tokenizer.json +# Diarization: diar_streaming_sortformer_4spk-v2.1.onnx check_models_exist() { - [ -d "$MODELS_DIR/$PARAKEET_DIR" ] && \ - [ -d "$MODELS_DIR/$EOU_DIR" ] && \ - [ -f "$MODELS_DIR/$DIARIZATION_DIR/diar_streaming_sortformer_4spk-v2.onnx" ] + # TDT model files + [ -f "$MODELS_DIR/$PARAKEET_DIR/encoder-model.onnx" ] && \ + [ -f "$MODELS_DIR/$PARAKEET_DIR/decoder_joint-model.onnx" ] && \ + [ -f "$MODELS_DIR/$PARAKEET_DIR/vocab.txt" ] && \ + # EOU model files + [ -f "$MODELS_DIR/$EOU_DIR/encoder.onnx" ] && \ + [ -f "$MODELS_DIR/$EOU_DIR/decoder_joint.onnx" ] && \ + [ -f "$MODELS_DIR/$EOU_DIR/tokenizer.json" ] && \ + # Diarization model + [ -f "$MODELS_DIR/$DIARIZATION_DIR/diar_streaming_sortformer_4spk-v2.1.onnx" ] } if check_models_exist; then @@ -68,13 +92,23 @@ else echo "Downloading models from Hugging Face..." # Parakeet TDT from istupakov/parakeet-tdt-0.6b-v3-onnx + # Required: encoder-model.onnx, encoder-model.onnx.data, decoder_joint-model.onnx, vocab.txt download_from_hf "$MODELS_DIR/$PARAKEET_DIR" "istupakov/parakeet-tdt-0.6b-v3-onnx" + # Verify TDT files were downloaded + if [ ! -f "$MODELS_DIR/$PARAKEET_DIR/vocab.txt" ]; then + echo "ERROR: vocab.txt not found in parakeet TDT model" + echo "Contents of $MODELS_DIR/$PARAKEET_DIR:" + ls -la "$MODELS_DIR/$PARAKEET_DIR" + exit 1 + fi + # EOU model from altunenes/parakeet-rs (subdirectory) - download_from_hf "$MODELS_DIR/$EOU_DIR" "altunenes/parakeet-rs" "realtime_eou_120m-v1-onnx/*" + download_from_hf_subdir "$MODELS_DIR/$EOU_DIR" "altunenes/parakeet-rs" "realtime_eou_120m-v1-onnx" - # Diarization model from altunenes/parakeet-rs (subdirectory) - download_from_hf "$MODELS_DIR/$DIARIZATION_DIR" "altunenes/parakeet-rs" "diarization/*" + # Diarization model from altunenes/parakeet-rs (single file at root) + mkdir -p "$MODELS_DIR/$DIARIZATION_DIR" + hf download "altunenes/parakeet-rs" "diar_streaming_sortformer_4spk-v2.1.onnx" --local-dir "$MODELS_DIR/$DIARIZATION_DIR" fi echo "All models downloaded successfully" |
