summaryrefslogtreecommitdiff
path: root/makima/sh
diff options
context:
space:
mode:
authorsoryu <soryu@soryu.co>2025-12-21 19:14:29 +0000
committersoryu <soryu@soryu.co>2025-12-23 14:47:18 +0000
commit75f2a72a06af6f722fce1bba1d1fc2f4c5e844df (patch)
tree7d9ba06bdade73b10be580bf09df8404e6f4ea8c /makima/sh
parent87e6c9c49fca144e3de3ea4a3618a84b1c418536 (diff)
downloadsoryu-75f2a72a06af6f722fce1bba1d1fc2f4c5e844df.tar.gz
soryu-75f2a72a06af6f722fce1bba1d1fc2f4c5e844df.zip
Bump diarization version to 2.1 and fix downloading the tokenizer
Diffstat (limited to 'makima/sh')
-rw-r--r--makima/sh/download-models.sh56
1 files changed, 45 insertions, 11 deletions
diff --git a/makima/sh/download-models.sh b/makima/sh/download-models.sh
index 7aecefe..0381e15 100644
--- a/makima/sh/download-models.sh
+++ b/makima/sh/download-models.sh
@@ -28,7 +28,6 @@ download_from_url() {
download_from_hf() {
local dest=$1
local repo=$2
- local include=${3:-}
if [ -d "$dest" ] && [ "$(ls -A $dest 2>/dev/null)" ]; then
echo "Model $dest already exists, skipping..."
@@ -37,21 +36,46 @@ download_from_hf() {
echo "Downloading from Hugging Face ($repo)..."
mkdir -p "$dest"
+ hf download "$repo" --local-dir "$dest"
+ echo "Downloaded to $dest successfully"
+}
- if [ -n "$include" ]; then
- hf download "$repo" --include "$include" --local-dir "$dest"
- else
- hf download "$repo" --local-dir "$dest"
+download_from_hf_subdir() {
+ local dest=$1
+ local repo=$2
+ local subdir=$3
+
+ if [ -d "$dest" ] && [ "$(ls -A $dest 2>/dev/null)" ]; then
+ echo "Model $dest already exists, skipping..."
+ return 0
fi
+ echo "Downloading $subdir from Hugging Face ($repo)..."
+ local tmpdir=$(mktemp -d)
+ hf download "$repo" --include "$subdir/*" --local-dir "$tmpdir"
+
+ # Move subdirectory contents to destination
+ mkdir -p "$dest"
+ mv "$tmpdir/$subdir"/* "$dest"/
+ rm -rf "$tmpdir"
echo "Downloaded to $dest successfully"
}
# Check if models exist
+# TDT: encoder-model.onnx, encoder-model.onnx.data, decoder_joint-model.onnx, vocab.txt
+# EOU: encoder.onnx, decoder_joint.onnx, tokenizer.json
+# Diarization: diar_streaming_sortformer_4spk-v2.1.onnx
check_models_exist() {
- [ -d "$MODELS_DIR/$PARAKEET_DIR" ] && \
- [ -d "$MODELS_DIR/$EOU_DIR" ] && \
- [ -f "$MODELS_DIR/$DIARIZATION_DIR/diar_streaming_sortformer_4spk-v2.onnx" ]
+ # TDT model files
+ [ -f "$MODELS_DIR/$PARAKEET_DIR/encoder-model.onnx" ] && \
+ [ -f "$MODELS_DIR/$PARAKEET_DIR/decoder_joint-model.onnx" ] && \
+ [ -f "$MODELS_DIR/$PARAKEET_DIR/vocab.txt" ] && \
+ # EOU model files
+ [ -f "$MODELS_DIR/$EOU_DIR/encoder.onnx" ] && \
+ [ -f "$MODELS_DIR/$EOU_DIR/decoder_joint.onnx" ] && \
+ [ -f "$MODELS_DIR/$EOU_DIR/tokenizer.json" ] && \
+ # Diarization model
+ [ -f "$MODELS_DIR/$DIARIZATION_DIR/diar_streaming_sortformer_4spk-v2.1.onnx" ]
}
if check_models_exist; then
@@ -68,13 +92,23 @@ else
echo "Downloading models from Hugging Face..."
# Parakeet TDT from istupakov/parakeet-tdt-0.6b-v3-onnx
+ # Required: encoder-model.onnx, encoder-model.onnx.data, decoder_joint-model.onnx, vocab.txt
download_from_hf "$MODELS_DIR/$PARAKEET_DIR" "istupakov/parakeet-tdt-0.6b-v3-onnx"
+ # Verify TDT files were downloaded
+ if [ ! -f "$MODELS_DIR/$PARAKEET_DIR/vocab.txt" ]; then
+ echo "ERROR: vocab.txt not found in parakeet TDT model"
+ echo "Contents of $MODELS_DIR/$PARAKEET_DIR:"
+ ls -la "$MODELS_DIR/$PARAKEET_DIR"
+ exit 1
+ fi
+
# EOU model from altunenes/parakeet-rs (subdirectory)
- download_from_hf "$MODELS_DIR/$EOU_DIR" "altunenes/parakeet-rs" "realtime_eou_120m-v1-onnx/*"
+ download_from_hf_subdir "$MODELS_DIR/$EOU_DIR" "altunenes/parakeet-rs" "realtime_eou_120m-v1-onnx"
- # Diarization model from altunenes/parakeet-rs (subdirectory)
- download_from_hf "$MODELS_DIR/$DIARIZATION_DIR" "altunenes/parakeet-rs" "diarization/*"
+ # Diarization model from altunenes/parakeet-rs (single file at root)
+ mkdir -p "$MODELS_DIR/$DIARIZATION_DIR"
+ hf download "altunenes/parakeet-rs" "diar_streaming_sortformer_4spk-v2.1.onnx" --local-dir "$MODELS_DIR/$DIARIZATION_DIR"
fi
echo "All models downloaded successfully"