Create container image and move parakeet fork to vendor dir

author: soryu <soryu@soryu.co> 2025-12-21 01:27:02 +0000
committer: soryu <soryu@soryu.co> 2025-12-23 14:47:18 +0000
commit: 3c696cfc9005e73be5ed46f8941dfc8f0aca7102 (patch)
tree: 497bffd67001501a003739cfe0bb790502ffd50a /vendor/parakeet-rs/examples/raw.rs
parent: 55cacf6e1a087c0fa6950a1ddeb09060f787e541 (diff)
download: soryu-3c696cfc9005e73be5ed46f8941dfc8f0aca7102.tar.gz
soryu-3c696cfc9005e73be5ed46f8941dfc8f0aca7102.zip
1 files changed, 86 insertions, 0 deletions
diff --git a/vendor/parakeet-rs/examples/raw.rs b/vendor/parakeet-rs/examples/raw.rs
new file mode 100644
index 0000000..a1a2adc
--- /dev/null
+++ b/vendor/parakeet-rs/examples/raw.rs
@@ -0,0 +1,86 @@
+/*
+Demonstrates using transcribe_samples()
+
+This example shows manual audio loading and calling transcribe_samples() directly
+with sample_rate and channels instead of using transcribe_file()
+
+Usage:
+cargo run --example raw 6_speakers.wav
+cargo run --example raw 6_speakers.wav tdt
+
+WARNING: TDT model has sequence length limitations (~8-10 minutes max).
+For longer audio files, you must split into chunks (e.g., 5-minute segments)
+and transcribe each chunk separately. Attempting to transcribe 25+ minute
+audio files in one call will cause ONNX runtime errors.
+Otherwise you will likely get a error like:
+"Error: Ort(Error { code: RuntimeException, msg: "Non-zero status code returned while running Add node. Name:'/layers.0/self_attn/Add_2' Status Message: /Users/runner/work/ort-artifacts/ort-artifacts/onnxruntime/onnxruntime/core/providers/cpu/math/element_wise_ops.h:540 void onnxruntime::BroadcastIterator::Init(ptrdiff_t, ptrdiff_t) axis == 1 || axis == largest was false. })"
+*/
+
+use parakeet_rs::{Parakeet, ParakeetTDT, TimestampMode};
+use std::env;
+use std::time::Instant;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let start_time = Instant::now();
+    let args: Vec<String> = env::args().collect();
+    let audio_path = if args.len() > 1 {
+        &args[1]
+    } else {
+        "6_speakers.wav"
+    };
+
+    let use_tdt = args.len() > 2 && args[2] == "tdt";
+
+    // Load audio manually using hound (or any other audio library)
+    // remember if you use raw audio API, you need to handle audio preprocessing yourself!
+    let mut reader = hound::WavReader::open(audio_path)?;
+    let spec = reader.spec();
+
+    println!("Audio info: {}Hz, {} channel(s)", spec.sample_rate, spec.channels);
+
+    let audio: Vec<f32> = match spec.sample_format {
+        hound::SampleFormat::Float => reader
+            .samples::<f32>()
+            .collect::<Result<Vec<_>, _>>()?,
+        hound::SampleFormat::Int => reader
+            .samples::<i16>()
+            .map(|s| s.map(|s| s as f32 / 32768.0))
+            .collect::<Result<Vec<_>, _>>()?,
+    };
+
+    if use_tdt {
+        println!("Loading TDT model...");
+        let mut parakeet = ParakeetTDT::from_pretrained("./tdt", None)?;
+
+        // Use transcribe_samples() with raw parameters and timestamp mode
+        let result = parakeet.transcribe_samples(audio, spec.sample_rate, spec.channels, Some(TimestampMode::Sentences))?;
+
+        println!("{}", result.text);
+        println!("\nSentencess:");
+        for segment in result.tokens.iter() {
+            println!("[{:.2}s - {:.2}s]: {}", segment.start, segment.end, segment.text);
+        }
+    } else {
+        println!("Loading CTC model...");
+        let mut parakeet = Parakeet::from_pretrained(".", None)?;
+
+        // CTC model doesn't predict punctuation (lowercase alphabet only)
+        // This means no sentence boundaries. we use Words mode instead of Sentences
+        let result = parakeet.transcribe_samples(audio, spec.sample_rate, spec.channels, Some(TimestampMode::Words))?;
+
+        println!("{}", result.text);
+
+        // Access word-level timestamps (showing first 10 for brevity)
+        // Note: CTC generates word-level timestamps but cannot segment into sentences
+        // due to lack of punctuation prediction - this is a model limitation if I not mistake
+        println!("\nWords (first 10):");
+        for word in result.tokens.iter().take(10) {
+            println!("[{:.2}s - {:.2}s]: {}", word.start, word.end, word.text);
+        }
+    }
+
+    let elapsed = start_time.elapsed();
+    println!("\n✓ Transcription completed in {:.2}s", elapsed.as_secs_f32());
+
+    Ok(())
+}
author	soryu <soryu@soryu.co>	2025-12-21 01:27:02 +0000
committer	soryu <soryu@soryu.co>	2025-12-23 14:47:18 +0000
commit	3c696cfc9005e73be5ed46f8941dfc8f0aca7102 (patch)
tree	497bffd67001501a003739cfe0bb790502ffd50a /vendor/parakeet-rs/examples/raw.rs
parent	55cacf6e1a087c0fa6950a1ddeb09060f787e541 (diff)
download	soryu-3c696cfc9005e73be5ed46f8941dfc8f0aca7102.tar.gz soryu-3c696cfc9005e73be5ed46f8941dfc8f0aca7102.zip