diff options
| author | soryu <soryu@soryu.co> | 2025-12-21 01:27:02 +0000 |
|---|---|---|
| committer | soryu <soryu@soryu.co> | 2025-12-23 14:47:18 +0000 |
| commit | 3c696cfc9005e73be5ed46f8941dfc8f0aca7102 (patch) | |
| tree | 497bffd67001501a003739cfe0bb790502ffd50a /vendor/parakeet-rs/examples/raw.rs | |
| parent | 55cacf6e1a087c0fa6950a1ddeb09060f787e541 (diff) | |
| download | soryu-3c696cfc9005e73be5ed46f8941dfc8f0aca7102.tar.gz soryu-3c696cfc9005e73be5ed46f8941dfc8f0aca7102.zip | |
Create container image and move parakeet fork to vendor dir
Diffstat (limited to 'vendor/parakeet-rs/examples/raw.rs')
| -rw-r--r-- | vendor/parakeet-rs/examples/raw.rs | 86 |
1 files changed, 86 insertions, 0 deletions
diff --git a/vendor/parakeet-rs/examples/raw.rs b/vendor/parakeet-rs/examples/raw.rs new file mode 100644 index 0000000..a1a2adc --- /dev/null +++ b/vendor/parakeet-rs/examples/raw.rs @@ -0,0 +1,86 @@ +/* +Demonstrates using transcribe_samples() + +This example shows manual audio loading and calling transcribe_samples() directly +with sample_rate and channels instead of using transcribe_file() + +Usage: +cargo run --example raw 6_speakers.wav +cargo run --example raw 6_speakers.wav tdt + +WARNING: TDT model has sequence length limitations (~8-10 minutes max). +For longer audio files, you must split into chunks (e.g., 5-minute segments) +and transcribe each chunk separately. Attempting to transcribe 25+ minute +audio files in one call will cause ONNX runtime errors. +Otherwise you will likely get a error like: +"Error: Ort(Error { code: RuntimeException, msg: "Non-zero status code returned while running Add node. Name:'/layers.0/self_attn/Add_2' Status Message: /Users/runner/work/ort-artifacts/ort-artifacts/onnxruntime/onnxruntime/core/providers/cpu/math/element_wise_ops.h:540 void onnxruntime::BroadcastIterator::Init(ptrdiff_t, ptrdiff_t) axis == 1 || axis == largest was false. })" +*/ + +use parakeet_rs::{Parakeet, ParakeetTDT, TimestampMode}; +use std::env; +use std::time::Instant; + +fn main() -> Result<(), Box<dyn std::error::Error>> { + let start_time = Instant::now(); + let args: Vec<String> = env::args().collect(); + let audio_path = if args.len() > 1 { + &args[1] + } else { + "6_speakers.wav" + }; + + let use_tdt = args.len() > 2 && args[2] == "tdt"; + + // Load audio manually using hound (or any other audio library) + // remember if you use raw audio API, you need to handle audio preprocessing yourself! + let mut reader = hound::WavReader::open(audio_path)?; + let spec = reader.spec(); + + println!("Audio info: {}Hz, {} channel(s)", spec.sample_rate, spec.channels); + + let audio: Vec<f32> = match spec.sample_format { + hound::SampleFormat::Float => reader + .samples::<f32>() + .collect::<Result<Vec<_>, _>>()?, + hound::SampleFormat::Int => reader + .samples::<i16>() + .map(|s| s.map(|s| s as f32 / 32768.0)) + .collect::<Result<Vec<_>, _>>()?, + }; + + if use_tdt { + println!("Loading TDT model..."); + let mut parakeet = ParakeetTDT::from_pretrained("./tdt", None)?; + + // Use transcribe_samples() with raw parameters and timestamp mode + let result = parakeet.transcribe_samples(audio, spec.sample_rate, spec.channels, Some(TimestampMode::Sentences))?; + + println!("{}", result.text); + println!("\nSentencess:"); + for segment in result.tokens.iter() { + println!("[{:.2}s - {:.2}s]: {}", segment.start, segment.end, segment.text); + } + } else { + println!("Loading CTC model..."); + let mut parakeet = Parakeet::from_pretrained(".", None)?; + + // CTC model doesn't predict punctuation (lowercase alphabet only) + // This means no sentence boundaries. we use Words mode instead of Sentences + let result = parakeet.transcribe_samples(audio, spec.sample_rate, spec.channels, Some(TimestampMode::Words))?; + + println!("{}", result.text); + + // Access word-level timestamps (showing first 10 for brevity) + // Note: CTC generates word-level timestamps but cannot segment into sentences + // due to lack of punctuation prediction - this is a model limitation if I not mistake + println!("\nWords (first 10):"); + for word in result.tokens.iter().take(10) { + println!("[{:.2}s - {:.2}s]: {}", word.start, word.end, word.text); + } + } + + let elapsed = start_time.elapsed(); + println!("\n✓ Transcription completed in {:.2}s", elapsed.as_secs_f32()); + + Ok(()) +} |
