diff options
Diffstat (limited to 'parakeet-rs/examples/streaming.rs')
| -rw-r--r-- | parakeet-rs/examples/streaming.rs | 129 |
1 files changed, 0 insertions, 129 deletions
diff --git a/parakeet-rs/examples/streaming.rs b/parakeet-rs/examples/streaming.rs deleted file mode 100644 index f5d36c9..0000000 --- a/parakeet-rs/examples/streaming.rs +++ /dev/null @@ -1,129 +0,0 @@ -/* -Demonstrates streaming ASR with Parakeet RealTime EOU - -Download models files from: -https://huggingface.co/altunenes/parakeet-rs/tree/main/realtime_eou_120m-v1-onnx - -This example -- Maintains 4-second ring buffer for feature extraction context -- Processes 160ms chunks (2560 samples at 16kHz) -- Extracts features from full buffer, then slices last 25 frames -- Encoder receives: 9 frames (pre-encode cache) + 16 frames (new) = 25 total -- Cache states (cache_last_channel/time) maintain temporal context - -Model files required in ./fullstr/: - - encoder.onnx (cache_aware_stream_step export) - - decoder_joint.onnx - - tokenizer.json - -Additional notes: -let reset_on_eou: bool = false; -I must admit that this is not work very well on my real world tests :/ - - -Usage: -cargo run --release --example streaming <audio.wav> -*/ - -use hound; -use parakeet_rs::ParakeetEOU; -use std::env; -use std::time::Instant; - -fn main() -> Result<(), Box<dyn std::error::Error>> { - let start_time = Instant::now(); - - let args: Vec<String> = env::args().collect(); - let audio_path = args - .get(1) - .expect("Usage: cargo run --release --example streaming <audio.wav>"); - - println!("Loading model from ./fullstr..."); - let mut parakeet = ParakeetEOU::from_pretrained("./fullstr", None)?; - - println!("Loading audio: {}", audio_path); - let mut reader = hound::WavReader::open(audio_path)?; - let spec = reader.spec(); - - let mut audio: Vec<f32> = match spec.sample_format { - hound::SampleFormat::Float => reader - .samples::<f32>() - .collect::<Result<Vec<_>, _>>()?, - hound::SampleFormat::Int => reader - .samples::<i16>() - .map(|s| s.map(|s| s as f32 / 32768.0)) - .collect::<Result<Vec<_>, _>>()?, - }; - - if spec.sample_rate != 16000 { - return Err(format!( - "Expected 16kHz audio, got {}Hz. Please resample first.", - spec.sample_rate - ) - .into()); - } - - if spec.channels > 1 { - audio = audio - .chunks(spec.channels as usize) - .map(|chunk| chunk.iter().sum::<f32>() / spec.channels as f32) - .collect(); - } - - let max_val = audio.iter().fold(0.0f32, |a, &b| a.max(b.abs())); - if max_val > 1e-6 { - let norm_factor = max_val + 1e-5; - for sample in &mut audio { - *sample /= norm_factor; - } - } - - let duration = audio.len() as f32 / 16000.0; - // 160ms at 16kHz - const CHUNK_SIZE: usize = 2560; - let reset_on_eou: bool = false; - - println!("Streaming transcription (160ms chunks with 4s buffer)...\n"); - - let mut full_text = String::new(); - - for chunk in audio.chunks(CHUNK_SIZE) { - let chunk_vec = if chunk.len() < CHUNK_SIZE { - let mut padded = chunk.to_vec(); - padded.resize(CHUNK_SIZE, 0.0); - padded - } else { - chunk.to_vec() - }; - - let text = parakeet.transcribe(&chunk_vec, reset_on_eou)?; - if !text.is_empty() { - print!("{}", text); - std::io::Write::flush(&mut std::io::stdout())?; - full_text.push_str(&text); - } - } - - println!("\n\nFlushing decoder..."); - let silence = vec![0.0f32; CHUNK_SIZE]; - for _ in 0..3 { - let text = parakeet.transcribe(&silence, reset_on_eou)?; - if !text.is_empty() { - print!("{}", text); - std::io::Write::flush(&mut std::io::stdout())?; - full_text.push_str(&text); - } - } - - println!("\n\nFinal Transcription:\n{}", full_text.trim()); - - let elapsed = start_time.elapsed(); - println!( - "\nTranscription completed in {:.2}s (audio: {:.2}s, RTF: {:.2}x)", - elapsed.as_secs_f32(), - duration, - duration / elapsed.as_secs_f32() - ); - - Ok(()) -} |
