path: root/vendor/parakeet-rs/examples/streaming.rs



/*
Demonstrates streaming ASR with Parakeet RealTime EOU

Download models files from:
https://huggingface.co/altunenes/parakeet-rs/tree/main/realtime_eou_120m-v1-onnx

This example 
- Maintains 4-second ring buffer for feature extraction context
- Processes 160ms chunks (2560 samples at 16kHz)
- Extracts features from full buffer, then slices last 25 frames
- Encoder receives: 9 frames (pre-encode cache) + 16 frames (new) = 25 total
- Cache states (cache_last_channel/time) maintain temporal context

Model files required in ./fullstr/:
  - encoder.onnx (cache_aware_stream_step export)
  - decoder_joint.onnx
  - tokenizer.json

Additional notes:
let reset_on_eou: bool = false;
I must admit that this is not work very well on my real world tests :/


Usage:
cargo run --release --example streaming <audio.wav>
*/

use hound;
use parakeet_rs::ParakeetEOU;
use std::env;
use std::time::Instant;

fn main() -> Result<(), Box<dyn std::error::Error>> {
    let start_time = Instant::now();

    let args: Vec<String> = env::args().collect();
    let audio_path = args
        .get(1)
        .expect("Usage: cargo run --release --example streaming <audio.wav>");

    println!("Loading model from ./fullstr...");
    let mut parakeet = ParakeetEOU::from_pretrained("./fullstr", None)?;

    println!("Loading audio: {}", audio_path);
    let mut reader = hound::WavReader::open(audio_path)?;
    let spec = reader.spec();

    let mut audio: Vec<f32> = match spec.sample_format {
        hound::SampleFormat::Float => reader
            .samples::<f32>()
            .collect::<Result<Vec<_>, _>>()?,
        hound::SampleFormat::Int => reader
            .samples::<i16>()
            .map(|s| s.map(|s| s as f32 / 32768.0))
            .collect::<Result<Vec<_>, _>>()?,
    };

    if spec.sample_rate != 16000 {
        return Err(format!(
            "Expected 16kHz audio, got {}Hz. Please resample first.",
            spec.sample_rate
        )
        .into());
    }

    if spec.channels > 1 {
        audio = audio
            .chunks(spec.channels as usize)
            .map(|chunk| chunk.iter().sum::<f32>() / spec.channels as f32)
            .collect();
    }

    let max_val = audio.iter().fold(0.0f32, |a, &b| a.max(b.abs()));
    if max_val > 1e-6 {
        let norm_factor = max_val + 1e-5;
        for sample in &mut audio {
            *sample /= norm_factor;
        }
    }

    let duration = audio.len() as f32 / 16000.0;
    // 160ms at 16kHz
    const CHUNK_SIZE: usize = 2560;
    let reset_on_eou: bool = false;

    println!("Streaming transcription (160ms chunks with 4s buffer)...\n");

    let mut full_text = String::new();

    for chunk in audio.chunks(CHUNK_SIZE) {
        let chunk_vec = if chunk.len() < CHUNK_SIZE {
            let mut padded = chunk.to_vec();
            padded.resize(CHUNK_SIZE, 0.0);
            padded
        } else {
            chunk.to_vec()
        };

        let text = parakeet.transcribe(&chunk_vec, reset_on_eou)?;
        if !text.is_empty() {
            print!("{}", text);
            std::io::Write::flush(&mut std::io::stdout())?;
            full_text.push_str(&text);
        }
    }

    println!("\n\nFlushing decoder...");
    let silence = vec![0.0f32; CHUNK_SIZE];
    for _ in 0..3 {
        let text = parakeet.transcribe(&silence, reset_on_eou)?;
        if !text.is_empty() {
            print!("{}", text);
            std::io::Write::flush(&mut std::io::stdout())?;
            full_text.push_str(&text);
        }
    }

    println!("\n\nFinal Transcription:\n{}", full_text.trim());

    let elapsed = start_time.elapsed();
    println!(
        "\nTranscription completed in {:.2}s (audio: {:.2}s, RTF: {:.2}x)",
        elapsed.as_secs_f32(),
        duration,
        duration / elapsed.as_secs_f32()
    );

    Ok(())
}