makima/src/listen.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123

use std::cmp::Ordering;
use std::path::Path;

pub use parakeet_rs::sortformer::{DiarizationConfig, Sortformer, SpeakerSegment};
pub use parakeet_rs::{ParakeetEOU, ParakeetTDT, TimedToken, TimestampMode};

use crate::audio;

const STREAM_CHUNK_MS: u32 = 5_000;

/// A segment of dialogue with speaker identification and timing.
#[derive(Debug, Clone)]
pub struct DialogueSegment {
    pub speaker: String,
    pub start: f32,
    pub end: f32,
    pub text: String,
}

pub(crate) fn listen() -> Result<Vec<DialogueSegment>, Box<dyn std::error::Error>> {
    let audio_path = Path::new("audio-ftc.mp3");

    let normalized = audio::to_16k_mono_from_path(audio_path)?;

    let mut parakeet = ParakeetTDT::from_pretrained("models/parakeet-tdt-0.6b-v3", None)?;
    let mut sortformer = Sortformer::with_config(
        "models/diarization/diar_streaming_sortformer_4spk-v2.onnx",
        None,
        DiarizationConfig::callhome(),
    )?;

    let chunk_samples = samples_per_chunk(normalized.sample_rate, STREAM_CHUNK_MS);
    let mut cumulative_audio: Vec<f32> = Vec::new();
    let mut last_printed_tokens = 0usize;
    let mut final_segments: Vec<DialogueSegment> = Vec::new();

    for (chunk_idx, chunk) in normalized.samples.chunks(chunk_samples).enumerate() {
        cumulative_audio.extend_from_slice(chunk);

        let diarization_segments = sortformer.diarize(
            cumulative_audio.clone(),
            normalized.sample_rate,
            normalized.channels,
        )?;

        let transcription = parakeet.transcribe_samples(
            cumulative_audio.clone(),
            normalized.sample_rate,
            normalized.channels,
            Some(TimestampMode::Sentences),
        )?;

        final_segments = align_speakers(&transcription.tokens, &diarization_segments);

        // Simulate "live" output by printing only newly emitted tokens.
        if transcription.tokens.len() > last_printed_tokens {
            let new_segments = &final_segments[last_printed_tokens..];
            for segment in new_segments {
                println!(
                    "[chunk {}] [{:.2}s - {:.2}s] {}: {}",
                    chunk_idx, segment.start, segment.end, segment.speaker, segment.text
                );
            }
            last_printed_tokens = transcription.tokens.len();
        }
    }

    Ok(final_segments)
}

/// Align transcription tokens with speaker diarization segments.
pub fn align_speakers(tokens: &[TimedToken], speakers: &[SpeakerSegment]) -> Vec<DialogueSegment> {
    tokens
        .iter()
        .map(|token| {
            let speaker = speaker_for_span(token.start, token.end, speakers)
                .unwrap_or_else(|| "UNKNOWN".to_string());
            DialogueSegment {
                speaker,
                start: token.start,
                end: token.end,
                text: token.text.trim().to_string(),
            }
        })
        .collect()
}

/// Calculate the number of samples in a chunk of given duration.
pub fn samples_per_chunk(sample_rate: u32, chunk_ms: u32) -> usize {
    let samples = (sample_rate as u64)
        .saturating_mul(chunk_ms as u64)
        .saturating_div(1_000);
    samples.max(1) as usize
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn samples_per_chunk_rounds_down_and_clamps() {
        assert_eq!(samples_per_chunk(16_000, 1_000), 16_000);
        assert_eq!(samples_per_chunk(16_000, 160), 2_560);
        assert_eq!(samples_per_chunk(16_000, 0), 1);
    }
}

fn speaker_for_span(start: f32, end: f32, speakers: &[SpeakerSegment]) -> Option<String> {
    speakers
        .iter()
        .filter_map(|segment| {
            let overlap_start = start.max(segment.start);
            let overlap_end = end.min(segment.end);
            let overlap = overlap_end - overlap_start;
            if overlap > 0.0 {
                Some((segment.speaker_id, overlap))
            } else {
                None
            }
        })
        .max_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(Ordering::Equal))
        .map(|(id, _)| format!("Speaker {}", id))
}