diff options
Diffstat (limited to 'makima/src')
| -rw-r--r-- | makima/src/server/handlers/listen.rs | 124 |
1 files changed, 95 insertions, 29 deletions
diff --git a/makima/src/server/handlers/listen.rs b/makima/src/server/handlers/listen.rs index 3055cb7..5fc5cea 100644 --- a/makima/src/server/handlers/listen.rs +++ b/makima/src/server/handlers/listen.rs @@ -512,12 +512,12 @@ fn decode_audio_chunk(data: &[u8], format: &StartMessage) -> Vec<f32> { } } -/// Deduplicate transcript entries by removing entries with similar start times and text. +/// Deduplicate transcript entries by removing entries with similar times and text. /// -/// Entries are considered duplicates if: -/// - Start times are within 0.5 seconds of each other -/// - Speaker is the same -/// - Text is identical or one is a substring of the other +/// Entries are considered duplicates if any of these are true: +/// - Start times are within 1.5 seconds AND text is similar (same, substring, or high overlap) +/// - Time ranges overlap significantly AND text is similar +/// - Text is identical regardless of timing fn deduplicate_transcripts(entries: &[TranscriptEntry]) -> Vec<TranscriptEntry> { if entries.is_empty() { return vec![]; @@ -530,49 +530,115 @@ fn deduplicate_transcripts(entries: &[TranscriptEntry]) -> Vec<TranscriptEntry> let mut result: Vec<TranscriptEntry> = Vec::new(); for entry in sorted { + // Normalize text for comparison + let entry_text_normalized = normalize_text(&entry.text); + // Check if this entry is a duplicate of any existing entry - let is_duplicate = result.iter().any(|existing| { - // Check if start times are close (within 0.5 seconds) - let time_close = (existing.start - entry.start).abs() < 0.5; + let duplicate_idx = result.iter().position(|existing| { + let existing_text_normalized = normalize_text(&existing.text); // Check if same speaker let same_speaker = existing.speaker == entry.speaker; - // Check if text matches or one contains the other - let text_match = existing.text == entry.text - || existing.text.contains(&entry.text) - || entry.text.contains(&existing.text); - - time_close && same_speaker && text_match + // Check if start times are identical or very close + let start_identical = (existing.start - entry.start).abs() < 0.1; + let start_close = (existing.start - entry.start).abs() < 1.5; + + // Check if time ranges overlap + let time_overlap = existing.start < entry.end && entry.start < existing.end; + + // Check various text similarity conditions + let text_identical = existing_text_normalized == entry_text_normalized; + let text_contained = existing_text_normalized.contains(&entry_text_normalized) + || entry_text_normalized.contains(&existing_text_normalized); + let text_similar = text_similarity(&existing_text_normalized, &entry_text_normalized) > 0.7; + + // Duplicate conditions: + // 1. Same speaker + identical start time (different end times = same segment refined) + // 2. Same speaker + close start + similar text + // 3. Same speaker + overlapping time + similar text + // 4. Identical text (likely a re-transcription) + (same_speaker && start_identical) + || (same_speaker && start_close && (text_identical || text_contained || text_similar)) + || (same_speaker && time_overlap && (text_identical || text_contained)) + || text_identical }); - if !is_duplicate { - result.push(entry); - } else { - // If duplicate, check if the new entry has longer text and update - for existing in &mut result { - let time_close = (existing.start - entry.start).abs() < 0.5; - let same_speaker = existing.speaker == entry.speaker; - - if time_close && same_speaker && entry.text.len() > existing.text.len() { - // Keep the longer text version - existing.text = entry.text.clone(); - existing.end = entry.end; - break; + match duplicate_idx { + Some(idx) => { + // If the new entry has longer text, update the existing one + if entry.text.len() > result[idx].text.len() { + result[idx].text = entry.text.clone(); + result[idx].end = result[idx].end.max(entry.end); + } else { + // Extend end time if needed + result[idx].end = result[idx].end.max(entry.end); + } + } + None => { + result.push(entry); + } + } + } + + // Second pass: merge adjacent segments with same speaker and similar text + let mut merged: Vec<TranscriptEntry> = Vec::new(); + for entry in result { + if let Some(last) = merged.last_mut() { + // Check if this should be merged with the previous entry + let same_speaker = last.speaker == entry.speaker; + let adjacent = (entry.start - last.end).abs() < 0.5; + let text_overlap = normalize_text(&last.text).contains(&normalize_text(&entry.text)) + || normalize_text(&entry.text).contains(&normalize_text(&last.text)); + + if same_speaker && adjacent && text_overlap { + // Merge: keep longer text, extend time range + if entry.text.len() > last.text.len() { + last.text = entry.text; } + last.end = last.end.max(entry.end); + continue; } } + merged.push(entry); } // Reassign IDs to be sequential - for (i, entry) in result.iter_mut().enumerate() { + for (i, entry) in merged.iter_mut().enumerate() { let parts: Vec<&str> = entry.id.split('-').collect(); if let Some(session_prefix) = parts.first() { entry.id = format!("{}-{}", session_prefix, i + 1); } } - result + merged +} + +/// Normalize text for comparison by lowercasing and collapsing whitespace. +fn normalize_text(text: &str) -> String { + text.to_lowercase() + .split_whitespace() + .collect::<Vec<_>>() + .join(" ") +} + +/// Calculate text similarity as a ratio of shared words. +fn text_similarity(a: &str, b: &str) -> f32 { + if a.is_empty() || b.is_empty() { + return 0.0; + } + + let words_a: std::collections::HashSet<&str> = a.split_whitespace().collect(); + let words_b: std::collections::HashSet<&str> = b.split_whitespace().collect(); + + let intersection = words_a.intersection(&words_b).count(); + let union = words_a.union(&words_b).count(); + + if union == 0 { + 0.0 + } else { + intersection as f32 / union as f32 + } } /// Process audio using sliding window through STT and streaming diarization models. |
