1 files changed, 95 insertions, 29 deletions
diff --git a/makima/src/server/handlers/listen.rs b/makima/src/server/handlers/listen.rs
index 3055cb7..5fc5cea 100644
--- a/makima/src/server/handlers/listen.rs
+++ b/makima/src/server/handlers/listen.rs
@@ -512,12 +512,12 @@ fn decode_audio_chunk(data: &[u8], format: &StartMessage) -> Vec<f32> {
     }
 }
 
-/// Deduplicate transcript entries by removing entries with similar start times and text.
+/// Deduplicate transcript entries by removing entries with similar times and text.
 ///
-/// Entries are considered duplicates if:
-/// - Start times are within 0.5 seconds of each other
-/// - Speaker is the same
-/// - Text is identical or one is a substring of the other
+/// Entries are considered duplicates if any of these are true:
+/// - Start times are within 1.5 seconds AND text is similar (same, substring, or high overlap)
+/// - Time ranges overlap significantly AND text is similar
+/// - Text is identical regardless of timing
 fn deduplicate_transcripts(entries: &[TranscriptEntry]) -> Vec<TranscriptEntry> {
     if entries.is_empty() {
         return vec![];
@@ -530,49 +530,115 @@ fn deduplicate_transcripts(entries: &[TranscriptEntry]) -> Vec<TranscriptEntry>
     let mut result: Vec<TranscriptEntry> = Vec::new();
 
     for entry in sorted {
+        // Normalize text for comparison
+        let entry_text_normalized = normalize_text(&entry.text);
+
         // Check if this entry is a duplicate of any existing entry
-        let is_duplicate = result.iter().any(|existing| {
-            // Check if start times are close (within 0.5 seconds)
-            let time_close = (existing.start - entry.start).abs() < 0.5;
+        let duplicate_idx = result.iter().position(|existing| {
+            let existing_text_normalized = normalize_text(&existing.text);
 
             // Check if same speaker
             let same_speaker = existing.speaker == entry.speaker;
 
-            // Check if text matches or one contains the other
-            let text_match = existing.text == entry.text
-                || existing.text.contains(&entry.text)
-                || entry.text.contains(&existing.text);
-
-            time_close && same_speaker && text_match
+            // Check if start times are identical or very close
+            let start_identical = (existing.start - entry.start).abs() < 0.1;
+            let start_close = (existing.start - entry.start).abs() < 1.5;
+
+            // Check if time ranges overlap
+            let time_overlap = existing.start < entry.end && entry.start < existing.end;
+
+            // Check various text similarity conditions
+            let text_identical = existing_text_normalized == entry_text_normalized;
+            let text_contained = existing_text_normalized.contains(&entry_text_normalized)
+                || entry_text_normalized.contains(&existing_text_normalized);
+            let text_similar = text_similarity(&existing_text_normalized, &entry_text_normalized) > 0.7;
+
+            // Duplicate conditions:
+            // 1. Same speaker + identical start time (different end times = same segment refined)
+            // 2. Same speaker + close start + similar text
+            // 3. Same speaker + overlapping time + similar text
+            // 4. Identical text (likely a re-transcription)
+            (same_speaker && start_identical)
+                || (same_speaker && start_close && (text_identical || text_contained || text_similar))
+                || (same_speaker && time_overlap && (text_identical || text_contained))
+                || text_identical
         });
 
-        if !is_duplicate {
-            result.push(entry);
-        } else {
-            // If duplicate, check if the new entry has longer text and update
-            for existing in &mut result {
-                let time_close = (existing.start - entry.start).abs() < 0.5;
-                let same_speaker = existing.speaker == entry.speaker;
-
-                if time_close && same_speaker && entry.text.len() > existing.text.len() {
-                    // Keep the longer text version
-                    existing.text = entry.text.clone();
-                    existing.end = entry.end;
-                    break;
+        match duplicate_idx {
+            Some(idx) => {
+                // If the new entry has longer text, update the existing one
+                if entry.text.len() > result[idx].text.len() {
+                    result[idx].text = entry.text.clone();
+                    result[idx].end = result[idx].end.max(entry.end);
+                } else {
+                    // Extend end time if needed
+                    result[idx].end = result[idx].end.max(entry.end);
+                }
+            }
+            None => {
+                result.push(entry);
+            }
+        }
+    }
+
+    // Second pass: merge adjacent segments with same speaker and similar text
+    let mut merged: Vec<TranscriptEntry> = Vec::new();
+    for entry in result {
+        if let Some(last) = merged.last_mut() {
+            // Check if this should be merged with the previous entry
+            let same_speaker = last.speaker == entry.speaker;
+            let adjacent = (entry.start - last.end).abs() < 0.5;
+            let text_overlap = normalize_text(&last.text).contains(&normalize_text(&entry.text))
+                || normalize_text(&entry.text).contains(&normalize_text(&last.text));
+
+            if same_speaker && adjacent && text_overlap {
+                // Merge: keep longer text, extend time range
+                if entry.text.len() > last.text.len() {
+                    last.text = entry.text;
                 }
+                last.end = last.end.max(entry.end);
+                continue;
             }
         }
+        merged.push(entry);
     }
 
     // Reassign IDs to be sequential
-    for (i, entry) in result.iter_mut().enumerate() {
+    for (i, entry) in merged.iter_mut().enumerate() {
         let parts: Vec<&str> = entry.id.split('-').collect();
         if let Some(session_prefix) = parts.first() {
             entry.id = format!("{}-{}", session_prefix, i + 1);
         }
     }
 
-    result
+    merged
+}
+
+/// Normalize text for comparison by lowercasing and collapsing whitespace.
+fn normalize_text(text: &str) -> String {
+    text.to_lowercase()
+        .split_whitespace()
+        .collect::<Vec<_>>()
+        .join(" ")
+}
+
+/// Calculate text similarity as a ratio of shared words.
+fn text_similarity(a: &str, b: &str) -> f32 {
+    if a.is_empty() || b.is_empty() {
+        return 0.0;
+    }
+
+    let words_a: std::collections::HashSet<&str> = a.split_whitespace().collect();
+    let words_b: std::collections::HashSet<&str> = b.split_whitespace().collect();
+
+    let intersection = words_a.intersection(&words_b).count();
+    let union = words_a.union(&words_b).count();
+
+    if union == 0 {
+        0.0
+    } else {
+        intersection as f32 / union as f32
+    }
 }
 
 /// Process audio using sliding window through STT and streaming diarization models.