summaryrefslogtreecommitdiff
path: root/makima/src/server/handlers
diff options
context:
space:
mode:
authorsoryu <soryu@soryu.co>2025-12-23 19:11:57 +0000
committersoryu <soryu@soryu.co>2025-12-23 19:11:57 +0000
commitf5222a7ae5ade5589436778cb01fc0abe625b3c3 (patch)
tree6e9739517d371179e6018412cba011b3f38868ef /makima/src/server/handlers
parent3c0adec8e3a9dd3bc34251e87e0fb5314793426d (diff)
downloadsoryu-f5222a7ae5ade5589436778cb01fc0abe625b3c3.tar.gz
soryu-f5222a7ae5ade5589436778cb01fc0abe625b3c3.zip
Add editable file sections and a drag&drop feature
Diffstat (limited to 'makima/src/server/handlers')
-rw-r--r--makima/src/server/handlers/listen.rs124
1 files changed, 95 insertions, 29 deletions
diff --git a/makima/src/server/handlers/listen.rs b/makima/src/server/handlers/listen.rs
index 3055cb7..5fc5cea 100644
--- a/makima/src/server/handlers/listen.rs
+++ b/makima/src/server/handlers/listen.rs
@@ -512,12 +512,12 @@ fn decode_audio_chunk(data: &[u8], format: &StartMessage) -> Vec<f32> {
}
}
-/// Deduplicate transcript entries by removing entries with similar start times and text.
+/// Deduplicate transcript entries by removing entries with similar times and text.
///
-/// Entries are considered duplicates if:
-/// - Start times are within 0.5 seconds of each other
-/// - Speaker is the same
-/// - Text is identical or one is a substring of the other
+/// Entries are considered duplicates if any of these are true:
+/// - Start times are within 1.5 seconds AND text is similar (same, substring, or high overlap)
+/// - Time ranges overlap significantly AND text is similar
+/// - Text is identical regardless of timing
fn deduplicate_transcripts(entries: &[TranscriptEntry]) -> Vec<TranscriptEntry> {
if entries.is_empty() {
return vec![];
@@ -530,49 +530,115 @@ fn deduplicate_transcripts(entries: &[TranscriptEntry]) -> Vec<TranscriptEntry>
let mut result: Vec<TranscriptEntry> = Vec::new();
for entry in sorted {
+ // Normalize text for comparison
+ let entry_text_normalized = normalize_text(&entry.text);
+
// Check if this entry is a duplicate of any existing entry
- let is_duplicate = result.iter().any(|existing| {
- // Check if start times are close (within 0.5 seconds)
- let time_close = (existing.start - entry.start).abs() < 0.5;
+ let duplicate_idx = result.iter().position(|existing| {
+ let existing_text_normalized = normalize_text(&existing.text);
// Check if same speaker
let same_speaker = existing.speaker == entry.speaker;
- // Check if text matches or one contains the other
- let text_match = existing.text == entry.text
- || existing.text.contains(&entry.text)
- || entry.text.contains(&existing.text);
-
- time_close && same_speaker && text_match
+ // Check if start times are identical or very close
+ let start_identical = (existing.start - entry.start).abs() < 0.1;
+ let start_close = (existing.start - entry.start).abs() < 1.5;
+
+ // Check if time ranges overlap
+ let time_overlap = existing.start < entry.end && entry.start < existing.end;
+
+ // Check various text similarity conditions
+ let text_identical = existing_text_normalized == entry_text_normalized;
+ let text_contained = existing_text_normalized.contains(&entry_text_normalized)
+ || entry_text_normalized.contains(&existing_text_normalized);
+ let text_similar = text_similarity(&existing_text_normalized, &entry_text_normalized) > 0.7;
+
+ // Duplicate conditions:
+ // 1. Same speaker + identical start time (different end times = same segment refined)
+ // 2. Same speaker + close start + similar text
+ // 3. Same speaker + overlapping time + similar text
+ // 4. Identical text (likely a re-transcription)
+ (same_speaker && start_identical)
+ || (same_speaker && start_close && (text_identical || text_contained || text_similar))
+ || (same_speaker && time_overlap && (text_identical || text_contained))
+ || text_identical
});
- if !is_duplicate {
- result.push(entry);
- } else {
- // If duplicate, check if the new entry has longer text and update
- for existing in &mut result {
- let time_close = (existing.start - entry.start).abs() < 0.5;
- let same_speaker = existing.speaker == entry.speaker;
-
- if time_close && same_speaker && entry.text.len() > existing.text.len() {
- // Keep the longer text version
- existing.text = entry.text.clone();
- existing.end = entry.end;
- break;
+ match duplicate_idx {
+ Some(idx) => {
+ // If the new entry has longer text, update the existing one
+ if entry.text.len() > result[idx].text.len() {
+ result[idx].text = entry.text.clone();
+ result[idx].end = result[idx].end.max(entry.end);
+ } else {
+ // Extend end time if needed
+ result[idx].end = result[idx].end.max(entry.end);
+ }
+ }
+ None => {
+ result.push(entry);
+ }
+ }
+ }
+
+ // Second pass: merge adjacent segments with same speaker and similar text
+ let mut merged: Vec<TranscriptEntry> = Vec::new();
+ for entry in result {
+ if let Some(last) = merged.last_mut() {
+ // Check if this should be merged with the previous entry
+ let same_speaker = last.speaker == entry.speaker;
+ let adjacent = (entry.start - last.end).abs() < 0.5;
+ let text_overlap = normalize_text(&last.text).contains(&normalize_text(&entry.text))
+ || normalize_text(&entry.text).contains(&normalize_text(&last.text));
+
+ if same_speaker && adjacent && text_overlap {
+ // Merge: keep longer text, extend time range
+ if entry.text.len() > last.text.len() {
+ last.text = entry.text;
}
+ last.end = last.end.max(entry.end);
+ continue;
}
}
+ merged.push(entry);
}
// Reassign IDs to be sequential
- for (i, entry) in result.iter_mut().enumerate() {
+ for (i, entry) in merged.iter_mut().enumerate() {
let parts: Vec<&str> = entry.id.split('-').collect();
if let Some(session_prefix) = parts.first() {
entry.id = format!("{}-{}", session_prefix, i + 1);
}
}
- result
+ merged
+}
+
+/// Normalize text for comparison by lowercasing and collapsing whitespace.
+fn normalize_text(text: &str) -> String {
+ text.to_lowercase()
+ .split_whitespace()
+ .collect::<Vec<_>>()
+ .join(" ")
+}
+
+/// Calculate text similarity as a ratio of shared words.
+fn text_similarity(a: &str, b: &str) -> f32 {
+ if a.is_empty() || b.is_empty() {
+ return 0.0;
+ }
+
+ let words_a: std::collections::HashSet<&str> = a.split_whitespace().collect();
+ let words_b: std::collections::HashSet<&str> = b.split_whitespace().collect();
+
+ let intersection = words_a.intersection(&words_b).count();
+ let union = words_a.union(&words_b).count();
+
+ if union == 0 {
+ 0.0
+ } else {
+ intersection as f32 / union as f32
+ }
}
/// Process audio using sliding window through STT and streaming diarization models.