use crate::decoder::TimedToken; /// Timestamp output mode for transcription results /// /// Determines how token-level timestamps are grouped and presented: /// - `Tokens`: Raw token-level output from the model (most detailed) /// - `Words`: Tokens grouped into individual words /// - `Sentences`: Tokens grouped by sentence boundaries (., ?, !) /// /// # Model-Specific Recommendations /// /// - **Parakeet CTC (English)**: Use `Words` mode. The CTC model only outputs lowercase /// alphabet without punctuation, so sentence segmentation is not possible. /// - **Parakeet TDT (Multilingual)**: Use `Sentences` mode. The TDT model predicts /// punctuation, enabling natural sentence boundaries. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum TimestampMode { /// Raw token-level timestamps from the model Tokens, /// Word-level timestamps (groups subword tokens) Words, /// Sentence-level timestamps (groups by punctuation) /// /// Note: Only works with models that predict punctuation (e.g., Parakeet TDT). /// CTC models don't predict punctuation, so use `Words` mode instead. Sentences, } impl Default for TimestampMode { fn default() -> Self { Self::Tokens } } /// Convert token timestamps to the requested output mode /// /// Takes raw token-level timestamps from the model and optionally groups them /// into words or sentences while preserving the original timing information. /// /// # Arguments /// /// * `tokens` - Raw token-level timestamps from model output /// * `mode` - Desired grouping level (Tokens, Words, or Sentences) /// /// # Returns /// /// Vector of TimedToken with timestamps at the requested granularity pub fn process_timestamps(tokens: &[TimedToken], mode: TimestampMode) -> Vec { match mode { TimestampMode::Tokens => tokens.to_vec(), TimestampMode::Words => group_by_words(tokens), TimestampMode::Sentences => group_by_sentences(tokens), } } // Group tokens into words based on word boundary markers fn group_by_words(tokens: &[TimedToken]) -> Vec { if tokens.is_empty() { return Vec::new(); } let mut words = Vec::new(); let mut current_word_text = String::new(); let mut current_word_start = 0.0; let mut last_word_lower = String::new(); for (i, token) in tokens.iter().enumerate() { // Skip empty tokens if token.text.trim().is_empty() { continue; } // Check if this starts a new word (SentencePiece uses ▁ or space prefix) // Also treat PURE punctuation marks (like ".", ",") as separate words // But NOT contractions like "'re" or "'s" which should attach to previous word let is_pure_punctuation = !token.text.is_empty() && token.text.chars().all(|c| c.is_ascii_punctuation()); // Check if this is a contraction suffix // These should NOT start a new word - they attach to the previous word let token_without_marker = token.text.trim_start_matches('▁').trim_start_matches(' '); let is_contraction = token_without_marker.starts_with('\''); let starts_word = (token.text.starts_with('▁') || token.text.starts_with(' ') || is_pure_punctuation) && !is_contraction || i == 0; if starts_word && !current_word_text.is_empty() { // Save previous word (with deduplication) let word_lower = current_word_text.to_lowercase(); if word_lower != last_word_lower { words.push(TimedToken { text: current_word_text.clone(), start: current_word_start, end: tokens[i - 1].end, }); last_word_lower = word_lower; } current_word_text.clear(); } // Start new word or append to current if current_word_text.is_empty() { current_word_start = token.start; } // Add token text, removing word boundary markers let token_text = token .text .trim_start_matches('▁') .trim_start_matches(' '); current_word_text.push_str(token_text); } // Add final word if !current_word_text.is_empty() { let word_lower = current_word_text.to_lowercase(); if word_lower != last_word_lower { words.push(TimedToken { text: current_word_text, start: current_word_start, end: tokens.last().unwrap().end, }); } } words } // Group words into sentences based on punctuation fn group_by_sentences(tokens: &[TimedToken]) -> Vec { // First get word-level grouping let words = group_by_words(tokens); if words.is_empty() { return Vec::new(); } let mut sentences = Vec::new(); let mut current_sentence = Vec::new(); for word in words { current_sentence.push(word.clone()); // Check if word ends with sentence terminator let ends_sentence = word.text.contains('.') || word.text.contains('?') || word.text.contains('!'); if ends_sentence { let sentence_text = format_sentence(¤t_sentence); let start = current_sentence.first().unwrap().start; let end = current_sentence.last().unwrap().end; if !sentence_text.is_empty() { sentences.push(TimedToken { text: sentence_text, start, end, }); } current_sentence.clear(); } } // Add final sentence if exists if !current_sentence.is_empty() { let sentence_text = format_sentence(¤t_sentence); let start = current_sentence.first().unwrap().start; let end = current_sentence.last().unwrap().end; if !sentence_text.is_empty() { sentences.push(TimedToken { text: sentence_text, start, end, }); } } sentences } // Join words with punctuation spacing fn format_sentence(words: &[TimedToken]) -> String { let result: Vec<&str> = words.iter().map(|w| w.text.as_str()).collect(); // Join words, but don't add space before certain punctuation let mut output = String::new(); for (i, word) in result.iter().enumerate() { // Check if this word is standalone punctuation that shouldn't have space before it // Contractions like "'re" or "'s" should have spaces before them let is_standalone_punct = word.len() == 1 && word.chars().all(|c| matches!(c, '.' | ',' | '!' | '?' | ';' | ':' | ')')); if i > 0 && !is_standalone_punct { output.push(' '); } output.push_str(word); } output } #[cfg(test)] mod tests { use super::*; #[test] fn test_word_grouping() { let tokens = vec![ TimedToken { text: "▁Hello".to_string(), start: 0.0, end: 0.5, }, TimedToken { text: "▁world".to_string(), start: 0.5, end: 1.0, }, ]; let words = group_by_words(&tokens); assert_eq!(words.len(), 2); assert_eq!(words[0].text, "Hello"); assert_eq!(words[1].text, "world"); } #[test] fn test_sentence_grouping() { let tokens = vec![ TimedToken { text: "▁Hello".to_string(), start: 0.0, end: 0.5, }, TimedToken { text: "▁world".to_string(), start: 0.5, end: 1.0, }, TimedToken { text: ".".to_string(), start: 1.0, end: 1.1, }, ]; let sentences = group_by_sentences(&tokens); assert_eq!(sentences.len(), 1); assert_eq!(sentences[0].text, "Hello world."); assert_eq!(sentences[0].start, 0.0); assert_eq!(sentences[0].end, 1.1); } #[test] fn test_repetition_preservation() { let words = vec![ TimedToken { text: "uh".to_string(), start: 0.0, end: 0.5, }, TimedToken { text: "uh".to_string(), start: 0.5, end: 1.0, }, TimedToken { text: "hello".to_string(), start: 1.0, end: 1.5, }, ]; let result = format_sentence(&words); assert_eq!(result, "uh uh hello"); } }