diff options
Diffstat (limited to 'parakeet-rs/src/timestamps.rs')
| -rw-r--r-- | parakeet-rs/src/timestamps.rs | 280 |
1 files changed, 0 insertions, 280 deletions
diff --git a/parakeet-rs/src/timestamps.rs b/parakeet-rs/src/timestamps.rs deleted file mode 100644 index 81ea600..0000000 --- a/parakeet-rs/src/timestamps.rs +++ /dev/null @@ -1,280 +0,0 @@ -use crate::decoder::TimedToken; - -/// Timestamp output mode for transcription results -/// -/// Determines how token-level timestamps are grouped and presented: -/// - `Tokens`: Raw token-level output from the model (most detailed) -/// - `Words`: Tokens grouped into individual words -/// - `Sentences`: Tokens grouped by sentence boundaries (., ?, !) -/// -/// # Model-Specific Recommendations -/// -/// - **Parakeet CTC (English)**: Use `Words` mode. The CTC model only outputs lowercase -/// alphabet without punctuation, so sentence segmentation is not possible. -/// - **Parakeet TDT (Multilingual)**: Use `Sentences` mode. The TDT model predicts -/// punctuation, enabling natural sentence boundaries. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum TimestampMode { - /// Raw token-level timestamps from the model - Tokens, - /// Word-level timestamps (groups subword tokens) - Words, - /// Sentence-level timestamps (groups by punctuation) - /// - /// Note: Only works with models that predict punctuation (e.g., Parakeet TDT). - /// CTC models don't predict punctuation, so use `Words` mode instead. - Sentences, -} - -impl Default for TimestampMode { - fn default() -> Self { - Self::Tokens - } -} - -/// Convert token timestamps to the requested output mode -/// -/// Takes raw token-level timestamps from the model and optionally groups them -/// into words or sentences while preserving the original timing information. -/// -/// # Arguments -/// -/// * `tokens` - Raw token-level timestamps from model output -/// * `mode` - Desired grouping level (Tokens, Words, or Sentences) -/// -/// # Returns -/// -/// Vector of TimedToken with timestamps at the requested granularity -pub fn process_timestamps(tokens: &[TimedToken], mode: TimestampMode) -> Vec<TimedToken> { - match mode { - TimestampMode::Tokens => tokens.to_vec(), - TimestampMode::Words => group_by_words(tokens), - TimestampMode::Sentences => group_by_sentences(tokens), - } -} - -// Group tokens into words based on word boundary markers -fn group_by_words(tokens: &[TimedToken]) -> Vec<TimedToken> { - if tokens.is_empty() { - return Vec::new(); - } - - let mut words = Vec::new(); - let mut current_word_text = String::new(); - let mut current_word_start = 0.0; - let mut last_word_lower = String::new(); - - for (i, token) in tokens.iter().enumerate() { - // Skip empty tokens - if token.text.trim().is_empty() { - continue; - } - - // Check if this starts a new word (SentencePiece uses ▁ or space prefix) - // Also treat PURE punctuation marks (like ".", ",") as separate words - // But NOT contractions like "'re" or "'s" which should attach to previous word - let is_pure_punctuation = !token.text.is_empty() && - token.text.chars().all(|c| c.is_ascii_punctuation()); - - // Check if this is a contraction suffix - // These should NOT start a new word - they attach to the previous word - let token_without_marker = token.text.trim_start_matches('▁').trim_start_matches(' '); - let is_contraction = token_without_marker.starts_with('\''); - - let starts_word = (token.text.starts_with('▁') - || token.text.starts_with(' ') - || is_pure_punctuation) - && !is_contraction - || i == 0; - - if starts_word && !current_word_text.is_empty() { - // Save previous word (with deduplication) - let word_lower = current_word_text.to_lowercase(); - if word_lower != last_word_lower { - words.push(TimedToken { - text: current_word_text.clone(), - start: current_word_start, - end: tokens[i - 1].end, - }); - last_word_lower = word_lower; - } - current_word_text.clear(); - } - - // Start new word or append to current - if current_word_text.is_empty() { - current_word_start = token.start; - } - - // Add token text, removing word boundary markers - let token_text = token - .text - .trim_start_matches('▁') - .trim_start_matches(' '); - current_word_text.push_str(token_text); - } - - // Add final word - if !current_word_text.is_empty() { - let word_lower = current_word_text.to_lowercase(); - if word_lower != last_word_lower { - words.push(TimedToken { - text: current_word_text, - start: current_word_start, - end: tokens.last().unwrap().end, - }); - } - } - - words -} - -// Group words into sentences based on punctuation -fn group_by_sentences(tokens: &[TimedToken]) -> Vec<TimedToken> { - // First get word-level grouping - let words = group_by_words(tokens); - if words.is_empty() { - return Vec::new(); - } - - let mut sentences = Vec::new(); - let mut current_sentence = Vec::new(); - - for word in words { - current_sentence.push(word.clone()); - - // Check if word ends with sentence terminator - let ends_sentence = word.text.contains('.') - || word.text.contains('?') - || word.text.contains('!'); - - if ends_sentence { - let sentence_text = format_sentence(¤t_sentence); - let start = current_sentence.first().unwrap().start; - let end = current_sentence.last().unwrap().end; - - if !sentence_text.is_empty() { - sentences.push(TimedToken { - text: sentence_text, - start, - end, - }); - } - current_sentence.clear(); - } - } - - // Add final sentence if exists - if !current_sentence.is_empty() { - let sentence_text = format_sentence(¤t_sentence); - let start = current_sentence.first().unwrap().start; - let end = current_sentence.last().unwrap().end; - - if !sentence_text.is_empty() { - sentences.push(TimedToken { - text: sentence_text, - start, - end, - }); - } - } - - sentences -} - -// Join words with punctuation spacing -fn format_sentence(words: &[TimedToken]) -> String { - let result: Vec<&str> = words.iter().map(|w| w.text.as_str()).collect(); - - // Join words, but don't add space before certain punctuation - let mut output = String::new(); - for (i, word) in result.iter().enumerate() { - // Check if this word is standalone punctuation that shouldn't have space before it - // Contractions like "'re" or "'s" should have spaces before them - let is_standalone_punct = word.len() == 1 && - word.chars().all(|c| matches!(c, '.' | ',' | '!' | '?' | ';' | ':' | ')')); - - if i > 0 && !is_standalone_punct { - output.push(' '); - } - output.push_str(word); - } - output -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_word_grouping() { - let tokens = vec![ - TimedToken { - text: "▁Hello".to_string(), - start: 0.0, - end: 0.5, - }, - TimedToken { - text: "▁world".to_string(), - start: 0.5, - end: 1.0, - }, - ]; - - let words = group_by_words(&tokens); - assert_eq!(words.len(), 2); - assert_eq!(words[0].text, "Hello"); - assert_eq!(words[1].text, "world"); - } - - #[test] - fn test_sentence_grouping() { - let tokens = vec![ - TimedToken { - text: "▁Hello".to_string(), - start: 0.0, - end: 0.5, - }, - TimedToken { - text: "▁world".to_string(), - start: 0.5, - end: 1.0, - }, - TimedToken { - text: ".".to_string(), - start: 1.0, - end: 1.1, - }, - ]; - - let sentences = group_by_sentences(&tokens); - assert_eq!(sentences.len(), 1); - assert_eq!(sentences[0].text, "Hello world."); - assert_eq!(sentences[0].start, 0.0); - assert_eq!(sentences[0].end, 1.1); - } - - #[test] - fn test_repetition_preservation() { - let words = vec![ - TimedToken { - text: "uh".to_string(), - start: 0.0, - end: 0.5, - }, - TimedToken { - text: "uh".to_string(), - start: 0.5, - end: 1.0, - }, - TimedToken { - text: "hello".to_string(), - start: 1.0, - end: 1.5, - }, - ]; - - let result = format_sentence(&words); - assert_eq!(result, "uh uh hello"); - } -} |
