summaryrefslogtreecommitdiff
path: root/parakeet-rs/src/timestamps.rs
diff options
context:
space:
mode:
Diffstat (limited to 'parakeet-rs/src/timestamps.rs')
-rw-r--r--parakeet-rs/src/timestamps.rs280
1 files changed, 0 insertions, 280 deletions
diff --git a/parakeet-rs/src/timestamps.rs b/parakeet-rs/src/timestamps.rs
deleted file mode 100644
index 81ea600..0000000
--- a/parakeet-rs/src/timestamps.rs
+++ /dev/null
@@ -1,280 +0,0 @@
-use crate::decoder::TimedToken;
-
-/// Timestamp output mode for transcription results
-///
-/// Determines how token-level timestamps are grouped and presented:
-/// - `Tokens`: Raw token-level output from the model (most detailed)
-/// - `Words`: Tokens grouped into individual words
-/// - `Sentences`: Tokens grouped by sentence boundaries (., ?, !)
-///
-/// # Model-Specific Recommendations
-///
-/// - **Parakeet CTC (English)**: Use `Words` mode. The CTC model only outputs lowercase
-/// alphabet without punctuation, so sentence segmentation is not possible.
-/// - **Parakeet TDT (Multilingual)**: Use `Sentences` mode. The TDT model predicts
-/// punctuation, enabling natural sentence boundaries.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum TimestampMode {
- /// Raw token-level timestamps from the model
- Tokens,
- /// Word-level timestamps (groups subword tokens)
- Words,
- /// Sentence-level timestamps (groups by punctuation)
- ///
- /// Note: Only works with models that predict punctuation (e.g., Parakeet TDT).
- /// CTC models don't predict punctuation, so use `Words` mode instead.
- Sentences,
-}
-
-impl Default for TimestampMode {
- fn default() -> Self {
- Self::Tokens
- }
-}
-
-/// Convert token timestamps to the requested output mode
-///
-/// Takes raw token-level timestamps from the model and optionally groups them
-/// into words or sentences while preserving the original timing information.
-///
-/// # Arguments
-///
-/// * `tokens` - Raw token-level timestamps from model output
-/// * `mode` - Desired grouping level (Tokens, Words, or Sentences)
-///
-/// # Returns
-///
-/// Vector of TimedToken with timestamps at the requested granularity
-pub fn process_timestamps(tokens: &[TimedToken], mode: TimestampMode) -> Vec<TimedToken> {
- match mode {
- TimestampMode::Tokens => tokens.to_vec(),
- TimestampMode::Words => group_by_words(tokens),
- TimestampMode::Sentences => group_by_sentences(tokens),
- }
-}
-
-// Group tokens into words based on word boundary markers
-fn group_by_words(tokens: &[TimedToken]) -> Vec<TimedToken> {
- if tokens.is_empty() {
- return Vec::new();
- }
-
- let mut words = Vec::new();
- let mut current_word_text = String::new();
- let mut current_word_start = 0.0;
- let mut last_word_lower = String::new();
-
- for (i, token) in tokens.iter().enumerate() {
- // Skip empty tokens
- if token.text.trim().is_empty() {
- continue;
- }
-
- // Check if this starts a new word (SentencePiece uses ▁ or space prefix)
- // Also treat PURE punctuation marks (like ".", ",") as separate words
- // But NOT contractions like "'re" or "'s" which should attach to previous word
- let is_pure_punctuation = !token.text.is_empty() &&
- token.text.chars().all(|c| c.is_ascii_punctuation());
-
- // Check if this is a contraction suffix
- // These should NOT start a new word - they attach to the previous word
- let token_without_marker = token.text.trim_start_matches('▁').trim_start_matches(' ');
- let is_contraction = token_without_marker.starts_with('\'');
-
- let starts_word = (token.text.starts_with('▁')
- || token.text.starts_with(' ')
- || is_pure_punctuation)
- && !is_contraction
- || i == 0;
-
- if starts_word && !current_word_text.is_empty() {
- // Save previous word (with deduplication)
- let word_lower = current_word_text.to_lowercase();
- if word_lower != last_word_lower {
- words.push(TimedToken {
- text: current_word_text.clone(),
- start: current_word_start,
- end: tokens[i - 1].end,
- });
- last_word_lower = word_lower;
- }
- current_word_text.clear();
- }
-
- // Start new word or append to current
- if current_word_text.is_empty() {
- current_word_start = token.start;
- }
-
- // Add token text, removing word boundary markers
- let token_text = token
- .text
- .trim_start_matches('▁')
- .trim_start_matches(' ');
- current_word_text.push_str(token_text);
- }
-
- // Add final word
- if !current_word_text.is_empty() {
- let word_lower = current_word_text.to_lowercase();
- if word_lower != last_word_lower {
- words.push(TimedToken {
- text: current_word_text,
- start: current_word_start,
- end: tokens.last().unwrap().end,
- });
- }
- }
-
- words
-}
-
-// Group words into sentences based on punctuation
-fn group_by_sentences(tokens: &[TimedToken]) -> Vec<TimedToken> {
- // First get word-level grouping
- let words = group_by_words(tokens);
- if words.is_empty() {
- return Vec::new();
- }
-
- let mut sentences = Vec::new();
- let mut current_sentence = Vec::new();
-
- for word in words {
- current_sentence.push(word.clone());
-
- // Check if word ends with sentence terminator
- let ends_sentence = word.text.contains('.')
- || word.text.contains('?')
- || word.text.contains('!');
-
- if ends_sentence {
- let sentence_text = format_sentence(&current_sentence);
- let start = current_sentence.first().unwrap().start;
- let end = current_sentence.last().unwrap().end;
-
- if !sentence_text.is_empty() {
- sentences.push(TimedToken {
- text: sentence_text,
- start,
- end,
- });
- }
- current_sentence.clear();
- }
- }
-
- // Add final sentence if exists
- if !current_sentence.is_empty() {
- let sentence_text = format_sentence(&current_sentence);
- let start = current_sentence.first().unwrap().start;
- let end = current_sentence.last().unwrap().end;
-
- if !sentence_text.is_empty() {
- sentences.push(TimedToken {
- text: sentence_text,
- start,
- end,
- });
- }
- }
-
- sentences
-}
-
-// Join words with punctuation spacing
-fn format_sentence(words: &[TimedToken]) -> String {
- let result: Vec<&str> = words.iter().map(|w| w.text.as_str()).collect();
-
- // Join words, but don't add space before certain punctuation
- let mut output = String::new();
- for (i, word) in result.iter().enumerate() {
- // Check if this word is standalone punctuation that shouldn't have space before it
- // Contractions like "'re" or "'s" should have spaces before them
- let is_standalone_punct = word.len() == 1 &&
- word.chars().all(|c| matches!(c, '.' | ',' | '!' | '?' | ';' | ':' | ')'));
-
- if i > 0 && !is_standalone_punct {
- output.push(' ');
- }
- output.push_str(word);
- }
- output
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
-
- #[test]
- fn test_word_grouping() {
- let tokens = vec![
- TimedToken {
- text: "▁Hello".to_string(),
- start: 0.0,
- end: 0.5,
- },
- TimedToken {
- text: "▁world".to_string(),
- start: 0.5,
- end: 1.0,
- },
- ];
-
- let words = group_by_words(&tokens);
- assert_eq!(words.len(), 2);
- assert_eq!(words[0].text, "Hello");
- assert_eq!(words[1].text, "world");
- }
-
- #[test]
- fn test_sentence_grouping() {
- let tokens = vec![
- TimedToken {
- text: "▁Hello".to_string(),
- start: 0.0,
- end: 0.5,
- },
- TimedToken {
- text: "▁world".to_string(),
- start: 0.5,
- end: 1.0,
- },
- TimedToken {
- text: ".".to_string(),
- start: 1.0,
- end: 1.1,
- },
- ];
-
- let sentences = group_by_sentences(&tokens);
- assert_eq!(sentences.len(), 1);
- assert_eq!(sentences[0].text, "Hello world.");
- assert_eq!(sentences[0].start, 0.0);
- assert_eq!(sentences[0].end, 1.1);
- }
-
- #[test]
- fn test_repetition_preservation() {
- let words = vec![
- TimedToken {
- text: "uh".to_string(),
- start: 0.0,
- end: 0.5,
- },
- TimedToken {
- text: "uh".to_string(),
- start: 0.5,
- end: 1.0,
- },
- TimedToken {
- text: "hello".to_string(),
- start: 1.0,
- end: 1.5,
- },
- ];
-
- let result = format_sentence(&words);
- assert_eq!(result, "uh uh hello");
- }
-}