use std::fs::File; use std::io::{self, Read, Seek}; use std::path::Path; use symphonia::core::audio::{AudioBufferRef, Signal}; use symphonia::core::codecs::{DecoderOptions, CODEC_TYPE_NULL}; use symphonia::core::errors::Error as SymphoniaError; use symphonia::core::formats::FormatOptions; use symphonia::core::io::{MediaSourceStream, ReadOnlySource}; use symphonia::core::meta::MetadataOptions; use symphonia::core::probe::Hint; pub const TARGET_SAMPLE_RATE: u32 = 16_000; pub const TARGET_CHANNELS: u16 = 1; #[derive(Debug, Clone)] pub struct PcmAudio { pub samples: Vec, pub sample_rate: u32, pub channels: u16, } #[derive(Debug)] pub enum AudioError { Io(io::Error), Decode(String), UnsupportedFormat, NoAudioTrack, } impl std::fmt::Display for AudioError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { AudioError::Io(err) => write!(f, "io error: {err}"), AudioError::Decode(err) => write!(f, "decode error: {err}"), AudioError::UnsupportedFormat => write!(f, "unsupported audio format"), AudioError::NoAudioTrack => write!(f, "no audio track found"), } } } impl std::error::Error for AudioError {} impl From for AudioError { fn from(value: io::Error) -> Self { AudioError::Io(value) } } impl From for AudioError { fn from(value: SymphoniaError) -> Self { match value { SymphoniaError::IoError(e) => AudioError::Io(e), SymphoniaError::Unsupported(_) => AudioError::UnsupportedFormat, other => AudioError::Decode(other.to_string()), } } } pub fn to_16k_mono_from_path(path: impl AsRef) -> Result { let path = path.as_ref(); let file = File::open(path)?; let mut hint = Hint::new(); if let Some(ext) = path.extension().and_then(|e| e.to_str()) { hint.with_extension(ext); } decode_to_16k_mono(file, hint) } pub fn to_16k_mono_from_reader( reader: R, ) -> Result { decode_to_16k_mono(reader, Hint::new()) } fn decode_to_16k_mono( reader: R, hint: Hint, ) -> Result { let source = MediaSourceStream::new(Box::new(ReadOnlySource::new(reader)), Default::default()); let format_opts = FormatOptions::default(); let metadata_opts = MetadataOptions::default(); let probed = symphonia::default::get_probe().format(&hint, source, &format_opts, &metadata_opts)?; let mut format = probed.format; let track = format .tracks() .iter() .find(|t| t.codec_params.codec != CODEC_TYPE_NULL) .ok_or(AudioError::NoAudioTrack)?; let track_id = track.id; let codec_params = track.codec_params.clone(); let sample_rate = codec_params.sample_rate.ok_or(AudioError::Decode( "unknown sample rate".to_string(), ))?; let channels = codec_params .channels .map(|c| c.count() as u16) .unwrap_or(1); let decoder_opts = DecoderOptions::default(); let mut decoder = symphonia::default::get_codecs().make(&codec_params, &decoder_opts)?; let mut interleaved: Vec = Vec::new(); loop { let packet = match format.next_packet() { Ok(p) => p, Err(SymphoniaError::IoError(ref e)) if e.kind() == io::ErrorKind::UnexpectedEof => break, Err(SymphoniaError::ResetRequired) => { decoder.reset(); continue; } Err(e) => return Err(e.into()), }; if packet.track_id() != track_id { continue; } let decoded = match decoder.decode(&packet) { Ok(d) => d, Err(SymphoniaError::DecodeError(_)) => continue, Err(e) => return Err(e.into()), }; append_samples(&decoded, &mut interleaved); } let mono = mixdown_to_mono(&interleaved, channels); let samples = resample_sinc(&mono, sample_rate, TARGET_SAMPLE_RATE); Ok(PcmAudio { samples, sample_rate: TARGET_SAMPLE_RATE, channels: TARGET_CHANNELS, }) } fn append_samples(buffer: &AudioBufferRef, out: &mut Vec) { match buffer { AudioBufferRef::U8(buf) => { for frame in 0..buf.frames() { for plane in buf.planes().planes() { out.push((plane[frame] as f32 - 128.0) / 128.0); } } } AudioBufferRef::U16(buf) => { for frame in 0..buf.frames() { for plane in buf.planes().planes() { out.push((plane[frame] as f32 - 32768.0) / 32768.0); } } } AudioBufferRef::U24(buf) => { for frame in 0..buf.frames() { for plane in buf.planes().planes() { out.push((plane[frame].inner() as f32 - 8388608.0) / 8388608.0); } } } AudioBufferRef::U32(buf) => { for frame in 0..buf.frames() { for plane in buf.planes().planes() { out.push((plane[frame] as f64 - 2147483648.0) as f32 / 2147483648.0); } } } AudioBufferRef::S8(buf) => { for frame in 0..buf.frames() { for plane in buf.planes().planes() { out.push(plane[frame] as f32 / 128.0); } } } AudioBufferRef::S16(buf) => { for frame in 0..buf.frames() { for plane in buf.planes().planes() { out.push(plane[frame] as f32 / 32768.0); } } } AudioBufferRef::S24(buf) => { for frame in 0..buf.frames() { for plane in buf.planes().planes() { out.push(plane[frame].inner() as f32 / 8388608.0); } } } AudioBufferRef::S32(buf) => { for frame in 0..buf.frames() { for plane in buf.planes().planes() { out.push(plane[frame] as f32 / 2147483648.0); } } } AudioBufferRef::F32(buf) => { for frame in 0..buf.frames() { for plane in buf.planes().planes() { out.push(plane[frame]); } } } AudioBufferRef::F64(buf) => { for frame in 0..buf.frames() { for plane in buf.planes().planes() { out.push(plane[frame] as f32); } } } } } fn mixdown_to_mono(interleaved: &[f32], channels: u16) -> Vec { if channels <= 1 { return interleaved.to_vec(); } let channels = channels as usize; let frames = interleaved.len() / channels; let mut mono = Vec::with_capacity(frames); for frame in 0..frames { let base = frame * channels; let mut acc = 0.0f32; for c in 0..channels { acc += interleaved[base + c]; } mono.push(acc / channels as f32); } mono } /// Resample and mixdown audio to 16kHz mono for STT processing. pub fn resample_and_mixdown(samples: &[f32], sample_rate: u32, channels: u16) -> Vec { let mono = mixdown_to_mono(samples, channels); resample_sinc(&mono, sample_rate, TARGET_SAMPLE_RATE) } fn resample_sinc(input: &[f32], input_rate: u32, output_rate: u32) -> Vec { if input_rate == output_rate { return input.to_vec(); } if input.is_empty() { return Vec::new(); } let ratio = input_rate as f64 / output_rate as f64; let output_len = ((input.len() as f64) / ratio).ceil() as usize; let cutoff = (output_rate as f64 / input_rate as f64).min(1.0); let radius: i32 = 32; let radius_f = radius as f64; let pi = std::f64::consts::PI; let mut output = Vec::with_capacity(output_len); for n in 0..output_len { let t = n as f64 * ratio; let center = t.floor() as i32; let frac = t - (center as f64); let mut acc = 0.0f64; let mut norm = 0.0f64; for k in -radius..=radius { let idx = center + k; if idx < 0 || (idx as usize) >= input.len() { continue; } let x = (k as f64) - frac; let d = x.abs(); if d > radius_f { continue; } let window = 0.5 * (1.0 + (pi * d / radius_f).cos()); let z = x * cutoff; let sinc = if z == 0.0 { 1.0 } else { let pz = pi * z; pz.sin() / pz }; let weight = cutoff * sinc * window; acc += input[idx as usize] as f64 * weight; norm += weight; } let y = if norm == 0.0 { 0.0 } else { acc / norm }; output.push(y as f32); } output } #[cfg(test)] mod tests { use super::*; use std::io::Cursor; fn create_wav_buffer(sample_rate: u32, channels: u16, samples: &[i16]) -> Vec { let mut buf = Vec::new(); let data_size = (samples.len() * 2) as u32; let file_size = 36 + data_size; buf.extend_from_slice(b"RIFF"); buf.extend_from_slice(&file_size.to_le_bytes()); buf.extend_from_slice(b"WAVE"); buf.extend_from_slice(b"fmt "); buf.extend_from_slice(&16u32.to_le_bytes()); buf.extend_from_slice(&1u16.to_le_bytes()); buf.extend_from_slice(&channels.to_le_bytes()); buf.extend_from_slice(&sample_rate.to_le_bytes()); let byte_rate = sample_rate * channels as u32 * 2; buf.extend_from_slice(&byte_rate.to_le_bytes()); let block_align = channels * 2; buf.extend_from_slice(&block_align.to_le_bytes()); buf.extend_from_slice(&16u16.to_le_bytes()); buf.extend_from_slice(b"data"); buf.extend_from_slice(&data_size.to_le_bytes()); for &s in samples { buf.extend_from_slice(&s.to_le_bytes()); } buf } #[test] fn converts_stereo_to_mono() { let mut samples = Vec::new(); for _ in 0..(TARGET_SAMPLE_RATE / 10) { samples.push(10_000i16); samples.push(0i16); } let wav = create_wav_buffer(TARGET_SAMPLE_RATE, 2, &samples); let cursor = Cursor::new(wav); let normalized = to_16k_mono_from_reader(cursor).unwrap(); assert_eq!(normalized.sample_rate, TARGET_SAMPLE_RATE); assert_eq!(normalized.channels, TARGET_CHANNELS); let mean = normalized.samples.iter().copied().sum::() / normalized.samples.len() as f32; let expected = (10_000.0 / 32768.0) / 2.0; assert!((mean - expected).abs() < 1e-3); } #[test] fn resamples_to_16k() { let samples: Vec = vec![0; 48_000]; let wav = create_wav_buffer(48_000, 1, &samples); let cursor = Cursor::new(wav); let normalized = to_16k_mono_from_reader(cursor).unwrap(); assert_eq!(normalized.sample_rate, TARGET_SAMPLE_RATE); assert_eq!(normalized.channels, TARGET_CHANNELS); assert_eq!(normalized.samples.len(), TARGET_SAMPLE_RATE as usize); let max_abs = normalized .samples .iter() .copied() .fold(0.0f32, |m, v| m.max(v.abs())); assert!(max_abs <= 1e-6); } }