use std::fs::File;
use std::io::{self, Read, Seek};
use std::path::Path;
use symphonia::core::audio::{AudioBufferRef, Signal};
use symphonia::core::codecs::{DecoderOptions, CODEC_TYPE_NULL};
use symphonia::core::errors::Error as SymphoniaError;
use symphonia::core::formats::FormatOptions;
use symphonia::core::io::{MediaSourceStream, ReadOnlySource};
use symphonia::core::meta::MetadataOptions;
use symphonia::core::probe::Hint;
pub const TARGET_SAMPLE_RATE: u32 = 16_000;
pub const TARGET_CHANNELS: u16 = 1;
#[derive(Debug, Clone)]
pub struct PcmAudio {
pub samples: Vec<f32>,
pub sample_rate: u32,
pub channels: u16,
}
#[derive(Debug)]
pub enum AudioError {
Io(io::Error),
Decode(String),
UnsupportedFormat,
NoAudioTrack,
}
impl std::fmt::Display for AudioError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
AudioError::Io(err) => write!(f, "io error: {err}"),
AudioError::Decode(err) => write!(f, "decode error: {err}"),
AudioError::UnsupportedFormat => write!(f, "unsupported audio format"),
AudioError::NoAudioTrack => write!(f, "no audio track found"),
}
}
}
impl std::error::Error for AudioError {}
impl From<io::Error> for AudioError {
fn from(value: io::Error) -> Self {
AudioError::Io(value)
}
}
impl From<SymphoniaError> for AudioError {
fn from(value: SymphoniaError) -> Self {
match value {
SymphoniaError::IoError(e) => AudioError::Io(e),
SymphoniaError::Unsupported(_) => AudioError::UnsupportedFormat,
other => AudioError::Decode(other.to_string()),
}
}
}
pub fn to_16k_mono_from_path(path: impl AsRef<Path>) -> Result<PcmAudio, AudioError> {
let path = path.as_ref();
let file = File::open(path)?;
let mut hint = Hint::new();
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
hint.with_extension(ext);
}
decode_to_16k_mono(file, hint)
}
pub fn to_16k_mono_from_reader<R: Read + Seek + Send + Sync + 'static>(
reader: R,
) -> Result<PcmAudio, AudioError> {
decode_to_16k_mono(reader, Hint::new())
}
fn decode_to_16k_mono<R: Read + Seek + Send + Sync + 'static>(
reader: R,
hint: Hint,
) -> Result<PcmAudio, AudioError> {
let source = MediaSourceStream::new(Box::new(ReadOnlySource::new(reader)), Default::default());
let format_opts = FormatOptions::default();
let metadata_opts = MetadataOptions::default();
let probed = symphonia::default::get_probe().format(&hint, source, &format_opts, &metadata_opts)?;
let mut format = probed.format;
let track = format
.tracks()
.iter()
.find(|t| t.codec_params.codec != CODEC_TYPE_NULL)
.ok_or(AudioError::NoAudioTrack)?;
let track_id = track.id;
let codec_params = track.codec_params.clone();
let sample_rate = codec_params.sample_rate.ok_or(AudioError::Decode(
"unknown sample rate".to_string(),
))?;
let channels = codec_params
.channels
.map(|c| c.count() as u16)
.unwrap_or(1);
let decoder_opts = DecoderOptions::default();
let mut decoder = symphonia::default::get_codecs().make(&codec_params, &decoder_opts)?;
let mut interleaved: Vec<f32> = Vec::new();
loop {
let packet = match format.next_packet() {
Ok(p) => p,
Err(SymphoniaError::IoError(ref e)) if e.kind() == io::ErrorKind::UnexpectedEof => break,
Err(SymphoniaError::ResetRequired) => {
decoder.reset();
continue;
}
Err(e) => return Err(e.into()),
};
if packet.track_id() != track_id {
continue;
}
let decoded = match decoder.decode(&packet) {
Ok(d) => d,
Err(SymphoniaError::DecodeError(_)) => continue,
Err(e) => return Err(e.into()),
};
append_samples(&decoded, &mut interleaved);
}
let mono = mixdown_to_mono(&interleaved, channels);
let samples = resample_sinc(&mono, sample_rate, TARGET_SAMPLE_RATE);
Ok(PcmAudio {
samples,
sample_rate: TARGET_SAMPLE_RATE,
channels: TARGET_CHANNELS,
})
}
fn append_samples(buffer: &AudioBufferRef, out: &mut Vec<f32>) {
match buffer {
AudioBufferRef::U8(buf) => {
for frame in 0..buf.frames() {
for plane in buf.planes().planes() {
out.push((plane[frame] as f32 - 128.0) / 128.0);
}
}
}
AudioBufferRef::U16(buf) => {
for frame in 0..buf.frames() {
for plane in buf.planes().planes() {
out.push((plane[frame] as f32 - 32768.0) / 32768.0);
}
}
}
AudioBufferRef::U24(buf) => {
for frame in 0..buf.frames() {
for plane in buf.planes().planes() {
out.push((plane[frame].inner() as f32 - 8388608.0) / 8388608.0);
}
}
}
AudioBufferRef::U32(buf) => {
for frame in 0..buf.frames() {
for plane in buf.planes().planes() {
out.push((plane[frame] as f64 - 2147483648.0) as f32 / 2147483648.0);
}
}
}
AudioBufferRef::S8(buf) => {
for frame in 0..buf.frames() {
for plane in buf.planes().planes() {
out.push(plane[frame] as f32 / 128.0);
}
}
}
AudioBufferRef::S16(buf) => {
for frame in 0..buf.frames() {
for plane in buf.planes().planes() {
out.push(plane[frame] as f32 / 32768.0);
}
}
}
AudioBufferRef::S24(buf) => {
for frame in 0..buf.frames() {
for plane in buf.planes().planes() {
out.push(plane[frame].inner() as f32 / 8388608.0);
}
}
}
AudioBufferRef::S32(buf) => {
for frame in 0..buf.frames() {
for plane in buf.planes().planes() {
out.push(plane[frame] as f32 / 2147483648.0);
}
}
}
AudioBufferRef::F32(buf) => {
for frame in 0..buf.frames() {
for plane in buf.planes().planes() {
out.push(plane[frame]);
}
}
}
AudioBufferRef::F64(buf) => {
for frame in 0..buf.frames() {
for plane in buf.planes().planes() {
out.push(plane[frame] as f32);
}
}
}
}
}
fn mixdown_to_mono(interleaved: &[f32], channels: u16) -> Vec<f32> {
if channels <= 1 {
return interleaved.to_vec();
}
let channels = channels as usize;
let frames = interleaved.len() / channels;
let mut mono = Vec::with_capacity(frames);
for frame in 0..frames {
let base = frame * channels;
let mut acc = 0.0f32;
for c in 0..channels {
acc += interleaved[base + c];
}
mono.push(acc / channels as f32);
}
mono
}
/// Resample and mixdown audio to 16kHz mono for STT processing.
pub fn resample_and_mixdown(samples: &[f32], sample_rate: u32, channels: u16) -> Vec<f32> {
let mono = mixdown_to_mono(samples, channels);
resample_sinc(&mono, sample_rate, TARGET_SAMPLE_RATE)
}
fn resample_sinc(input: &[f32], input_rate: u32, output_rate: u32) -> Vec<f32> {
if input_rate == output_rate {
return input.to_vec();
}
if input.is_empty() {
return Vec::new();
}
let ratio = input_rate as f64 / output_rate as f64;
let output_len = ((input.len() as f64) / ratio).ceil() as usize;
let cutoff = (output_rate as f64 / input_rate as f64).min(1.0);
let radius: i32 = 32;
let radius_f = radius as f64;
let pi = std::f64::consts::PI;
let mut output = Vec::with_capacity(output_len);
for n in 0..output_len {
let t = n as f64 * ratio;
let center = t.floor() as i32;
let frac = t - (center as f64);
let mut acc = 0.0f64;
let mut norm = 0.0f64;
for k in -radius..=radius {
let idx = center + k;
if idx < 0 || (idx as usize) >= input.len() {
continue;
}
let x = (k as f64) - frac;
let d = x.abs();
if d > radius_f {
continue;
}
let window = 0.5 * (1.0 + (pi * d / radius_f).cos());
let z = x * cutoff;
let sinc = if z == 0.0 {
1.0
} else {
let pz = pi * z;
pz.sin() / pz
};
let weight = cutoff * sinc * window;
acc += input[idx as usize] as f64 * weight;
norm += weight;
}
let y = if norm == 0.0 { 0.0 } else { acc / norm };
output.push(y as f32);
}
output
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
fn create_wav_buffer(sample_rate: u32, channels: u16, samples: &[i16]) -> Vec<u8> {
let mut buf = Vec::new();
let data_size = (samples.len() * 2) as u32;
let file_size = 36 + data_size;
buf.extend_from_slice(b"RIFF");
buf.extend_from_slice(&file_size.to_le_bytes());
buf.extend_from_slice(b"WAVE");
buf.extend_from_slice(b"fmt ");
buf.extend_from_slice(&16u32.to_le_bytes());
buf.extend_from_slice(&1u16.to_le_bytes());
buf.extend_from_slice(&channels.to_le_bytes());
buf.extend_from_slice(&sample_rate.to_le_bytes());
let byte_rate = sample_rate * channels as u32 * 2;
buf.extend_from_slice(&byte_rate.to_le_bytes());
let block_align = channels * 2;
buf.extend_from_slice(&block_align.to_le_bytes());
buf.extend_from_slice(&16u16.to_le_bytes());
buf.extend_from_slice(b"data");
buf.extend_from_slice(&data_size.to_le_bytes());
for &s in samples {
buf.extend_from_slice(&s.to_le_bytes());
}
buf
}
#[test]
fn converts_stereo_to_mono() {
let mut samples = Vec::new();
for _ in 0..(TARGET_SAMPLE_RATE / 10) {
samples.push(10_000i16);
samples.push(0i16);
}
let wav = create_wav_buffer(TARGET_SAMPLE_RATE, 2, &samples);
let cursor = Cursor::new(wav);
let normalized = to_16k_mono_from_reader(cursor).unwrap();
assert_eq!(normalized.sample_rate, TARGET_SAMPLE_RATE);
assert_eq!(normalized.channels, TARGET_CHANNELS);
let mean =
normalized.samples.iter().copied().sum::<f32>() / normalized.samples.len() as f32;
let expected = (10_000.0 / 32768.0) / 2.0;
assert!((mean - expected).abs() < 1e-3);
}
#[test]
fn resamples_to_16k() {
let samples: Vec<i16> = vec![0; 48_000];
let wav = create_wav_buffer(48_000, 1, &samples);
let cursor = Cursor::new(wav);
let normalized = to_16k_mono_from_reader(cursor).unwrap();
assert_eq!(normalized.sample_rate, TARGET_SAMPLE_RATE);
assert_eq!(normalized.channels, TARGET_CHANNELS);
assert_eq!(normalized.samples.len(), TARGET_SAMPLE_RATE as usize);
let max_abs = normalized
.samples
.iter()
.copied()
.fold(0.0f32, |m, v| m.max(v.abs()));
assert!(max_abs <= 1e-6);
}
}