diff options
Diffstat (limited to 'makima/src/audio.rs')
| -rw-r--r-- | makima/src/audio.rs | 375 |
1 files changed, 375 insertions, 0 deletions
diff --git a/makima/src/audio.rs b/makima/src/audio.rs new file mode 100644 index 0000000..acfe7ce --- /dev/null +++ b/makima/src/audio.rs @@ -0,0 +1,375 @@ +use std::fs::File; +use std::io::{self, Read, Seek}; +use std::path::Path; + +use symphonia::core::audio::{AudioBufferRef, Signal}; +use symphonia::core::codecs::{DecoderOptions, CODEC_TYPE_NULL}; +use symphonia::core::errors::Error as SymphoniaError; +use symphonia::core::formats::FormatOptions; +use symphonia::core::io::{MediaSourceStream, ReadOnlySource}; +use symphonia::core::meta::MetadataOptions; +use symphonia::core::probe::Hint; + +pub const TARGET_SAMPLE_RATE: u32 = 16_000; +pub const TARGET_CHANNELS: u16 = 1; + +#[derive(Debug, Clone)] +pub struct PcmAudio { + pub samples: Vec<f32>, + pub sample_rate: u32, + pub channels: u16, +} + +#[derive(Debug)] +pub enum AudioError { + Io(io::Error), + Decode(String), + UnsupportedFormat, + NoAudioTrack, +} + +impl std::fmt::Display for AudioError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + AudioError::Io(err) => write!(f, "io error: {err}"), + AudioError::Decode(err) => write!(f, "decode error: {err}"), + AudioError::UnsupportedFormat => write!(f, "unsupported audio format"), + AudioError::NoAudioTrack => write!(f, "no audio track found"), + } + } +} + +impl std::error::Error for AudioError {} + +impl From<io::Error> for AudioError { + fn from(value: io::Error) -> Self { + AudioError::Io(value) + } +} + +impl From<SymphoniaError> for AudioError { + fn from(value: SymphoniaError) -> Self { + match value { + SymphoniaError::IoError(e) => AudioError::Io(e), + SymphoniaError::Unsupported(_) => AudioError::UnsupportedFormat, + other => AudioError::Decode(other.to_string()), + } + } +} + +pub fn to_16k_mono_from_path(path: impl AsRef<Path>) -> Result<PcmAudio, AudioError> { + let path = path.as_ref(); + let file = File::open(path)?; + + let mut hint = Hint::new(); + if let Some(ext) = path.extension().and_then(|e| e.to_str()) { + hint.with_extension(ext); + } + + decode_to_16k_mono(file, hint) +} + +pub fn to_16k_mono_from_reader<R: Read + Seek + Send + Sync + 'static>( + reader: R, +) -> Result<PcmAudio, AudioError> { + decode_to_16k_mono(reader, Hint::new()) +} + +fn decode_to_16k_mono<R: Read + Seek + Send + Sync + 'static>( + reader: R, + hint: Hint, +) -> Result<PcmAudio, AudioError> { + let source = MediaSourceStream::new(Box::new(ReadOnlySource::new(reader)), Default::default()); + + let format_opts = FormatOptions::default(); + let metadata_opts = MetadataOptions::default(); + + let probed = symphonia::default::get_probe().format(&hint, source, &format_opts, &metadata_opts)?; + let mut format = probed.format; + + let track = format + .tracks() + .iter() + .find(|t| t.codec_params.codec != CODEC_TYPE_NULL) + .ok_or(AudioError::NoAudioTrack)?; + + let track_id = track.id; + let codec_params = track.codec_params.clone(); + + let sample_rate = codec_params.sample_rate.ok_or(AudioError::Decode( + "unknown sample rate".to_string(), + ))?; + let channels = codec_params + .channels + .map(|c| c.count() as u16) + .unwrap_or(1); + + let decoder_opts = DecoderOptions::default(); + let mut decoder = symphonia::default::get_codecs().make(&codec_params, &decoder_opts)?; + + let mut interleaved: Vec<f32> = Vec::new(); + + loop { + let packet = match format.next_packet() { + Ok(p) => p, + Err(SymphoniaError::IoError(ref e)) if e.kind() == io::ErrorKind::UnexpectedEof => break, + Err(SymphoniaError::ResetRequired) => { + decoder.reset(); + continue; + } + Err(e) => return Err(e.into()), + }; + + if packet.track_id() != track_id { + continue; + } + + let decoded = match decoder.decode(&packet) { + Ok(d) => d, + Err(SymphoniaError::DecodeError(_)) => continue, + Err(e) => return Err(e.into()), + }; + + append_samples(&decoded, &mut interleaved); + } + + let mono = mixdown_to_mono(&interleaved, channels); + let samples = resample_sinc(&mono, sample_rate, TARGET_SAMPLE_RATE); + + Ok(PcmAudio { + samples, + sample_rate: TARGET_SAMPLE_RATE, + channels: TARGET_CHANNELS, + }) +} + +fn append_samples(buffer: &AudioBufferRef, out: &mut Vec<f32>) { + match buffer { + AudioBufferRef::U8(buf) => { + for frame in 0..buf.frames() { + for plane in buf.planes().planes() { + out.push((plane[frame] as f32 - 128.0) / 128.0); + } + } + } + AudioBufferRef::U16(buf) => { + for frame in 0..buf.frames() { + for plane in buf.planes().planes() { + out.push((plane[frame] as f32 - 32768.0) / 32768.0); + } + } + } + AudioBufferRef::U24(buf) => { + for frame in 0..buf.frames() { + for plane in buf.planes().planes() { + out.push((plane[frame].inner() as f32 - 8388608.0) / 8388608.0); + } + } + } + AudioBufferRef::U32(buf) => { + for frame in 0..buf.frames() { + for plane in buf.planes().planes() { + out.push((plane[frame] as f64 - 2147483648.0) as f32 / 2147483648.0); + } + } + } + AudioBufferRef::S8(buf) => { + for frame in 0..buf.frames() { + for plane in buf.planes().planes() { + out.push(plane[frame] as f32 / 128.0); + } + } + } + AudioBufferRef::S16(buf) => { + for frame in 0..buf.frames() { + for plane in buf.planes().planes() { + out.push(plane[frame] as f32 / 32768.0); + } + } + } + AudioBufferRef::S24(buf) => { + for frame in 0..buf.frames() { + for plane in buf.planes().planes() { + out.push(plane[frame].inner() as f32 / 8388608.0); + } + } + } + AudioBufferRef::S32(buf) => { + for frame in 0..buf.frames() { + for plane in buf.planes().planes() { + out.push(plane[frame] as f32 / 2147483648.0); + } + } + } + AudioBufferRef::F32(buf) => { + for frame in 0..buf.frames() { + for plane in buf.planes().planes() { + out.push(plane[frame]); + } + } + } + AudioBufferRef::F64(buf) => { + for frame in 0..buf.frames() { + for plane in buf.planes().planes() { + out.push(plane[frame] as f32); + } + } + } + } +} + +fn mixdown_to_mono(interleaved: &[f32], channels: u16) -> Vec<f32> { + if channels <= 1 { + return interleaved.to_vec(); + } + + let channels = channels as usize; + let frames = interleaved.len() / channels; + + let mut mono = Vec::with_capacity(frames); + for frame in 0..frames { + let base = frame * channels; + let mut acc = 0.0f32; + for c in 0..channels { + acc += interleaved[base + c]; + } + mono.push(acc / channels as f32); + } + + mono +} + +fn resample_sinc(input: &[f32], input_rate: u32, output_rate: u32) -> Vec<f32> { + if input_rate == output_rate { + return input.to_vec(); + } + if input.is_empty() { + return Vec::new(); + } + + let ratio = input_rate as f64 / output_rate as f64; + let output_len = ((input.len() as f64) / ratio).ceil() as usize; + + let cutoff = (output_rate as f64 / input_rate as f64).min(1.0); + + let radius: i32 = 32; + let radius_f = radius as f64; + let pi = std::f64::consts::PI; + + let mut output = Vec::with_capacity(output_len); + for n in 0..output_len { + let t = n as f64 * ratio; + let center = t.floor() as i32; + let frac = t - (center as f64); + + let mut acc = 0.0f64; + let mut norm = 0.0f64; + + for k in -radius..=radius { + let idx = center + k; + if idx < 0 || (idx as usize) >= input.len() { + continue; + } + + let x = (k as f64) - frac; + let d = x.abs(); + if d > radius_f { + continue; + } + + let window = 0.5 * (1.0 + (pi * d / radius_f).cos()); + + let z = x * cutoff; + let sinc = if z == 0.0 { + 1.0 + } else { + let pz = pi * z; + pz.sin() / pz + }; + + let weight = cutoff * sinc * window; + acc += input[idx as usize] as f64 * weight; + norm += weight; + } + + let y = if norm == 0.0 { 0.0 } else { acc / norm }; + output.push(y as f32); + } + + output +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Cursor; + + fn create_wav_buffer(sample_rate: u32, channels: u16, samples: &[i16]) -> Vec<u8> { + let mut buf = Vec::new(); + let data_size = (samples.len() * 2) as u32; + let file_size = 36 + data_size; + + buf.extend_from_slice(b"RIFF"); + buf.extend_from_slice(&file_size.to_le_bytes()); + buf.extend_from_slice(b"WAVE"); + + buf.extend_from_slice(b"fmt "); + buf.extend_from_slice(&16u32.to_le_bytes()); + buf.extend_from_slice(&1u16.to_le_bytes()); + buf.extend_from_slice(&channels.to_le_bytes()); + buf.extend_from_slice(&sample_rate.to_le_bytes()); + let byte_rate = sample_rate * channels as u32 * 2; + buf.extend_from_slice(&byte_rate.to_le_bytes()); + let block_align = channels * 2; + buf.extend_from_slice(&block_align.to_le_bytes()); + buf.extend_from_slice(&16u16.to_le_bytes()); + + buf.extend_from_slice(b"data"); + buf.extend_from_slice(&data_size.to_le_bytes()); + for &s in samples { + buf.extend_from_slice(&s.to_le_bytes()); + } + + buf + } + + #[test] + fn converts_stereo_to_mono() { + let mut samples = Vec::new(); + for _ in 0..(TARGET_SAMPLE_RATE / 10) { + samples.push(10_000i16); + samples.push(0i16); + } + + let wav = create_wav_buffer(TARGET_SAMPLE_RATE, 2, &samples); + let cursor = Cursor::new(wav); + + let normalized = to_16k_mono_from_reader(cursor).unwrap(); + + assert_eq!(normalized.sample_rate, TARGET_SAMPLE_RATE); + assert_eq!(normalized.channels, TARGET_CHANNELS); + let mean = + normalized.samples.iter().copied().sum::<f32>() / normalized.samples.len() as f32; + let expected = (10_000.0 / 32768.0) / 2.0; + assert!((mean - expected).abs() < 1e-3); + } + + #[test] + fn resamples_to_16k() { + let samples: Vec<i16> = vec![0; 48_000]; + let wav = create_wav_buffer(48_000, 1, &samples); + let cursor = Cursor::new(wav); + + let normalized = to_16k_mono_from_reader(cursor).unwrap(); + + assert_eq!(normalized.sample_rate, TARGET_SAMPLE_RATE); + assert_eq!(normalized.channels, TARGET_CHANNELS); + assert_eq!(normalized.samples.len(), TARGET_SAMPLE_RATE as usize); + let max_abs = normalized + .samples + .iter() + .copied() + .fold(0.0f32, |m, v| m.max(v.abs())); + assert!(max_abs <= 1e-6); + } +} |
