summaryrefslogtreecommitdiff
path: root/makima
diff options
context:
space:
mode:
authorsoryu <soryu@soryu.co>2025-12-19 04:43:59 +0000
committersoryu <soryu@soryu.co>2025-12-23 14:47:18 +0000
commitab9166170043ba5e0ce974e5b7accf0939d686e3 (patch)
treed65be5b7df0dda330fbb2c03f444a5ee02009dd5 /makima
parentb065e5d6a7cd157dad858b12ecae4624df172ee0 (diff)
downloadsoryu-ab9166170043ba5e0ce974e5b7accf0939d686e3.tar.gz
soryu-ab9166170043ba5e0ce974e5b7accf0939d686e3.zip
Experiment: ChatterBoxTTS
Diffstat (limited to 'makima')
-rw-r--r--makima/Cargo.lock1388
-rw-r--r--makima/Cargo.toml6
-rw-r--r--makima/src/audio.rs375
-rw-r--r--makima/src/listen.rs97
-rw-r--r--makima/src/main.rs38
-rw-r--r--makima/src/tts.rs580
6 files changed, 2427 insertions, 57 deletions
diff --git a/makima/Cargo.lock b/makima/Cargo.lock
index 1fcd019..f3fea81 100644
--- a/makima/Cargo.lock
+++ b/makima/Cargo.lock
@@ -9,6 +9,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
[[package]]
+name = "ahash"
+version = "0.8.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
+dependencies = [
+ "cfg-if",
+ "getrandom 0.3.4",
+ "once_cell",
+ "serde",
+ "version_check",
+ "zerocopy",
+]
+
+[[package]]
name = "aho-corasick"
version = "1.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -18,6 +32,18 @@ dependencies = [
]
[[package]]
+name = "arrayvec"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
+
+[[package]]
+name = "atomic-waker"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
+
+[[package]]
name = "autocfg"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -43,6 +69,12 @@ checksum = "0e050f626429857a27ddccb31e0aca21356bfa709c04041aefddac081a8f068a"
[[package]]
name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
+[[package]]
+name = "bitflags"
version = "2.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3"
@@ -63,6 +95,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43"
[[package]]
+name = "bytemuck"
+version = "1.24.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fbdf580320f38b612e485521afda1ee26d10cc9884efaaa750d383e13e3c5f4"
+
+[[package]]
name = "byteorder"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -75,6 +113,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3"
[[package]]
+name = "castaway"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dec551ab6e7578819132c713a93c022a05d60159dc86e7a7050223577484c55a"
+dependencies = [
+ "rustversion",
+]
+
+[[package]]
name = "cc"
version = "1.2.49"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -91,6 +138,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
[[package]]
+name = "compact_str"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fdb1325a1cece981e8a296ab8f0f9b63ae357bd0784a9faaf548cc7b480707a"
+dependencies = [
+ "castaway",
+ "cfg-if",
+ "itoa",
+ "rustversion",
+ "ryu",
+ "serde",
+ "static_assertions",
+]
+
+[[package]]
name = "console"
version = "0.15.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -208,6 +270,15 @@ dependencies = [
]
[[package]]
+name = "dary_heap"
+version = "0.3.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06d2e3287df1c007e74221c49ca10a95d557349e54b3a75dc2fb14712c751f04"
+dependencies = [
+ "serde",
+]
+
+[[package]]
name = "der"
version = "0.7.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -259,6 +330,38 @@ dependencies = [
]
[[package]]
+name = "dirs"
+version = "6.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3e8aa94d75141228480295a7d0e7feb620b1a5ad9f12bc40be62411e38cce4e"
+dependencies = [
+ "dirs-sys",
+]
+
+[[package]]
+name = "dirs-sys"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab"
+dependencies = [
+ "libc",
+ "option-ext",
+ "redox_users",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "displaydoc"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
name = "either"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -271,6 +374,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
[[package]]
+name = "encoding_rs"
+version = "0.8.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "equivalent"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
+
+[[package]]
name = "errno"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -290,6 +408,12 @@ dependencies = [
]
[[package]]
+name = "extended"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af9673d8203fcb076b19dfd17e38b3d4ae9f44959416ea532ce72415a6020365"
+
+[[package]]
name = "eyre"
version = "0.6.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -355,6 +479,104 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
[[package]]
+name = "form_urlencoded"
+version = "1.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf"
+dependencies = [
+ "percent-encoding",
+]
+
+[[package]]
+name = "futures"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-executor",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-channel"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+]
+
+[[package]]
+name = "futures-core"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
+
+[[package]]
+name = "futures-executor"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
+dependencies = [
+ "futures-core",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-io"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
+
+[[package]]
+name = "futures-macro"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "futures-sink"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7"
+
+[[package]]
+name = "futures-task"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
+
+[[package]]
+name = "futures-util"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-macro",
+ "futures-sink",
+ "futures-task",
+ "memchr",
+ "pin-project-lite",
+ "pin-utils",
+ "slab",
+]
+
+[[package]]
name = "generic-array"
version = "0.14.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -388,6 +610,61 @@ dependencies = [
]
[[package]]
+name = "h2"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3c0b69cfcb4e1b9f1bf2f53f95f766e4661169728ec61cd3fe5a0166f2d1386"
+dependencies = [
+ "atomic-waker",
+ "bytes",
+ "fnv",
+ "futures-core",
+ "futures-sink",
+ "http",
+ "indexmap",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
+
+[[package]]
+name = "hermit-abi"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
+
+[[package]]
+name = "hf-hub"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "629d8f3bbeda9d148036d6b0de0a3ab947abd08ce90626327fc3547a49d59d97"
+dependencies = [
+ "dirs",
+ "futures",
+ "http",
+ "indicatif",
+ "libc",
+ "log",
+ "native-tls",
+ "num_cpus",
+ "rand 0.9.2",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "thiserror 2.0.17",
+ "tokio",
+ "ureq 2.12.1",
+ "windows-sys 0.60.2",
+]
+
+[[package]]
name = "hound"
version = "3.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -404,24 +681,239 @@ dependencies = [
]
[[package]]
+name = "http-body"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
+dependencies = [
+ "bytes",
+ "http",
+]
+
+[[package]]
+name = "http-body-util"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "http",
+ "http-body",
+ "pin-project-lite",
+]
+
+[[package]]
name = "httparse"
version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
[[package]]
+name = "hyper"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11"
+dependencies = [
+ "atomic-waker",
+ "bytes",
+ "futures-channel",
+ "futures-core",
+ "h2",
+ "http",
+ "http-body",
+ "httparse",
+ "itoa",
+ "pin-project-lite",
+ "pin-utils",
+ "smallvec 1.15.1",
+ "tokio",
+ "want",
+]
+
+[[package]]
+name = "hyper-rustls"
+version = "0.27.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58"
+dependencies = [
+ "http",
+ "hyper",
+ "hyper-util",
+ "rustls",
+ "rustls-pki-types",
+ "tokio",
+ "tokio-rustls",
+ "tower-service",
+]
+
+[[package]]
+name = "hyper-tls"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
+dependencies = [
+ "bytes",
+ "http-body-util",
+ "hyper",
+ "hyper-util",
+ "native-tls",
+ "tokio",
+ "tokio-native-tls",
+ "tower-service",
+]
+
+[[package]]
+name = "hyper-util"
+version = "0.1.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "727805d60e7938b76b826a6ef209eb70eaa1812794f9424d4a4e2d740662df5f"
+dependencies = [
+ "base64 0.22.1",
+ "bytes",
+ "futures-channel",
+ "futures-core",
+ "futures-util",
+ "http",
+ "http-body",
+ "hyper",
+ "ipnet",
+ "libc",
+ "percent-encoding",
+ "pin-project-lite",
+ "socket2",
+ "system-configuration",
+ "tokio",
+ "tower-service",
+ "tracing",
+ "windows-registry",
+]
+
+[[package]]
+name = "icu_collections"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43"
+dependencies = [
+ "displaydoc",
+ "potential_utf",
+ "yoke",
+ "zerofrom",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_locale_core"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6"
+dependencies = [
+ "displaydoc",
+ "litemap",
+ "tinystr",
+ "writeable",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599"
+dependencies = [
+ "icu_collections",
+ "icu_normalizer_data",
+ "icu_properties",
+ "icu_provider",
+ "smallvec 1.15.1",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer_data"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a"
+
+[[package]]
+name = "icu_properties"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec"
+dependencies = [
+ "icu_collections",
+ "icu_locale_core",
+ "icu_properties_data",
+ "icu_provider",
+ "zerotrie",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_properties_data"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af"
+
+[[package]]
+name = "icu_provider"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614"
+dependencies = [
+ "displaydoc",
+ "icu_locale_core",
+ "writeable",
+ "yoke",
+ "zerofrom",
+ "zerotrie",
+ "zerovec",
+]
+
+[[package]]
name = "ident_case"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
[[package]]
+name = "idna"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de"
+dependencies = [
+ "idna_adapter",
+ "smallvec 1.15.1",
+ "utf8_iter",
+]
+
+[[package]]
+name = "idna_adapter"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344"
+dependencies = [
+ "icu_normalizer",
+ "icu_properties",
+]
+
+[[package]]
name = "indenter"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "964de6e86d545b246d84badc0fef527924ace5134f30641c203ef52ba83f58d5"
[[package]]
+name = "indexmap"
+version = "2.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ad4bb2b565bca0645f4d68c5c9af97fba094e9791da685bf83cb5f3ce74acf2"
+dependencies = [
+ "equivalent",
+ "hashbrown",
+]
+
+[[package]]
name = "indicatif"
version = "0.17.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -435,6 +927,22 @@ dependencies = [
]
[[package]]
+name = "ipnet"
+version = "2.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130"
+
+[[package]]
+name = "iri-string"
+version = "0.7.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f867b9d1d896b67beb18518eda36fdb77a32ea590de864f1325b294a6d14397"
+dependencies = [
+ "memchr",
+ "serde",
+]
+
+[[package]]
name = "itertools"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -453,6 +961,15 @@ dependencies = [
]
[[package]]
+name = "itertools"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
+dependencies = [
+ "either",
+]
+
+[[package]]
name = "itoa"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -486,7 +1003,7 @@ version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "416f7e718bdb06000964960ffa43b4335ad4012ae8b99060261aa4a8088d5ccb"
dependencies = [
- "bitflags",
+ "bitflags 2.10.0",
"libc",
"redox_syscall",
]
@@ -498,6 +1015,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039"
[[package]]
+name = "litemap"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
+
+[[package]]
name = "log"
version = "0.4.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -523,8 +1046,12 @@ checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30"
name = "makima"
version = "0.1.0"
dependencies = [
- "hound",
+ "hf-hub",
+ "ndarray",
+ "ort",
"parakeet-rs",
+ "symphonia",
+ "tokenizers 0.21.4",
]
[[package]]
@@ -544,6 +1071,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
[[package]]
+name = "mime"
+version = "0.3.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
+
+[[package]]
name = "minimal-lexical"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -560,6 +1093,17 @@ dependencies = [
]
[[package]]
+name = "mio"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc"
+dependencies = [
+ "libc",
+ "wasi",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
name = "monostate"
version = "0.1.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -651,6 +1195,16 @@ dependencies = [
]
[[package]]
+name = "num_cpus"
+version = "1.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
+dependencies = [
+ "hermit-abi",
+ "libc",
+]
+
+[[package]]
name = "number_prefix"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -668,7 +1222,7 @@ version = "6.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "336b9c63443aceef14bea841b899035ae3abe89b7c486aaf4c5bd8aafedac3f0"
dependencies = [
- "bitflags",
+ "bitflags 2.10.0",
"libc",
"once_cell",
"onig_sys",
@@ -690,7 +1244,7 @@ version = "0.10.75"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328"
dependencies = [
- "bitflags",
+ "bitflags 2.10.0",
"cfg-if",
"foreign-types",
"libc",
@@ -729,6 +1283,12 @@ dependencies = [
]
[[package]]
+name = "option-ext"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
+
+[[package]]
name = "ort"
version = "2.0.0-rc.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -750,7 +1310,7 @@ dependencies = [
"pkg-config",
"sha2",
"tar",
- "ureq",
+ "ureq 3.1.4",
]
[[package]]
@@ -766,7 +1326,7 @@ dependencies = [
"rustfft",
"serde",
"serde_json",
- "tokenizers",
+ "tokenizers 0.20.4",
]
[[package]]
@@ -797,6 +1357,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
[[package]]
+name = "pin-utils"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
+
+[[package]]
name = "pkg-config"
version = "0.3.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -818,6 +1384,15 @@ dependencies = [
]
[[package]]
+name = "potential_utf"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77"
+dependencies = [
+ "zerovec",
+]
+
+[[package]]
name = "ppv-lite86"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -866,8 +1441,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
- "rand_chacha",
- "rand_core",
+ "rand_chacha 0.3.1",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "rand"
+version = "0.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
+dependencies = [
+ "rand_chacha 0.9.0",
+ "rand_core 0.9.3",
]
[[package]]
@@ -877,7 +1462,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
- "rand_core",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.9.3",
]
[[package]]
@@ -890,6 +1485,15 @@ dependencies = [
]
[[package]]
+name = "rand_core"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
+dependencies = [
+ "getrandom 0.3.4",
+]
+
+[[package]]
name = "rawpointer"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -917,6 +1521,17 @@ dependencies = [
]
[[package]]
+name = "rayon-cond"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2964d0cf57a3e7a06e8183d14a8b527195c706b7983549cd5462d5aa3747438f"
+dependencies = [
+ "either",
+ "itertools 0.14.0",
+ "rayon",
+]
+
+[[package]]
name = "rayon-core"
version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -932,7 +1547,18 @@ version = "0.5.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
dependencies = [
- "bitflags",
+ "bitflags 2.10.0",
+]
+
+[[package]]
+name = "redox_users"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac"
+dependencies = [
+ "getrandom 0.2.16",
+ "libredox",
+ "thiserror 2.0.17",
]
[[package]]
@@ -965,6 +1591,63 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
[[package]]
+name = "reqwest"
+version = "0.12.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b4c14b2d9afca6a60277086b0cc6a6ae0b568f6f7916c943a8cdc79f8be240f"
+dependencies = [
+ "base64 0.22.1",
+ "bytes",
+ "encoding_rs",
+ "futures-core",
+ "futures-util",
+ "h2",
+ "http",
+ "http-body",
+ "http-body-util",
+ "hyper",
+ "hyper-rustls",
+ "hyper-tls",
+ "hyper-util",
+ "js-sys",
+ "log",
+ "mime",
+ "native-tls",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustls-pki-types",
+ "serde",
+ "serde_json",
+ "serde_urlencoded",
+ "sync_wrapper",
+ "tokio",
+ "tokio-native-tls",
+ "tokio-util",
+ "tower",
+ "tower-http",
+ "tower-service",
+ "url",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "wasm-streams",
+ "web-sys",
+]
+
+[[package]]
+name = "ring"
+version = "0.17.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
+dependencies = [
+ "cc",
+ "cfg-if",
+ "getrandom 0.2.16",
+ "libc",
+ "untrusted",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
name = "rustfft"
version = "6.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -984,7 +1667,7 @@ version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e"
dependencies = [
- "bitflags",
+ "bitflags 2.10.0",
"errno",
"libc",
"linux-raw-sys",
@@ -992,6 +1675,21 @@ dependencies = [
]
[[package]]
+name = "rustls"
+version = "0.23.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "533f54bc6a7d4f647e46ad909549eda97bf5afc1585190ef692b4286b198bd8f"
+dependencies = [
+ "log",
+ "once_cell",
+ "ring",
+ "rustls-pki-types",
+ "rustls-webpki",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
name = "rustls-pki-types"
version = "1.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1001,6 +1699,17 @@ dependencies = [
]
[[package]]
+name = "rustls-webpki"
+version = "0.103.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2ffdfa2f5286e2247234e03f680868ac2815974dc39e00ea15adc445d0aafe52"
+dependencies = [
+ "ring",
+ "rustls-pki-types",
+ "untrusted",
+]
+
+[[package]]
name = "rustversion"
version = "1.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1027,7 +1736,7 @@ version = "2.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02"
dependencies = [
- "bitflags",
+ "bitflags 2.10.0",
"core-foundation",
"core-foundation-sys",
"libc",
@@ -1088,6 +1797,18 @@ dependencies = [
]
[[package]]
+name = "serde_urlencoded"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd"
+dependencies = [
+ "form_urlencoded",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
name = "sha2"
version = "0.10.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1111,6 +1832,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2"
[[package]]
+name = "slab"
+version = "0.4.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589"
+
+[[package]]
name = "smallvec"
version = "1.15.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1123,6 +1850,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51d44cfb396c3caf6fbfd0ab422af02631b69ddd96d2eff0b0f0724f9024051b"
[[package]]
+name = "socket2"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "17129e116933cf371d018bb80ae557e889637989d8638274fb25622827b03881"
+dependencies = [
+ "libc",
+ "windows-sys 0.60.2",
+]
+
+[[package]]
name = "socks"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1146,6 +1883,18 @@ dependencies = [
]
[[package]]
+name = "stable_deref_trait"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
+
+[[package]]
+name = "static_assertions"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
+
+[[package]]
name = "strength_reduce"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1158,6 +1907,170 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
+name = "subtle"
+version = "2.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
+
+[[package]]
+name = "symphonia"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5773a4c030a19d9bfaa090f49746ff35c75dfddfa700df7a5939d5e076a57039"
+dependencies = [
+ "lazy_static",
+ "symphonia-bundle-flac",
+ "symphonia-bundle-mp3",
+ "symphonia-codec-aac",
+ "symphonia-codec-adpcm",
+ "symphonia-codec-pcm",
+ "symphonia-codec-vorbis",
+ "symphonia-core",
+ "symphonia-format-mkv",
+ "symphonia-format-ogg",
+ "symphonia-format-riff",
+ "symphonia-metadata",
+]
+
+[[package]]
+name = "symphonia-bundle-flac"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c91565e180aea25d9b80a910c546802526ffd0072d0b8974e3ebe59b686c9976"
+dependencies = [
+ "log",
+ "symphonia-core",
+ "symphonia-metadata",
+ "symphonia-utils-xiph",
+]
+
+[[package]]
+name = "symphonia-bundle-mp3"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4872dd6bb56bf5eac799e3e957aa1981086c3e613b27e0ac23b176054f7c57ed"
+dependencies = [
+ "lazy_static",
+ "log",
+ "symphonia-core",
+ "symphonia-metadata",
+]
+
+[[package]]
+name = "symphonia-codec-aac"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c263845aa86881416849c1729a54c7f55164f8b96111dba59de46849e73a790"
+dependencies = [
+ "lazy_static",
+ "log",
+ "symphonia-core",
+]
+
+[[package]]
+name = "symphonia-codec-adpcm"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2dddc50e2bbea4cfe027441eece77c46b9f319748605ab8f3443350129ddd07f"
+dependencies = [
+ "log",
+ "symphonia-core",
+]
+
+[[package]]
+name = "symphonia-codec-pcm"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4e89d716c01541ad3ebe7c91ce4c8d38a7cf266a3f7b2f090b108fb0cb031d95"
+dependencies = [
+ "log",
+ "symphonia-core",
+]
+
+[[package]]
+name = "symphonia-codec-vorbis"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f025837c309cd69ffef572750b4a2257b59552c5399a5e49707cc5b1b85d1c73"
+dependencies = [
+ "log",
+ "symphonia-core",
+ "symphonia-utils-xiph",
+]
+
+[[package]]
+name = "symphonia-core"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea00cc4f79b7f6bb7ff87eddc065a1066f3a43fe1875979056672c9ef948c2af"
+dependencies = [
+ "arrayvec",
+ "bitflags 1.3.2",
+ "bytemuck",
+ "lazy_static",
+ "log",
+]
+
+[[package]]
+name = "symphonia-format-mkv"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "122d786d2c43a49beb6f397551b4a050d8229eaa54c7ddf9ee4b98899b8742d0"
+dependencies = [
+ "lazy_static",
+ "log",
+ "symphonia-core",
+ "symphonia-metadata",
+ "symphonia-utils-xiph",
+]
+
+[[package]]
+name = "symphonia-format-ogg"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b4955c67c1ed3aa8ae8428d04ca8397fbef6a19b2b051e73b5da8b1435639cb"
+dependencies = [
+ "log",
+ "symphonia-core",
+ "symphonia-metadata",
+ "symphonia-utils-xiph",
+]
+
+[[package]]
+name = "symphonia-format-riff"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2d7c3df0e7d94efb68401d81906eae73c02b40d5ec1a141962c592d0f11a96f"
+dependencies = [
+ "extended",
+ "log",
+ "symphonia-core",
+ "symphonia-metadata",
+]
+
+[[package]]
+name = "symphonia-metadata"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "36306ff42b9ffe6e5afc99d49e121e0bd62fe79b9db7b9681d48e29fa19e6b16"
+dependencies = [
+ "encoding_rs",
+ "lazy_static",
+ "log",
+ "symphonia-core",
+]
+
+[[package]]
+name = "symphonia-utils-xiph"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee27c85ab799a338446b68eec77abf42e1a6f1bb490656e121c6e27bfbab9f16"
+dependencies = [
+ "symphonia-core",
+ "symphonia-metadata",
+]
+
+[[package]]
name = "syn"
version = "2.0.111"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1169,6 +2082,47 @@ dependencies = [
]
[[package]]
+name = "sync_wrapper"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263"
+dependencies = [
+ "futures-core",
+]
+
+[[package]]
+name = "synstructure"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "system-configuration"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b"
+dependencies = [
+ "bitflags 2.10.0",
+ "core-foundation",
+ "system-configuration-sys",
+]
+
+[[package]]
+name = "system-configuration-sys"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
name = "tar"
version = "0.4.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1198,7 +2152,16 @@ version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
dependencies = [
- "thiserror-impl",
+ "thiserror-impl 1.0.69",
+]
+
+[[package]]
+name = "thiserror"
+version = "2.0.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8"
+dependencies = [
+ "thiserror-impl 2.0.17",
]
[[package]]
@@ -1213,6 +2176,27 @@ dependencies = [
]
[[package]]
+name = "thiserror-impl"
+version = "2.0.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tinystr"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869"
+dependencies = [
+ "displaydoc",
+ "zerovec",
+]
+
+[[package]]
name = "tokenizers"
version = "0.20.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1230,21 +2214,159 @@ dependencies = [
"monostate",
"onig",
"paste",
- "rand",
+ "rand 0.8.5",
+ "rayon",
+ "rayon-cond 0.3.0",
+ "regex",
+ "regex-syntax",
+ "serde",
+ "serde_json",
+ "spm_precompiled",
+ "thiserror 1.0.69",
+ "unicode-normalization-alignments",
+ "unicode-segmentation",
+ "unicode_categories",
+]
+
+[[package]]
+name = "tokenizers"
+version = "0.21.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a620b996116a59e184c2fa2dfd8251ea34a36d0a514758c6f966386bd2e03476"
+dependencies = [
+ "ahash",
+ "aho-corasick",
+ "compact_str",
+ "dary_heap",
+ "derive_builder",
+ "esaxx-rs",
+ "getrandom 0.3.4",
+ "indicatif",
+ "itertools 0.14.0",
+ "log",
+ "macro_rules_attribute",
+ "monostate",
+ "onig",
+ "paste",
+ "rand 0.9.2",
"rayon",
- "rayon-cond",
+ "rayon-cond 0.4.0",
"regex",
"regex-syntax",
"serde",
"serde_json",
"spm_precompiled",
- "thiserror",
+ "thiserror 2.0.17",
"unicode-normalization-alignments",
"unicode-segmentation",
"unicode_categories",
]
[[package]]
+name = "tokio"
+version = "1.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408"
+dependencies = [
+ "bytes",
+ "libc",
+ "mio",
+ "pin-project-lite",
+ "socket2",
+ "tokio-macros",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "tokio-macros"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tokio-native-tls"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
+dependencies = [
+ "native-tls",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-rustls"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61"
+dependencies = [
+ "rustls",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-util"
+version = "0.7.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2efa149fe76073d6e8fd97ef4f4eca7b67f599660115591483572e406e165594"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "futures-sink",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tower"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "pin-project-lite",
+ "sync_wrapper",
+ "tokio",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "tower-http"
+version = "0.6.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8"
+dependencies = [
+ "bitflags 2.10.0",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "iri-string",
+ "pin-project-lite",
+ "tower",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
+
+[[package]]
+name = "tower-service"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
+
+[[package]]
name = "tracing"
version = "0.1.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1274,6 +2396,12 @@ dependencies = [
]
[[package]]
+name = "try-lock"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
+
+[[package]]
name = "typenum"
version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1313,6 +2441,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
[[package]]
+name = "untrusted"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
+
+[[package]]
+name = "ureq"
+version = "2.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d"
+dependencies = [
+ "base64 0.22.1",
+ "flate2",
+ "log",
+ "native-tls",
+ "once_cell",
+ "rustls",
+ "rustls-pki-types",
+ "serde",
+ "serde_json",
+ "socks",
+ "url",
+ "webpki-roots 0.26.11",
+]
+
+[[package]]
name = "ureq"
version = "3.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1343,12 +2497,30 @@ dependencies = [
]
[[package]]
+name = "url"
+version = "2.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08bc136a29a3d1758e07a9cca267be308aeebf5cfd5a10f3f67ab2097683ef5b"
+dependencies = [
+ "form_urlencoded",
+ "idna",
+ "percent-encoding",
+ "serde",
+]
+
+[[package]]
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
+name = "utf8_iter"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
+
+[[package]]
name = "vcpkg"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1361,6 +2533,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
[[package]]
+name = "want"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e"
+dependencies = [
+ "try-lock",
+]
+
+[[package]]
name = "wasi"
version = "0.11.1+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1389,6 +2570,19 @@ dependencies = [
]
[[package]]
+name = "wasm-bindgen-futures"
+version = "0.4.56"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "836d9622d604feee9e5de25ac10e3ea5f2d65b41eac0d9ce72eb5deae707ce7c"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "once_cell",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
name = "wasm-bindgen-macro"
version = "0.2.106"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1421,6 +2615,29 @@ dependencies = [
]
[[package]]
+name = "wasm-streams"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65"
+dependencies = [
+ "futures-util",
+ "js-sys",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+]
+
+[[package]]
+name = "web-sys"
+version = "0.3.83"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b32828d774c412041098d182a8b38b16ea816958e07cf40eec2bc080ae137ac"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
name = "web-time"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1440,6 +2657,24 @@ dependencies = [
]
[[package]]
+name = "webpki-roots"
+version = "0.26.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9"
+dependencies = [
+ "webpki-roots 1.0.4",
+]
+
+[[package]]
+name = "webpki-roots"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2878ef029c47c6e8cf779119f20fcf52bde7ad42a731b2a304bc221df17571e"
+dependencies = [
+ "rustls-pki-types",
+]
+
+[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1468,6 +2703,44 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
[[package]]
+name = "windows-registry"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720"
+dependencies = [
+ "windows-link",
+ "windows-result",
+ "windows-strings",
+]
+
+[[package]]
+name = "windows-result"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5"
+dependencies = [
+ "windows-link",
+]
+
+[[package]]
+name = "windows-strings"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091"
+dependencies = [
+ "windows-link",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+
+[[package]]
name = "windows-sys"
version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1630,6 +2903,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
[[package]]
+name = "writeable"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
+
+[[package]]
name = "xattr"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1640,6 +2919,29 @@ dependencies = [
]
[[package]]
+name = "yoke"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954"
+dependencies = [
+ "stable_deref_trait",
+ "yoke-derive",
+ "zerofrom",
+]
+
+[[package]]
+name = "yoke-derive"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "synstructure",
+]
+
+[[package]]
name = "zerocopy"
version = "0.8.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1660,7 +2962,61 @@ dependencies = [
]
[[package]]
+name = "zerofrom"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5"
+dependencies = [
+ "zerofrom-derive",
+]
+
+[[package]]
+name = "zerofrom-derive"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "synstructure",
+]
+
+[[package]]
name = "zeroize"
version = "1.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
+
+[[package]]
+name = "zerotrie"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851"
+dependencies = [
+ "displaydoc",
+ "yoke",
+ "zerofrom",
+]
+
+[[package]]
+name = "zerovec"
+version = "0.11.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002"
+dependencies = [
+ "yoke",
+ "zerofrom",
+ "zerovec-derive",
+]
+
+[[package]]
+name = "zerovec-derive"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
diff --git a/makima/Cargo.toml b/makima/Cargo.toml
index 3f3abb7..df7384f 100644
--- a/makima/Cargo.toml
+++ b/makima/Cargo.toml
@@ -5,4 +5,8 @@ edition = "2024"
[dependencies]
parakeet-rs = { version = "0.2.5", features = ["sortformer"] }
-hound = "3.5"
+symphonia = { version = "0.5", features = ["mp3", "aac", "flac", "ogg", "vorbis", "wav", "pcm"] }
+ort = "2.0.0-rc.10"
+tokenizers = "0.21"
+hf-hub = "0.4"
+ndarray = "0.16"
diff --git a/makima/src/audio.rs b/makima/src/audio.rs
new file mode 100644
index 0000000..acfe7ce
--- /dev/null
+++ b/makima/src/audio.rs
@@ -0,0 +1,375 @@
+use std::fs::File;
+use std::io::{self, Read, Seek};
+use std::path::Path;
+
+use symphonia::core::audio::{AudioBufferRef, Signal};
+use symphonia::core::codecs::{DecoderOptions, CODEC_TYPE_NULL};
+use symphonia::core::errors::Error as SymphoniaError;
+use symphonia::core::formats::FormatOptions;
+use symphonia::core::io::{MediaSourceStream, ReadOnlySource};
+use symphonia::core::meta::MetadataOptions;
+use symphonia::core::probe::Hint;
+
+pub const TARGET_SAMPLE_RATE: u32 = 16_000;
+pub const TARGET_CHANNELS: u16 = 1;
+
+#[derive(Debug, Clone)]
+pub struct PcmAudio {
+ pub samples: Vec<f32>,
+ pub sample_rate: u32,
+ pub channels: u16,
+}
+
+#[derive(Debug)]
+pub enum AudioError {
+ Io(io::Error),
+ Decode(String),
+ UnsupportedFormat,
+ NoAudioTrack,
+}
+
+impl std::fmt::Display for AudioError {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ AudioError::Io(err) => write!(f, "io error: {err}"),
+ AudioError::Decode(err) => write!(f, "decode error: {err}"),
+ AudioError::UnsupportedFormat => write!(f, "unsupported audio format"),
+ AudioError::NoAudioTrack => write!(f, "no audio track found"),
+ }
+ }
+}
+
+impl std::error::Error for AudioError {}
+
+impl From<io::Error> for AudioError {
+ fn from(value: io::Error) -> Self {
+ AudioError::Io(value)
+ }
+}
+
+impl From<SymphoniaError> for AudioError {
+ fn from(value: SymphoniaError) -> Self {
+ match value {
+ SymphoniaError::IoError(e) => AudioError::Io(e),
+ SymphoniaError::Unsupported(_) => AudioError::UnsupportedFormat,
+ other => AudioError::Decode(other.to_string()),
+ }
+ }
+}
+
+pub fn to_16k_mono_from_path(path: impl AsRef<Path>) -> Result<PcmAudio, AudioError> {
+ let path = path.as_ref();
+ let file = File::open(path)?;
+
+ let mut hint = Hint::new();
+ if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
+ hint.with_extension(ext);
+ }
+
+ decode_to_16k_mono(file, hint)
+}
+
+pub fn to_16k_mono_from_reader<R: Read + Seek + Send + Sync + 'static>(
+ reader: R,
+) -> Result<PcmAudio, AudioError> {
+ decode_to_16k_mono(reader, Hint::new())
+}
+
+fn decode_to_16k_mono<R: Read + Seek + Send + Sync + 'static>(
+ reader: R,
+ hint: Hint,
+) -> Result<PcmAudio, AudioError> {
+ let source = MediaSourceStream::new(Box::new(ReadOnlySource::new(reader)), Default::default());
+
+ let format_opts = FormatOptions::default();
+ let metadata_opts = MetadataOptions::default();
+
+ let probed = symphonia::default::get_probe().format(&hint, source, &format_opts, &metadata_opts)?;
+ let mut format = probed.format;
+
+ let track = format
+ .tracks()
+ .iter()
+ .find(|t| t.codec_params.codec != CODEC_TYPE_NULL)
+ .ok_or(AudioError::NoAudioTrack)?;
+
+ let track_id = track.id;
+ let codec_params = track.codec_params.clone();
+
+ let sample_rate = codec_params.sample_rate.ok_or(AudioError::Decode(
+ "unknown sample rate".to_string(),
+ ))?;
+ let channels = codec_params
+ .channels
+ .map(|c| c.count() as u16)
+ .unwrap_or(1);
+
+ let decoder_opts = DecoderOptions::default();
+ let mut decoder = symphonia::default::get_codecs().make(&codec_params, &decoder_opts)?;
+
+ let mut interleaved: Vec<f32> = Vec::new();
+
+ loop {
+ let packet = match format.next_packet() {
+ Ok(p) => p,
+ Err(SymphoniaError::IoError(ref e)) if e.kind() == io::ErrorKind::UnexpectedEof => break,
+ Err(SymphoniaError::ResetRequired) => {
+ decoder.reset();
+ continue;
+ }
+ Err(e) => return Err(e.into()),
+ };
+
+ if packet.track_id() != track_id {
+ continue;
+ }
+
+ let decoded = match decoder.decode(&packet) {
+ Ok(d) => d,
+ Err(SymphoniaError::DecodeError(_)) => continue,
+ Err(e) => return Err(e.into()),
+ };
+
+ append_samples(&decoded, &mut interleaved);
+ }
+
+ let mono = mixdown_to_mono(&interleaved, channels);
+ let samples = resample_sinc(&mono, sample_rate, TARGET_SAMPLE_RATE);
+
+ Ok(PcmAudio {
+ samples,
+ sample_rate: TARGET_SAMPLE_RATE,
+ channels: TARGET_CHANNELS,
+ })
+}
+
+fn append_samples(buffer: &AudioBufferRef, out: &mut Vec<f32>) {
+ match buffer {
+ AudioBufferRef::U8(buf) => {
+ for frame in 0..buf.frames() {
+ for plane in buf.planes().planes() {
+ out.push((plane[frame] as f32 - 128.0) / 128.0);
+ }
+ }
+ }
+ AudioBufferRef::U16(buf) => {
+ for frame in 0..buf.frames() {
+ for plane in buf.planes().planes() {
+ out.push((plane[frame] as f32 - 32768.0) / 32768.0);
+ }
+ }
+ }
+ AudioBufferRef::U24(buf) => {
+ for frame in 0..buf.frames() {
+ for plane in buf.planes().planes() {
+ out.push((plane[frame].inner() as f32 - 8388608.0) / 8388608.0);
+ }
+ }
+ }
+ AudioBufferRef::U32(buf) => {
+ for frame in 0..buf.frames() {
+ for plane in buf.planes().planes() {
+ out.push((plane[frame] as f64 - 2147483648.0) as f32 / 2147483648.0);
+ }
+ }
+ }
+ AudioBufferRef::S8(buf) => {
+ for frame in 0..buf.frames() {
+ for plane in buf.planes().planes() {
+ out.push(plane[frame] as f32 / 128.0);
+ }
+ }
+ }
+ AudioBufferRef::S16(buf) => {
+ for frame in 0..buf.frames() {
+ for plane in buf.planes().planes() {
+ out.push(plane[frame] as f32 / 32768.0);
+ }
+ }
+ }
+ AudioBufferRef::S24(buf) => {
+ for frame in 0..buf.frames() {
+ for plane in buf.planes().planes() {
+ out.push(plane[frame].inner() as f32 / 8388608.0);
+ }
+ }
+ }
+ AudioBufferRef::S32(buf) => {
+ for frame in 0..buf.frames() {
+ for plane in buf.planes().planes() {
+ out.push(plane[frame] as f32 / 2147483648.0);
+ }
+ }
+ }
+ AudioBufferRef::F32(buf) => {
+ for frame in 0..buf.frames() {
+ for plane in buf.planes().planes() {
+ out.push(plane[frame]);
+ }
+ }
+ }
+ AudioBufferRef::F64(buf) => {
+ for frame in 0..buf.frames() {
+ for plane in buf.planes().planes() {
+ out.push(plane[frame] as f32);
+ }
+ }
+ }
+ }
+}
+
+fn mixdown_to_mono(interleaved: &[f32], channels: u16) -> Vec<f32> {
+ if channels <= 1 {
+ return interleaved.to_vec();
+ }
+
+ let channels = channels as usize;
+ let frames = interleaved.len() / channels;
+
+ let mut mono = Vec::with_capacity(frames);
+ for frame in 0..frames {
+ let base = frame * channels;
+ let mut acc = 0.0f32;
+ for c in 0..channels {
+ acc += interleaved[base + c];
+ }
+ mono.push(acc / channels as f32);
+ }
+
+ mono
+}
+
+fn resample_sinc(input: &[f32], input_rate: u32, output_rate: u32) -> Vec<f32> {
+ if input_rate == output_rate {
+ return input.to_vec();
+ }
+ if input.is_empty() {
+ return Vec::new();
+ }
+
+ let ratio = input_rate as f64 / output_rate as f64;
+ let output_len = ((input.len() as f64) / ratio).ceil() as usize;
+
+ let cutoff = (output_rate as f64 / input_rate as f64).min(1.0);
+
+ let radius: i32 = 32;
+ let radius_f = radius as f64;
+ let pi = std::f64::consts::PI;
+
+ let mut output = Vec::with_capacity(output_len);
+ for n in 0..output_len {
+ let t = n as f64 * ratio;
+ let center = t.floor() as i32;
+ let frac = t - (center as f64);
+
+ let mut acc = 0.0f64;
+ let mut norm = 0.0f64;
+
+ for k in -radius..=radius {
+ let idx = center + k;
+ if idx < 0 || (idx as usize) >= input.len() {
+ continue;
+ }
+
+ let x = (k as f64) - frac;
+ let d = x.abs();
+ if d > radius_f {
+ continue;
+ }
+
+ let window = 0.5 * (1.0 + (pi * d / radius_f).cos());
+
+ let z = x * cutoff;
+ let sinc = if z == 0.0 {
+ 1.0
+ } else {
+ let pz = pi * z;
+ pz.sin() / pz
+ };
+
+ let weight = cutoff * sinc * window;
+ acc += input[idx as usize] as f64 * weight;
+ norm += weight;
+ }
+
+ let y = if norm == 0.0 { 0.0 } else { acc / norm };
+ output.push(y as f32);
+ }
+
+ output
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use std::io::Cursor;
+
+ fn create_wav_buffer(sample_rate: u32, channels: u16, samples: &[i16]) -> Vec<u8> {
+ let mut buf = Vec::new();
+ let data_size = (samples.len() * 2) as u32;
+ let file_size = 36 + data_size;
+
+ buf.extend_from_slice(b"RIFF");
+ buf.extend_from_slice(&file_size.to_le_bytes());
+ buf.extend_from_slice(b"WAVE");
+
+ buf.extend_from_slice(b"fmt ");
+ buf.extend_from_slice(&16u32.to_le_bytes());
+ buf.extend_from_slice(&1u16.to_le_bytes());
+ buf.extend_from_slice(&channels.to_le_bytes());
+ buf.extend_from_slice(&sample_rate.to_le_bytes());
+ let byte_rate = sample_rate * channels as u32 * 2;
+ buf.extend_from_slice(&byte_rate.to_le_bytes());
+ let block_align = channels * 2;
+ buf.extend_from_slice(&block_align.to_le_bytes());
+ buf.extend_from_slice(&16u16.to_le_bytes());
+
+ buf.extend_from_slice(b"data");
+ buf.extend_from_slice(&data_size.to_le_bytes());
+ for &s in samples {
+ buf.extend_from_slice(&s.to_le_bytes());
+ }
+
+ buf
+ }
+
+ #[test]
+ fn converts_stereo_to_mono() {
+ let mut samples = Vec::new();
+ for _ in 0..(TARGET_SAMPLE_RATE / 10) {
+ samples.push(10_000i16);
+ samples.push(0i16);
+ }
+
+ let wav = create_wav_buffer(TARGET_SAMPLE_RATE, 2, &samples);
+ let cursor = Cursor::new(wav);
+
+ let normalized = to_16k_mono_from_reader(cursor).unwrap();
+
+ assert_eq!(normalized.sample_rate, TARGET_SAMPLE_RATE);
+ assert_eq!(normalized.channels, TARGET_CHANNELS);
+ let mean =
+ normalized.samples.iter().copied().sum::<f32>() / normalized.samples.len() as f32;
+ let expected = (10_000.0 / 32768.0) / 2.0;
+ assert!((mean - expected).abs() < 1e-3);
+ }
+
+ #[test]
+ fn resamples_to_16k() {
+ let samples: Vec<i16> = vec![0; 48_000];
+ let wav = create_wav_buffer(48_000, 1, &samples);
+ let cursor = Cursor::new(wav);
+
+ let normalized = to_16k_mono_from_reader(cursor).unwrap();
+
+ assert_eq!(normalized.sample_rate, TARGET_SAMPLE_RATE);
+ assert_eq!(normalized.channels, TARGET_CHANNELS);
+ assert_eq!(normalized.samples.len(), TARGET_SAMPLE_RATE as usize);
+ let max_abs = normalized
+ .samples
+ .iter()
+ .copied()
+ .fold(0.0f32, |m, v| m.max(v.abs()));
+ assert!(max_abs <= 1e-6);
+ }
+}
diff --git a/makima/src/listen.rs b/makima/src/listen.rs
index a0f4246..cd0a394 100644
--- a/makima/src/listen.rs
+++ b/makima/src/listen.rs
@@ -1,11 +1,12 @@
use std::cmp::Ordering;
use std::path::Path;
-use hound::WavReader;
use parakeet_rs::sortformer::{DiarizationConfig, Sortformer, SpeakerSegment};
use parakeet_rs::{ParakeetTDT, TimedToken, TimestampMode};
-const SAMPLE_RATE: u32 = 16_000;
+use crate::audio;
+
+const STREAM_CHUNK_MS: u32 = 5_000;
pub struct DialogueSegment {
pub speaker: String,
@@ -15,57 +16,54 @@ pub struct DialogueSegment {
}
pub(crate) fn listen() -> Result<Vec<DialogueSegment>, Box<dyn std::error::Error>> {
- let audio_path = Path::new("audio.wav");
+ let audio_path = Path::new("audio-ftc.mp3");
- let (audio, sample_rate, channels) = load_audio(audio_path)?;
+ let normalized = audio::to_16k_mono_from_path(audio_path)?;
let mut parakeet = ParakeetTDT::from_pretrained("models/parakeet-tdt-0.6b-v3", None)?;
- let transcription = parakeet.transcribe_samples(
- audio.clone(),
- sample_rate,
- channels,
- Some(TimestampMode::Sentences),
- )?;
-
let mut sortformer = Sortformer::with_config(
"models/diarization/diar_streaming_sortformer_4spk-v2.onnx",
None,
DiarizationConfig::callhome(),
)?;
- let diarization_segments = sortformer.diarize(audio, sample_rate, channels)?;
-
- let segments = align_speakers(&transcription.tokens, &diarization_segments);
- for segment in &segments {
- println!(
- "[{:.2}s - {:.2}s] {}: {}",
- segment.start, segment.end, segment.speaker, segment.text
- );
- }
- Ok(segments)
-}
+ let chunk_samples = samples_per_chunk(normalized.sample_rate, STREAM_CHUNK_MS);
+ let mut cumulative_audio: Vec<f32> = Vec::new();
+ let mut last_printed_tokens = 0usize;
+ let mut final_segments: Vec<DialogueSegment> = Vec::new();
-fn load_audio(path: &Path) -> Result<(Vec<f32>, u32, u16), Box<dyn std::error::Error>> {
- let mut reader = WavReader::open(path)?;
- let spec = reader.spec();
+ for (chunk_idx, chunk) in normalized.samples.chunks(chunk_samples).enumerate() {
+ cumulative_audio.extend_from_slice(chunk);
- if spec.sample_rate != SAMPLE_RATE {
- return Err(format!(
- "Expected {} Hz audio, got {} Hz",
- SAMPLE_RATE, spec.sample_rate
- )
- .into());
- }
+ let diarization_segments = sortformer.diarize(
+ cumulative_audio.clone(),
+ normalized.sample_rate,
+ normalized.channels,
+ )?;
+
+ let transcription = parakeet.transcribe_samples(
+ cumulative_audio.clone(),
+ normalized.sample_rate,
+ normalized.channels,
+ Some(TimestampMode::Sentences),
+ )?;
+
+ final_segments = align_speakers(&transcription.tokens, &diarization_segments);
- let samples = match spec.sample_format {
- hound::SampleFormat::Float => reader.samples::<f32>().collect::<Result<Vec<_>, _>>()?,
- hound::SampleFormat::Int => reader
- .samples::<i16>()
- .map(|s| s.map(|s| s as f32 / 32768.0))
- .collect::<Result<Vec<_>, _>>()?,
- };
+ // Simulate "live" output by printing only newly emitted tokens.
+ if transcription.tokens.len() > last_printed_tokens {
+ let new_segments = &final_segments[last_printed_tokens..];
+ for segment in new_segments {
+ println!(
+ "[chunk {}] [{:.2}s - {:.2}s] {}: {}",
+ chunk_idx, segment.start, segment.end, segment.speaker, segment.text
+ );
+ }
+ last_printed_tokens = transcription.tokens.len();
+ }
+ }
- Ok((samples, spec.sample_rate, spec.channels))
+ Ok(final_segments)
}
fn align_speakers(tokens: &[TimedToken], speakers: &[SpeakerSegment]) -> Vec<DialogueSegment> {
@@ -84,6 +82,25 @@ fn align_speakers(tokens: &[TimedToken], speakers: &[SpeakerSegment]) -> Vec<Dia
.collect()
}
+fn samples_per_chunk(sample_rate: u32, chunk_ms: u32) -> usize {
+ let samples = (sample_rate as u64)
+ .saturating_mul(chunk_ms as u64)
+ .saturating_div(1_000);
+ samples.max(1) as usize
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn samples_per_chunk_rounds_down_and_clamps() {
+ assert_eq!(samples_per_chunk(16_000, 1_000), 16_000);
+ assert_eq!(samples_per_chunk(16_000, 160), 2_560);
+ assert_eq!(samples_per_chunk(16_000, 0), 1);
+ }
+}
+
fn speaker_for_span(start: f32, end: f32, speakers: &[SpeakerSegment]) -> Option<String> {
speakers
.iter()
diff --git a/makima/src/main.rs b/makima/src/main.rs
index 9097ef6..2348b23 100644
--- a/makima/src/main.rs
+++ b/makima/src/main.rs
@@ -1,6 +1,44 @@
+use std::path::Path;
+use crate::tts::{save_wav, ChatterboxTTS};
+
+mod audio;
mod listen;
+pub mod tts;
fn main() -> Result<(), Box<dyn std::error::Error>> {
+ println!("Loading ChatterboxTTS...");
+ let mut tts = ChatterboxTTS::from_pretrained(None)?;
+ println!("Model loaded successfully!");
+
+ // // Voice cloning using existing audio file
+ // println!("Generating TTS with voice cloning...");
+ // let audio = tts.generate_tts_with_voice(
+ // "Hello, this is a test of the voice cloning system.",
+ // Path::new("audio.wav")
+ // )?;
+ //
+ // println!("Generated {} samples", audio.len());
+ // save_wav(&audio, Path::new("output.wav"))?;
+ // println!("Saved to output.wav");
+
+
+ // Load reference audio from mp3
+ println!("Loading reference audio...");
+ let reference = audio::to_16k_mono_from_path(Path::new("audio.mp3"))?;
+ let samples = &reference.samples;
+ let sample_rate = reference.sample_rate;
+
+ // Voice cloning using audio samples
+ println!("Generating TTS with voice cloning...");
+ let audio = tts.generate_tts_with_samples(
+ "Hello, this is a test of the voice cloning system [chuckles]. Repeat after me \" I am Steve Jobs!\"",
+ samples,
+ sample_rate,
+ )?;
+
+ println!("Generated {} samples", audio.len());
+ save_wav(&audio, Path::new("output.wav"))?;
+ println!("Saved to output.wav");
let segments = listen::listen()?;
println!("Captured {} diarized segments", segments.len());
Ok(())
diff --git a/makima/src/tts.rs b/makima/src/tts.rs
new file mode 100644
index 0000000..5198938
--- /dev/null
+++ b/makima/src/tts.rs
@@ -0,0 +1,580 @@
+use std::path::{Path, PathBuf};
+use std::fs;
+
+use hf_hub::api::sync::Api;
+use std::borrow::Cow;
+
+use ndarray::{ArrayD, Array2, Array3, Array4, IxDyn};
+use ort::session::Session;
+use ort::value::{Value, DynValue};
+use tokenizers::Tokenizer;
+
+use crate::audio;
+
+pub const SAMPLE_RATE: u32 = 24_000;
+const START_SPEECH_TOKEN: i64 = 6561;
+const STOP_SPEECH_TOKEN: i64 = 6562;
+const SILENCE_TOKEN: i64 = 4299;
+const NUM_LAYERS: usize = 24;
+const NUM_KV_HEADS: usize = 16;
+const HEAD_DIM: usize = 64;
+
+const MODEL_ID: &str = "ResembleAI/chatterbox-turbo-ONNX";
+const DEFAULT_MODEL_DIR: &str = "models/chatterbox-turbo";
+
+#[derive(Debug)]
+pub enum TtsError {
+ ModelLoad(String),
+ Inference(String),
+ Tokenizer(String),
+ Audio(audio::AudioError),
+ Io(std::io::Error),
+ VoiceRequired,
+}
+
+impl std::fmt::Display for TtsError {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ TtsError::ModelLoad(msg) => write!(f, "model load error: {msg}"),
+ TtsError::Inference(msg) => write!(f, "inference error: {msg}"),
+ TtsError::Tokenizer(msg) => write!(f, "tokenizer error: {msg}"),
+ TtsError::Audio(err) => write!(f, "audio error: {err}"),
+ TtsError::Io(err) => write!(f, "io error: {err}"),
+ TtsError::VoiceRequired => write!(f, "voice reference audio is required for chatterbox-turbo"),
+ }
+ }
+}
+
+impl std::error::Error for TtsError {}
+
+impl From<audio::AudioError> for TtsError {
+ fn from(value: audio::AudioError) -> Self {
+ TtsError::Audio(value)
+ }
+}
+
+impl From<std::io::Error> for TtsError {
+ fn from(value: std::io::Error) -> Self {
+ TtsError::Io(value)
+ }
+}
+
+impl From<ort::Error> for TtsError {
+ fn from(value: ort::Error) -> Self {
+ TtsError::ModelLoad(value.to_string())
+ }
+}
+
+pub struct ChatterboxTTS {
+ speech_encoder: Session,
+ embed_tokens: Session,
+ language_model: Session,
+ conditional_decoder: Session,
+ tokenizer: Tokenizer,
+}
+
+struct VoiceCondition {
+ audio_features: ArrayD<f32>,
+ prompt_tokens: ArrayD<i64>,
+ speaker_embeddings: ArrayD<f32>,
+ speaker_features: ArrayD<f32>,
+}
+
+fn extract_f32_tensor(value: &Value) -> Result<ArrayD<f32>, TtsError> {
+ let (shape, data) = value
+ .try_extract_tensor::<f32>()
+ .map_err(|e| TtsError::Inference(e.to_string()))?;
+
+ let dims: Vec<usize> = shape.iter().map(|&d| d as usize).collect();
+ ArrayD::from_shape_vec(IxDyn(&dims), data.to_vec())
+ .map_err(|e| TtsError::Inference(e.to_string()))
+}
+
+fn extract_i64_tensor(value: &Value) -> Result<ArrayD<i64>, TtsError> {
+ let (shape, data) = value
+ .try_extract_tensor::<i64>()
+ .map_err(|e| TtsError::Inference(e.to_string()))?;
+
+ let dims: Vec<usize> = shape.iter().map(|&d| d as usize).collect();
+ ArrayD::from_shape_vec(IxDyn(&dims), data.to_vec())
+ .map_err(|e| TtsError::Inference(e.to_string()))
+}
+
+impl ChatterboxTTS {
+ pub fn from_pretrained(model_dir: Option<&str>) -> Result<Self, TtsError> {
+ let model_path = PathBuf::from(model_dir.unwrap_or(DEFAULT_MODEL_DIR));
+
+ if !model_path.exists() {
+ download_models(&model_path)?;
+ }
+
+ Self::load_from_path(&model_path)
+ }
+
+ pub fn load_from_path(model_dir: &Path) -> Result<Self, TtsError> {
+ let speech_encoder = Session::builder()?
+ .with_intra_threads(4)?
+ .commit_from_file(model_dir.join("speech_encoder.onnx"))?;
+
+ let embed_tokens = Session::builder()?
+ .with_intra_threads(4)?
+ .commit_from_file(model_dir.join("embed_tokens.onnx"))?;
+
+ let language_model = Session::builder()?
+ .with_intra_threads(4)?
+ .commit_from_file(model_dir.join("language_model.onnx"))?;
+
+ let conditional_decoder = Session::builder()?
+ .with_intra_threads(4)?
+ .commit_from_file(model_dir.join("conditional_decoder.onnx"))?;
+
+ let tokenizer_path = model_dir.join("tokenizer.json");
+ let tokenizer = Tokenizer::from_file(&tokenizer_path)
+ .map_err(|e| TtsError::Tokenizer(e.to_string()))?;
+
+ Ok(Self {
+ speech_encoder,
+ embed_tokens,
+ language_model,
+ conditional_decoder,
+ tokenizer,
+ })
+ }
+
+ pub fn generate_tts(&mut self, _text: &str) -> Result<Vec<f32>, TtsError> {
+ // Chatterbox TTS requires voice reference audio
+ Err(TtsError::VoiceRequired)
+ }
+
+ pub fn generate_tts_with_voice(
+ &mut self,
+ text: &str,
+ sample_audio_path: &Path,
+ ) -> Result<Vec<f32>, TtsError> {
+ let audio = audio::to_16k_mono_from_path(sample_audio_path)?;
+ let resampled = resample_to_24k(&audio.samples, audio.sample_rate);
+ self.generate_tts_with_samples(text, &resampled, SAMPLE_RATE)
+ }
+
+ pub fn generate_tts_with_samples(
+ &mut self,
+ text: &str,
+ samples: &[f32],
+ sample_rate: u32,
+ ) -> Result<Vec<f32>, TtsError> {
+ let resampled = if sample_rate != SAMPLE_RATE {
+ resample_to_24k(samples, sample_rate)
+ } else {
+ samples.to_vec()
+ };
+
+ // 1. Encode reference audio
+ let voice_condition = self.encode_voice(&resampled)?;
+
+ // 2. Tokenize text
+ let encoding = self
+ .tokenizer
+ .encode(text, true)
+ .map_err(|e| TtsError::Tokenizer(e.to_string()))?;
+
+ let text_input_ids: Vec<i64> = encoding.get_ids().iter().map(|&id| id as i64).collect();
+
+ // 3. Generate speech tokens
+ let generated_tokens = self.generate_speech_tokens(
+ &text_input_ids,
+ &voice_condition.audio_features,
+ )?;
+
+ // 4. Prepare final speech tokens: prompt_tokens + generated + silence
+ let prompt_tokens: Vec<i64> = voice_condition.prompt_tokens.iter().copied().collect();
+ let silence_tokens = vec![SILENCE_TOKEN; 3];
+
+ let mut final_tokens = Vec::with_capacity(
+ prompt_tokens.len() + generated_tokens.len() + silence_tokens.len()
+ );
+ final_tokens.extend_from_slice(&prompt_tokens);
+ final_tokens.extend_from_slice(&generated_tokens);
+ final_tokens.extend_from_slice(&silence_tokens);
+
+ // 5. Decode to audio
+ let audio_samples = self.decode_speech_tokens(
+ &final_tokens,
+ &voice_condition.speaker_embeddings,
+ &voice_condition.speaker_features,
+ )?;
+
+ Ok(audio_samples)
+ }
+
+ fn encode_voice(&mut self, samples: &[f32]) -> Result<VoiceCondition, TtsError> {
+ let audio_arr = Array2::from_shape_vec((1, samples.len()), samples.to_vec())
+ .map_err(|e| TtsError::Inference(e.to_string()))?;
+
+ let audio_tensor = Value::from_array(audio_arr)?;
+
+ let outputs = self.speech_encoder.run(ort::inputs!["audio_values" => audio_tensor])?;
+
+ // Order: audio_features, audio_tokens (prompt_token), speaker_embeddings, speaker_features
+ let audio_features = extract_f32_tensor(&outputs[0])?;
+ let prompt_tokens = extract_i64_tensor(&outputs[1])?;
+ let speaker_embeddings = extract_f32_tensor(&outputs[2])?;
+ let speaker_features = extract_f32_tensor(&outputs[3])?;
+
+ Ok(VoiceCondition {
+ audio_features,
+ prompt_tokens,
+ speaker_embeddings,
+ speaker_features,
+ })
+ }
+
+ fn generate_speech_tokens(
+ &mut self,
+ text_input_ids: &[i64],
+ audio_features: &ArrayD<f32>,
+ ) -> Result<Vec<i64>, TtsError> {
+ let max_new_tokens: usize = 1024;
+ let repetition_penalty: f32 = 1.2;
+
+ // Start with START_SPEECH_TOKEN
+ let mut generate_tokens: Vec<i64> = vec![START_SPEECH_TOKEN];
+
+ // Initialize empty KV cache (seq_len = 0)
+ let mut past_key_values = self.init_kv_cache(0)?;
+
+ let mut first_iteration = true;
+ let mut total_seq_len: usize = 0;
+
+ for _ in 0..max_new_tokens {
+ // Get embeddings for current input_ids
+ let current_input_ids = if first_iteration {
+ // First iteration: use text input_ids
+ text_input_ids.to_vec()
+ } else {
+ // Subsequent iterations: use last generated token
+ vec![*generate_tokens.last().unwrap()]
+ };
+
+ let input_ids_arr = Array2::from_shape_vec(
+ (1, current_input_ids.len()),
+ current_input_ids
+ ).map_err(|e| TtsError::Inference(e.to_string()))?;
+
+ let input_ids_tensor = Value::from_array(input_ids_arr)?;
+
+ let inputs_embeds = {
+ let embed_outputs = self.embed_tokens.run(ort::inputs![input_ids_tensor])?;
+ extract_f32_tensor(&embed_outputs[0])?
+ };
+
+ // On first iteration, concatenate audio features with text embeddings
+ let inputs_embeds = if first_iteration {
+ let audio_feat_3d = audio_features.view()
+ .into_dimensionality::<ndarray::Ix3>()
+ .map_err(|e| TtsError::Inference(e.to_string()))?;
+ let text_emb_3d = inputs_embeds.view()
+ .into_dimensionality::<ndarray::Ix3>()
+ .map_err(|e| TtsError::Inference(e.to_string()))?;
+
+ ndarray::concatenate(ndarray::Axis(1), &[audio_feat_3d, text_emb_3d])
+ .map_err(|e| TtsError::Inference(e.to_string()))?
+ } else {
+ inputs_embeds.view()
+ .into_dimensionality::<ndarray::Ix3>()
+ .map_err(|e| TtsError::Inference(e.to_string()))?
+ .to_owned()
+ };
+
+ let seq_len = inputs_embeds.shape()[1];
+
+ // Set up attention mask and position ids
+ let (attention_mask, position_ids) = if first_iteration {
+ total_seq_len = seq_len;
+ let attention_mask: Array2<i64> = Array2::ones((1, seq_len));
+ let position_ids = Array2::from_shape_fn((1, seq_len), |(_, j)| j as i64);
+ (attention_mask, position_ids)
+ } else {
+ total_seq_len += 1;
+ let attention_mask: Array2<i64> = Array2::ones((1, total_seq_len));
+ let position_ids = Array2::from_shape_vec(
+ (1, 1),
+ vec![(total_seq_len - 1) as i64]
+ ).map_err(|e| TtsError::Inference(e.to_string()))?;
+ (attention_mask, position_ids)
+ };
+
+ // Run language model
+ let (logits, new_kv) = self.run_language_model(
+ inputs_embeds,
+ position_ids,
+ attention_mask,
+ past_key_values,
+ )?;
+
+ past_key_values = new_kv;
+
+ // Get last logits
+ let logits_3d = logits.view().into_dimensionality::<ndarray::Ix3>()
+ .map_err(|e| TtsError::Inference(e.to_string()))?;
+ let last_idx = logits_3d.shape()[1] - 1;
+
+ let mut current_logits: Vec<f32> = logits_3d
+ .slice(ndarray::s![0, last_idx, ..])
+ .iter()
+ .copied()
+ .collect();
+
+ // Apply repetition penalty
+ apply_repetition_penalty(&mut current_logits, &generate_tokens, repetition_penalty);
+
+ // Get next token
+ let next_token = argmax(&current_logits);
+
+ generate_tokens.push(next_token);
+
+ if next_token == STOP_SPEECH_TOKEN {
+ break;
+ }
+
+ first_iteration = false;
+ }
+
+ // Return tokens without START and STOP tokens: [1:-1]
+ if generate_tokens.len() > 2 {
+ Ok(generate_tokens[1..generate_tokens.len()-1].to_vec())
+ } else {
+ Ok(Vec::new())
+ }
+ }
+
+ fn init_kv_cache(&self, seq_len: usize) -> Result<Vec<Array4<f32>>, TtsError> {
+ let mut cache = Vec::with_capacity(NUM_LAYERS * 2);
+ for _ in 0..NUM_LAYERS {
+ let key = Array4::<f32>::zeros((1, NUM_KV_HEADS, seq_len, HEAD_DIM));
+ let value = Array4::<f32>::zeros((1, NUM_KV_HEADS, seq_len, HEAD_DIM));
+ cache.push(key);
+ cache.push(value);
+ }
+ Ok(cache)
+ }
+
+ fn run_language_model(
+ &mut self,
+ inputs_embeds: Array3<f32>,
+ position_ids: Array2<i64>,
+ attention_mask: Array2<i64>,
+ past_key_values: Vec<Array4<f32>>,
+ ) -> Result<(ArrayD<f32>, Vec<Array4<f32>>), TtsError> {
+ let mut inputs: Vec<(Cow<str>, DynValue)> = Vec::new();
+
+ inputs.push((Cow::from("inputs_embeds"), Value::from_array(inputs_embeds)?.into_dyn()));
+ inputs.push((Cow::from("position_ids"), Value::from_array(position_ids)?.into_dyn()));
+ inputs.push((Cow::from("attention_mask"), Value::from_array(attention_mask)?.into_dyn()));
+
+ // Add KV cache inputs
+ for layer_idx in 0..NUM_LAYERS {
+ let key_name = format!("past_key_values.{}.key", layer_idx);
+ let value_name = format!("past_key_values.{}.value", layer_idx);
+
+ let key_tensor = Value::from_array(past_key_values[layer_idx * 2].clone())?.into_dyn();
+ let value_tensor = Value::from_array(past_key_values[layer_idx * 2 + 1].clone())?.into_dyn();
+
+ inputs.push((Cow::from(key_name), key_tensor));
+ inputs.push((Cow::from(value_name), value_tensor));
+ }
+
+ let outputs = self.language_model.run(inputs)?;
+
+ let logits = extract_f32_tensor(&outputs[0])?;
+
+ let mut new_kv = Vec::with_capacity(NUM_LAYERS * 2);
+ for layer_idx in 0..NUM_LAYERS {
+ let key_idx = 1 + layer_idx * 2;
+ let value_idx = 2 + layer_idx * 2;
+
+ let key_arr = extract_f32_tensor(&outputs[key_idx])?;
+ let value_arr = extract_f32_tensor(&outputs[value_idx])?;
+
+ let key_4d = key_arr.into_dimensionality::<ndarray::Ix4>()
+ .map_err(|e| TtsError::Inference(e.to_string()))?;
+ let value_4d = value_arr.into_dimensionality::<ndarray::Ix4>()
+ .map_err(|e| TtsError::Inference(e.to_string()))?;
+
+ new_kv.push(key_4d.to_owned());
+ new_kv.push(value_4d.to_owned());
+ }
+
+ Ok((logits, new_kv))
+ }
+
+ fn decode_speech_tokens(
+ &mut self,
+ speech_tokens: &[i64],
+ speaker_embeddings: &ArrayD<f32>,
+ speaker_features: &ArrayD<f32>,
+ ) -> Result<Vec<f32>, TtsError> {
+ if speech_tokens.is_empty() {
+ return Ok(Vec::new());
+ }
+
+ let tokens_arr = Array2::from_shape_vec((1, speech_tokens.len()), speech_tokens.to_vec())
+ .map_err(|e| TtsError::Inference(e.to_string()))?;
+
+ let mut inputs: Vec<(Cow<str>, DynValue)> = Vec::new();
+ inputs.push((Cow::from("speech_tokens"), Value::from_array(tokens_arr)?.into_dyn()));
+ inputs.push((Cow::from("speaker_embeddings"), Value::from_array(speaker_embeddings.clone())?.into_dyn()));
+ inputs.push((Cow::from("speaker_features"), Value::from_array(speaker_features.clone())?.into_dyn()));
+
+ let outputs = self.conditional_decoder.run(inputs)?;
+
+ let waveform = extract_f32_tensor(&outputs[0])?;
+
+ Ok(waveform.iter().copied().collect())
+ }
+}
+
+fn download_models(target_dir: &Path) -> Result<(), TtsError> {
+ fs::create_dir_all(target_dir)?;
+
+ let api = Api::new().map_err(|e| TtsError::ModelLoad(e.to_string()))?;
+ let repo = api.model(MODEL_ID.to_string());
+
+ let model_files = [
+ "onnx/speech_encoder.onnx",
+ "onnx/speech_encoder.onnx_data",
+ "onnx/embed_tokens.onnx",
+ "onnx/embed_tokens.onnx_data",
+ "onnx/language_model.onnx",
+ "onnx/language_model.onnx_data",
+ "onnx/conditional_decoder.onnx",
+ "onnx/conditional_decoder.onnx_data",
+ "tokenizer.json",
+ ];
+
+ for file in &model_files {
+ println!("Downloading {}...", file);
+ let downloaded_path = repo.get(file).map_err(|e| TtsError::ModelLoad(e.to_string()))?;
+
+ let filename = Path::new(file).file_name().unwrap();
+ let target_path = target_dir.join(filename);
+
+ if !target_path.exists() {
+ fs::copy(&downloaded_path, &target_path)?;
+ }
+ }
+
+ println!("Models downloaded to {:?}", target_dir);
+ Ok(())
+}
+
+fn resample_to_24k(samples: &[f32], input_rate: u32) -> Vec<f32> {
+ if input_rate == SAMPLE_RATE {
+ return samples.to_vec();
+ }
+ if samples.is_empty() {
+ return Vec::new();
+ }
+
+ let ratio = input_rate as f64 / SAMPLE_RATE as f64;
+ let output_len = ((samples.len() as f64) / ratio).ceil() as usize;
+
+ let mut output = Vec::with_capacity(output_len);
+ for i in 0..output_len {
+ let src_idx = (i as f64 * ratio) as usize;
+ let sample = samples.get(src_idx).copied().unwrap_or(0.0);
+ output.push(sample);
+ }
+
+ output
+}
+
+fn apply_repetition_penalty(logits: &mut [f32], generated: &[i64], penalty: f32) {
+ for &token in generated {
+ if (token as usize) < logits.len() {
+ let score = logits[token as usize];
+ // Note: opposite of standard - if score < 0, multiply; if > 0, divide
+ logits[token as usize] = if score < 0.0 {
+ score * penalty
+ } else {
+ score / penalty
+ };
+ }
+ }
+}
+
+fn argmax(logits: &[f32]) -> i64 {
+ logits
+ .iter()
+ .enumerate()
+ .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
+ .map(|(idx, _)| idx as i64)
+ .unwrap_or(0)
+}
+
+pub fn save_wav(samples: &[f32], path: &Path) -> Result<(), TtsError> {
+ let mut file = fs::File::create(path)?;
+ write_wav(&mut file, samples, SAMPLE_RATE)?;
+ Ok(())
+}
+
+fn write_wav<W: std::io::Write>(writer: &mut W, samples: &[f32], sample_rate: u32) -> Result<(), std::io::Error> {
+ let num_samples = samples.len() as u32;
+ let num_channels: u16 = 1;
+ let bits_per_sample: u16 = 16;
+ let byte_rate = sample_rate * num_channels as u32 * bits_per_sample as u32 / 8;
+ let block_align = num_channels * bits_per_sample / 8;
+ let data_size = num_samples * num_channels as u32 * bits_per_sample as u32 / 8;
+ let file_size = 36 + data_size;
+
+ writer.write_all(b"RIFF")?;
+ writer.write_all(&file_size.to_le_bytes())?;
+ writer.write_all(b"WAVE")?;
+
+ writer.write_all(b"fmt ")?;
+ writer.write_all(&16u32.to_le_bytes())?;
+ writer.write_all(&1u16.to_le_bytes())?;
+ writer.write_all(&num_channels.to_le_bytes())?;
+ writer.write_all(&sample_rate.to_le_bytes())?;
+ writer.write_all(&byte_rate.to_le_bytes())?;
+ writer.write_all(&block_align.to_le_bytes())?;
+ writer.write_all(&bits_per_sample.to_le_bytes())?;
+
+ writer.write_all(b"data")?;
+ writer.write_all(&data_size.to_le_bytes())?;
+
+ for &sample in samples {
+ let clamped = sample.clamp(-1.0, 1.0);
+ let int_sample = (clamped * 32767.0) as i16;
+ writer.write_all(&int_sample.to_le_bytes())?;
+ }
+
+ Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_argmax() {
+ let logits = vec![0.1, 0.5, 0.3, 0.8, 0.2];
+ assert_eq!(argmax(&logits), 3);
+ }
+
+ #[test]
+ fn test_resample_same_rate() {
+ let samples = vec![0.1, 0.2, 0.3];
+ let resampled = resample_to_24k(&samples, SAMPLE_RATE);
+ assert_eq!(resampled, samples);
+ }
+
+ #[test]
+ fn test_repetition_penalty() {
+ let mut logits = vec![1.0, 2.0, 3.0, 4.0];
+ let generated = vec![1, 3];
+ apply_repetition_penalty(&mut logits, &generated, 1.2);
+ // score > 0 -> divide
+ assert!((logits[1] - 2.0 / 1.2).abs() < 1e-6);
+ assert!((logits[3] - 4.0 / 1.2).abs() < 1e-6);
+ }
+}