diff --git a/rust-whisper-baked/Cargo.lock b/rust-whisper-baked/Cargo.lock index 12b0928..9471794 100644 --- a/rust-whisper-baked/Cargo.lock +++ b/rust-whisper-baked/Cargo.lock @@ -383,6 +383,12 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "itoa" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" + [[package]] name = "jni" version = "0.19.0" @@ -765,6 +771,7 @@ dependencies = [ "listen-lib", "rust-whisper-baked-lib", "rust-whisper-lib", + "stop-words", ] [[package]] @@ -802,6 +809,12 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "ryu" +version = "1.0.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" + [[package]] name = "same-file" version = "1.0.6" @@ -817,6 +830,37 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "serde" +version = "1.0.193" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25dd9975e68d0cb5aa1120c288333fc98731bd1dd12f561e468ea4728c042b89" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.193" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.41", +] + +[[package]] +name = "serde_json" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb0652c533506ad7a2e353cce269330d6afd8bdfb6d75e0ace5b35aacbd7b9e9" +dependencies = [ + "itoa", + "ryu", + "serde", +] + [[package]] name = "shlex" version = "1.2.0" @@ -848,6 +892,15 @@ version = "1.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4dccd0940a2dcdf68d092b8cbab7dc0ad8fa938bf95787e1b916b0e3d0e8e970" +[[package]] +name = "stop-words" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8500024d809de02ecbf998472b7bed3c4fca380df2be68917f6a473bdb28ddcc" +dependencies = [ + "serde_json", +] + [[package]] name = "strsim" version = "0.10.0" diff --git a/rust-whisper-baked/Cargo.toml b/rust-whisper-baked/Cargo.toml index 7cc57b4..2356bac 100644 --- a/rust-whisper-baked/Cargo.toml +++ b/rust-whisper-baked/Cargo.toml @@ -10,3 +10,4 @@ rust-whisper-lib = { path = "../rust-whisper-lib" } rust-whisper-baked-lib = { path = "../rust-whisper-baked-lib" } listen-lib = { path = "../listen-lib" } clap = { version = "4.4.10", features = ["derive"] } +stop-words = "0.8.0" diff --git a/rust-whisper-baked/src/main.rs b/rust-whisper-baked/src/main.rs index b88c327..9104bc8 100644 --- a/rust-whisper-baked/src/main.rs +++ b/rust-whisper-baked/src/main.rs @@ -101,8 +101,10 @@ impl Destutterer { let mut n = self.prevs.len().clamp(0, nexts.len()); while n > 0 { - let prev_s = Word::to_comparable_string(self.prevs[self.prevs.len() - n..].to_vec()); - let next_s = Word::to_comparable_string(nexts[..n].to_vec()); + let prev_s = Word::to_comparable_string(self.prevs, self.prevs.len(), n); + eprintln!("prevs: {:?} => '{}'", self.prevs[self.prevs.len() - n..].to_vec(), &prev_s); + let next_s = Word::to_comparable_string(nexts, 0, n); // TODO indexes skip stop words + eprintln!("nexts: {:?} => '{}'", nexts[..n].to_vec(), &next_s); if prev_s == next_s { break; } @@ -113,7 +115,7 @@ impl Destutterer { } } -#[derive(Clone)] +#[derive(Clone, Debug)] struct Word { raw: String, } @@ -131,8 +133,11 @@ impl Word { } fn to_comparable_string(v: Vec) -> String { + let skips = stop_words::get("en"); v.iter() - .map(|x| x.raw.chars().filter(|c| c.is_ascii_alphanumeric()).collect()) + .map(|w| w.raw.to_lowercase()) + .filter(|word| !skips.contains(word)) + .map(|word| word.chars().filter(|c| c.is_ascii_alphanumeric()).collect::()) .collect::>() .join(" ") } @@ -149,6 +154,13 @@ impl Word { mod tests { use super::*; + #[test] + fn test_destutterer_stop_words() { + let mut w = new_destutterer(); + assert_eq!("welcome to the internet".to_string(), w.step("welcome to the internet".to_string())); + assert_eq!("have a look around".to_string(), w.step("welcome to the a internet; have a look around".to_string())); + } + #[test] fn test_destutterer_punctuation() { let mut w = new_destutterer();