diff --git a/hotwords/transcript.sh b/hotwords/transcript.sh index 33bdeed..f07cd65 100644 --- a/hotwords/transcript.sh +++ b/hotwords/transcript.sh @@ -13,6 +13,7 @@ if ! which rust-whisper-baked; then fi >&2 cat < /dev/null rust-whisper-baked --stream-device 'BlackHole 2ch' --stream-step 30 --stream-retain 1 --stream-{head,tail}=0.25 --threads 9 2> /dev/null | tee -a "$HOME/Sync/drawful/DnD/bdoob/__log.d/$(date +%Y.%m.%d).transcript.txt" | tee -a "$HOME/Sync/drawful/DnD/nessira.d/_log.d/$(date +%Y.%m.%d).transcript.txt" diff --git a/rust-whisper-baked/Cargo.lock b/rust-whisper-baked/Cargo.lock index 9471794..d394f6d 100644 --- a/rust-whisper-baked/Cargo.lock +++ b/rust-whisper-baked/Cargo.lock @@ -763,12 +763,23 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9b1a3d5f46d53f4a3478e2be4a5a5ce5108ea58b100dcd139830eae7f79a3a1" +[[package]] +name = "rust-stemmers" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54" +dependencies = [ + "serde", + "serde_derive", +] + [[package]] name = "rust-whisper-baked" version = "0.1.0" dependencies = [ "clap", "listen-lib", + "rust-stemmers", "rust-whisper-baked-lib", "rust-whisper-lib", "stop-words", diff --git a/rust-whisper-baked/Cargo.toml b/rust-whisper-baked/Cargo.toml index 2356bac..04ae675 100644 --- a/rust-whisper-baked/Cargo.toml +++ b/rust-whisper-baked/Cargo.toml @@ -11,3 +11,4 @@ rust-whisper-baked-lib = { path = "../rust-whisper-baked-lib" } listen-lib = { path = "../listen-lib" } clap = { version = "4.4.10", features = ["derive"] } stop-words = "0.8.0" +rust-stemmers = "1.2.0" diff --git a/rust-whisper-baked/src/main.rs b/rust-whisper-baked/src/main.rs index 6d3bacd..ea9ca68 100644 --- a/rust-whisper-baked/src/main.rs +++ b/rust-whisper-baked/src/main.rs @@ -167,14 +167,19 @@ impl Words { } fn to_comparable_words(&self) -> Vec { - self.to_words().iter().filter(|x| x.s.is_some()).map(|x| x.clone()).collect() + self.to_words().iter() + .filter(|x| x.s.is_some()) + .map(|x| x.clone()) + .collect() } fn to_words(&self) -> Vec { let skips = stop_words::get("en"); + let stemmer = rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::English); let strs = self.raw.iter() .map(|w| w.to_lowercase()) - .map(|word| word.chars().filter(|c| c.is_ascii_alphanumeric()).collect::()) + .map(|w| w.chars().filter(|c| c.is_ascii_alphanumeric()).collect::()) + .map(|w| stemmer.stem(&w).into_owned()) .collect::>(); let mut result = vec![]; for i in 0..strs.len() {