stem words for destuttering

master
bel 2024-01-03 20:40:35 -07:00
parent 4c80247ab9
commit 5fdc60e32c
4 changed files with 20 additions and 2 deletions

View File

@ -13,6 +13,7 @@ if ! which rust-whisper-baked; then
fi >&2 fi >&2
cat <<EOF cat <<EOF
rust-whisper-baked --stream-device pulse_monitor --stream-step 16 --stream-retain 8 --stream-{head,tail}=0.25 2> /dev/null
rust-whisper-baked --stream-device 'BlackHole 2ch' --stream-step 30 --stream-retain 1 --stream-{head,tail}=0.25 --threads 9 2> /dev/null rust-whisper-baked --stream-device 'BlackHole 2ch' --stream-step 30 --stream-retain 1 --stream-{head,tail}=0.25 --threads 9 2> /dev/null
| tee -a "$HOME/Sync/drawful/DnD/bdoob/__log.d/$(date +%Y.%m.%d).transcript.txt" | tee -a "$HOME/Sync/drawful/DnD/bdoob/__log.d/$(date +%Y.%m.%d).transcript.txt"
| tee -a "$HOME/Sync/drawful/DnD/nessira.d/_log.d/$(date +%Y.%m.%d).transcript.txt" | tee -a "$HOME/Sync/drawful/DnD/nessira.d/_log.d/$(date +%Y.%m.%d).transcript.txt"

View File

@ -763,12 +763,23 @@ version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9b1a3d5f46d53f4a3478e2be4a5a5ce5108ea58b100dcd139830eae7f79a3a1" checksum = "b9b1a3d5f46d53f4a3478e2be4a5a5ce5108ea58b100dcd139830eae7f79a3a1"
[[package]]
name = "rust-stemmers"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54"
dependencies = [
"serde",
"serde_derive",
]
[[package]] [[package]]
name = "rust-whisper-baked" name = "rust-whisper-baked"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"clap", "clap",
"listen-lib", "listen-lib",
"rust-stemmers",
"rust-whisper-baked-lib", "rust-whisper-baked-lib",
"rust-whisper-lib", "rust-whisper-lib",
"stop-words", "stop-words",

View File

@ -11,3 +11,4 @@ rust-whisper-baked-lib = { path = "../rust-whisper-baked-lib" }
listen-lib = { path = "../listen-lib" } listen-lib = { path = "../listen-lib" }
clap = { version = "4.4.10", features = ["derive"] } clap = { version = "4.4.10", features = ["derive"] }
stop-words = "0.8.0" stop-words = "0.8.0"
rust-stemmers = "1.2.0"

View File

@ -167,14 +167,19 @@ impl Words {
} }
fn to_comparable_words(&self) -> Vec<Word> { fn to_comparable_words(&self) -> Vec<Word> {
self.to_words().iter().filter(|x| x.s.is_some()).map(|x| x.clone()).collect() self.to_words().iter()
.filter(|x| x.s.is_some())
.map(|x| x.clone())
.collect()
} }
fn to_words(&self) -> Vec<Word> { fn to_words(&self) -> Vec<Word> {
let skips = stop_words::get("en"); let skips = stop_words::get("en");
let stemmer = rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::English);
let strs = self.raw.iter() let strs = self.raw.iter()
.map(|w| w.to_lowercase()) .map(|w| w.to_lowercase())
.map(|word| word.chars().filter(|c| c.is_ascii_alphanumeric()).collect::<String>()) .map(|w| w.chars().filter(|c| c.is_ascii_alphanumeric()).collect::<String>())
.map(|w| stemmer.stem(&w).into_owned())
.collect::<Vec<String>>(); .collect::<Vec<String>>();
let mut result = vec![]; let mut result = vec![];
for i in 0..strs.len() { for i in 0..strs.len() {