stem words for destuttering

This commit is contained in:
bel
2024-01-03 20:40:35 -07:00
parent 4c80247ab9
commit 5fdc60e32c
4 changed files with 20 additions and 2 deletions

View File

@@ -763,12 +763,23 @@ version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9b1a3d5f46d53f4a3478e2be4a5a5ce5108ea58b100dcd139830eae7f79a3a1"
[[package]]
name = "rust-stemmers"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54"
dependencies = [
"serde",
"serde_derive",
]
[[package]]
name = "rust-whisper-baked"
version = "0.1.0"
dependencies = [
"clap",
"listen-lib",
"rust-stemmers",
"rust-whisper-baked-lib",
"rust-whisper-lib",
"stop-words",

View File

@@ -11,3 +11,4 @@ rust-whisper-baked-lib = { path = "../rust-whisper-baked-lib" }
listen-lib = { path = "../listen-lib" }
clap = { version = "4.4.10", features = ["derive"] }
stop-words = "0.8.0"
rust-stemmers = "1.2.0"

View File

@@ -167,14 +167,19 @@ impl Words {
}
fn to_comparable_words(&self) -> Vec<Word> {
self.to_words().iter().filter(|x| x.s.is_some()).map(|x| x.clone()).collect()
self.to_words().iter()
.filter(|x| x.s.is_some())
.map(|x| x.clone())
.collect()
}
fn to_words(&self) -> Vec<Word> {
let skips = stop_words::get("en");
let stemmer = rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::English);
let strs = self.raw.iter()
.map(|w| w.to_lowercase())
.map(|word| word.chars().filter(|c| c.is_ascii_alphanumeric()).collect::<String>())
.map(|w| w.chars().filter(|c| c.is_ascii_alphanumeric()).collect::<String>())
.map(|w| stemmer.stem(&w).into_owned())
.collect::<Vec<String>>();
let mut result = vec![];
for i in 0..strs.len() {