todo
parent
b08e055dac
commit
fb7892b52b
|
|
@ -383,6 +383,12 @@ dependencies = [
|
|||
"hashbrown",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c"
|
||||
|
||||
[[package]]
|
||||
name = "jni"
|
||||
version = "0.19.0"
|
||||
|
|
@ -765,6 +771,7 @@ dependencies = [
|
|||
"listen-lib",
|
||||
"rust-whisper-baked-lib",
|
||||
"rust-whisper-lib",
|
||||
"stop-words",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -802,6 +809,12 @@ dependencies = [
|
|||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
version = "1.0.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c"
|
||||
|
||||
[[package]]
|
||||
name = "same-file"
|
||||
version = "1.0.6"
|
||||
|
|
@ -817,6 +830,37 @@ version = "1.2.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.193"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "25dd9975e68d0cb5aa1120c288333fc98731bd1dd12f561e468ea4728c042b89"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.193"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.41",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_json"
|
||||
version = "1.0.109"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cb0652c533506ad7a2e353cce269330d6afd8bdfb6d75e0ace5b35aacbd7b9e9"
|
||||
dependencies = [
|
||||
"itoa",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "shlex"
|
||||
version = "1.2.0"
|
||||
|
|
@ -848,6 +892,15 @@ version = "1.11.2"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4dccd0940a2dcdf68d092b8cbab7dc0ad8fa938bf95787e1b916b0e3d0e8e970"
|
||||
|
||||
[[package]]
|
||||
name = "stop-words"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8500024d809de02ecbf998472b7bed3c4fca380df2be68917f6a473bdb28ddcc"
|
||||
dependencies = [
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.10.0"
|
||||
|
|
|
|||
|
|
@ -10,3 +10,4 @@ rust-whisper-lib = { path = "../rust-whisper-lib" }
|
|||
rust-whisper-baked-lib = { path = "../rust-whisper-baked-lib" }
|
||||
listen-lib = { path = "../listen-lib" }
|
||||
clap = { version = "4.4.10", features = ["derive"] }
|
||||
stop-words = "0.8.0"
|
||||
|
|
|
|||
|
|
@ -101,8 +101,10 @@ impl Destutterer {
|
|||
|
||||
let mut n = self.prevs.len().clamp(0, nexts.len());
|
||||
while n > 0 {
|
||||
let prev_s = Word::to_comparable_string(self.prevs[self.prevs.len() - n..].to_vec());
|
||||
let next_s = Word::to_comparable_string(nexts[..n].to_vec());
|
||||
let prev_s = Word::to_comparable_string(self.prevs, self.prevs.len(), n);
|
||||
eprintln!("prevs: {:?} => '{}'", self.prevs[self.prevs.len() - n..].to_vec(), &prev_s);
|
||||
let next_s = Word::to_comparable_string(nexts, 0, n); // TODO indexes skip stop words
|
||||
eprintln!("nexts: {:?} => '{}'", nexts[..n].to_vec(), &next_s);
|
||||
if prev_s == next_s {
|
||||
break;
|
||||
}
|
||||
|
|
@ -113,7 +115,7 @@ impl Destutterer {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
#[derive(Clone, Debug)]
|
||||
struct Word {
|
||||
raw: String,
|
||||
}
|
||||
|
|
@ -131,8 +133,11 @@ impl Word {
|
|||
}
|
||||
|
||||
fn to_comparable_string(v: Vec<Word>) -> String {
|
||||
let skips = stop_words::get("en");
|
||||
v.iter()
|
||||
.map(|x| x.raw.chars().filter(|c| c.is_ascii_alphanumeric()).collect())
|
||||
.map(|w| w.raw.to_lowercase())
|
||||
.filter(|word| !skips.contains(word))
|
||||
.map(|word| word.chars().filter(|c| c.is_ascii_alphanumeric()).collect::<String>())
|
||||
.collect::<Vec<String>>()
|
||||
.join(" ")
|
||||
}
|
||||
|
|
@ -149,6 +154,13 @@ impl Word {
|
|||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_destutterer_stop_words() {
|
||||
let mut w = new_destutterer();
|
||||
assert_eq!("welcome to the internet".to_string(), w.step("welcome to the internet".to_string()));
|
||||
assert_eq!("have a look around".to_string(), w.step("welcome to the a internet; have a look around".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_destutterer_punctuation() {
|
||||
let mut w = new_destutterer();
|
||||
|
|
|
|||
Loading…
Reference in New Issue