diff --git a/rust-whisper-baked/src/main.rs b/rust-whisper-baked/src/main.rs index 9104bc8..45efcf7 100644 --- a/rust-whisper-baked/src/main.rs +++ b/rust-whisper-baked/src/main.rs @@ -84,7 +84,7 @@ fn channel(flags: rust_whisper_lib::Flags) { } struct Destutterer { - prevs: Vec, + prev: Words, } fn new_destutterer() -> Destutterer { @@ -97,59 +97,89 @@ impl Destutterer { return next; } - let nexts = Word::from_string(next.clone()); - - let mut n = self.prevs.len().clamp(0, nexts.len()); + let next_words = Words::from_string(next.clone()); + let mut n = self.prevs.len().clamp(0, next_words.len()); while n > 0 { - let prev_s = Word::to_comparable_string(self.prevs, self.prevs.len(), n); - eprintln!("prevs: {:?} => '{}'", self.prevs[self.prevs.len() - n..].to_vec(), &prev_s); - let next_s = Word::to_comparable_string(nexts, 0, n); // TODO indexes skip stop words - eprintln!("nexts: {:?} => '{}'", nexts[..n].to_vec(), &next_s); + let prev_s = (self.prevs.len()-n..self.prevs.len()).map(|i| self.prevs.comparable_string(i)).collect().join(" "); + let next_s = (0-n).map(|i| next_words.comparable_string(i)).collect().join(" "); + eprintln!("prevs => '{}'", &prev_s); + eprintln!("nexts => '{}'", &next_s); if prev_s == next_s { break; } n -= 1; } - self.prevs = nexts.clone(); + self.prevs = next_words; Word::to_string(nexts[n..].to_vec()) } } #[derive(Clone, Debug)] -struct Word { - raw: String, +struct Words { + raw: Vec, } -impl Word { - fn from_string(s: String) -> Vec { - let mut result = vec![]; +impl Words { + fn from_string(s: String) -> Words { + let mut result = Words{raw: vec![]}; for word in s.split(" ") { let word = word.trim(); if word.len() > 0 { - result.push(Word{raw: word.to_string()}); + result.raw.push(word.to_string()); } } result } - fn to_comparable_string(v: Vec) -> String { - let skips = stop_words::get("en"); - v.iter() - .map(|w| w.raw.to_lowercase()) - .filter(|word| !skips.contains(word)) - .map(|word| word.chars().filter(|c| c.is_ascii_alphanumeric()).collect::()) - .collect::>() - .join(" ") + fn last_n_comparable_to_string(&self, n: usize) -> (String, usize) { + TODO } - fn to_string(v: Vec) -> String { - v.iter() - .map(|x| x.raw.clone()) + fn first_n_comparable_to_string(&self, n: usize) -> (String, usize){ + TODO + } + + fn comparable_len(&self) -> usize { + self.to_comparable_words().len() + } + + fn to_comparable_words(&self) -> Vec { + self.to_words().iter().filter(|x| x.s.is_some()).collect() + } + + fn to_words(&self) -> Vec { + let skips = stop_words::get("en"); + let strs = self.raw.iter() + .map(|w| w.to_lowercase()) + .map(|word| word.chars().filter(|c| c.is_ascii_alphanumeric()).collect::()) + .collect::>(); + let mut result = vec![]; + for i in 0..strs.len() { + result.push(Word{ + s: match skips.contains(&strs[i]) { + true => None, + false => Some(strs[i]), + }, + idx: i as usize, + }); + } + result + } + + fn to_string(&self) -> String { + self.raw.iter() + .map(|x| x.clone()) .collect::>() .join(" ") } } +#[derive(Debug)] +struct Word { + s: Option, + idx: usize, +} + #[cfg(test)] mod tests { use super::*;