todo
This commit is contained in:
@@ -101,8 +101,10 @@ impl Destutterer {
|
||||
|
||||
let mut n = self.prevs.len().clamp(0, nexts.len());
|
||||
while n > 0 {
|
||||
let prev_s = Word::to_comparable_string(self.prevs[self.prevs.len() - n..].to_vec());
|
||||
let next_s = Word::to_comparable_string(nexts[..n].to_vec());
|
||||
let prev_s = Word::to_comparable_string(self.prevs, self.prevs.len(), n);
|
||||
eprintln!("prevs: {:?} => '{}'", self.prevs[self.prevs.len() - n..].to_vec(), &prev_s);
|
||||
let next_s = Word::to_comparable_string(nexts, 0, n); // TODO indexes skip stop words
|
||||
eprintln!("nexts: {:?} => '{}'", nexts[..n].to_vec(), &next_s);
|
||||
if prev_s == next_s {
|
||||
break;
|
||||
}
|
||||
@@ -113,7 +115,7 @@ impl Destutterer {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
#[derive(Clone, Debug)]
|
||||
struct Word {
|
||||
raw: String,
|
||||
}
|
||||
@@ -131,8 +133,11 @@ impl Word {
|
||||
}
|
||||
|
||||
fn to_comparable_string(v: Vec<Word>) -> String {
|
||||
let skips = stop_words::get("en");
|
||||
v.iter()
|
||||
.map(|x| x.raw.chars().filter(|c| c.is_ascii_alphanumeric()).collect())
|
||||
.map(|w| w.raw.to_lowercase())
|
||||
.filter(|word| !skips.contains(word))
|
||||
.map(|word| word.chars().filter(|c| c.is_ascii_alphanumeric()).collect::<String>())
|
||||
.collect::<Vec<String>>()
|
||||
.join(" ")
|
||||
}
|
||||
@@ -149,6 +154,13 @@ impl Word {
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_destutterer_stop_words() {
|
||||
let mut w = new_destutterer();
|
||||
assert_eq!("welcome to the internet".to_string(), w.step("welcome to the internet".to_string()));
|
||||
assert_eq!("have a look around".to_string(), w.step("welcome to the a internet; have a look around".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_destutterer_punctuation() {
|
||||
let mut w = new_destutterer();
|
||||
|
||||
Reference in New Issue
Block a user