Cargo fmt pass

This commit is contained in:
Clément Renault
2019-10-18 13:05:28 +02:00
parent 47d777c8f7
commit ca26a0f2e4
48 changed files with 1599 additions and 979 deletions

View File

@ -1,10 +1,10 @@
use std::collections::{BTreeMap, HashMap};
use std::convert::TryFrom;
use crate::{DocIndex, DocumentId};
use deunicode::deunicode_with_tofu;
use crate::{DocumentId, DocIndex};
use meilidb_schema::SchemaAttr;
use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token};
use meilidb_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer};
use sdset::SetBuf;
type Word = Vec<u8>; // TODO make it be a SmallVec
@ -60,7 +60,9 @@ impl RawIndexer {
&mut self.docs_words,
);
if !must_continue { break }
if !must_continue {
break;
}
number_of_words += 1;
}
@ -70,8 +72,9 @@ impl RawIndexer {
}
pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
where I: IntoIterator<Item=&'a str, IntoIter=IT>,
IT: Iterator<Item = &'a str> + Clone,
where
I: IntoIterator<Item = &'a str, IntoIter = IT>,
IT: Iterator<Item = &'a str> + Clone,
{
// TODO serialize this to one call to the SeqTokenizer loop
@ -88,14 +91,25 @@ impl RawIndexer {
&mut self.docs_words,
);
if !must_continue { break }
if !must_continue {
break;
}
}
let deunicoded: Vec<_> = lowercased.into_iter().map(|lowercase_text| {
if lowercase_text.contains(is_cjk) { return lowercase_text }
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
if lowercase_text != deunicoded { deunicoded } else { lowercase_text }
}).collect();
let deunicoded: Vec<_> = lowercased
.into_iter()
.map(|lowercase_text| {
if lowercase_text.contains(is_cjk) {
return lowercase_text;
}
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
if lowercase_text != deunicoded {
deunicoded
} else {
lowercase_text
}
})
.collect();
let iter = deunicoded.iter().map(|t| t.as_str());
for token in SeqTokenizer::new(iter) {
@ -108,17 +122,21 @@ impl RawIndexer {
&mut self.docs_words,
);
if !must_continue { break }
if !must_continue {
break;
}
}
}
pub fn build(self) -> Indexed {
let words_doc_indexes = self.words_doc_indexes
let words_doc_indexes = self
.words_doc_indexes
.into_iter()
.map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
.collect();
let docs_words = self.docs_words
let docs_words = self
.docs_words
.into_iter()
.map(|(id, mut words)| {
words.sort_unstable();
@ -127,7 +145,10 @@ impl RawIndexer {
})
.collect();
Indexed { words_doc_indexes, docs_words }
Indexed {
words_doc_indexes,
docs_words,
}
}
}
@ -138,16 +159,20 @@ fn index_token(
word_limit: usize,
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool
{
if token.word_index >= word_limit { return false }
) -> bool {
if token.word_index >= word_limit {
return false;
}
match token_to_docindex(id, attr, token) {
Some(docindex) => {
let word = Vec::from(token.word);
words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex);
words_doc_indexes
.entry(word.clone())
.or_insert_with(Vec::new)
.push(docindex);
docs_words.entry(id).or_insert_with(Vec::new).push(word);
},
}
None => return false,
}
@ -183,7 +208,9 @@ mod tests {
let text = "Zut, laspirateur, jai oublié de léteindre !";
indexer.index_text(docid, attr, text);
let Indexed { words_doc_indexes, .. } = indexer.build();
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
@ -191,7 +218,9 @@ mod tests {
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe...
assert!(words_doc_indexes.get(&"léteindre".to_owned().into_bytes()).is_some());
assert!(words_doc_indexes
.get(&"léteindre".to_owned().into_bytes())
.is_some());
}
#[test]
@ -203,7 +232,9 @@ mod tests {
let text = vec!["Zut, laspirateur, jai oublié de léteindre !"];
indexer.index_text_seq(docid, attr, text);
let Indexed { words_doc_indexes, .. } = indexer.build();
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
@ -211,6 +242,8 @@ mod tests {
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe...
assert!(words_doc_indexes.get(&"léteindre".to_owned().into_bytes()).is_some());
assert!(words_doc_indexes
.get(&"léteindre".to_owned().into_bytes())
.is_some());
}
}