Make the RawIndexer support stop words

2025-09-20 19:56:25 +00:00 · 2019-10-29 15:53:45 +01:00
parent a226fd23c3
commit ff7dde7522
2 changed files with 28 additions and 13 deletions
--- a/meilidb-core/src/raw_indexer.rs
+++ b/meilidb-core/src/raw_indexer.rs
@ -11,6 +11,7 @@ type Word = Vec<u8>; // TODO make it be a SmallVec

 pub struct RawIndexer {
    word_limit: usize, // the maximum number of indexed words
+    stop_words: fst::Set,
    words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
    docs_words: HashMap<DocumentId, Vec<Word>>,
 }
@ -21,13 +22,14 @@ pub struct Indexed {
 }

 impl RawIndexer {
-    pub fn new() -> RawIndexer {
-        RawIndexer::with_word_limit(1000)
+    pub fn new(stop_words: fst::Set) -> RawIndexer {
+        RawIndexer::with_word_limit(stop_words, 1000)
    }

-    pub fn with_word_limit(limit: usize) -> RawIndexer {
+    pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer {
        RawIndexer {
            word_limit: limit,
+            stop_words,
            words_doc_indexes: BTreeMap::new(),
            docs_words: HashMap::new(),
        }
@ -56,6 +58,7 @@ impl RawIndexer {
                    id,
                    attr,
                    self.word_limit,
+                    &self.stop_words,
                    &mut self.words_doc_indexes,
                    &mut self.docs_words,
                );
@ -87,6 +90,7 @@ impl RawIndexer {
                id,
                attr,
                self.word_limit,
+                &self.stop_words,
                &mut self.words_doc_indexes,
                &mut self.docs_words,
            );
@ -118,6 +122,7 @@ impl RawIndexer {
                id,
                attr,
                self.word_limit,
+                &self.stop_words,
                &mut self.words_doc_indexes,
                &mut self.docs_words,
            );
@ -152,17 +157,12 @@ impl RawIndexer {
    }
 }

-impl Default for RawIndexer {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 fn index_token(
    token: Token,
    id: DocumentId,
    attr: SchemaAttr,
    word_limit: usize,
+    stop_words: &fst::Set,
    words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
    docs_words: &mut HashMap<DocumentId, Vec<Word>>,
 ) -> bool {
@ -170,6 +170,10 @@ fn index_token(
        return false;
    }

+    if stop_words.contains(&token.word) {
+        return false;
+    }
+
    match token_to_docindex(id, attr, token) {
        Some(docindex) => {
            let word = Vec::from(token.word);
@ -207,7 +211,7 @@ mod tests {

    #[test]
    fn strange_apostrophe() {
-        let mut indexer = RawIndexer::new();
+        let mut indexer = RawIndexer::new(fst::Set::default());

        let docid = DocumentId(0);
        let attr = SchemaAttr(0);
@ -231,7 +235,7 @@ mod tests {

    #[test]
    fn strange_apostrophe_in_sequence() {
-        let mut indexer = RawIndexer::new();
+        let mut indexer = RawIndexer::new(fst::Set::default());

        let docid = DocumentId(0);
        let attr = SchemaAttr(0);