Ignore words that are too long

2025-09-20 19:56:25 +00:00 · 2019-11-10 17:41:32 +01:00
parent 78381f1818
commit d18e775bec
2 changed files with 71 additions and 14 deletions
--- a/meilidb-core/src/raw_indexer.rs
+++ b/meilidb-core/src/raw_indexer.rs
@ -7,6 +7,8 @@ use meilidb_schema::SchemaAttr;
 use meilidb_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer};
 use sdset::SetBuf;

+const WORD_LENGTH_LIMIT: usize = 80;
+
 type Word = Vec<u8>; // TODO make it be a SmallVec

 pub struct RawIndexer {
@ -128,21 +130,26 @@ fn index_token(
        match token_to_docindex(id, attr, token) {
            Some(docindex) => {
                let word = Vec::from(token.word);
-                words_doc_indexes
-                    .entry(word.clone())
-                    .or_insert_with(Vec::new)
-                    .push(docindex);
-                docs_words.entry(id).or_insert_with(Vec::new).push(word);

-                if !lower.contains(is_cjk) {
-                    let unidecoded = deunicode_with_tofu(&lower, "");
-                    if unidecoded != lower && !unidecoded.is_empty() {
-                        let word = Vec::from(unidecoded);
-                        words_doc_indexes
-                            .entry(word.clone())
-                            .or_insert_with(Vec::new)
-                            .push(docindex);
-                        docs_words.entry(id).or_insert_with(Vec::new).push(word);
+                if word.len() <= WORD_LENGTH_LIMIT {
+                    words_doc_indexes
+                        .entry(word.clone())
+                        .or_insert_with(Vec::new)
+                        .push(docindex);
+                    docs_words.entry(id).or_insert_with(Vec::new).push(word);
+
+                    if !lower.contains(is_cjk) {
+                        let unidecoded = deunicode_with_tofu(&lower, "");
+                        if unidecoded != lower && !unidecoded.is_empty() {
+                            let word = Vec::from(unidecoded);
+                            if word.len() <= WORD_LENGTH_LIMIT {
+                                words_doc_indexes
+                                    .entry(word.clone())
+                                    .or_insert_with(Vec::new)
+                                    .push(docindex);
+                                docs_words.entry(id).or_insert_with(Vec::new).push(word);
+                            }
+                        }
                    }
                }
            }