integrate with meilisearch tokenizer

2025-07-27 00:31:02 +00:00 · 2020-12-23 19:09:01 +01:00
parent 7e1c94ab9c
commit 1ae761311e
10 changed files with 460 additions and 269 deletions
--- a/src/update/index_documents/mod.rs
+++ b/src/update/index_documents/mod.rs
@ -370,6 +370,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
            let readers = rayon::iter::repeatn(documents, num_threads)
                .enumerate()
                .map(|(i, documents)| {
+                    let stop_words = fst::Set::default();
                    let store = Store::new(
                        searchable_fields.clone(),
                        faceted_fields.clone(),
@ -379,6 +380,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
                        chunk_compression_type,
                        chunk_compression_level,
                        chunk_fusing_shrink_size,
+                        &stop_words,
                    )?;
                    store.index(
                        documents,
--- a/src/update/index_documents/store.rs
+++ b/src/update/index_documents/store.rs
@ -16,12 +16,13 @@ use ordered_float::OrderedFloat;
 use roaring::RoaringBitmap;
 use serde_json::Value;
 use tempfile::tempfile;
+use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, TokenKind};
+use fst::Set;

 use crate::facet::FacetType;
 use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
 use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec};
 use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec};
-use crate::tokenizer::{simple_tokenizer, only_token};
 use crate::update::UpdateIndexingStep;
 use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId, FieldId};

@ -47,7 +48,7 @@ pub struct Readers {
    pub documents: Reader<FileFuse>,
 }

-pub struct Store {
+pub struct Store<'s, A> {
    // Indexing parameters
    searchable_fields: HashSet<FieldId>,
    faceted_fields: HashMap<FieldId, FacetType>,
@ -71,9 +72,11 @@ pub struct Store {
    // MTBL writers
    docid_word_positions_writer: Writer<File>,
    documents_writer: Writer<File>,
+    // tokenizer
+    analyzer: Analyzer<'s, A>,
 }

-impl Store {
+impl<'s, A: AsRef<[u8]>> Store<'s, A> {
    pub fn new(
        searchable_fields: HashSet<FieldId>,
        faceted_fields: HashMap<FieldId, FacetType>,
@ -83,7 +86,8 @@ impl Store {
        chunk_compression_type: CompressionType,
        chunk_compression_level: Option<u32>,
        chunk_fusing_shrink_size: Option<u64>,
-    ) -> anyhow::Result<Store>
+        stop_words: &'s Set<A>,
+    ) -> anyhow::Result<Self>
    {
        // We divide the max memory by the number of sorter the Store have.
        let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 4));
@ -137,6 +141,8 @@ impl Store {
            create_writer(chunk_compression_type, chunk_compression_level, f)
        })?;

+        let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
+
        Ok(Store {
            // Indexing parameters.
            searchable_fields,
@ -161,6 +167,8 @@ impl Store {
            // MTBL writers
            docid_word_positions_writer,
            documents_writer,
+            //tokenizer
+            analyzer,
        })
    }

@ -462,9 +470,13 @@ impl Store {
                                None => continue,
                            };

-                            let tokens = simple_tokenizer(&content).filter_map(only_token);
-                            for (pos, token) in tokens.enumerate().take(MAX_POSITION) {
-                                let word = token.to_lowercase();
+                            let analyzed = self.analyzer.analyze(&content);
+                            let tokens = analyzed
+                                .tokens()
+                                .filter(|t| t.is_word())
+                                .map(|t| t.text().to_string());
+
+                            for (pos, word) in tokens.enumerate().take(MAX_POSITION) {
                                let position = (attr as usize * MAX_POSITION + pos) as u32;
                                words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
                            }