mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 04:56:28 +00:00 
			
		
		
		
	Use fst 0.4.4 in the project
This commit is contained in:
		| @@ -79,12 +79,8 @@ where | ||||
|  | ||||
|     let mut result = SortResult::default(); | ||||
|  | ||||
|     let words_set = match unsafe { main_store.static_words_fst(reader)? } { | ||||
|         Some(words) => words, | ||||
|         None => return Ok(SortResult::default()), | ||||
|     }; | ||||
|  | ||||
|     let stop_words = main_store.stop_words_fst(reader)?.unwrap_or_default(); | ||||
|     let words_set = main_store.words_fst(reader)?; | ||||
|     let stop_words = main_store.stop_words_fst(reader)?; | ||||
|  | ||||
|     let context = QTContext { | ||||
|         words_set, | ||||
| @@ -230,12 +226,8 @@ where | ||||
| { | ||||
|     let mut result = SortResult::default(); | ||||
|  | ||||
|     let words_set = match unsafe { main_store.static_words_fst(reader)? } { | ||||
|         Some(words) => words, | ||||
|         None => return Ok(SortResult::default()), | ||||
|     }; | ||||
|  | ||||
|     let stop_words = main_store.stop_words_fst(reader)?.unwrap_or_default(); | ||||
|     let words_set = main_store.words_fst(reader)?; | ||||
|     let stop_words = main_store.stop_words_fst(reader)?; | ||||
|  | ||||
|     let context = QTContext { | ||||
|         words_set, | ||||
|   | ||||
| @@ -38,16 +38,20 @@ pub use meilisearch_types::{DocIndex, DocumentId, Highlight}; | ||||
| pub use meilisearch_schema::Schema; | ||||
| pub use query_words_mapper::QueryWordsMapper; | ||||
|  | ||||
| use std::convert::TryFrom; | ||||
| use std::collections::HashMap; | ||||
| use compact_arena::SmallArena; | ||||
| use log::{error, trace}; | ||||
| use std::borrow::Cow; | ||||
| use std::collections::HashMap; | ||||
| use std::convert::TryFrom; | ||||
|  | ||||
| use crate::bucket_sort::PostingsListView; | ||||
| use crate::levenshtein::prefix_damerau_levenshtein; | ||||
| use crate::query_tree::{QueryId, QueryKind}; | ||||
| use crate::reordered_attrs::ReorderedAttrs; | ||||
|  | ||||
| type FstSetCow<'a> = fst::Set<Cow<'a, [u8]>>; | ||||
| type FstMapCow<'a> = fst::Map<Cow<'a, [u8]>>; | ||||
|  | ||||
| #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] | ||||
| pub struct Document { | ||||
|     pub id: DocumentId, | ||||
|   | ||||
| @@ -186,7 +186,7 @@ mod tests { | ||||
|     use std::collections::{BTreeSet, HashMap}; | ||||
|     use std::iter::FromIterator; | ||||
|  | ||||
|     use fst::{IntoStreamer, Set}; | ||||
|     use fst::IntoStreamer; | ||||
|     use meilisearch_schema::IndexedPos; | ||||
|     use sdset::SetBuf; | ||||
|     use tempfile::TempDir; | ||||
| @@ -199,21 +199,21 @@ mod tests { | ||||
|     use crate::store::Index; | ||||
|     use meilisearch_schema::Schema; | ||||
|  | ||||
|     fn set_from_stream<'f, I, S>(stream: I) -> Set | ||||
|     fn set_from_stream<'f, I, S>(stream: I) -> fst::Set<Vec<u8>> | ||||
|     where | ||||
|         I: for<'a> fst::IntoStreamer<'a, Into = S, Item = &'a [u8]>, | ||||
|         S: 'f + for<'a> fst::Streamer<'a, Item = &'a [u8]>, | ||||
|     { | ||||
|         let mut builder = fst::SetBuilder::memory(); | ||||
|         builder.extend_stream(stream).unwrap(); | ||||
|         builder.into_inner().and_then(Set::from_bytes).unwrap() | ||||
|         builder.into_set() | ||||
|     } | ||||
|  | ||||
|     fn insert_key(set: &Set, key: &[u8]) -> Set { | ||||
|     fn insert_key<A: AsRef<[u8]>>(set: &fst::Set<A>, key: &[u8]) -> fst::Set<Vec<u8>> { | ||||
|         let unique_key = { | ||||
|             let mut builder = fst::SetBuilder::memory(); | ||||
|             builder.insert(key).unwrap(); | ||||
|             builder.into_inner().and_then(Set::from_bytes).unwrap() | ||||
|             builder.into_set() | ||||
|         }; | ||||
|  | ||||
|         let union_ = set.op().add(unique_key.into_stream()).r#union(); | ||||
| @@ -221,11 +221,11 @@ mod tests { | ||||
|         set_from_stream(union_) | ||||
|     } | ||||
|  | ||||
|     fn sdset_into_fstset(set: &sdset::Set<&str>) -> Set { | ||||
|     fn sdset_into_fstset(set: &sdset::Set<&str>) -> fst::Set<Vec<u8>> { | ||||
|         let mut builder = fst::SetBuilder::memory(); | ||||
|         let set = SetBuf::from_dirty(set.into_iter().map(|s| normalize_str(s)).collect()); | ||||
|         builder.extend_iter(set.into_iter()).unwrap(); | ||||
|         builder.into_inner().and_then(Set::from_bytes).unwrap() | ||||
|         builder.into_set() | ||||
|     } | ||||
|  | ||||
|     const fn doc_index(document_id: u32, word_index: u16) -> DocIndex { | ||||
| @@ -265,15 +265,11 @@ mod tests { | ||||
|  | ||||
|             let word = normalize_str(word); | ||||
|  | ||||
|             let alternatives = match self | ||||
|             let alternatives = self | ||||
|                 .index | ||||
|                 .synonyms | ||||
|                 .synonyms(&writer, word.as_bytes()) | ||||
|                 .unwrap() | ||||
|             { | ||||
|                 Some(alternatives) => alternatives, | ||||
|                 None => fst::Set::default(), | ||||
|             }; | ||||
|                 .unwrap(); | ||||
|  | ||||
|             let new = sdset_into_fstset(&new); | ||||
|             let new_alternatives = | ||||
| @@ -283,10 +279,7 @@ mod tests { | ||||
|                 .put_synonyms(&mut writer, word.as_bytes(), &new_alternatives) | ||||
|                 .unwrap(); | ||||
|  | ||||
|             let synonyms = match self.index.main.synonyms_fst(&writer).unwrap() { | ||||
|                 Some(synonyms) => synonyms, | ||||
|                 None => fst::Set::default(), | ||||
|             }; | ||||
|             let synonyms = self.index.main.synonyms_fst(&writer).unwrap(); | ||||
|  | ||||
|             let synonyms_fst = insert_key(&synonyms, word.as_bytes()); | ||||
|             self.index | ||||
| @@ -339,7 +332,7 @@ mod tests { | ||||
|  | ||||
|             index.main.put_schema(&mut writer, &schema).unwrap(); | ||||
|  | ||||
|             let words_fst = Set::from_iter(words_fst).unwrap(); | ||||
|             let words_fst = fst::Set::from_iter(words_fst).unwrap(); | ||||
|  | ||||
|             index.main.put_words_fst(&mut writer, &words_fst).unwrap(); | ||||
|  | ||||
|   | ||||
| @@ -12,7 +12,7 @@ use sdset::{Set, SetBuf, SetOperation}; | ||||
| use log::debug; | ||||
|  | ||||
| use crate::database::MainT; | ||||
| use crate::{store, DocumentId, DocIndex, MResult}; | ||||
| use crate::{store, DocumentId, DocIndex, MResult, FstSetCow}; | ||||
| use crate::automaton::{normalize_str, build_dfa, build_prefix_dfa, build_exact_dfa}; | ||||
| use crate::QueryWordsMapper; | ||||
|  | ||||
| @@ -112,9 +112,9 @@ pub struct PostingsList { | ||||
|     matches: SetBuf<DocIndex>, | ||||
| } | ||||
|  | ||||
| pub struct Context { | ||||
|     pub words_set: fst::Set, | ||||
|     pub stop_words: fst::Set, | ||||
| pub struct Context<'a> { | ||||
|     pub words_set: FstSetCow<'a>, | ||||
|     pub stop_words: FstSetCow<'a>, | ||||
|     pub synonyms: store::Synonyms, | ||||
|     pub postings_lists: store::PostingsLists, | ||||
|     pub prefix_postings_lists: store::PrefixPostingsListsCache, | ||||
| @@ -147,7 +147,7 @@ fn split_best_frequency<'a>(reader: &heed::RoTxn<MainT>, ctx: &Context, word: &' | ||||
|  | ||||
| fn fetch_synonyms(reader: &heed::RoTxn<MainT>, ctx: &Context, words: &[&str]) -> MResult<Vec<Vec<String>>> { | ||||
|     let words = normalize_str(&words.join(" ")); | ||||
|     let set = ctx.synonyms.synonyms(reader, words.as_bytes())?.unwrap_or_default(); | ||||
|     let set = ctx.synonyms.synonyms(reader, words.as_bytes())?; | ||||
|  | ||||
|     let mut strings = Vec::new(); | ||||
|     let mut stream = set.stream(); | ||||
|   | ||||
| @@ -1,34 +1,37 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::collections::{BTreeMap, HashMap}; | ||||
| use std::convert::TryFrom; | ||||
|  | ||||
| use crate::{DocIndex, DocumentId}; | ||||
| use deunicode::deunicode_with_tofu; | ||||
| use meilisearch_schema::IndexedPos; | ||||
| use meilisearch_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer}; | ||||
| use sdset::SetBuf; | ||||
|  | ||||
| use crate::{DocIndex, DocumentId}; | ||||
| use crate::FstSetCow; | ||||
|  | ||||
| const WORD_LENGTH_LIMIT: usize = 80; | ||||
|  | ||||
| type Word = Vec<u8>; // TODO make it be a SmallVec | ||||
|  | ||||
| pub struct RawIndexer { | ||||
| pub struct RawIndexer<A> { | ||||
|     word_limit: usize, // the maximum number of indexed words | ||||
|     stop_words: fst::Set, | ||||
|     stop_words: fst::Set<A>, | ||||
|     words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>, | ||||
|     docs_words: HashMap<DocumentId, Vec<Word>>, | ||||
| } | ||||
|  | ||||
| pub struct Indexed { | ||||
| pub struct Indexed<'a> { | ||||
|     pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>, | ||||
|     pub docs_words: HashMap<DocumentId, fst::Set>, | ||||
|     pub docs_words: HashMap<DocumentId, FstSetCow<'a>>, | ||||
| } | ||||
|  | ||||
| impl RawIndexer { | ||||
|     pub fn new(stop_words: fst::Set) -> RawIndexer { | ||||
| impl<A> RawIndexer<A> { | ||||
|     pub fn new(stop_words: fst::Set<A>) -> RawIndexer<A> { | ||||
|         RawIndexer::with_word_limit(stop_words, 1000) | ||||
|     } | ||||
|  | ||||
|     pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer { | ||||
|     pub fn with_word_limit(stop_words: fst::Set<A>, limit: usize) -> RawIndexer<A> { | ||||
|         RawIndexer { | ||||
|             word_limit: limit, | ||||
|             stop_words, | ||||
| @@ -36,7 +39,9 @@ impl RawIndexer { | ||||
|             docs_words: HashMap::new(), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<A: AsRef<[u8]>> RawIndexer<A> { | ||||
|     pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize { | ||||
|         let mut number_of_words = 0; | ||||
|  | ||||
| @@ -61,9 +66,9 @@ impl RawIndexer { | ||||
|         number_of_words | ||||
|     } | ||||
|  | ||||
|     pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, iter: I) | ||||
|     pub fn index_text_seq<'s, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, iter: I) | ||||
|     where | ||||
|         I: IntoIterator<Item = &'a str>, | ||||
|         I: IntoIterator<Item = &'s str>, | ||||
|     { | ||||
|         let iter = iter.into_iter(); | ||||
|         for token in SeqTokenizer::new(iter) { | ||||
| @@ -83,7 +88,7 @@ impl RawIndexer { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn build(self) -> Indexed { | ||||
|     pub fn build(self) -> Indexed<'static> { | ||||
|         let words_doc_indexes = self | ||||
|             .words_doc_indexes | ||||
|             .into_iter() | ||||
| @@ -96,7 +101,8 @@ impl RawIndexer { | ||||
|             .map(|(id, mut words)| { | ||||
|                 words.sort_unstable(); | ||||
|                 words.dedup(); | ||||
|                 (id, fst::Set::from_iter(words).unwrap()) | ||||
|                 let fst = fst::Set::from_iter(words).unwrap().map_data(Cow::Owned).unwrap(); | ||||
|                 (id, fst) | ||||
|             }) | ||||
|             .collect(); | ||||
|  | ||||
| @@ -107,15 +113,17 @@ impl RawIndexer { | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn index_token( | ||||
| fn index_token<A>( | ||||
|     token: Token, | ||||
|     id: DocumentId, | ||||
|     indexed_pos: IndexedPos, | ||||
|     word_limit: usize, | ||||
|     stop_words: &fst::Set, | ||||
|     stop_words: &fst::Set<A>, | ||||
|     words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>, | ||||
|     docs_words: &mut HashMap<DocumentId, Vec<Word>>, | ||||
| ) -> bool { | ||||
| ) -> bool | ||||
| where A: AsRef<[u8]>, | ||||
| { | ||||
|     if token.word_index >= word_limit { | ||||
|         return false; | ||||
|     } | ||||
|   | ||||
| @@ -1,9 +1,11 @@ | ||||
| use super::BEU32; | ||||
| use crate::database::MainT; | ||||
| use crate::DocumentId; | ||||
| use heed::types::{ByteSlice, OwnedType}; | ||||
| use std::borrow::Cow; | ||||
|  | ||||
| use heed::Result as ZResult; | ||||
| use std::sync::Arc; | ||||
| use heed::types::{ByteSlice, OwnedType}; | ||||
|  | ||||
| use crate::database::MainT; | ||||
| use crate::{DocumentId, FstSetCow}; | ||||
| use super::BEU32; | ||||
|  | ||||
| #[derive(Copy, Clone)] | ||||
| pub struct DocsWords { | ||||
| @@ -15,7 +17,7 @@ impl DocsWords { | ||||
|         self, | ||||
|         writer: &mut heed::RwTxn<MainT>, | ||||
|         document_id: DocumentId, | ||||
|         words: &fst::Set, | ||||
|         words: &FstSetCow, | ||||
|     ) -> ZResult<()> { | ||||
|         let document_id = BEU32::new(document_id.0); | ||||
|         let bytes = words.as_fst().as_bytes(); | ||||
| @@ -31,20 +33,11 @@ impl DocsWords { | ||||
|         self.docs_words.clear(writer) | ||||
|     } | ||||
|  | ||||
|     pub fn doc_words( | ||||
|         self, | ||||
|         reader: &heed::RoTxn<MainT>, | ||||
|         document_id: DocumentId, | ||||
|     ) -> ZResult<Option<fst::Set>> { | ||||
|     pub fn doc_words(self, reader: &heed::RoTxn<MainT>, document_id: DocumentId) -> ZResult<FstSetCow> { | ||||
|         let document_id = BEU32::new(document_id.0); | ||||
|         match self.docs_words.get(reader, &document_id)? { | ||||
|             Some(bytes) => { | ||||
|                 let len = bytes.len(); | ||||
|                 let bytes = Arc::new(bytes.to_owned()); | ||||
|                 let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); | ||||
|                 Ok(Some(fst::Set::from(fst))) | ||||
|             } | ||||
|             None => Ok(None), | ||||
|             Some(bytes) => Ok(fst::Set::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()), | ||||
|             None => Ok(fst::Set::default().map_data(Cow::Owned).unwrap()), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,5 +1,4 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::sync::Arc; | ||||
| use std::collections::HashMap; | ||||
|  | ||||
| use chrono::{DateTime, Utc}; | ||||
| @@ -12,6 +11,7 @@ use sdset::Set; | ||||
| use crate::database::MainT; | ||||
| use crate::RankedMap; | ||||
| use crate::settings::RankingRule; | ||||
| use crate::{FstSetCow, FstMapCow}; | ||||
| use super::{CowSet, DocumentsIds}; | ||||
|  | ||||
| const ATTRIBUTES_FOR_FACETING_KEY: &str = "attributes-for-faceting"; | ||||
| @@ -103,11 +103,15 @@ impl Main { | ||||
|         self.put_internal_docids(writer, &internal_docids) | ||||
|     } | ||||
|  | ||||
|     pub fn put_external_docids(self, writer: &mut heed::RwTxn<MainT>, ids: &fst::Map) -> ZResult<()> { | ||||
|     pub fn put_external_docids<A>(self, writer: &mut heed::RwTxn<MainT>, ids: &fst::Map<A>) -> ZResult<()> | ||||
|     where A: AsRef<[u8]>, | ||||
|     { | ||||
|         self.main.put::<_, Str, ByteSlice>(writer, EXTERNAL_DOCIDS_KEY, ids.as_fst().as_bytes()) | ||||
|     } | ||||
|  | ||||
|     pub fn merge_external_docids(self, writer: &mut heed::RwTxn<MainT>, new_docids: &fst::Map) -> ZResult<()> { | ||||
|     pub fn merge_external_docids<A>(self, writer: &mut heed::RwTxn<MainT>, new_docids: &fst::Map<A>) -> ZResult<()> | ||||
|     where A: AsRef<[u8]>, | ||||
|     { | ||||
|         use fst::{Streamer, IntoStreamer}; | ||||
|  | ||||
|         // Do an union of the old and the new set of external docids. | ||||
| @@ -117,13 +121,15 @@ impl Main { | ||||
|         while let Some((docid, values)) = op.next() { | ||||
|             build.insert(docid, values[0].value).unwrap(); | ||||
|         } | ||||
|         let external_docids = build.into_inner().unwrap(); | ||||
|         drop(op); | ||||
|  | ||||
|         // TODO prefer using self.put_user_ids | ||||
|         self.main.put::<_, Str, ByteSlice>(writer, EXTERNAL_DOCIDS_KEY, external_docids.as_slice()) | ||||
|         let external_docids = build.into_map(); | ||||
|         self.put_external_docids(writer, &external_docids) | ||||
|     } | ||||
|  | ||||
|     pub fn remove_external_docids(self, writer: &mut heed::RwTxn<MainT>, ids: &fst::Map) -> ZResult<()> { | ||||
|     pub fn remove_external_docids<A>(self, writer: &mut heed::RwTxn<MainT>, ids: &fst::Map<A>) -> ZResult<()> | ||||
|     where A: AsRef<[u8]>, | ||||
|     { | ||||
|         use fst::{Streamer, IntoStreamer}; | ||||
|  | ||||
|         // Do an union of the old and the new set of external docids. | ||||
| @@ -133,21 +139,16 @@ impl Main { | ||||
|         while let Some((docid, values)) = op.next() { | ||||
|             build.insert(docid, values[0].value).unwrap(); | ||||
|         } | ||||
|         let external_docids = build.into_inner().unwrap(); | ||||
|         drop(op); | ||||
|  | ||||
|         // TODO prefer using self.put_external_docids | ||||
|         self.main.put::<_, Str, ByteSlice>(writer, EXTERNAL_DOCIDS_KEY, external_docids.as_slice()) | ||||
|         let external_docids = build.into_map(); | ||||
|         self.put_external_docids(writer, &external_docids) | ||||
|     } | ||||
|  | ||||
|     pub fn external_docids(self, reader: &heed::RoTxn<MainT>) -> ZResult<fst::Map> { | ||||
|     pub fn external_docids(self, reader: &heed::RoTxn<MainT>) -> ZResult<FstMapCow> { | ||||
|         match self.main.get::<_, Str, ByteSlice>(reader, EXTERNAL_DOCIDS_KEY)? { | ||||
|             Some(bytes) => { | ||||
|                 let len = bytes.len(); | ||||
|                 let bytes = Arc::new(bytes.to_owned()); | ||||
|                 let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); | ||||
|                 Ok(fst::Map::from(fst)) | ||||
|             }, | ||||
|             None => Ok(fst::Map::default()), | ||||
|             Some(bytes) => Ok(fst::Map::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()), | ||||
|             None => Ok(fst::Map::default().map_data(Cow::Owned).unwrap()), | ||||
|         } | ||||
|     } | ||||
|  | ||||
| @@ -156,30 +157,14 @@ impl Main { | ||||
|         Ok(external_ids.get(external_docid).map(|id| DocumentId(id as u32))) | ||||
|     } | ||||
|  | ||||
|     pub fn put_words_fst(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set) -> ZResult<()> { | ||||
|     pub fn put_words_fst<A: AsRef<[u8]>>(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set<A>) -> ZResult<()> { | ||||
|         self.main.put::<_, Str, ByteSlice>(writer, WORDS_KEY, fst.as_fst().as_bytes()) | ||||
|     } | ||||
|  | ||||
|     pub unsafe fn static_words_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<fst::Set>> { | ||||
|     pub fn words_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<FstSetCow> { | ||||
|         match self.main.get::<_, Str, ByteSlice>(reader, WORDS_KEY)? { | ||||
|             Some(bytes) => { | ||||
|                 let bytes: &'static [u8] = std::mem::transmute(bytes); | ||||
|                 let set = fst::Set::from_static_slice(bytes).unwrap(); | ||||
|                 Ok(Some(set)) | ||||
|             }, | ||||
|             None => Ok(None), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn words_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<fst::Set>> { | ||||
|         match self.main.get::<_, Str, ByteSlice>(reader, WORDS_KEY)? { | ||||
|             Some(bytes) => { | ||||
|                 let len = bytes.len(); | ||||
|                 let bytes = Arc::new(bytes.to_owned()); | ||||
|                 let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); | ||||
|                 Ok(Some(fst::Set::from(fst))) | ||||
|             }, | ||||
|             None => Ok(None), | ||||
|             Some(bytes) => Ok(fst::Set::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()), | ||||
|             None => Ok(fst::Set::default().map_data(Cow::Owned).unwrap()), | ||||
|         } | ||||
|     } | ||||
|  | ||||
| @@ -203,37 +188,27 @@ impl Main { | ||||
|         self.main.get::<_, Str, SerdeBincode<RankedMap>>(reader, RANKED_MAP_KEY) | ||||
|     } | ||||
|  | ||||
|     pub fn put_synonyms_fst(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set) -> ZResult<()> { | ||||
|     pub fn put_synonyms_fst<A: AsRef<[u8]>>(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set<A>) -> ZResult<()> { | ||||
|         let bytes = fst.as_fst().as_bytes(); | ||||
|         self.main.put::<_, Str, ByteSlice>(writer, SYNONYMS_KEY, bytes) | ||||
|     } | ||||
|  | ||||
|     pub fn synonyms_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<fst::Set>> { | ||||
|     pub fn synonyms_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<FstSetCow> { | ||||
|         match self.main.get::<_, Str, ByteSlice>(reader, SYNONYMS_KEY)? { | ||||
|             Some(bytes) => { | ||||
|                 let len = bytes.len(); | ||||
|                 let bytes = Arc::new(bytes.to_owned()); | ||||
|                 let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); | ||||
|                 Ok(Some(fst::Set::from(fst))) | ||||
|             } | ||||
|             None => Ok(None), | ||||
|             Some(bytes) => Ok(fst::Set::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()), | ||||
|             None => Ok(fst::Set::default().map_data(Cow::Owned).unwrap()), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn put_stop_words_fst(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set) -> ZResult<()> { | ||||
|     pub fn put_stop_words_fst<A: AsRef<[u8]>>(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set<A>) -> ZResult<()> { | ||||
|         let bytes = fst.as_fst().as_bytes(); | ||||
|         self.main.put::<_, Str, ByteSlice>(writer, STOP_WORDS_KEY, bytes) | ||||
|     } | ||||
|  | ||||
|     pub fn stop_words_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<fst::Set>> { | ||||
|     pub fn stop_words_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<FstSetCow> { | ||||
|         match self.main.get::<_, Str, ByteSlice>(reader, STOP_WORDS_KEY)? { | ||||
|             Some(bytes) => { | ||||
|                 let len = bytes.len(); | ||||
|                 let bytes = Arc::new(bytes.to_owned()); | ||||
|                 let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); | ||||
|                 Ok(Some(fst::Set::from(fst))) | ||||
|             } | ||||
|             None => Ok(None), | ||||
|             Some(bytes) => Ok(fst::Set::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()), | ||||
|             None => Ok(fst::Set::default().map_data(Cow::Owned).unwrap()), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -1,7 +1,10 @@ | ||||
| use heed::types::ByteSlice; | ||||
| use crate::database::MainT; | ||||
| use std::borrow::Cow; | ||||
|  | ||||
| use heed::Result as ZResult; | ||||
| use std::sync::Arc; | ||||
| use heed::types::ByteSlice; | ||||
|  | ||||
| use crate::database::MainT; | ||||
| use crate::FstSetCow; | ||||
|  | ||||
| #[derive(Copy, Clone)] | ||||
| pub struct Synonyms { | ||||
| @@ -9,12 +12,9 @@ pub struct Synonyms { | ||||
| } | ||||
|  | ||||
| impl Synonyms { | ||||
|     pub fn put_synonyms( | ||||
|         self, | ||||
|         writer: &mut heed::RwTxn<MainT>, | ||||
|         word: &[u8], | ||||
|         synonyms: &fst::Set, | ||||
|     ) -> ZResult<()> { | ||||
|     pub fn put_synonyms<A>(self, writer: &mut heed::RwTxn<MainT>, word: &[u8], synonyms: &fst::Set<A>) -> ZResult<()> | ||||
|     where A: AsRef<[u8]>, | ||||
|     { | ||||
|         let bytes = synonyms.as_fst().as_bytes(); | ||||
|         self.synonyms.put(writer, word, bytes) | ||||
|     } | ||||
| @@ -27,15 +27,10 @@ impl Synonyms { | ||||
|         self.synonyms.clear(writer) | ||||
|     } | ||||
|  | ||||
|     pub fn synonyms(self, reader: &heed::RoTxn<MainT>, word: &[u8]) -> ZResult<Option<fst::Set>> { | ||||
|     pub fn synonyms<'txn>(self, reader: &'txn heed::RoTxn<MainT>, word: &[u8]) -> ZResult<FstSetCow<'txn>> { | ||||
|         match self.synonyms.get(reader, word)? { | ||||
|             Some(bytes) => { | ||||
|                 let len = bytes.len(); | ||||
|                 let bytes = Arc::new(bytes.to_owned()); | ||||
|                 let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap(); | ||||
|                 Ok(Some(fst::Set::from(fst))) | ||||
|             } | ||||
|             None => Ok(None), | ||||
|             Some(bytes) => Ok(fst::Set::new(bytes).unwrap().map_data(Cow::Borrowed).unwrap()), | ||||
|             None => Ok(fst::Set::default().map_data(Cow::Owned).unwrap()), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,3 +1,4 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::collections::{HashMap, BTreeMap}; | ||||
|  | ||||
| use fst::{set::OpBuilder, SetBuilder}; | ||||
| @@ -108,17 +109,18 @@ pub fn push_documents_addition<D: serde::Serialize>( | ||||
|     Ok(last_update_id) | ||||
| } | ||||
|  | ||||
| fn index_document( | ||||
| fn index_document<A>( | ||||
|     writer: &mut heed::RwTxn<MainT>, | ||||
|     documents_fields: DocumentsFields, | ||||
|     documents_fields_counts: DocumentsFieldsCounts, | ||||
|     ranked_map: &mut RankedMap, | ||||
|     indexer: &mut RawIndexer, | ||||
|     indexer: &mut RawIndexer<A>, | ||||
|     schema: &Schema, | ||||
|     field_id: FieldId, | ||||
|     document_id: DocumentId, | ||||
|     value: &Value, | ||||
| ) -> MResult<()> | ||||
| where A: AsRef<[u8]>, | ||||
| { | ||||
|     let serialized = serde_json::to_vec(value)?; | ||||
|     documents_fields.put_document_field(writer, document_id, field_id, &serialized)?; | ||||
| @@ -208,10 +210,7 @@ pub fn apply_addition<'a, 'b>( | ||||
|         None => RankedMap::default(), | ||||
|     }; | ||||
|  | ||||
|     let stop_words = match index.main.stop_words_fst(writer)? { | ||||
|         Some(stop_words) => stop_words, | ||||
|         None => fst::Set::default(), | ||||
|     }; | ||||
|     let stop_words = index.main.stop_words_fst(writer)?.map_data(Cow::into_owned)?; | ||||
|  | ||||
|     // 3. index the documents fields in the stores | ||||
|     if let Some(attributes_for_facetting) = index.main.attributes_for_faceting(writer)? { | ||||
| @@ -297,10 +296,10 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Ind | ||||
|     index.postings_lists.clear(writer)?; | ||||
|     index.docs_words.clear(writer)?; | ||||
|  | ||||
|     let stop_words = match index.main.stop_words_fst(writer)? { | ||||
|         Some(stop_words) => stop_words, | ||||
|         None => fst::Set::default(), | ||||
|     }; | ||||
|     let stop_words = index.main | ||||
|         .stop_words_fst(writer)? | ||||
|         .map_data(Cow::into_owned) | ||||
|         .unwrap(); | ||||
|  | ||||
|     let number_of_inserted_documents = documents_ids_to_reindex.len(); | ||||
|     let mut indexer = RawIndexer::new(stop_words); | ||||
| @@ -348,13 +347,15 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Ind | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| pub fn write_documents_addition_index( | ||||
| pub fn write_documents_addition_index<A>( | ||||
|     writer: &mut heed::RwTxn<MainT>, | ||||
|     index: &store::Index, | ||||
|     ranked_map: &RankedMap, | ||||
|     number_of_inserted_documents: usize, | ||||
|     indexer: RawIndexer, | ||||
| ) -> MResult<()> { | ||||
|     indexer: RawIndexer<A>, | ||||
| ) -> MResult<()> | ||||
| where A: AsRef<[u8]>, | ||||
| { | ||||
|     let indexed = indexer.build(); | ||||
|     let mut delta_words_builder = SetBuilder::memory(); | ||||
|  | ||||
| @@ -373,33 +374,27 @@ pub fn write_documents_addition_index( | ||||
|         index.docs_words.put_doc_words(writer, id, &words)?; | ||||
|     } | ||||
|  | ||||
|     let delta_words = delta_words_builder | ||||
|         .into_inner() | ||||
|         .and_then(fst::Set::from_bytes) | ||||
|         .unwrap(); | ||||
|     let delta_words = delta_words_builder.into_set(); | ||||
|  | ||||
|     let words = match index.main.words_fst(writer)? { | ||||
|         Some(words) => { | ||||
|             let op = OpBuilder::new() | ||||
|                 .add(words.stream()) | ||||
|                 .add(delta_words.stream()) | ||||
|                 .r#union(); | ||||
|     let words_fst = index.main.words_fst(writer)?; | ||||
|     let words = if !words_fst.is_empty() { | ||||
|         let op = OpBuilder::new() | ||||
|             .add(words_fst.stream()) | ||||
|             .add(delta_words.stream()) | ||||
|             .r#union(); | ||||
|  | ||||
|             let mut words_builder = SetBuilder::memory(); | ||||
|             words_builder.extend_stream(op).unwrap(); | ||||
|             words_builder | ||||
|                 .into_inner() | ||||
|                 .and_then(fst::Set::from_bytes) | ||||
|                 .unwrap() | ||||
|         } | ||||
|         None => delta_words, | ||||
|         let mut words_builder = SetBuilder::memory(); | ||||
|         words_builder.extend_stream(op).unwrap(); | ||||
|         words_builder.into_set() | ||||
|     } else { | ||||
|         delta_words | ||||
|     }; | ||||
|  | ||||
|     index.main.put_words_fst(writer, &words)?; | ||||
|     index.main.put_ranked_map(writer, ranked_map)?; | ||||
|     index.main.put_number_of_documents(writer, |old| old + number_of_inserted_documents as u64)?; | ||||
|  | ||||
|     compute_short_prefixes(writer, index)?; | ||||
|     compute_short_prefixes(writer, &words, index)?; | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|   | ||||
| @@ -114,7 +114,8 @@ pub fn apply_documents_deletion( | ||||
|             ranked_map.remove(id, *ranked_attr); | ||||
|         } | ||||
|  | ||||
|         if let Some(words) = index.docs_words.doc_words(writer, id)? { | ||||
|         let words = index.docs_words.doc_words(writer, id)?; | ||||
|         if !words.is_empty() { | ||||
|             let mut stream = words.stream(); | ||||
|             while let Some(word) = stream.next() { | ||||
|                 let word = word.to_vec(); | ||||
| @@ -157,21 +158,16 @@ pub fn apply_documents_deletion( | ||||
|     } | ||||
|  | ||||
|     let removed_words = fst::Set::from_iter(removed_words).unwrap(); | ||||
|     let words = match index.main.words_fst(writer)? { | ||||
|         Some(words_set) => { | ||||
|             let op = fst::set::OpBuilder::new() | ||||
|                 .add(words_set.stream()) | ||||
|                 .add(removed_words.stream()) | ||||
|                 .difference(); | ||||
|     let words = { | ||||
|         let words_set = index.main.words_fst(writer)?; | ||||
|         let op = fst::set::OpBuilder::new() | ||||
|             .add(words_set.stream()) | ||||
|             .add(removed_words.stream()) | ||||
|             .difference(); | ||||
|  | ||||
|             let mut words_builder = SetBuilder::memory(); | ||||
|             words_builder.extend_stream(op).unwrap(); | ||||
|             words_builder | ||||
|                 .into_inner() | ||||
|                 .and_then(fst::Set::from_bytes) | ||||
|                 .unwrap() | ||||
|         } | ||||
|         None => fst::Set::default(), | ||||
|         let mut words_builder = SetBuilder::memory(); | ||||
|         words_builder.extend_stream(op).unwrap(); | ||||
|         words_builder.into_set() | ||||
|     }; | ||||
|  | ||||
|     index.main.put_words_fst(writer, &words)?; | ||||
| @@ -182,7 +178,7 @@ pub fn apply_documents_deletion( | ||||
|     index.main.remove_external_docids(writer, &external_docids)?; | ||||
|     index.main.remove_internal_docids(writer, &internal_docids)?; | ||||
|  | ||||
|     compute_short_prefixes(writer, index)?; | ||||
|     compute_short_prefixes(writer, &words, index)?; | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|   | ||||
| @@ -6,18 +6,19 @@ use meilisearch_types::DocumentId; | ||||
| use ordered_float::OrderedFloat; | ||||
| use serde_json::Value; | ||||
|  | ||||
| use crate::Number; | ||||
| use crate::{Number, FstMapCow}; | ||||
| use crate::raw_indexer::RawIndexer; | ||||
| use crate::serde::SerializerError; | ||||
| use crate::store::DiscoverIds; | ||||
|  | ||||
| /// Returns the number of words indexed or `None` if the type is unindexable. | ||||
| pub fn index_value( | ||||
|     indexer: &mut RawIndexer, | ||||
| pub fn index_value<A>( | ||||
|     indexer: &mut RawIndexer<A>, | ||||
|     document_id: DocumentId, | ||||
|     indexed_pos: IndexedPos, | ||||
|     value: &Value, | ||||
| ) -> Option<usize> | ||||
| where A: AsRef<[u8]>, | ||||
| { | ||||
|     match value { | ||||
|         Value::Null => None, | ||||
| @@ -99,7 +100,7 @@ pub fn value_to_number(value: &Value) -> Option<Number> { | ||||
| /// the corresponding id or generate a new one, this is the way we produce documents ids. | ||||
| pub fn discover_document_id( | ||||
|     docid: &str, | ||||
|     external_docids: &fst::Map, | ||||
|     external_docids: &FstMapCow, | ||||
|     available_docids: &mut DiscoverIds<'_>, | ||||
| ) -> Result<DocumentId, SerializerError> | ||||
| { | ||||
| @@ -120,7 +121,7 @@ pub fn discover_document_id( | ||||
| pub fn extract_document_id( | ||||
|     primary_key: &str, | ||||
|     document: &IndexMap<String, Value>, | ||||
|     external_docids: &fst::Map, | ||||
|     external_docids: &FstMapCow, | ||||
|     available_docids: &mut DiscoverIds<'_>, | ||||
| ) -> Result<(DocumentId, String), SerializerError> | ||||
| { | ||||
|   | ||||
| @@ -297,13 +297,13 @@ pub fn update_task<'a, 'b>( | ||||
|     Ok(status) | ||||
| } | ||||
|  | ||||
| fn compute_short_prefixes(writer: &mut heed::RwTxn<MainT>, index: &store::Index) -> MResult<()> { | ||||
|     // retrieve the words fst to compute all those prefixes | ||||
|     let words_fst = match index.main.words_fst(writer)? { | ||||
|         Some(fst) => fst, | ||||
|         None => return Ok(()), | ||||
|     }; | ||||
|  | ||||
| fn compute_short_prefixes<A>( | ||||
|     writer: &mut heed::RwTxn<MainT>, | ||||
|     words_fst: &fst::Set<A>, | ||||
|     index: &store::Index, | ||||
| ) -> MResult<()> | ||||
| where A: AsRef<[u8]>, | ||||
| { | ||||
|     // clear the prefixes | ||||
|     let pplc_store = index.prefix_postings_lists_cache; | ||||
|     pplc_store.clear(writer)?; | ||||
|   | ||||
| @@ -168,7 +168,6 @@ pub fn apply_stop_words_update( | ||||
|  | ||||
|     let old_stop_words: BTreeSet<String> = index.main | ||||
|         .stop_words_fst(writer)? | ||||
|         .unwrap_or_default() | ||||
|         .stream() | ||||
|         .into_strs()? | ||||
|         .into_iter() | ||||
| @@ -186,7 +185,8 @@ pub fn apply_stop_words_update( | ||||
|         apply_stop_words_deletion(writer, index, deletion)?; | ||||
|     } | ||||
|  | ||||
|     if let Some(words_fst) = index.main.words_fst(writer)? { | ||||
|     let words_fst = index.main.words_fst(writer)?; | ||||
|     if !words_fst.is_empty() { | ||||
|         let stop_words = fst::Set::from_iter(stop_words)?; | ||||
|         let op = OpBuilder::new() | ||||
|             .add(&words_fst) | ||||
| @@ -195,7 +195,7 @@ pub fn apply_stop_words_update( | ||||
|  | ||||
|         let mut builder = fst::SetBuilder::memory(); | ||||
|         builder.extend_stream(op)?; | ||||
|         let words_fst = builder.into_inner().and_then(fst::Set::from_bytes)?; | ||||
|         let words_fst = builder.into_set(); | ||||
|  | ||||
|         index.main.put_words_fst(writer, &words_fst)?; | ||||
|         index.main.put_stop_words_fst(writer, &stop_words)?; | ||||
| @@ -222,28 +222,25 @@ fn apply_stop_words_addition( | ||||
|     } | ||||
|  | ||||
|     // create the new delta stop words fst | ||||
|     let delta_stop_words = stop_words_builder | ||||
|         .into_inner() | ||||
|         .and_then(fst::Set::from_bytes)?; | ||||
|     let delta_stop_words = stop_words_builder.into_set(); | ||||
|  | ||||
|     // we also need to remove all the stop words from the main fst | ||||
|     if let Some(word_fst) = main_store.words_fst(writer)? { | ||||
|     let words_fst = main_store.words_fst(writer)?; | ||||
|     if !words_fst.is_empty() { | ||||
|         let op = OpBuilder::new() | ||||
|             .add(&word_fst) | ||||
|             .add(&words_fst) | ||||
|             .add(&delta_stop_words) | ||||
|             .difference(); | ||||
|  | ||||
|         let mut word_fst_builder = SetBuilder::memory(); | ||||
|         word_fst_builder.extend_stream(op)?; | ||||
|         let word_fst = word_fst_builder | ||||
|             .into_inner() | ||||
|             .and_then(fst::Set::from_bytes)?; | ||||
|         let word_fst = word_fst_builder.into_set(); | ||||
|  | ||||
|         main_store.put_words_fst(writer, &word_fst)?; | ||||
|     } | ||||
|  | ||||
|     // now we add all of these stop words from the main store | ||||
|     let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default(); | ||||
|     let stop_words_fst = main_store.stop_words_fst(writer)?; | ||||
|  | ||||
|     let op = OpBuilder::new() | ||||
|         .add(&stop_words_fst) | ||||
| @@ -252,9 +249,7 @@ fn apply_stop_words_addition( | ||||
|  | ||||
|     let mut stop_words_builder = SetBuilder::memory(); | ||||
|     stop_words_builder.extend_stream(op)?; | ||||
|     let stop_words_fst = stop_words_builder | ||||
|         .into_inner() | ||||
|         .and_then(fst::Set::from_bytes)?; | ||||
|     let stop_words_fst = stop_words_builder.into_set(); | ||||
|  | ||||
|     main_store.put_stop_words_fst(writer, &stop_words_fst)?; | ||||
|  | ||||
| @@ -274,12 +269,10 @@ fn apply_stop_words_deletion( | ||||
|     } | ||||
|  | ||||
|     // create the new delta stop words fst | ||||
|     let delta_stop_words = stop_words_builder | ||||
|         .into_inner() | ||||
|         .and_then(fst::Set::from_bytes)?; | ||||
|     let delta_stop_words = stop_words_builder.into_set(); | ||||
|  | ||||
|     // now we delete all of these stop words from the main store | ||||
|     let stop_words_fst = index.main.stop_words_fst(writer)?.unwrap_or_default(); | ||||
|     let stop_words_fst = index.main.stop_words_fst(writer)?; | ||||
|  | ||||
|     let op = OpBuilder::new() | ||||
|         .add(&stop_words_fst) | ||||
| @@ -288,7 +281,7 @@ fn apply_stop_words_deletion( | ||||
|  | ||||
|     let mut stop_words_builder = SetBuilder::memory(); | ||||
|     stop_words_builder.extend_stream(op)?; | ||||
|     let stop_words_fst = stop_words_builder.into_inner().and_then(fst::Set::from_bytes)?; | ||||
|     let stop_words_fst = stop_words_builder.into_set(); | ||||
|  | ||||
|     Ok(index.main.put_stop_words_fst(writer, &stop_words_fst)?) | ||||
| } | ||||
| @@ -311,16 +304,13 @@ pub fn apply_synonyms_update( | ||||
|             let alternatives = SetBuf::from_dirty(alternatives); | ||||
|             let mut alternatives_builder = SetBuilder::memory(); | ||||
|             alternatives_builder.extend_iter(alternatives)?; | ||||
|             let bytes = alternatives_builder.into_inner()?; | ||||
|             fst::Set::from_bytes(bytes)? | ||||
|             alternatives_builder.into_set() | ||||
|         }; | ||||
|  | ||||
|         synonyms_store.put_synonyms(writer, word.as_bytes(), &alternatives)?; | ||||
|     } | ||||
|  | ||||
|     let synonyms_set = synonyms_builder | ||||
|         .into_inner() | ||||
|         .and_then(fst::Set::from_bytes)?; | ||||
|     let synonyms_set = synonyms_builder.into_set(); | ||||
|  | ||||
|     main_store.put_synonyms_fst(writer, &synonyms_set)?; | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user