mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 21:16:28 +00:00 
			
		
		
		
	Make the FieldsIdsMap serialization more stable by using a BTreeMap
This commit is contained in:
		| @@ -1,9 +1,9 @@ | ||||
| use std::collections::{HashMap, BTreeMap}; | ||||
| use std::collections::BTreeMap; | ||||
| use serde::{Serialize, Deserialize}; | ||||
|  | ||||
| #[derive(Debug, Clone, Serialize, Deserialize)] | ||||
| pub struct FieldsIdsMap { | ||||
|     names_ids: HashMap<String, u8>, | ||||
|     names_ids: BTreeMap<String, u8>, | ||||
|     ids_names: BTreeMap<u8, String>, | ||||
|     next_id: Option<u8>, | ||||
| } | ||||
| @@ -11,7 +11,7 @@ pub struct FieldsIdsMap { | ||||
| impl FieldsIdsMap { | ||||
|     pub fn new() -> FieldsIdsMap { | ||||
|         FieldsIdsMap { | ||||
|             names_ids: HashMap::new(), | ||||
|             names_ids: BTreeMap::new(), | ||||
|             ids_names: BTreeMap::new(), | ||||
|             next_id: Some(0), | ||||
|         } | ||||
| @@ -66,6 +66,12 @@ impl FieldsIdsMap { | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl Default for FieldsIdsMap { | ||||
|     fn default() -> FieldsIdsMap { | ||||
|         FieldsIdsMap::new() | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use super::*; | ||||
|   | ||||
| @@ -1,26 +0,0 @@ | ||||
| use std::borrow::Cow; | ||||
| use csv::{StringRecord, Writer, ReaderBuilder}; | ||||
|  | ||||
| pub struct CsvStringRecordCodec; | ||||
|  | ||||
| impl heed::BytesDecode<'_> for CsvStringRecordCodec { | ||||
|     type DItem = StringRecord; | ||||
|  | ||||
|     fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> { | ||||
|         let mut reader = ReaderBuilder::new() | ||||
|             .has_headers(false) | ||||
|             .buffer_capacity(bytes.len()) // we will just read this record | ||||
|             .from_reader(bytes); | ||||
|         reader.records().next()?.ok() // it return an Option of Result | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl heed::BytesEncode<'_> for CsvStringRecordCodec { | ||||
|     type EItem = StringRecord; | ||||
|  | ||||
|     fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|         let mut writer = Writer::from_writer(Vec::new()); | ||||
|         writer.write_record(item).ok()?; | ||||
|         writer.into_inner().ok().map(Cow::Owned) | ||||
|     } | ||||
| } | ||||
| @@ -1,7 +1,6 @@ | ||||
| mod beu32_str_codec; | ||||
| mod bo_roaring_bitmap_codec; | ||||
| mod cbo_roaring_bitmap_codec; | ||||
| mod csv_string_record_codec; | ||||
| mod obkv_codec; | ||||
| mod roaring_bitmap_codec; | ||||
| mod str_str_u8_codec; | ||||
| @@ -9,7 +8,6 @@ mod str_str_u8_codec; | ||||
| pub use self::beu32_str_codec::BEU32StrCodec; | ||||
| pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec; | ||||
| pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec; | ||||
| pub use self::csv_string_record_codec::CsvStringRecordCodec; | ||||
| pub use self::obkv_codec::ObkvCodec; | ||||
| pub use self::roaring_bitmap_codec::RoaringBitmapCodec; | ||||
| pub use self::str_str_u8_codec::StrStrU8Codec; | ||||
|   | ||||
							
								
								
									
										24
									
								
								src/index.rs
									
									
									
									
									
								
							
							
						
						
									
										24
									
								
								src/index.rs
									
									
									
									
									
								
							| @@ -1,23 +1,23 @@ | ||||
| use anyhow::Context; | ||||
| use csv::StringRecord; | ||||
| use heed::types::*; | ||||
| use heed::{PolyDatabase, Database}; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::Search; | ||||
| use crate::{BEU32, DocumentId}; | ||||
| use crate::fields_ids_map::FieldsIdsMap; | ||||
| use crate::{ | ||||
|     RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec, | ||||
|     CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, | ||||
|     BoRoaringBitmapCodec, CboRoaringBitmapCodec, | ||||
| }; | ||||
|  | ||||
| pub const WORDS_FST_KEY: &str = "words-fst"; | ||||
| pub const HEADERS_KEY: &str = "headers"; | ||||
| pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; | ||||
| pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; | ||||
|  | ||||
| #[derive(Clone)] | ||||
| pub struct Index { | ||||
|     /// Contains many different types (e.g. the documents CSV headers). | ||||
|     /// Contains many different types (e.g. the fields ids map). | ||||
|     pub main: PolyDatabase, | ||||
|     /// A word and all the documents ids containing the word. | ||||
|     pub word_docids: Database<Str, RoaringBitmapCodec>, | ||||
| @@ -25,7 +25,7 @@ pub struct Index { | ||||
|     pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>, | ||||
|     /// Maps the proximity between a pair of words with all the docids where this relation appears. | ||||
|     pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>, | ||||
|     /// Maps the document id to the document as a CSV line. | ||||
|     /// Maps the document id to the document as an obkv store. | ||||
|     pub documents: Database<OwnedType<BEU32>, ObkvCodec>, | ||||
| } | ||||
|  | ||||
| @@ -44,17 +44,17 @@ impl Index { | ||||
|         Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, DOCUMENTS_IDS_KEY)?) | ||||
|     } | ||||
|  | ||||
|     pub fn put_headers(&self, wtxn: &mut heed::RwTxn, headers: &StringRecord) -> heed::Result<()> { | ||||
|         self.main.put::<_, Str, CsvStringRecordCodec>(wtxn, HEADERS_KEY, headers) | ||||
|     pub fn put_fields_ids_map(&self, wtxn: &mut heed::RwTxn, map: &FieldsIdsMap) -> heed::Result<()> { | ||||
|         self.main.put::<_, Str, SerdeJson<FieldsIdsMap>>(wtxn, FIELDS_IDS_MAP_KEY, map) | ||||
|     } | ||||
|  | ||||
|     pub fn headers(&self, rtxn: &heed::RoTxn) -> heed::Result<Option<StringRecord>> { | ||||
|         self.main.get::<_, Str, CsvStringRecordCodec>(rtxn, HEADERS_KEY) | ||||
|     pub fn fields_ids_map(&self, rtxn: &heed::RoTxn) -> heed::Result<Option<FieldsIdsMap>> { | ||||
|         self.main.get::<_, Str, SerdeJson<FieldsIdsMap>>(rtxn, FIELDS_IDS_MAP_KEY) | ||||
|     } | ||||
|  | ||||
|     pub fn number_of_attributes(&self, rtxn: &heed::RoTxn) -> anyhow::Result<Option<usize>> { | ||||
|         match self.headers(rtxn)? { | ||||
|             Some(headers) => Ok(Some(headers.len())), | ||||
|     pub fn number_of_fields(&self, rtxn: &heed::RoTxn) -> anyhow::Result<Option<usize>> { | ||||
|         match self.fields_ids_map(rtxn)? { | ||||
|             Some(map) => Ok(Some(map.len())), | ||||
|             None => Ok(None), | ||||
|         } | ||||
|     } | ||||
|   | ||||
| @@ -1,6 +1,6 @@ | ||||
| use std::borrow::Cow; | ||||
|  | ||||
| use anyhow::bail; | ||||
| use anyhow::{bail, ensure}; | ||||
| use bstr::ByteSlice as _; | ||||
| use fst::IntoStreamer; | ||||
| use roaring::RoaringBitmap; | ||||
| @@ -8,7 +8,7 @@ use roaring::RoaringBitmap; | ||||
| use crate::heed_codec::CboRoaringBitmapCodec; | ||||
|  | ||||
| const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes(); | ||||
| const HEADERS_KEY: &[u8] = crate::index::HEADERS_KEY.as_bytes(); | ||||
| const FIELDS_IDS_MAP_KEY: &[u8] = crate::index::FIELDS_IDS_MAP_KEY.as_bytes(); | ||||
| const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes(); | ||||
|  | ||||
| pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> { | ||||
| @@ -25,8 +25,8 @@ pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> { | ||||
|             build.extend_stream(op.into_stream()).unwrap(); | ||||
|             Ok(build.into_inner().unwrap()) | ||||
|         }, | ||||
|         HEADERS_KEY => { | ||||
|             assert!(values.windows(2).all(|vs| vs[0] == vs[1])); | ||||
|         FIELDS_IDS_MAP_KEY => { | ||||
|             ensure!(values.windows(2).all(|vs| vs[0] == vs[1]), "fields ids map doesn't match"); | ||||
|             Ok(values[0].to_vec()) | ||||
|         }, | ||||
|         DOCUMENTS_IDS_KEY => word_docids_merge(&[], values), | ||||
|   | ||||
| @@ -16,7 +16,8 @@ use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; | ||||
| use roaring::RoaringBitmap; | ||||
| use tempfile::tempfile; | ||||
|  | ||||
| use crate::heed_codec::{CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; | ||||
| use crate::fields_ids_map::FieldsIdsMap; | ||||
| use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; | ||||
| use crate::tokenizer::{simple_tokenizer, only_token}; | ||||
| use crate::{SmallVec32, Position, DocumentId}; | ||||
|  | ||||
| @@ -30,7 +31,7 @@ const MAX_POSITION: usize = 1000; | ||||
| const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION; | ||||
|  | ||||
| const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes(); | ||||
| const HEADERS_KEY: &[u8] = crate::index::HEADERS_KEY.as_bytes(); | ||||
| const FIELDS_IDS_MAP_KEY: &[u8] = crate::index::FIELDS_IDS_MAP_KEY.as_bytes(); | ||||
| const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes(); | ||||
|  | ||||
| pub struct Readers { | ||||
| @@ -182,10 +183,10 @@ impl Store { | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn write_headers(&mut self, headers: &StringRecord) -> anyhow::Result<()> { | ||||
|         let headers = CsvStringRecordCodec::bytes_encode(headers) | ||||
|             .with_context(|| format!("could not encode csv record"))?; | ||||
|         Ok(self.main_sorter.insert(HEADERS_KEY, headers)?) | ||||
|     fn write_fields_ids_map(&mut self, map: &FieldsIdsMap) -> anyhow::Result<()> { | ||||
|         let bytes = serde_json::to_vec(&map)?; | ||||
|         self.main_sorter.insert(FIELDS_IDS_MAP_KEY, bytes)?; | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn write_document( | ||||
| @@ -320,7 +321,12 @@ impl Store { | ||||
|  | ||||
|         // Write the headers into the store. | ||||
|         let headers = rdr.headers()?; | ||||
|         self.write_headers(&headers)?; | ||||
|  | ||||
|         let mut fields_ids_map = FieldsIdsMap::new(); | ||||
|         for header in headers.iter() { | ||||
|             fields_ids_map.insert(header).context("no more field id available")?; | ||||
|         } | ||||
|         self.write_fields_ids_map(&fields_ids_map)?; | ||||
|  | ||||
|         let mut before = Instant::now(); | ||||
|         let mut document_id: usize = base_document_id; | ||||
|   | ||||
| @@ -20,8 +20,8 @@ pub use self::index::Index; | ||||
| pub use self::search::{Search, SearchResult}; | ||||
| pub use self::update_store::UpdateStore; | ||||
| pub use self::heed_codec::{ | ||||
|     RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec, | ||||
|     CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, | ||||
|     RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, | ||||
|     ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, | ||||
| }; | ||||
|  | ||||
| pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>; | ||||
|   | ||||
| @@ -1,8 +1,10 @@ | ||||
| use std::collections::HashMap; | ||||
| use std::io::{self, BufRead}; | ||||
| use std::iter::once; | ||||
| use std::path::PathBuf; | ||||
| use std::time::Instant; | ||||
|  | ||||
| use anyhow::Context; | ||||
| use heed::EnvOpenOptions; | ||||
| use log::debug; | ||||
| use structopt::StructOpt; | ||||
| @@ -59,18 +61,22 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { | ||||
|         let query = result?; | ||||
|         let result = index.search(&rtxn).query(query).execute().unwrap(); | ||||
|  | ||||
|         let headers = match index.headers(&rtxn)? { | ||||
|             Some(headers) => headers, | ||||
|             None => return Ok(()), | ||||
|         }; | ||||
|         let mut stdout = io::stdout(); | ||||
|         let fields_ids_map = index.fields_ids_map(&rtxn)?.unwrap_or_default(); | ||||
|         let documents = index.documents(&rtxn, result.documents_ids.iter().cloned())?; | ||||
|  | ||||
|         let mut wtr = csv::Writer::from_writer(io::stdout()); | ||||
|         wtr.write_record(&headers)?; | ||||
|         for (_id, record) in documents { | ||||
|             wtr.write_record(record.iter().map(|(_, v)| v))?; | ||||
|             let document: anyhow::Result<HashMap<_, _>> = record.iter() | ||||
|                 .map(|(k, v)| { | ||||
|                     let key = fields_ids_map.name(k).context("field id not found")?; | ||||
|                     let val = std::str::from_utf8(v)?; | ||||
|                     Ok((key, val)) | ||||
|                 }) | ||||
|                 .collect(); | ||||
|  | ||||
|             let document = document?; | ||||
|             serde_json::to_writer(&mut stdout, &document)?; | ||||
|         } | ||||
|         wtr.flush()?; | ||||
|  | ||||
|         debug!("Took {:.02?} to find {} documents", before.elapsed(), result.documents_ids.len()); | ||||
|     } | ||||
|   | ||||
| @@ -382,22 +382,22 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { | ||||
|             let SearchResult { found_words, documents_ids } = search.execute().unwrap(); | ||||
|  | ||||
|             let mut documents = Vec::new(); | ||||
|             if let Some(headers) = index.headers(&rtxn).unwrap() { | ||||
|                 for (_id, record) in index.documents(&rtxn, documents_ids).unwrap() { | ||||
|                     let mut record = record.iter() | ||||
|                         .map(|(key_id, value)| { | ||||
|                             let key = headers[key_id as usize].to_owned(); | ||||
|                             let value = std::str::from_utf8(value).unwrap().to_owned(); | ||||
|                             (key, value) | ||||
|                         }) | ||||
|                         .collect(); | ||||
|             let fields_ids_map = index.fields_ids_map(&rtxn).unwrap().unwrap_or_default(); | ||||
|  | ||||
|                     if !disable_highlighting { | ||||
|                         highlight_record(&mut record, &found_words); | ||||
|                     } | ||||
|             for (_id, record) in index.documents(&rtxn, documents_ids).unwrap() { | ||||
|                 let mut record = record.iter() | ||||
|                     .map(|(key_id, value)| { | ||||
|                         let key = fields_ids_map.name(key_id).unwrap().to_owned(); | ||||
|                         let value = std::str::from_utf8(value).unwrap().to_owned(); | ||||
|                         (key, value) | ||||
|                     }) | ||||
|                     .collect(); | ||||
|  | ||||
|                     documents.push(record); | ||||
|                 if !disable_highlighting { | ||||
|                     highlight_record(&mut record, &found_words); | ||||
|                 } | ||||
|  | ||||
|                 documents.push(record); | ||||
|             } | ||||
|  | ||||
|             Response::builder() | ||||
|   | ||||
		Reference in New Issue
	
	Block a user