mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 05:26:27 +00:00 
			
		
		
		
	feat: Introduce an Index system based on RocksDB
This commit is contained in:
		
							
								
								
									
										12
									
								
								Cargo.toml
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								Cargo.toml
									
									
									
									
									
								
							| @@ -6,10 +6,11 @@ authors = ["Kerollmops <renault.cle@gmail.com>"] | |||||||
|  |  | ||||||
| [dependencies] | [dependencies] | ||||||
| byteorder = "1.2" | byteorder = "1.2" | ||||||
|  | fnv = "1.0" | ||||||
|  | fs2 = "0.4" | ||||||
| lazy_static = "1.1" | lazy_static = "1.1" | ||||||
| sdset = "0.2" | sdset = "0.2" | ||||||
| fs2 = "0.4" | unidecode = "0.3" | ||||||
| fnv = "1.0" |  | ||||||
|  |  | ||||||
| [dependencies.fst] | [dependencies.fst] | ||||||
| git = "https://github.com/Kerollmops/fst.git" | git = "https://github.com/Kerollmops/fst.git" | ||||||
| @@ -27,12 +28,11 @@ git = "https://github.com/pingcap/rust-rocksdb.git" | |||||||
| git = "https://github.com/Kerollmops/group-by.git" | git = "https://github.com/Kerollmops/group-by.git" | ||||||
|  |  | ||||||
| [dev-dependencies] | [dev-dependencies] | ||||||
|  | csv = "1.0" | ||||||
|  | elapsed = "0.1" | ||||||
| moby-name-gen = "0.1" | moby-name-gen = "0.1" | ||||||
|  | serde = "1.0" | ||||||
| serde_derive = "1.0" | serde_derive = "1.0" | ||||||
| serde_json = "1.0" | serde_json = "1.0" | ||||||
| structopt = "0.2" | structopt = "0.2" | ||||||
| unidecode = "0.3" |  | ||||||
| elapsed = "0.1" |  | ||||||
| serde = "1.0" |  | ||||||
| warp = "0.1" | warp = "0.1" | ||||||
| csv = "1.0" |  | ||||||
|   | |||||||
| @@ -79,8 +79,7 @@ impl CsvIndexer { | |||||||
|             } |             } | ||||||
|  |  | ||||||
|             { |             { | ||||||
|                 let title = Tokenizer::new(&product.title); |                 let title = Tokenizer::new(&product.title).filter(|&(_, w)| !self.common_words.contains(w)); | ||||||
|                 let title = title.iter().filter(|&(_, w)| !self.common_words.contains(w)); |  | ||||||
|                 insert_document_words(&mut builder, product.id, 1, title); |                 insert_document_words(&mut builder, product.id, 1, title); | ||||||
|  |  | ||||||
|                 let key = format!("{}-title", product.id); |                 let key = format!("{}-title", product.id); | ||||||
| @@ -89,8 +88,7 @@ impl CsvIndexer { | |||||||
|             } |             } | ||||||
|  |  | ||||||
|             { |             { | ||||||
|                 let description = Tokenizer::new(&product.description); |                 let description = Tokenizer::new(&product.description).filter(|&(_, w)| !self.common_words.contains(w)); | ||||||
|                 let description = description.iter().filter(|&(_, w)| !self.common_words.contains(w)); |  | ||||||
|                 insert_document_words(&mut builder, product.id, 2, description); |                 insert_document_words(&mut builder, product.id, 2, description); | ||||||
|  |  | ||||||
|                 let key = format!("{}-description", product.id); |                 let key = format!("{}-description", product.id); | ||||||
|   | |||||||
| @@ -84,8 +84,7 @@ impl JsonLinesIndexer { | |||||||
|             } |             } | ||||||
|  |  | ||||||
|             { |             { | ||||||
|                 let title = Tokenizer::new(&product.title); |                 let title = Tokenizer::new(&product.title).filter(|&(_, w)| !self.common_words.contains(w)); | ||||||
|                 let title = title.iter().filter(|&(_, w)| !self.common_words.contains(w)); |  | ||||||
|                 insert_document_words(&mut builder, product.id, 1, title); |                 insert_document_words(&mut builder, product.id, 1, title); | ||||||
|  |  | ||||||
|                 let key = format!("{}-title", product.id); |                 let key = format!("{}-title", product.id); | ||||||
| @@ -94,8 +93,7 @@ impl JsonLinesIndexer { | |||||||
|             } |             } | ||||||
|  |  | ||||||
|             { |             { | ||||||
|                 let description = Tokenizer::new(&product.description); |                 let description = Tokenizer::new(&product.description).filter(|&(_, w)| !self.common_words.contains(w)); | ||||||
|                 let description = description.iter().filter(|&(_, w)| !self.common_words.contains(w)); |  | ||||||
|                 insert_document_words(&mut builder, product.id, 2, description); |                 insert_document_words(&mut builder, product.id, 2, description); | ||||||
|  |  | ||||||
|                 let key = format!("{}-description", product.id); |                 let key = format!("{}-description", product.id); | ||||||
|   | |||||||
| @@ -40,7 +40,7 @@ impl<W: Write> NegativeBlobBuilder<W> { | |||||||
|         Self { doc_ids: DocIdsBuilder::new(wrt) } |         Self { doc_ids: DocIdsBuilder::new(wrt) } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn insert(&mut self, doc: DocumentId) { |     pub fn insert(&mut self, doc: DocumentId) -> bool { | ||||||
|         self.doc_ids.insert(doc) |         self.doc_ids.insert(doc) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -47,7 +47,7 @@ impl DocIds { | |||||||
| } | } | ||||||
|  |  | ||||||
| pub struct DocIdsBuilder<W> { | pub struct DocIdsBuilder<W> { | ||||||
|     doc_ids: BTreeSet<DocumentId>, |     doc_ids: BTreeSet<DocumentId>, // TODO: prefer a linked-list | ||||||
|     wrt: W, |     wrt: W, | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -59,8 +59,8 @@ impl<W: io::Write> DocIdsBuilder<W> { | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn insert(&mut self, doc: DocumentId) { |     pub fn insert(&mut self, doc: DocumentId) -> bool { | ||||||
|         self.doc_ids.insert(doc); |         self.doc_ids.insert(doc) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn into_inner(mut self) -> io::Result<W> { |     pub fn into_inner(mut self) -> io::Result<W> { | ||||||
|   | |||||||
							
								
								
									
										40
									
								
								src/index.rs
									
									
									
									
									
								
							
							
						
						
									
										40
									
								
								src/index.rs
									
									
									
									
									
								
							| @@ -1,40 +0,0 @@ | |||||||
| use std::path::{Path, PathBuf}; |  | ||||||
| use std::error::Error; |  | ||||||
| use std::fs::{self, File}; |  | ||||||
|  |  | ||||||
| use fs2::FileExt; |  | ||||||
|  |  | ||||||
| use crate::rank::Document; |  | ||||||
| use crate::blob::Blob; |  | ||||||
|  |  | ||||||
| pub struct Index { |  | ||||||
|     path: PathBuf, |  | ||||||
|     lock_file: File, |  | ||||||
|     blobs: Vec<Blob>, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl Index { |  | ||||||
|     pub fn open<P: Into<PathBuf>>(path: P) -> Result<Self, Box<Error>> { |  | ||||||
|         let path = path.into(); |  | ||||||
|  |  | ||||||
|         let lock_file = File::create(path.join(".lock"))?; |  | ||||||
|         lock_file.try_lock_exclusive()?; |  | ||||||
|  |  | ||||||
|         let blobs = Vec::new(); |  | ||||||
|  |  | ||||||
|         Ok(Self { path, lock_file, blobs }) |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     pub fn create<P: Into<PathBuf>>(path: P) -> Result<Self, Box<Error>> { |  | ||||||
|         let path = path.into(); |  | ||||||
|  |  | ||||||
|         fs::create_dir_all(&path)?; |  | ||||||
|         File::create(path.join(".lock"))?; |  | ||||||
|  |  | ||||||
|         Self::open(path) |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     pub fn blobs(&self) -> &[Blob] { |  | ||||||
|         &self.blobs |  | ||||||
|     } |  | ||||||
| } |  | ||||||
							
								
								
									
										16
									
								
								src/index/blob_name.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								src/index/blob_name.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,16 @@ | |||||||
|  | use std::fmt; | ||||||
|  |  | ||||||
|  | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||||||
|  | pub struct BlobName; | ||||||
|  |  | ||||||
|  | impl BlobName { | ||||||
|  |     pub fn new() -> BlobName { | ||||||
|  |         unimplemented!() | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl fmt::Display for BlobName { | ||||||
|  |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||||
|  |         unimplemented!() | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										175
									
								
								src/index/mod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										175
									
								
								src/index/mod.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,175 @@ | |||||||
|  | pub mod blob_name; | ||||||
|  | pub mod schema; | ||||||
|  | pub mod search; | ||||||
|  | pub mod update; | ||||||
|  |  | ||||||
|  | use std::io; | ||||||
|  | use std::rc::Rc; | ||||||
|  | use std::error::Error; | ||||||
|  | use std::fs::{self, File}; | ||||||
|  | use std::fmt::{self, Write}; | ||||||
|  | use std::ops::{Deref, BitOr}; | ||||||
|  | use std::path::{Path, PathBuf}; | ||||||
|  | use std::collections::{BTreeSet, BTreeMap}; | ||||||
|  |  | ||||||
|  | use fs2::FileExt; | ||||||
|  | use ::rocksdb::{rocksdb, rocksdb_options}; | ||||||
|  | use ::rocksdb::merge_operator::MergeOperands; | ||||||
|  |  | ||||||
|  | use crate::rank::Document; | ||||||
|  | use crate::data::DocIdsBuilder; | ||||||
|  | use crate::{DocIndex, DocumentId}; | ||||||
|  | use crate::index::{update::Update, search::Search}; | ||||||
|  | use crate::blob::{PositiveBlobBuilder, Blob, Sign}; | ||||||
|  | use crate::tokenizer::{TokenizerBuilder, DefaultBuilder, Tokenizer}; | ||||||
|  |  | ||||||
|  | fn simple_vec_append(key: &[u8], value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> { | ||||||
|  |     let mut output = Vec::new(); | ||||||
|  |     for bytes in operands.chain(value) { | ||||||
|  |         output.extend_from_slice(bytes); | ||||||
|  |     } | ||||||
|  |     output | ||||||
|  | } | ||||||
|  |  | ||||||
|  | pub struct Index { | ||||||
|  |     database: rocksdb::DB, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl Index { | ||||||
|  |     pub fn open<P: AsRef<Path>>(path: P) -> Result<Index, Box<Error>> { | ||||||
|  |         let path = path.as_ref().to_string_lossy(); | ||||||
|  |  | ||||||
|  |         let mut opts = rocksdb_options::DBOptions::new(); | ||||||
|  |         opts.create_if_missing(true); | ||||||
|  |  | ||||||
|  |         let mut cf_opts = rocksdb_options::ColumnFamilyOptions::new(); | ||||||
|  |         cf_opts.add_merge_operator("blobs order operator", simple_vec_append); | ||||||
|  |  | ||||||
|  |         let database = rocksdb::DB::open_cf(opts, &path, vec![("default", cf_opts)])?; | ||||||
|  |  | ||||||
|  |         // check if index is a valid RocksDB and | ||||||
|  |         // contains the right key-values (i.e. "blobs-order") | ||||||
|  |  | ||||||
|  |         Ok(Self { database }) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn ingest_update(&self, update: Update) -> Result<(), Box<Error>> { | ||||||
|  |         let path = update.into_path_buf(); | ||||||
|  |         let path = path.to_string_lossy(); | ||||||
|  |  | ||||||
|  |         let mut options = rocksdb_options::IngestExternalFileOptions::new(); | ||||||
|  |         // options.move_files(true); | ||||||
|  |  | ||||||
|  |         let cf_handle = self.database.cf_handle("default").unwrap(); | ||||||
|  |         self.database.ingest_external_file_optimized(&cf_handle, &options, &[&path])?; | ||||||
|  |  | ||||||
|  |         Ok(()) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn snapshot(&self) -> Snapshot<&rocksdb::DB> { | ||||||
|  |         Snapshot::new(&self.database) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl Search for Index { | ||||||
|  |     fn search(&self, text: &str) -> Vec<Document> { | ||||||
|  |         unimplemented!() | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | pub struct Snapshot<D> | ||||||
|  | where D: Deref<Target=rocksdb::DB>, | ||||||
|  | { | ||||||
|  |     inner: rocksdb::Snapshot<D>, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl<D> Snapshot<D> | ||||||
|  | where D: Deref<Target=rocksdb::DB>, | ||||||
|  | { | ||||||
|  |     pub fn new(inner: D) -> Snapshot<D> { | ||||||
|  |         Self { inner: rocksdb::Snapshot::new(inner) } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl<D> Search for Snapshot<D> | ||||||
|  | where D: Deref<Target=rocksdb::DB>, | ||||||
|  | { | ||||||
|  |     fn search(&self, text: &str) -> Vec<Document> { | ||||||
|  |         unimplemented!() | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[cfg(test)] | ||||||
|  | mod tests { | ||||||
|  |     use super::*; | ||||||
|  |     use crate::index::schema::Schema; | ||||||
|  |     use crate::index::update::{PositiveUpdateBuilder, NegativeUpdateBuilder}; | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn generate_negative_update() -> Result<(), Box<Error>> { | ||||||
|  |  | ||||||
|  |         let schema = Schema::open("/meili/default.sch")?; | ||||||
|  |         let mut builder = NegativeUpdateBuilder::new("update-delete-0001.sst"); | ||||||
|  |  | ||||||
|  |         // you can insert documents in any order, it is sorted internally | ||||||
|  |         builder.remove(1); | ||||||
|  |         builder.remove(5); | ||||||
|  |         builder.remove(2); | ||||||
|  |  | ||||||
|  |         let update = builder.build()?; | ||||||
|  |  | ||||||
|  |         assert_eq!(update.info().sign, Sign::Negative); | ||||||
|  |  | ||||||
|  |         Ok(()) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn generate_positive_update() -> Result<(), Box<Error>> { | ||||||
|  |  | ||||||
|  |         let schema = Schema::open("/meili/default.sch")?; | ||||||
|  |         let tokenizer_builder = DefaultBuilder::new(); | ||||||
|  |         let mut builder = PositiveUpdateBuilder::new("update-positive-0001.sst", schema.clone(), tokenizer_builder); | ||||||
|  |  | ||||||
|  |         // you can insert documents in any order, it is sorted internally | ||||||
|  |         let title_field = schema.field("title").unwrap(); | ||||||
|  |         builder.update_field(1, title_field, "hallo!".to_owned()); | ||||||
|  |         builder.update_field(5, title_field, "hello!".to_owned()); | ||||||
|  |         builder.update_field(2, title_field, "hi!".to_owned()); | ||||||
|  |  | ||||||
|  |         let name_field = schema.field("name").unwrap(); | ||||||
|  |         builder.remove_field(4, name_field); | ||||||
|  |  | ||||||
|  |         let update = builder.build()?; | ||||||
|  |  | ||||||
|  |         assert_eq!(update.info().sign, Sign::Positive); | ||||||
|  |  | ||||||
|  |         Ok(()) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn execution() -> Result<(), Box<Error>> { | ||||||
|  |  | ||||||
|  |         let index = Index::open("/meili/data")?; | ||||||
|  |         let update = Update::open("update-0001.sst")?; | ||||||
|  |         index.ingest_update(update)?; | ||||||
|  |         // directly apply changes to the database and see new results | ||||||
|  |         let results = index.search("helo"); | ||||||
|  |  | ||||||
|  |         ////////////// | ||||||
|  |  | ||||||
|  |         let index = Index::open("/meili/data")?; | ||||||
|  |         let update = Update::open("update-0001.sst")?; | ||||||
|  |  | ||||||
|  |         // if you create a snapshot before an update | ||||||
|  |         let snapshot = index.snapshot(); | ||||||
|  |         index.ingest_update(update)?; | ||||||
|  |  | ||||||
|  |         // the snapshot does not see the updates | ||||||
|  |         let results = snapshot.search("helo"); | ||||||
|  |  | ||||||
|  |         // the raw index itself see new results | ||||||
|  |         let results = index.search("helo"); | ||||||
|  |  | ||||||
|  |         Ok(()) | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										82
									
								
								src/index/schema.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										82
									
								
								src/index/schema.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,82 @@ | |||||||
|  | use std::error::Error; | ||||||
|  | use std::path::Path; | ||||||
|  | use std::ops::BitOr; | ||||||
|  | use std::fmt; | ||||||
|  |  | ||||||
|  | pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false }; | ||||||
|  | pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true }; | ||||||
|  |  | ||||||
|  | #[derive(Copy, Clone)] | ||||||
|  | pub struct SchemaProps { | ||||||
|  |     stored: bool, | ||||||
|  |     indexed: bool, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl SchemaProps { | ||||||
|  |     pub fn is_stored(&self) -> bool { | ||||||
|  |         self.stored | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn is_indexed(&self) -> bool { | ||||||
|  |         self.indexed | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl BitOr for SchemaProps { | ||||||
|  |     type Output = Self; | ||||||
|  |  | ||||||
|  |     fn bitor(self, other: Self) -> Self::Output { | ||||||
|  |         SchemaProps { | ||||||
|  |             stored: self.stored | other.stored, | ||||||
|  |             indexed: self.indexed | other.indexed, | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | pub struct SchemaBuilder; | ||||||
|  |  | ||||||
|  | impl SchemaBuilder { | ||||||
|  |     pub fn new() -> SchemaBuilder { | ||||||
|  |         unimplemented!() | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn field(&mut self, name: &str, props: SchemaProps) -> SchemaField { | ||||||
|  |         unimplemented!() | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn build(self) -> Schema { | ||||||
|  |         unimplemented!() | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[derive(Copy, Clone, PartialOrd, Ord, PartialEq, Eq)] | ||||||
|  | pub struct SchemaField(u32); | ||||||
|  |  | ||||||
|  | impl SchemaField { | ||||||
|  |     pub fn as_u32(&self) -> u32 { | ||||||
|  |         self.0 | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl fmt::Display for SchemaField { | ||||||
|  |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||||
|  |         write!(f, "{}", self.0) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[derive(Clone)] | ||||||
|  | pub struct Schema; | ||||||
|  |  | ||||||
|  | impl Schema { | ||||||
|  |     pub fn open<P: AsRef<Path>>(path: P) -> Result<Schema, Box<Error>> { | ||||||
|  |         unimplemented!() | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn props(&self, field: SchemaField) -> SchemaProps { | ||||||
|  |         unimplemented!() | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn field(&self, name: &str) -> Option<SchemaField> { | ||||||
|  |         unimplemented!() | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										5
									
								
								src/index/search.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								src/index/search.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,5 @@ | |||||||
|  | use crate::rank::Document; | ||||||
|  |  | ||||||
|  | pub trait Search { | ||||||
|  |     fn search(&self, text: &str) -> Vec<Document>; | ||||||
|  | } | ||||||
							
								
								
									
										55
									
								
								src/index/update/mod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										55
									
								
								src/index/update/mod.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,55 @@ | |||||||
|  | use std::path::PathBuf; | ||||||
|  | use std::error::Error; | ||||||
|  |  | ||||||
|  | use ::rocksdb::rocksdb_options; | ||||||
|  |  | ||||||
|  | use crate::index::blob_name::BlobName; | ||||||
|  | use crate::blob::Sign; | ||||||
|  |  | ||||||
|  | mod negative_update; | ||||||
|  | mod positive_update; | ||||||
|  |  | ||||||
|  | pub use self::negative_update::{NegativeUpdateBuilder}; | ||||||
|  | pub use self::positive_update::{PositiveUpdateBuilder, NewState}; | ||||||
|  |  | ||||||
|  | // These prefixes are here to make sure the documents fields | ||||||
|  | // and the internal data doesn't collide and the internal data are | ||||||
|  | // at the top of the sst file. | ||||||
|  | const FIELD_BLOBS_ORDER: &str = "00-blobs-order"; | ||||||
|  |  | ||||||
|  | pub struct Update { | ||||||
|  |     path: PathBuf, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl Update { | ||||||
|  |     pub fn open<P: Into<PathBuf>>(path: P) -> Result<Update, Box<Error>> { | ||||||
|  |         let path = path.into(); | ||||||
|  |  | ||||||
|  |         let env_options = rocksdb_options::EnvOptions::new(); | ||||||
|  |         let column_family_options = rocksdb_options::ColumnFamilyOptions::new(); | ||||||
|  |         let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options); | ||||||
|  |         file_writer.open(&path.to_string_lossy())?; | ||||||
|  |         let infos = file_writer.finish()?; | ||||||
|  |  | ||||||
|  |         if infos.smallest_key() != FIELD_BLOBS_ORDER.as_bytes() { | ||||||
|  |             // FIXME return a nice error | ||||||
|  |             panic!("Invalid update file: the blobs-order field is not the smallest key") | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         Ok(Update { path }) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn into_path_buf(self) -> PathBuf { | ||||||
|  |         self.path | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn info(&self) -> UpdateInfo { | ||||||
|  |         unimplemented!() | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] | ||||||
|  | pub struct UpdateInfo { | ||||||
|  |     pub sign: Sign, | ||||||
|  |     pub id: BlobName, | ||||||
|  | } | ||||||
							
								
								
									
										59
									
								
								src/index/update/negative_update.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								src/index/update/negative_update.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,59 @@ | |||||||
|  | use std::path::PathBuf; | ||||||
|  | use std::error::Error; | ||||||
|  |  | ||||||
|  | use ::rocksdb::rocksdb_options; | ||||||
|  |  | ||||||
|  | use crate::index::update::{FIELD_BLOBS_ORDER, Update}; | ||||||
|  | use crate::index::blob_name::BlobName; | ||||||
|  | use crate::data::DocIdsBuilder; | ||||||
|  | use crate::DocumentId; | ||||||
|  |  | ||||||
|  | pub struct NegativeUpdateBuilder { | ||||||
|  |     path: PathBuf, | ||||||
|  |     doc_ids: DocIdsBuilder<Vec<u8>>, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl NegativeUpdateBuilder { | ||||||
|  |     pub fn new<P: Into<PathBuf>>(path: P) -> NegativeUpdateBuilder { | ||||||
|  |         NegativeUpdateBuilder { | ||||||
|  |             path: path.into(), | ||||||
|  |             doc_ids: DocIdsBuilder::new(Vec::new()), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn remove(&mut self, id: DocumentId) -> bool { | ||||||
|  |         self.doc_ids.insert(id) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn build(self) -> Result<Update, Box<Error>> { | ||||||
|  |         let blob_name = BlobName::new(); | ||||||
|  |  | ||||||
|  |         let env_options = rocksdb_options::EnvOptions::new(); | ||||||
|  |         let column_family_options = rocksdb_options::ColumnFamilyOptions::new(); | ||||||
|  |         let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options); | ||||||
|  |  | ||||||
|  |         file_writer.open(&self.path.to_string_lossy())?; | ||||||
|  |  | ||||||
|  |         // TODO the blob-name must be written in bytes (16 bytes) | ||||||
|  |         //      along with the sign | ||||||
|  |         unimplemented!("write the blob sign and name"); | ||||||
|  |  | ||||||
|  |         // write the blob name to be merged | ||||||
|  |         let blob_name = blob_name.to_string(); | ||||||
|  |         file_writer.merge(FIELD_BLOBS_ORDER.as_bytes(), blob_name.as_bytes())?; | ||||||
|  |  | ||||||
|  |         // write the doc ids | ||||||
|  |         let blob_key = format!("0b-{}-doc-ids", blob_name); | ||||||
|  |         let blob_doc_ids = self.doc_ids.into_inner()?; | ||||||
|  |         file_writer.put(blob_key.as_bytes(), &blob_doc_ids)?; | ||||||
|  |  | ||||||
|  |         for id in blob_doc_ids { | ||||||
|  |             let start = format!("5d-{}", id); | ||||||
|  |             let end = format!("5d-{}", id + 1); | ||||||
|  |             file_writer.delete_range(start.as_bytes(), end.as_bytes())?; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         file_writer.finish()?; | ||||||
|  |         Update::open(self.path) | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										124
									
								
								src/index/update/positive_update.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										124
									
								
								src/index/update/positive_update.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,124 @@ | |||||||
|  | use std::collections::BTreeMap; | ||||||
|  | use std::path::PathBuf; | ||||||
|  | use std::error::Error; | ||||||
|  | use std::fmt::Write; | ||||||
|  |  | ||||||
|  | use ::rocksdb::rocksdb_options; | ||||||
|  |  | ||||||
|  | use crate::index::schema::{SchemaProps, Schema, SchemaField}; | ||||||
|  | use crate::index::update::{FIELD_BLOBS_ORDER, Update}; | ||||||
|  | use crate::tokenizer::TokenizerBuilder; | ||||||
|  | use crate::index::blob_name::BlobName; | ||||||
|  | use crate::blob::PositiveBlobBuilder; | ||||||
|  | use crate::{DocIndex, DocumentId}; | ||||||
|  |  | ||||||
|  | pub enum NewState { | ||||||
|  |     Updated { | ||||||
|  |         value: String, | ||||||
|  |         props: SchemaProps, | ||||||
|  |     }, | ||||||
|  |     Removed, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | pub struct PositiveUpdateBuilder<B> { | ||||||
|  |     path: PathBuf, | ||||||
|  |     schema: Schema, | ||||||
|  |     tokenizer_builder: B, | ||||||
|  |     new_states: BTreeMap<(DocumentId, SchemaField), NewState>, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl<B> PositiveUpdateBuilder<B> { | ||||||
|  |     pub fn new<P: Into<PathBuf>>(path: P, schema: Schema, tokenizer_builder: B) -> PositiveUpdateBuilder<B> { | ||||||
|  |         PositiveUpdateBuilder { | ||||||
|  |             path: path.into(), | ||||||
|  |             schema: schema, | ||||||
|  |             tokenizer_builder: tokenizer_builder, | ||||||
|  |             new_states: BTreeMap::new(), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // TODO value must be a field that can be indexed | ||||||
|  |     pub fn update_field(&mut self, id: DocumentId, field: SchemaField, value: String) { | ||||||
|  |         let state = NewState::Updated { value, props: self.schema.props(field) }; | ||||||
|  |         self.new_states.insert((id, field), state); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn remove_field(&mut self, id: DocumentId, field: SchemaField) { | ||||||
|  |         self.new_states.insert((id, field), NewState::Removed); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl<B> PositiveUpdateBuilder<B> | ||||||
|  | where B: TokenizerBuilder | ||||||
|  | { | ||||||
|  |     pub fn build(self) -> Result<Update, Box<Error>> { | ||||||
|  |         let blob_name = BlobName::new(); | ||||||
|  |  | ||||||
|  |         let env_options = rocksdb_options::EnvOptions::new(); | ||||||
|  |         let column_family_options = rocksdb_options::ColumnFamilyOptions::new(); | ||||||
|  |         let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options); | ||||||
|  |  | ||||||
|  |         file_writer.open(&self.path.to_string_lossy())?; | ||||||
|  |  | ||||||
|  |         // TODO the blob-name must be written in bytes (16 bytes) | ||||||
|  |         //      along with the sign | ||||||
|  |         unimplemented!("write the blob sign and name"); | ||||||
|  |  | ||||||
|  |         // write the blob name to be merged | ||||||
|  |         let blob_name = blob_name.to_string(); | ||||||
|  |         file_writer.put(FIELD_BLOBS_ORDER.as_bytes(), blob_name.as_bytes())?; | ||||||
|  |  | ||||||
|  |         let mut builder = PositiveBlobBuilder::new(Vec::new(), Vec::new()); | ||||||
|  |         for ((document_id, field), state) in &self.new_states { | ||||||
|  |             let value = match state { | ||||||
|  |                 NewState::Updated { value, props } if props.is_indexed() => value, | ||||||
|  |                 _ => continue, | ||||||
|  |             }; | ||||||
|  |  | ||||||
|  |             for (index, word) in self.tokenizer_builder.build(value) { | ||||||
|  |                 let doc_index = DocIndex { | ||||||
|  |                     document_id: *document_id, | ||||||
|  |                     attribute: field.as_u32() as u8, | ||||||
|  |                     attribute_index: index as u32, | ||||||
|  |                 }; | ||||||
|  |                 // insert the exact representation | ||||||
|  |                 let word_lower = word.to_lowercase(); | ||||||
|  |  | ||||||
|  |                 // and the unidecoded lowercased version | ||||||
|  |                 let word_unidecoded = unidecode::unidecode(word).to_lowercase(); | ||||||
|  |                 if word_lower != word_unidecoded { | ||||||
|  |                     builder.insert(word_unidecoded, doc_index); | ||||||
|  |                 } | ||||||
|  |  | ||||||
|  |                 builder.insert(word_lower, doc_index); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         let (blob_fst_map, blob_doc_idx) = builder.into_inner()?; | ||||||
|  |  | ||||||
|  |         // write the fst | ||||||
|  |         let blob_key = format!("0b-{}-fst", blob_name); | ||||||
|  |         file_writer.put(blob_key.as_bytes(), &blob_fst_map)?; | ||||||
|  |  | ||||||
|  |         // write the doc-idx | ||||||
|  |         let blob_key = format!("0b-{}-doc-idx", blob_name); | ||||||
|  |         file_writer.put(blob_key.as_bytes(), &blob_doc_idx)?; | ||||||
|  |  | ||||||
|  |         // write all the documents fields updates | ||||||
|  |         let mut key = String::from("5d-"); | ||||||
|  |         let prefix_len = key.len(); | ||||||
|  |  | ||||||
|  |         for ((id, field), state) in self.new_states { | ||||||
|  |             key.truncate(prefix_len); | ||||||
|  |             write!(&mut key, "{}-{}", id, field)?; | ||||||
|  |             match state { | ||||||
|  |                 NewState::Updated { value, props } => if props.is_stored() { | ||||||
|  |                     file_writer.put(key.as_bytes(), value.as_bytes())? | ||||||
|  |                 }, | ||||||
|  |                 NewState::Removed => file_writer.delete(key.as_bytes())?, | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         file_writer.finish()?; | ||||||
|  |         Update::open(self.path) | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -1,28 +1,32 @@ | |||||||
| use std::mem; | use std::mem; | ||||||
| use self::Separator::*; | use self::Separator::*; | ||||||
|  |  | ||||||
|  | pub trait TokenizerBuilder { | ||||||
|  |     fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a>; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | pub struct DefaultBuilder; | ||||||
|  |  | ||||||
|  | impl DefaultBuilder { | ||||||
|  |     pub fn new() -> DefaultBuilder { | ||||||
|  |         DefaultBuilder | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl TokenizerBuilder for DefaultBuilder { | ||||||
|  |     fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a> { | ||||||
|  |         Box::new(Tokenizer::new(text)) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| pub struct Tokenizer<'a> { | pub struct Tokenizer<'a> { | ||||||
|  |     index: usize, | ||||||
|     inner: &'a str, |     inner: &'a str, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'a> Tokenizer<'a> { | impl<'a> Tokenizer<'a> { | ||||||
|     pub fn new(string: &str) -> Tokenizer { |     pub fn new(string: &str) -> Tokenizer { | ||||||
|         Tokenizer { inner: string } |         Tokenizer { | ||||||
|     } |  | ||||||
|  |  | ||||||
|     pub fn iter(&self) -> Tokens { |  | ||||||
|         Tokens::new(self.inner) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub struct Tokens<'a> { |  | ||||||
|     index: usize, |  | ||||||
|     inner: &'a str, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl<'a> Tokens<'a> { |  | ||||||
|     fn new(string: &str) -> Tokens { |  | ||||||
|         Tokens { |  | ||||||
|             index: 0, |             index: 0, | ||||||
|             inner: string.trim_matches(&[' ', '.', ';', ',', '!', '?', '-', '\'', '"'][..]), |             inner: string.trim_matches(&[' ', '.', ';', ',', '!', '?', '-', '\'', '"'][..]), | ||||||
|         } |         } | ||||||
| @@ -52,7 +56,7 @@ impl Separator { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'a> Iterator for Tokens<'a> { | impl<'a> Iterator for Tokenizer<'a> { | ||||||
|     type Item = (usize, &'a str); |     type Item = (usize, &'a str); | ||||||
|  |  | ||||||
|     fn next(&mut self) -> Option<Self::Item> { |     fn next(&mut self) -> Option<Self::Item> { | ||||||
| @@ -101,37 +105,33 @@ mod tests { | |||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn easy() { |     fn easy() { | ||||||
|         let tokenizer = Tokenizer::new("salut"); |         let mut tokenizer = Tokenizer::new("salut"); | ||||||
|         let mut tokens = tokenizer.iter(); |  | ||||||
|  |  | ||||||
|         assert_eq!(tokens.next(), Some((0, "salut"))); |         assert_eq!(tokenizer.next(), Some((0, "salut"))); | ||||||
|         assert_eq!(tokens.next(), None); |         assert_eq!(tokenizer.next(), None); | ||||||
|  |  | ||||||
|         let tokenizer = Tokenizer::new("yo    "); |         let mut tokenizer = Tokenizer::new("yo    "); | ||||||
|         let mut tokens = tokenizer.iter(); |  | ||||||
|  |  | ||||||
|         assert_eq!(tokens.next(), Some((0, "yo"))); |         assert_eq!(tokenizer.next(), Some((0, "yo"))); | ||||||
|         assert_eq!(tokens.next(), None); |         assert_eq!(tokenizer.next(), None); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn hard() { |     fn hard() { | ||||||
|         let tokenizer = Tokenizer::new(" .? yo lolo. aïe"); |         let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe"); | ||||||
|         let mut tokens = tokenizer.iter(); |  | ||||||
|  |  | ||||||
|         assert_eq!(tokens.next(), Some((0, "yo"))); |         assert_eq!(tokenizer.next(), Some((0, "yo"))); | ||||||
|         assert_eq!(tokens.next(), Some((1, "lolo"))); |         assert_eq!(tokenizer.next(), Some((1, "lolo"))); | ||||||
|         assert_eq!(tokens.next(), Some((9, "aïe"))); |         assert_eq!(tokenizer.next(), Some((9, "aïe"))); | ||||||
|         assert_eq!(tokens.next(), None); |         assert_eq!(tokenizer.next(), None); | ||||||
|  |  | ||||||
|         let tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,"); |         let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,"); | ||||||
|         let mut tokens = tokenizer.iter(); |  | ||||||
|  |  | ||||||
|         assert_eq!(tokens.next(), Some((0, "yo"))); |         assert_eq!(tokenizer.next(), Some((0, "yo"))); | ||||||
|         assert_eq!(tokens.next(), Some((8, "lolo"))); |         assert_eq!(tokenizer.next(), Some((8, "lolo"))); | ||||||
|         assert_eq!(tokens.next(), Some((16, "wtf"))); |         assert_eq!(tokenizer.next(), Some((16, "wtf"))); | ||||||
|         assert_eq!(tokens.next(), Some((24, "lol"))); |         assert_eq!(tokenizer.next(), Some((24, "lol"))); | ||||||
|         assert_eq!(tokens.next(), Some((32, "aïe"))); |         assert_eq!(tokenizer.next(), Some((32, "aïe"))); | ||||||
|         assert_eq!(tokens.next(), None); |         assert_eq!(tokenizer.next(), None); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user