mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 04:56:28 +00:00 
			
		
		
		
	feat: Introduce the Schema
This commit is contained in:
		| @@ -5,11 +5,15 @@ version = "0.1.0" | ||||
| authors = ["Kerollmops <renault.cle@gmail.com>"] | ||||
|  | ||||
| [dependencies] | ||||
| bincode = "1.0" | ||||
| byteorder = "1.2" | ||||
| fnv = "1.0" | ||||
| fs2 = "0.4" | ||||
| lazy_static = "1.1" | ||||
| linked-hash-map = { version = "0.5", features = ["serde_impl"] } | ||||
| sdset = "0.2" | ||||
| serde = "1.0" | ||||
| serde_derive = "1.0" | ||||
| unidecode = "0.3" | ||||
|  | ||||
| [dependencies.fst] | ||||
| @@ -31,8 +35,6 @@ git = "https://github.com/Kerollmops/group-by.git" | ||||
| csv = "1.0" | ||||
| elapsed = "0.1" | ||||
| moby-name-gen = "0.1" | ||||
| serde = "1.0" | ||||
| serde_derive = "1.0" | ||||
| serde_json = "1.0" | ||||
| structopt = "0.2" | ||||
| tempfile = "3.0" | ||||
|   | ||||
| @@ -1,146 +0,0 @@ | ||||
| #[macro_use] extern crate serde_derive; | ||||
|  | ||||
| use std::collections::BTreeMap; | ||||
| use std::path::PathBuf; | ||||
| use std::fs::File; | ||||
| use std::io; | ||||
|  | ||||
| use csv::ReaderBuilder; | ||||
| use pentium::{MetadataBuilder, DocIndex, Tokenizer, CommonWords}; | ||||
| use rocksdb::{SstFileWriter, EnvOptions, ColumnFamilyOptions}; | ||||
| use structopt::StructOpt; | ||||
|  | ||||
| #[derive(Debug, StructOpt)] | ||||
| pub struct CommandCsv { | ||||
|     /// The stop word file, each word must be separated by a newline. | ||||
|     #[structopt(long = "stop-words", parse(from_os_str))] | ||||
|     pub stop_words: PathBuf, | ||||
|  | ||||
|     /// The csv file to index. | ||||
|     #[structopt(parse(from_os_str))] | ||||
|     pub products: PathBuf, | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Deserialize)] | ||||
| struct Product { | ||||
|     id: u64, | ||||
|     title: String, | ||||
|     description: String, | ||||
|     image: String, | ||||
| } | ||||
|  | ||||
| #[derive(Debug)] | ||||
| pub struct CsvIndexer { | ||||
|     common_words: CommonWords, | ||||
|     products: PathBuf, | ||||
| } | ||||
|  | ||||
| impl CsvIndexer { | ||||
|     pub fn from_command(command: CommandCsv) -> io::Result<CsvIndexer> { | ||||
|         let common_words = CommonWords::from_file(command.stop_words)?; | ||||
|         let products = command.products; | ||||
|  | ||||
|         Ok(CsvIndexer { common_words, products }) | ||||
|     } | ||||
|  | ||||
|     pub fn index(self) { | ||||
|         let random_name = PathBuf::from(moby_name_gen::random_name()); | ||||
|         let map_file = random_name.with_extension("map"); | ||||
|         let idx_file = random_name.with_extension("idx"); | ||||
|         let sst_file = random_name.with_extension("sst"); | ||||
|  | ||||
|         let env_options = EnvOptions::new(); | ||||
|         let cf_options = ColumnFamilyOptions::new(); | ||||
|         let mut sst_file_writer = SstFileWriter::new(env_options, cf_options); | ||||
|         let sst_file = sst_file.to_str().unwrap(); | ||||
|         sst_file_writer.open(&sst_file).expect("open the sst file"); | ||||
|  | ||||
|         let map = File::create(&map_file).unwrap(); | ||||
|         let indexes = File::create(&idx_file).unwrap(); | ||||
|         let mut builder = MetadataBuilder::new(map, indexes); | ||||
|         let mut fields = BTreeMap::new(); | ||||
|  | ||||
|         let mut rdr = ReaderBuilder::new().from_path(&self.products).expect("reading product file"); | ||||
|         let mut errors = 0; | ||||
|  | ||||
|         for result in rdr.deserialize() { | ||||
|             let product: Product = match result { | ||||
|                 Ok(product) => product, | ||||
|                 Err(e) => { eprintln!("{:?}", e); errors += 1; continue }, | ||||
|             }; | ||||
|  | ||||
|             { | ||||
|                 let string_id = product.id.to_string(); | ||||
|                 insert_document_words(&mut builder, product.id, 0, Some((0, string_id.as_str()))); | ||||
|  | ||||
|                 let key = format!("{}-id", product.id); | ||||
|                 let value = string_id; | ||||
|                 fields.insert(key, value); | ||||
|             } | ||||
|  | ||||
|             { | ||||
|                 let title = Tokenizer::new(&product.title).filter(|&(_, w)| !self.common_words.contains(w)); | ||||
|                 insert_document_words(&mut builder, product.id, 1, title); | ||||
|  | ||||
|                 let key = format!("{}-title", product.id); | ||||
|                 let value = product.title; | ||||
|                 fields.insert(key, value); | ||||
|             } | ||||
|  | ||||
|             { | ||||
|                 let description = Tokenizer::new(&product.description).filter(|&(_, w)| !self.common_words.contains(w)); | ||||
|                 insert_document_words(&mut builder, product.id, 2, description); | ||||
|  | ||||
|                 let key = format!("{}-description", product.id); | ||||
|                 let value = product.description; | ||||
|                 fields.insert(key, value); | ||||
|             } | ||||
|  | ||||
|             { | ||||
|                 let key = format!("{}-image", product.id); | ||||
|                 let value = product.image; | ||||
|                 fields.insert(key, value); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         for (key, value) in fields { | ||||
|             sst_file_writer.put(key.as_bytes(), value.as_bytes()).unwrap(); | ||||
|         } | ||||
|         let _sst_file_info = sst_file_writer.finish().unwrap(); | ||||
|  | ||||
|         builder.finish().unwrap(); | ||||
|  | ||||
|         println!("Found {} errorneous lines", errors); | ||||
|         println!("Succesfully created {:?} dump.", random_name); | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn insert_document_words<'a, I, A, B>(builder: &mut MetadataBuilder<A, B>, doc_id: u64, attr: u8, words: I) | ||||
| where A: io::Write, | ||||
|       B: io::Write, | ||||
|       I: IntoIterator<Item=(usize, &'a str)>, | ||||
| { | ||||
|     for (index, word) in words { | ||||
|         let doc_index = DocIndex { | ||||
|             document_id: doc_id, | ||||
|             attribute: attr, | ||||
|             attribute_index: index as u32, | ||||
|         }; | ||||
|         // insert the exact representation | ||||
|         let word_lower = word.to_lowercase(); | ||||
|  | ||||
|         // and the unidecoded lowercased version | ||||
|         let word_unidecoded = unidecode::unidecode(word).to_lowercase(); | ||||
|         if word_lower != word_unidecoded { | ||||
|             builder.insert(word_unidecoded, doc_index); | ||||
|         } | ||||
|  | ||||
|         builder.insert(word_lower, doc_index); | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn main() { | ||||
|     let command = CommandCsv::from_args(); | ||||
|     let indexer = CsvIndexer::from_command(command).unwrap(); | ||||
|     indexer.index(); | ||||
| } | ||||
| @@ -1,151 +0,0 @@ | ||||
| #[macro_use] extern crate serde_derive; | ||||
|  | ||||
| use std::collections::BTreeMap; | ||||
| use std::io::{self, BufReader, BufRead}; | ||||
| use std::fs::File; | ||||
| use std::path::PathBuf; | ||||
|  | ||||
| use serde_json::from_str; | ||||
| use rocksdb::{SstFileWriter, EnvOptions, ColumnFamilyOptions}; | ||||
| use pentium::{MetadataBuilder, DocIndex, Tokenizer, CommonWords}; | ||||
| use structopt::StructOpt; | ||||
|  | ||||
| #[derive(Debug, StructOpt)] | ||||
| pub struct CommandJsonLines { | ||||
|     /// The stop word file, each word must be separated by a newline. | ||||
|     #[structopt(long = "stop-words", parse(from_os_str))] | ||||
|     pub stop_words: PathBuf, | ||||
|  | ||||
|     /// The csv file to index. | ||||
|     #[structopt(parse(from_os_str))] | ||||
|     pub products: PathBuf, | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Deserialize)] | ||||
| struct Product { | ||||
|     id: u64, | ||||
|     title: String, | ||||
|     description: String, | ||||
|     image: String, | ||||
| } | ||||
|  | ||||
| #[derive(Debug)] | ||||
| pub struct JsonLinesIndexer { | ||||
|     common_words: CommonWords, | ||||
|     products: PathBuf, | ||||
| } | ||||
|  | ||||
| impl JsonLinesIndexer { | ||||
|     pub fn from_command(command: CommandJsonLines) -> io::Result<JsonLinesIndexer> { | ||||
|         let common_words = CommonWords::from_file(command.stop_words)?; | ||||
|         let products = command.products; | ||||
|  | ||||
|         Ok(JsonLinesIndexer { common_words, products }) | ||||
|     } | ||||
|  | ||||
|     pub fn index(self) { | ||||
|         let data = File::open(&self.products).unwrap(); | ||||
|         let data = BufReader::new(data); | ||||
|  | ||||
|         // TODO add a subcommand to pack these files in a tar.xxx archive | ||||
|         let random_name = PathBuf::from(moby_name_gen::random_name()); | ||||
|         let map_file = random_name.with_extension("map"); | ||||
|         let idx_file = random_name.with_extension("idx"); | ||||
|         let sst_file = random_name.with_extension("sst"); | ||||
|  | ||||
|         let env_options = EnvOptions::new(); | ||||
|         let cf_options = ColumnFamilyOptions::new(); | ||||
|         let mut sst_file_writer = SstFileWriter::new(env_options, cf_options); | ||||
|         let sst_file = sst_file.to_str().unwrap(); | ||||
|         sst_file_writer.open(&sst_file).expect("open the sst file"); | ||||
|  | ||||
|         let map = File::create(&map_file).unwrap(); | ||||
|         let indexes = File::create(&idx_file).unwrap(); | ||||
|         let mut builder = MetadataBuilder::new(map, indexes); | ||||
|         let mut fields = BTreeMap::new(); | ||||
|         let mut errors = 0; | ||||
|  | ||||
|         for result in data.lines() { | ||||
|             let product: Product = match result { | ||||
|                 Ok(product) => match from_str(&product) { | ||||
|                     Ok(product) => product, | ||||
|                     Err(e) => { eprintln!("{:?}", e); errors += 1; continue }, | ||||
|                 }, | ||||
|                 Err(e) => { eprintln!("{:?}", e); errors += 1; continue }, | ||||
|             }; | ||||
|  | ||||
|             { | ||||
|                 let string_id = product.id.to_string(); | ||||
|                 insert_document_words(&mut builder, product.id, 0, Some((0, string_id.as_str()))); | ||||
|  | ||||
|                 let key = format!("{}-id", product.id); | ||||
|                 let value = string_id; | ||||
|                 fields.insert(key, value); | ||||
|             } | ||||
|  | ||||
|             { | ||||
|                 let title = Tokenizer::new(&product.title).filter(|&(_, w)| !self.common_words.contains(w)); | ||||
|                 insert_document_words(&mut builder, product.id, 1, title); | ||||
|  | ||||
|                 let key = format!("{}-title", product.id); | ||||
|                 let value = product.title; | ||||
|                 fields.insert(key, value); | ||||
|             } | ||||
|  | ||||
|             { | ||||
|                 let description = Tokenizer::new(&product.description).filter(|&(_, w)| !self.common_words.contains(w)); | ||||
|                 insert_document_words(&mut builder, product.id, 2, description); | ||||
|  | ||||
|                 let key = format!("{}-description", product.id); | ||||
|                 let value = product.description; | ||||
|                 fields.insert(key, value); | ||||
|             } | ||||
|  | ||||
|             { | ||||
|                 let key = format!("{}-image", product.id); | ||||
|                 let value = product.image; | ||||
|                 fields.insert(key, value); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         for (key, value) in fields { | ||||
|             sst_file_writer.put(key.as_bytes(), value.as_bytes()).unwrap(); | ||||
|         } | ||||
|         let _sst_file_info = sst_file_writer.finish().unwrap(); | ||||
|  | ||||
|         builder.finish().unwrap(); | ||||
|  | ||||
|         println!("Found {} errorneous lines", errors); | ||||
|         println!("Succesfully created {:?} dump.", random_name); | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn insert_document_words<'a, I, A, B>(builder: &mut MetadataBuilder<A, B>, doc_id: u64, attr: u8, words: I) | ||||
| where A: io::Write, | ||||
|       B: io::Write, | ||||
|       I: IntoIterator<Item=(usize, &'a str)>, | ||||
| { | ||||
|     for (index, word) in words { | ||||
|         let doc_index = DocIndex { | ||||
|             document_id: doc_id, | ||||
|             attribute: attr, | ||||
|             attribute_index: index as u32, | ||||
|         }; | ||||
|         // insert the exact representation | ||||
|         let word_lower = word.to_lowercase(); | ||||
|  | ||||
|         // and the unidecoded lowercased version | ||||
|         let word_unidecoded = unidecode::unidecode(word).to_lowercase(); | ||||
|         if word_lower != word_unidecoded { | ||||
|             builder.insert(word_unidecoded, doc_index); | ||||
|         } | ||||
|  | ||||
|         builder.insert(word_lower, doc_index); | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn main() { | ||||
|     let command = CommandJsonLines::from_args(); | ||||
|     let indexer = JsonLinesIndexer::from_command(command).unwrap(); | ||||
|     indexer.index(); | ||||
| } | ||||
| @@ -1,98 +0,0 @@ | ||||
| use std::error::Error; | ||||
| use std::str::from_utf8_unchecked; | ||||
| use std::io::{self, Write}; | ||||
| use structopt::StructOpt; | ||||
| use std::path::PathBuf; | ||||
|  | ||||
| use elapsed::measure_time; | ||||
| use rocksdb::{DB, DBOptions, IngestExternalFileOptions}; | ||||
| use pentium::index::Index; | ||||
| use pentium::rank::{criterion, Config, RankedStream}; | ||||
| use pentium::{automaton, DocumentId}; | ||||
|  | ||||
| #[derive(Debug, StructOpt)] | ||||
| pub struct CommandConsole { | ||||
|     /// Meta file name (e.g. relaxed-colden). | ||||
|     #[structopt(parse(from_os_str))] | ||||
|     pub index_path: PathBuf, | ||||
| } | ||||
|  | ||||
| pub struct ConsoleSearch { | ||||
|     index: Index, | ||||
| } | ||||
|  | ||||
| impl ConsoleSearch { | ||||
|     pub fn from_command(command: CommandConsole) -> Result<ConsoleSearch, Box<Error>> { | ||||
|         let index = Index::open(command.index_path)?; | ||||
|         Ok(ConsoleSearch { index }) | ||||
|     } | ||||
|  | ||||
|     pub fn serve(self) { | ||||
|         loop { | ||||
|             print!("Searching for: "); | ||||
|             io::stdout().flush().unwrap(); | ||||
|  | ||||
|             let mut query = String::new(); | ||||
|             io::stdin().read_line(&mut query).unwrap(); | ||||
|  | ||||
|             if query.is_empty() { break } | ||||
|  | ||||
|             let (elapsed, _) = measure_time(|| search(&self.index, &query)); | ||||
|             println!("Finished in {}", elapsed); | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn search(index: &Index, query: &str) { | ||||
|     let mut automatons = Vec::new(); | ||||
|     for query in query.split_whitespace().map(str::to_lowercase) { | ||||
|         let lev = automaton::build_prefix_dfa(&query); | ||||
|         automatons.push(lev); | ||||
|     } | ||||
|  | ||||
|     let distinct_by_title_first_four_chars = |id: &DocumentId| { | ||||
|         let title_key = format!("{}-title", id); | ||||
|         match database.get(title_key.as_bytes()) { | ||||
|             Ok(Some(value)) => { | ||||
|                 value.to_utf8().map(|s| s.chars().take(4).collect::<String>()) | ||||
|             }, | ||||
|             Ok(None) => None, | ||||
|             Err(err) => { | ||||
|                 eprintln!("{:?}", err); | ||||
|                 None | ||||
|             } | ||||
|         } | ||||
|     }; | ||||
|  | ||||
|     let index: Index = unimplemented!(); | ||||
|  | ||||
|     // "Sony" "PlayStation 4 500GB" | ||||
|     let config = Config { | ||||
|         blobs: &index.blobs().unwrap(), | ||||
|         automatons: automatons, | ||||
|         criteria: criterion::default(), | ||||
|         distinct: (distinct_by_title_first_four_chars, 1), | ||||
|     }; | ||||
|     let stream = RankedStream::new(config); | ||||
|  | ||||
|     let documents = stream.retrieve_distinct_documents(0..20); | ||||
|     // let documents = stream.retrieve_documents(0..20); | ||||
|  | ||||
|     for document in documents { | ||||
|         let id_key = format!("{}-id", document.id); | ||||
|         let id = database.get(id_key.as_bytes()).unwrap().unwrap(); | ||||
|         let id = unsafe { from_utf8_unchecked(&id) }; | ||||
|         print!("{} ", id); | ||||
|  | ||||
|         let title_key = format!("{}-title", document.id); | ||||
|         let title = database.get(title_key.as_bytes()).unwrap().unwrap(); | ||||
|         let title = unsafe { from_utf8_unchecked(&title) }; | ||||
|         println!("{:?}", title); | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn main() { | ||||
|     let command = CommandConsole::from_args(); | ||||
|     let console = ConsoleSearch::from_command(command).unwrap(); | ||||
|     console.serve() | ||||
| } | ||||
| @@ -1,140 +0,0 @@ | ||||
| #[macro_use] extern crate serde_derive; | ||||
|  | ||||
| use std::str::from_utf8_unchecked; | ||||
| use std::io::{self, Write}; | ||||
| use std::net::SocketAddr; | ||||
| use std::path::PathBuf; | ||||
| use std::error::Error; | ||||
| use std::sync::Arc; | ||||
|  | ||||
| use pentium::rank::{criterion, Config, RankedStream}; | ||||
| use pentium::{automaton, Metadata}; | ||||
| use rocksdb::{DB, DBOptions, IngestExternalFileOptions}; | ||||
| use warp::Filter; | ||||
| use structopt::StructOpt; | ||||
|  | ||||
| #[derive(Debug, StructOpt)] | ||||
| pub struct CommandHttp { | ||||
|     /// The address and port to bind the server to. | ||||
|     #[structopt(short = "l", default_value = "127.0.0.1:3030")] | ||||
|     pub listen_addr: SocketAddr, | ||||
|  | ||||
|     /// Meta file name (e.g. relaxed-colden). | ||||
|     #[structopt(parse(from_os_str))] | ||||
|     pub meta_name: PathBuf, | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Serialize)] | ||||
| struct Document<'a> { | ||||
|     id: u64, | ||||
|     title: &'a str, | ||||
|     description: &'a str, | ||||
|     image: &'a str, | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Deserialize)] | ||||
| struct SearchQuery { q: String } | ||||
|  | ||||
| pub struct HttpServer { | ||||
|     listen_addr: SocketAddr, | ||||
|     metadata: Arc<Metadata>, | ||||
|     db: Arc<DB>, | ||||
| } | ||||
|  | ||||
| impl HttpServer { | ||||
|     pub fn from_command(command: CommandHttp) -> io::Result<HttpServer> { | ||||
|         let map_file = command.meta_name.with_extension("map"); | ||||
|         let idx_file = command.meta_name.with_extension("idx"); | ||||
|         let sst_file = command.meta_name.with_extension("sst"); | ||||
|         let metadata = unsafe { Metadata::from_paths(map_file, idx_file).unwrap() }; | ||||
|  | ||||
|         let rocksdb = "rocksdb/storage"; | ||||
|         let db = DB::open_default(rocksdb).unwrap(); | ||||
|         let sst_file = sst_file.to_str().unwrap(); | ||||
|         db.ingest_external_file(&IngestExternalFileOptions::new(), &[&sst_file]).unwrap(); | ||||
|         drop(db); | ||||
|         let db = DB::open_for_read_only(DBOptions::default(), rocksdb, false).unwrap(); | ||||
|  | ||||
|         Ok(HttpServer { | ||||
|             listen_addr: command.listen_addr, | ||||
|             metadata: Arc::new(metadata), | ||||
|             db: Arc::new(db), | ||||
|         }) | ||||
|     } | ||||
|  | ||||
|     pub fn serve(self) { | ||||
|         let HttpServer { listen_addr, metadata, db } = self; | ||||
|  | ||||
|         let routes = warp::path("search") | ||||
|             .and(warp::query()) | ||||
|             .map(move |query: SearchQuery| { | ||||
|                 let body = search(metadata.clone(), db.clone(), &query.q).unwrap(); | ||||
|                 body | ||||
|             }) | ||||
|             .with(warp::reply::with::header("Content-Type", "application/json")) | ||||
|             .with(warp::reply::with::header("Access-Control-Allow-Origin", "*")); | ||||
|  | ||||
|         warp::serve(routes).run(listen_addr) | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn search<M, D>(metadata: M, database: D, query: &str) -> Result<String, Box<Error>> | ||||
| where M: AsRef<Metadata>, | ||||
|       D: AsRef<DB>, | ||||
| { | ||||
|     let mut automatons = Vec::new(); | ||||
|     for query in query.split_whitespace().map(str::to_lowercase) { | ||||
|         let lev = automaton::build_prefix_dfa(&query); | ||||
|         automatons.push(lev); | ||||
|     } | ||||
|  | ||||
|     let config = Config { | ||||
|         index: unimplemented!(), | ||||
|         automatons: automatons, | ||||
|         criteria: criterion::default(), | ||||
|         distinct: ((), 1), | ||||
|     }; | ||||
|     let stream = RankedStream::new(config); | ||||
|  | ||||
|     let documents = stream.retrieve_documents(0..20); | ||||
|  | ||||
|     let mut body = Vec::new(); | ||||
|     write!(&mut body, "[")?; | ||||
|  | ||||
|     let mut first = true; | ||||
|     for document in documents { | ||||
|         let title_key = format!("{}-title", document.id); | ||||
|         let title = database.as_ref().get(title_key.as_bytes()).unwrap().unwrap(); | ||||
|         let title = unsafe { from_utf8_unchecked(&title) }; | ||||
|  | ||||
|         let description_key = format!("{}-description", document.id); | ||||
|         let description = database.as_ref().get(description_key.as_bytes()).unwrap().unwrap(); | ||||
|         let description = unsafe { from_utf8_unchecked(&description) }; | ||||
|  | ||||
|         let image_key = format!("{}-image", document.id); | ||||
|         let image = database.as_ref().get(image_key.as_bytes()).unwrap().unwrap(); | ||||
|         let image = unsafe { from_utf8_unchecked(&image) }; | ||||
|  | ||||
|         let document = Document { | ||||
|             id: document.id, | ||||
|             title: title, | ||||
|             description: description, | ||||
|             image: image, | ||||
|         }; | ||||
|  | ||||
|         if !first { write!(&mut body, ",")? } | ||||
|         serde_json::to_writer(&mut body, &document)?; | ||||
|  | ||||
|         first = false; | ||||
|     } | ||||
|  | ||||
|     write!(&mut body, "]")?; | ||||
|  | ||||
|     Ok(String::from_utf8(body)?) | ||||
| } | ||||
|  | ||||
| fn main() { | ||||
|     let command = CommandHttp::from_args(); | ||||
|     let server = HttpServer::from_command(command).unwrap(); | ||||
|     server.serve(); | ||||
| } | ||||
| @@ -136,12 +136,12 @@ mod tests { | ||||
|         let mut builder = PositiveUpdateBuilder::new("update-positive-0001.sst", schema.clone(), tokenizer_builder); | ||||
|  | ||||
|         // you can insert documents in any order, it is sorted internally | ||||
|         let title_field = schema.field("title").unwrap(); | ||||
|         let title_field = schema.attribute("title").unwrap(); | ||||
|         builder.update_field(1, title_field, "hallo!".to_owned()); | ||||
|         builder.update_field(5, title_field, "hello!".to_owned()); | ||||
|         builder.update_field(2, title_field, "hi!".to_owned()); | ||||
|  | ||||
|         let name_field = schema.field("name").unwrap(); | ||||
|         let name_field = schema.attribute("name").unwrap(); | ||||
|         builder.remove_field(4, name_field); | ||||
|  | ||||
|         let update = builder.build()?; | ||||
|   | ||||
| @@ -1,14 +1,16 @@ | ||||
| use std::collections::{HashMap, BTreeMap}; | ||||
| use std::io::{Read, Write}; | ||||
| use std::error::Error; | ||||
| use std::path::Path; | ||||
| use std::ops::BitOr; | ||||
| use std::fs::File; | ||||
| use std::fmt; | ||||
|  | ||||
| use linked_hash_map::LinkedHashMap; | ||||
|  | ||||
| pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false }; | ||||
| pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true }; | ||||
|  | ||||
| #[derive(Copy, Clone)] | ||||
| #[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] | ||||
| pub struct SchemaProps { | ||||
|     stored: bool, | ||||
|     indexed: bool, | ||||
| @@ -36,66 +38,110 @@ impl BitOr for SchemaProps { | ||||
| } | ||||
|  | ||||
| pub struct SchemaBuilder { | ||||
|     fields: Vec<(String, SchemaProps)>, | ||||
|     attrs: LinkedHashMap<String, SchemaProps>, | ||||
| } | ||||
|  | ||||
| impl SchemaBuilder { | ||||
|     pub fn new() -> SchemaBuilder { | ||||
|         SchemaBuilder { fields: Vec::new() } | ||||
|         SchemaBuilder { attrs: LinkedHashMap::new() } | ||||
|     } | ||||
|  | ||||
|     pub fn field<N>(&mut self, name: N, props: SchemaProps) -> SchemaField | ||||
|     where N: Into<String>, | ||||
|     { | ||||
|         let len = self.fields.len(); | ||||
|         let name = name.into(); | ||||
|         self.fields.push((name, props)); | ||||
|  | ||||
|         SchemaField(len as u32) | ||||
|     pub fn new_field<S: Into<String>>(&mut self, name: S, props: SchemaProps) -> SchemaAttr { | ||||
|         let len = self.attrs.len(); | ||||
|         self.attrs.insert(name.into(), props); | ||||
|         SchemaAttr(len as u32) | ||||
|     } | ||||
|  | ||||
|     pub fn build(self) -> Schema { | ||||
|         unimplemented!() | ||||
|         let mut attrs = HashMap::new(); | ||||
|         let mut props = Vec::new(); | ||||
|  | ||||
|         for (i, (name, prop)) in self.attrs.into_iter().enumerate() { | ||||
|             attrs.insert(name, SchemaAttr(i as u32)); | ||||
|             props.push(prop); | ||||
|         } | ||||
|  | ||||
|         Schema { attrs, props } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Clone)] | ||||
| pub struct Schema; | ||||
| #[derive(Debug, Clone, PartialEq, Eq)] | ||||
| pub struct Schema { | ||||
|     attrs: HashMap<String, SchemaAttr>, | ||||
|     props: Vec<SchemaProps>, | ||||
| } | ||||
|  | ||||
| impl Schema { | ||||
|     pub fn open<P: AsRef<Path>>(path: P) -> Result<Schema, Box<Error>> { | ||||
|     pub fn open<P: AsRef<Path>>(path: P) -> bincode::Result<Schema> { | ||||
|         let file = File::open(path)?; | ||||
|         Schema::read_from(file) | ||||
|     } | ||||
|  | ||||
|     pub fn read_from<R: Read>(reader: R) -> Result<Schema, Box<Error>> { | ||||
|         unimplemented!() | ||||
|     pub fn read_from<R: Read>(reader: R) -> bincode::Result<Schema> { | ||||
|         let attrs = bincode::deserialize_from(reader)?; | ||||
|         let builder = SchemaBuilder { attrs }; | ||||
|         Ok(builder.build()) | ||||
|     } | ||||
|  | ||||
|     pub fn write_to<W: Write>(writer: W) -> Result<(), Box<Error>> { | ||||
|         unimplemented!() | ||||
|     pub fn write_to<W: Write>(&self, writer: W) -> bincode::Result<()> { | ||||
|         let mut ordered = BTreeMap::new(); | ||||
|         for (name, field) in &self.attrs { | ||||
|             let index = field.as_u32(); | ||||
|             let props = self.props[index as usize]; | ||||
|             ordered.insert(index, (name, props)); | ||||
|         } | ||||
|  | ||||
|     pub fn props(&self, field: SchemaField) -> SchemaProps { | ||||
|         unimplemented!() | ||||
|         let mut attrs = LinkedHashMap::with_capacity(ordered.len()); | ||||
|         for (_, (name, props)) in ordered { | ||||
|             attrs.insert(name, props); | ||||
|         } | ||||
|  | ||||
|     pub fn field(&self, name: &str) -> Option<SchemaField> { | ||||
|         unimplemented!() | ||||
|         bincode::serialize_into(writer, &attrs) | ||||
|     } | ||||
|  | ||||
|     pub fn props(&self, attr: SchemaAttr) -> SchemaProps { | ||||
|         self.props[attr.as_u32() as usize] | ||||
|     } | ||||
|  | ||||
|     pub fn attribute<S: AsRef<str>>(&self, name: S) -> Option<SchemaAttr> { | ||||
|         self.attrs.get(name.as_ref()).cloned() | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Copy, Clone, PartialOrd, Ord, PartialEq, Eq)] | ||||
| pub struct SchemaField(u32); | ||||
| #[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)] | ||||
| pub struct SchemaAttr(u32); | ||||
|  | ||||
| impl SchemaField { | ||||
| impl SchemaAttr { | ||||
|     pub fn as_u32(&self) -> u32 { | ||||
|         self.0 | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl fmt::Display for SchemaField { | ||||
| impl fmt::Display for SchemaAttr { | ||||
|     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||
|         write!(f, "{}", self.0) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use super::*; | ||||
|  | ||||
|     #[test] | ||||
|     fn serialize_deserialize() -> bincode::Result<()> { | ||||
|         let mut builder = SchemaBuilder::new(); | ||||
|         builder.new_field("alphabet", STORED); | ||||
|         builder.new_field("beta", STORED | INDEXED); | ||||
|         builder.new_field("gamma", INDEXED); | ||||
|         let schema = builder.build(); | ||||
|  | ||||
|         let mut buffer = Vec::new(); | ||||
|  | ||||
|         schema.write_to(&mut buffer)?; | ||||
|         let schema2 = Schema::read_from(buffer.as_slice())?; | ||||
|  | ||||
|         assert_eq!(schema, schema2); | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -5,7 +5,7 @@ use std::fmt::Write; | ||||
|  | ||||
| use ::rocksdb::rocksdb_options; | ||||
|  | ||||
| use crate::index::schema::{SchemaProps, Schema, SchemaField}; | ||||
| use crate::index::schema::{SchemaProps, Schema, SchemaAttr}; | ||||
| use crate::index::update::{FIELD_BLOBS_ORDER, Update}; | ||||
| use crate::tokenizer::TokenizerBuilder; | ||||
| use crate::index::blob_name::BlobName; | ||||
| @@ -24,7 +24,7 @@ pub struct PositiveUpdateBuilder<B> { | ||||
|     path: PathBuf, | ||||
|     schema: Schema, | ||||
|     tokenizer_builder: B, | ||||
|     new_states: BTreeMap<(DocumentId, SchemaField), NewState>, | ||||
|     new_states: BTreeMap<(DocumentId, SchemaAttr), NewState>, | ||||
| } | ||||
|  | ||||
| impl<B> PositiveUpdateBuilder<B> { | ||||
| @@ -38,12 +38,12 @@ impl<B> PositiveUpdateBuilder<B> { | ||||
|     } | ||||
|  | ||||
|     // TODO value must be a field that can be indexed | ||||
|     pub fn update_field(&mut self, id: DocumentId, field: SchemaField, value: String) { | ||||
|     pub fn update_field(&mut self, id: DocumentId, field: SchemaAttr, value: String) { | ||||
|         let state = NewState::Updated { value, props: self.schema.props(field) }; | ||||
|         self.new_states.insert((id, field), state); | ||||
|     } | ||||
|  | ||||
|     pub fn remove_field(&mut self, id: DocumentId, field: SchemaField) { | ||||
|     pub fn remove_field(&mut self, id: DocumentId, field: SchemaAttr) { | ||||
|         self.new_states.insert((id, field), NewState::Removed); | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,6 +1,7 @@ | ||||
| #![feature(range_contains)] | ||||
|  | ||||
| #[macro_use] extern crate lazy_static; | ||||
| #[macro_use] extern crate serde_derive; | ||||
|  | ||||
| pub mod index; | ||||
| pub mod blob; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user