mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 05:26:27 +00:00 
			
		
		
		
	dump: Make the data less prone of memory indirections
This commit is contained in:
		| @@ -13,7 +13,7 @@ use std::io::{BufReader, BufRead}; | ||||
| use fst::Streamer; | ||||
| use serde_json::from_str; | ||||
|  | ||||
| use raptor::{MultiMapBuilder, MultiMap}; | ||||
| use raptor::{FstMapBuilder, FstMap}; | ||||
|  | ||||
| #[derive(Debug, Deserialize)] | ||||
| struct Product { | ||||
| @@ -42,7 +42,7 @@ fn main() { | ||||
|         set | ||||
|     }; | ||||
|  | ||||
|     let mut builder = MultiMapBuilder::new(); | ||||
|     let mut builder = FstMapBuilder::new(); | ||||
|     for line in data.lines() { | ||||
|         let line = line.unwrap(); | ||||
|  | ||||
| @@ -65,11 +65,6 @@ fn main() { | ||||
|     let values = File::create("values.vecs").unwrap(); | ||||
|     let (map, values) = builder.build(map, values).unwrap(); | ||||
|  | ||||
|     // just to check if the dump is valid | ||||
|     let map = unsafe { MultiMap::from_paths("map.fst", "values.vecs").unwrap() }; | ||||
|  | ||||
|     // let mut stream = map.stream(); | ||||
|     // while let Some(x) = stream.next() { | ||||
|     //     println!("{:?}", x); | ||||
|     // } | ||||
|     eprintln!("Checking the dump consistency..."); | ||||
|     unsafe { FstMap::<u64>::from_paths("map.fst", "values.vecs").unwrap() }; | ||||
| } | ||||
|   | ||||
| @@ -21,19 +21,19 @@ use tokio_minihttp::{Request, Response, Http}; | ||||
| use tokio_proto::TcpServer; | ||||
| use tokio_service::Service; | ||||
|  | ||||
| use raptor::MultiMap; | ||||
| use raptor::FstMap; | ||||
|  | ||||
| lazy_static! { | ||||
|     static ref MAP: MultiMap = { | ||||
|     static ref MAP: FstMap<u64> = { | ||||
|         let map = read_to_vec("map.fst").unwrap(); | ||||
|         let values = read_to_vec("values.vecs").unwrap(); | ||||
|  | ||||
|         MultiMap::from_bytes(map, &values).unwrap() | ||||
|         FstMap::from_bytes(map, &values).unwrap() | ||||
|     }; | ||||
| } | ||||
|  | ||||
| struct MainService { | ||||
|     map: &'static MultiMap, | ||||
|     map: &'static FstMap<u64>, | ||||
| } | ||||
|  | ||||
| impl Service for MainService { | ||||
|   | ||||
							
								
								
									
										161
									
								
								src/fst_map.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										161
									
								
								src/fst_map.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,161 @@ | ||||
| use bincode; | ||||
| use fst::{self, Map, MapBuilder, Automaton}; | ||||
| use serde::de::DeserializeOwned; | ||||
| use serde::ser::Serialize; | ||||
| use std::fs::File; | ||||
| use std::io::{Write, BufReader}; | ||||
| use std::ops::{Range, Deref, DerefMut}; | ||||
| use std::path::Path; | ||||
| use {StreamBuilder, Stream}; | ||||
|  | ||||
| #[derive(Debug)] | ||||
| pub struct FstMap<T> { | ||||
|     inner: Map, | ||||
|     values: Values<T>, | ||||
| } | ||||
|  | ||||
| impl<T> FstMap<T> { | ||||
|     pub unsafe fn from_paths<P, Q>(map: P, values: Q) -> fst::Result<Self> | ||||
|     where | ||||
|         T: DeserializeOwned, | ||||
|         P: AsRef<Path>, | ||||
|         Q: AsRef<Path> | ||||
|     { | ||||
|         let inner = Map::from_path(map)?; | ||||
|  | ||||
|         // TODO handle errors !!! | ||||
|         let values = File::open(values).unwrap(); | ||||
|         let values = BufReader::new(values); | ||||
|         let values = bincode::deserialize_from(values).unwrap(); | ||||
|  | ||||
|         Ok(Self { inner, values }) | ||||
|     } | ||||
|  | ||||
|     pub fn from_bytes(map: Vec<u8>, values: &[u8]) -> fst::Result<Self> | ||||
|     where | ||||
|         T: DeserializeOwned | ||||
|     { | ||||
|         let inner = Map::from_bytes(map)?; | ||||
|         let values = bincode::deserialize(values).unwrap(); | ||||
|  | ||||
|         Ok(Self { inner, values }) | ||||
|     } | ||||
|  | ||||
|     pub fn stream(&self) -> Stream<T> { | ||||
|         Stream { | ||||
|             inner: self.inner.stream(), | ||||
|             values: &self.values, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn contains_key<K: AsRef<[u8]>>(&self, key: K) -> bool { | ||||
|         self.inner.contains_key(key) | ||||
|     } | ||||
|  | ||||
|     pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[T]> { | ||||
|         self.inner.get(key).map(|i| unsafe { self.values.get_unchecked(i as usize) }) | ||||
|     } | ||||
|  | ||||
|     pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<T, A> { | ||||
|         StreamBuilder { | ||||
|             inner: self.inner.search(aut), | ||||
|             values: &self.values, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Serialize, Deserialize)] | ||||
| pub struct Values<T> { | ||||
|     ranges: Box<[Range<u64>]>, | ||||
|     values: Box<[T]>, | ||||
| } | ||||
|  | ||||
| impl<T> Values<T> { | ||||
|     fn new(raw: Vec<Vec<T>>) -> Self { | ||||
|         let cap = raw.len(); | ||||
|         let mut ranges = Vec::with_capacity(cap); | ||||
|         let cap = raw.iter().map(Vec::len).sum(); | ||||
|         let mut values = Vec::with_capacity(cap); | ||||
|  | ||||
|         for v in &raw { | ||||
|             let len = v.len() as u64; | ||||
|             let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0); | ||||
|  | ||||
|             let range = Range { start, end: start + len }; | ||||
|             ranges.push(range); | ||||
|         } | ||||
|  | ||||
|         values.extend(raw.into_iter().flat_map(IntoIterator::into_iter)); | ||||
|  | ||||
|         let ranges = ranges.into_boxed_slice(); | ||||
|         let values = values.into_boxed_slice(); | ||||
|  | ||||
|         Self { ranges, values } | ||||
|     } | ||||
|  | ||||
|     pub unsafe fn get_unchecked(&self, index: usize) -> &[T] { | ||||
|         let range = self.ranges.get_unchecked(index); | ||||
|         let range = Range { start: range.start as usize, end: range.end as usize }; | ||||
|         self.values.get_unchecked(range) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug)] | ||||
| pub struct FstMapBuilder<T> { | ||||
|     map: Vec<(String, u64)>, | ||||
|     // This makes many memory indirections but it is only used | ||||
|     // at index time, not kept for query time. | ||||
|     values: Vec<Vec<T>>, | ||||
| } | ||||
|  | ||||
| impl<T> FstMapBuilder<T> { | ||||
|     pub fn new() -> Self { | ||||
|         Self { | ||||
|             map: Vec::new(), | ||||
|             values: Vec::new(), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn insert<S: Into<String>>(&mut self, key: S, value: T) { | ||||
|         let key = key.into(); | ||||
|         match self.map.binary_search_by_key(&key.as_str(), |&(ref k, _)| k) { | ||||
|             Ok(index) => { | ||||
|                 let (_, index) = self.map[index]; | ||||
|                 let values = &mut self.values[index as usize]; | ||||
|  | ||||
|                 values.push(value); | ||||
|             }, | ||||
|             Err(index) => { | ||||
|                 self.values.push(vec![value]); | ||||
|                 let values_index = (self.values.len() - 1) as u64; | ||||
|  | ||||
|                 let value = (key, values_index); | ||||
|                 self.map.insert(index, value); | ||||
|             }, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn build_memory(self) -> fst::Result<FstMap<T>> { | ||||
|         Ok(FstMap { | ||||
|             inner: Map::from_iter(self.map)?, | ||||
|             values: Values::new(self.values), | ||||
|         }) | ||||
|     } | ||||
|  | ||||
|     pub fn build<W, X>(self, map_wrt: W, mut values_wrt: X) -> fst::Result<(W, X)> | ||||
|     where | ||||
|         T: Serialize, | ||||
|         W: Write, | ||||
|         X: Write | ||||
|     { | ||||
|         let mut builder = MapBuilder::new(map_wrt)?; | ||||
|         builder.extend_iter(self.map)?; | ||||
|         let map = builder.into_inner()?; | ||||
|         let values = Values::new(self.values); | ||||
|  | ||||
|         // TODO handle that error !!! | ||||
|         bincode::serialize_into(&mut values_wrt, &values).unwrap(); | ||||
|  | ||||
|         Ok((map, values_wrt)) | ||||
|     } | ||||
| } | ||||
							
								
								
									
										153
									
								
								src/lib.rs
									
									
									
									
									
								
							
							
						
						
									
										153
									
								
								src/lib.rs
									
									
									
									
									
								
							| @@ -1,76 +1,26 @@ | ||||
| #[macro_use] extern crate serde_derive; | ||||
| extern crate bincode; | ||||
| extern crate fst; | ||||
| extern crate smallvec; | ||||
| extern crate serde; | ||||
|  | ||||
| use std::ops::{Deref, DerefMut}; | ||||
| mod fst_map; | ||||
|  | ||||
| use std::ops::{Range, Deref, DerefMut}; | ||||
| use std::io::{Write, BufReader}; | ||||
| use std::fs::File; | ||||
| use std::path::Path; | ||||
| use std::str::from_utf8_unchecked; | ||||
| use fst::Automaton; | ||||
|  | ||||
| pub use fst::MapBuilder; | ||||
| use smallvec::SmallVec; | ||||
| pub use self::fst_map::{FstMap, FstMapBuilder}; | ||||
| use self::fst_map::Values; | ||||
|  | ||||
| type SmallVec32<T> = SmallVec<[T; 16]>; | ||||
|  | ||||
| #[derive(Debug)] | ||||
| pub struct MultiMap { | ||||
|     map: fst::Map, | ||||
|     values: Box<[SmallVec32<u64>]>, | ||||
| } | ||||
|  | ||||
| impl MultiMap { | ||||
|     pub unsafe fn from_paths<P, Q>(map: P, values: Q) -> fst::Result<MultiMap> | ||||
|     where | ||||
|         P: AsRef<Path>, | ||||
|         Q: AsRef<Path> | ||||
|     { | ||||
|         let map = fst::Map::from_path(map)?; | ||||
|  | ||||
|         // TODO handle errors !!! | ||||
|         let values = File::open(values).unwrap(); | ||||
|         let values = BufReader::new(values); | ||||
|         let values = bincode::deserialize_from(values).unwrap(); | ||||
|  | ||||
|         Ok(MultiMap { map, values }) | ||||
|     } | ||||
|  | ||||
|     pub fn from_bytes(map: Vec<u8>, values: &[u8]) -> fst::Result<MultiMap> { | ||||
|         let map = fst::Map::from_bytes(map)?; | ||||
|         let values = bincode::deserialize(values).unwrap(); | ||||
|  | ||||
|         Ok(MultiMap { map, values }) | ||||
|     } | ||||
|  | ||||
|     pub fn stream(&self) -> Stream { | ||||
|         Stream { | ||||
|             inner: self.map.stream(), | ||||
|             values: &self.values, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn contains_key<K: AsRef<[u8]>>(&self, key: K) -> bool { | ||||
|         self.map.contains_key(key) | ||||
|     } | ||||
|  | ||||
|     pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[u64]> { | ||||
|         self.map.get(key).map(|i| &*self.values[i as usize]) | ||||
|     } | ||||
|  | ||||
|     pub fn search<A: fst::Automaton>(&self, aut: A) -> StreamBuilder<A> { | ||||
|         StreamBuilder { | ||||
|             inner: self.map.search(aut), | ||||
|             values: &self.values, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct StreamBuilder<'a, A: fst::Automaton> { | ||||
| pub struct StreamBuilder<'a, T: 'a, A: Automaton> { | ||||
|     inner: fst::map::StreamBuilder<'a, A>, | ||||
|     values: &'a [SmallVec32<u64>], | ||||
|     values: &'a Values<T>, | ||||
| } | ||||
|  | ||||
| impl<'a, A: fst::Automaton> Deref for StreamBuilder<'a, A> { | ||||
| impl<'a, T, A: Automaton> Deref for StreamBuilder<'a, T, A> { | ||||
|     type Target = fst::map::StreamBuilder<'a, A>; | ||||
|  | ||||
|     fn deref(&self) -> &Self::Target { | ||||
| @@ -78,16 +28,16 @@ impl<'a, A: fst::Automaton> Deref for StreamBuilder<'a, A> { | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a, A: fst::Automaton> DerefMut for StreamBuilder<'a, A> { | ||||
| impl<'a, T, A: Automaton> DerefMut for StreamBuilder<'a, T, A> { | ||||
|     fn deref_mut(&mut self) -> &mut Self::Target { | ||||
|         &mut self.inner | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a, A: fst::Automaton> fst::IntoStreamer<'a> for StreamBuilder<'a, A> { | ||||
|     type Item = (&'a str, &'a [u64]); | ||||
| impl<'a, T: 'a, A: Automaton> fst::IntoStreamer<'a> for StreamBuilder<'a, T, A> { | ||||
|     type Item = (&'a str, &'a [T]); | ||||
|  | ||||
|     type Into = Stream<'a, A>; | ||||
|     type Into = Stream<'a, T, A>; | ||||
|  | ||||
|     fn into_stream(self) -> Self::Into { | ||||
|         Stream { | ||||
| @@ -97,84 +47,23 @@ impl<'a, A: fst::Automaton> fst::IntoStreamer<'a> for StreamBuilder<'a, A> { | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct Stream<'a, A: fst::Automaton = fst::automaton::AlwaysMatch> { | ||||
| pub struct Stream<'a, T: 'a, A: Automaton = fst::automaton::AlwaysMatch> { | ||||
|     inner: fst::map::Stream<'a, A>, | ||||
|     values: &'a [SmallVec32<u64>], | ||||
|     values: &'a Values<T>, | ||||
| } | ||||
|  | ||||
| impl<'a, 'm, A: fst::Automaton> fst::Streamer<'a> for Stream<'m, A> { | ||||
|     type Item = (&'a str, &'a [u64]); | ||||
| impl<'a, 'm, T: 'a, A: Automaton> fst::Streamer<'a> for Stream<'m, T, A> { | ||||
|     type Item = (&'a str, &'a [T]); | ||||
|  | ||||
|     fn next(&'a mut self) -> Option<Self::Item> { | ||||
|         // Here we can't just `map` because of some borrow rules | ||||
|         match self.inner.next() { | ||||
|             Some((key, i)) => { | ||||
|                 let key = unsafe { from_utf8_unchecked(key) }; | ||||
|                 Some((key, &*self.values[i as usize])) | ||||
|                 let values = unsafe { self.values.get_unchecked(i as usize) }; | ||||
|                 Some((key, values)) | ||||
|             }, | ||||
|             None => None, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug)] | ||||
| pub struct MultiMapBuilder { | ||||
|     map: Vec<(String, u64)>, | ||||
|     values: Vec<SmallVec32<u64>>, | ||||
| } | ||||
|  | ||||
| impl<'a> MultiMapBuilder { | ||||
|     pub fn new() -> MultiMapBuilder { | ||||
|         MultiMapBuilder { | ||||
|             map: Vec::new(), | ||||
|             values: Vec::new(), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn insert<S: Into<String>>(&mut self, key: S, value: u64) { | ||||
|         let key = key.into(); | ||||
|         match self.map.binary_search_by_key(&key.as_str(), |&(ref k, _)| k) { | ||||
|             Ok(index) => { | ||||
|                 let (_, index) = self.map[index]; | ||||
|                 let values = &mut self.values[index as usize]; | ||||
|                 if let Err(index) = values.binary_search(&value) { | ||||
|                     values.insert(index, value) | ||||
|                 } | ||||
|             }, | ||||
|             Err(index) => { | ||||
|                 let values = { | ||||
|                     let mut vec = SmallVec32::new(); | ||||
|                     vec.push(value); | ||||
|                     vec | ||||
|                 }; | ||||
|                 self.values.push(values); | ||||
|                 let values_index = (self.values.len() - 1) as u64; | ||||
|  | ||||
|                 let value = (key, values_index); | ||||
|                 self.map.insert(index, value); | ||||
|             }, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn build_memory(self) -> fst::Result<MultiMap> { | ||||
|         Ok(MultiMap { | ||||
|             map: fst::Map::from_iter(self.map)?, | ||||
|             values: self.values.into_boxed_slice(), | ||||
|         }) | ||||
|     } | ||||
|  | ||||
|     pub fn build<W, X>(self, map_wrt: W, mut values_wrt: X) -> fst::Result<(W, X)> | ||||
|     where | ||||
|         W: Write, | ||||
|         X: Write | ||||
|     { | ||||
|         let mut builder = MapBuilder::new(map_wrt)?; | ||||
|         builder.extend_iter(self.map)?; | ||||
|         let map = builder.into_inner()?; | ||||
|  | ||||
|         // TODO handle that !!! | ||||
|         bincode::serialize_into(&mut values_wrt, &self.values).unwrap(); | ||||
|  | ||||
|         Ok((map, values_wrt)) | ||||
|     } | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user