mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 13:36:27 +00:00 
			
		
		
		
	feat(search): Accept multiple words and do a simple union
This commit is contained in:
		
				
					committed by
					
						 Clément Renault
						Clément Renault
					
				
			
			
				
	
			
			
			
						parent
						
							758baeb8e1
						
					
				
				
					commit
					1476aa3dba
				
			
							
								
								
									
										10
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										10
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -85,7 +85,6 @@ dependencies = [ | ||||
| [[package]] | ||||
| name = "fst" | ||||
| version = "0.3.0" | ||||
| source = "git+https://github.com/Kerollmops/fst.git?branch=stream-with-state#a969462433944a22f1356a8bf2affb8e9bde6f67" | ||||
| dependencies = [ | ||||
|  "byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)", | ||||
|  "memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", | ||||
| @@ -161,9 +160,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| [[package]] | ||||
| name = "levenshtein_automata" | ||||
| version = "0.1.0" | ||||
| source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#13a685e087efcf253936342c055166fa5d5c9b9c" | ||||
| dependencies = [ | ||||
|  "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=stream-with-state)", | ||||
|  "fst 0.3.0", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| @@ -305,9 +303,9 @@ version = "0.1.0" | ||||
| dependencies = [ | ||||
|  "bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", | ||||
|  "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", | ||||
|  "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=stream-with-state)", | ||||
|  "fst 0.3.0", | ||||
|  "futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)", | ||||
|  "levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)", | ||||
|  "levenshtein_automata 0.1.0", | ||||
|  "serde 1.0.45 (registry+https://github.com/rust-lang/crates.io-index)", | ||||
|  "serde_derive 1.0.45 (registry+https://github.com/rust-lang/crates.io-index)", | ||||
|  "serde_json 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", | ||||
| @@ -651,7 +649,6 @@ dependencies = [ | ||||
| "checksum crossbeam-utils 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d636a8b3bcc1b409d7ffd3facef8f21dcb4009626adbd0c5e6c4305c07253c7b" | ||||
| "checksum dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "09c3753c3db574d215cba4ea76018483895d7bff25a31b49ba45db21c48e50ab" | ||||
| "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" | ||||
| "checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=stream-with-state)" = "<none>" | ||||
| "checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" | ||||
| "checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" | ||||
| "checksum futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)" = "1a70b146671de62ec8c8ed572219ca5d594d9b06c0b364d5e67b722fc559b48c" | ||||
| @@ -662,7 +659,6 @@ dependencies = [ | ||||
| "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" | ||||
| "checksum lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c8f31047daa365f19be14b47c29df4f7c3b581832407daabe6ae77397619237d" | ||||
| "checksum lazycell 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a6f08839bc70ef4a3fe1d566d5350f519c5912ea86be0df1740a7d247c7fc0ef" | ||||
| "checksum levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "<none>" | ||||
| "checksum libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)" = "6fd41f331ac7c5b8ac259b8bf82c75c0fb2e469bbf37d2becbba9a6a2221965b" | ||||
| "checksum log 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b" | ||||
| "checksum log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "89f010e843f2b1a31dbd316b3b8d443758bc634bed37aabade59c686d644e0a2" | ||||
|   | ||||
							
								
								
									
										14
									
								
								Cargo.toml
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								Cargo.toml
									
									
									
									
									
								
							| @@ -16,13 +16,15 @@ tokio-service = "0.1" | ||||
| url = "1.7" | ||||
|  | ||||
| [dependencies.fst] | ||||
| git = "https://github.com/Kerollmops/fst.git" | ||||
| branch = "stream-with-state" | ||||
| path = "../../fst" | ||||
| # git = "https://github.com/Kerollmops/fst.git" | ||||
| # branch = "stream-with-state" | ||||
|  | ||||
| [dependencies.levenshtein_automata] | ||||
| git = "https://github.com/Kerollmops/levenshtein-automata.git" | ||||
| branch = "custom-fst" | ||||
| path = "../../levenshtein-automata" | ||||
| # git = "https://github.com/Kerollmops/levenshtein-automata.git" | ||||
| # branch = "custom-fst" | ||||
| features = ["fst_automaton"] | ||||
|  | ||||
| [profile.release] | ||||
| lto = true | ||||
| # [profile.release] | ||||
| # lto = true | ||||
|   | ||||
| @@ -13,14 +13,14 @@ use std::path::Path; | ||||
| use std::fs::File; | ||||
| use std::io::{Read, BufReader}; | ||||
|  | ||||
| use fst::{IntoStreamer, Streamer}; | ||||
| use fst::Streamer; | ||||
| use futures::future; | ||||
| use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; | ||||
| use tokio_minihttp::{Request, Response, Http}; | ||||
| use tokio_proto::TcpServer; | ||||
| use tokio_service::Service; | ||||
|  | ||||
| use raptor::FstMap; | ||||
| use raptor::{FstMap, OpWithStateBuilder}; | ||||
|  | ||||
| static mut MAP: Option<FstMap<u64>> = None; | ||||
| static mut LEV_BUILDER_0: Option<LevBuilder> = None; | ||||
| @@ -52,25 +52,40 @@ impl<'a> Service for MainService<'a> { | ||||
|         if let Some((_, query)) = url.query_pairs().find(|&(ref k, _)| k == "q") { | ||||
|             let query = query.to_lowercase(); | ||||
|  | ||||
|             let lev = if query.len() <= 4 { | ||||
|                 self.lev_builder_0.build_dfa(&query) | ||||
|             } else if query.len() <= 8 { | ||||
|                 self.lev_builder_1.build_dfa(&query) | ||||
|             } else { | ||||
|                 self.lev_builder_2.build_dfa(&query) | ||||
|             }; | ||||
|             let mut automatons = Vec::new(); | ||||
|  | ||||
|             let mut stream = self.map.search(&lev).with_state().into_stream(); | ||||
|             for query in query.split_whitespace() { | ||||
|                 let lev = if query.len() <= 4 { | ||||
|                     self.lev_builder_0.build_dfa(&query) | ||||
|                 } else if query.len() <= 8 { | ||||
|                     self.lev_builder_1.build_dfa(&query) | ||||
|                 } else { | ||||
|                     self.lev_builder_2.build_dfa(&query) | ||||
|                 }; | ||||
|                 automatons.push(lev); | ||||
|             } | ||||
|  | ||||
|             let mut op = OpWithStateBuilder::new(self.map.values()); | ||||
|  | ||||
|             for automaton in automatons.iter().cloned() { | ||||
|                 let stream = self.map.as_map().search(automaton).with_state(); | ||||
|                 op.push(stream); | ||||
|             } | ||||
|  | ||||
|             let mut stream = op.union(); | ||||
|  | ||||
|             let mut body = String::new(); | ||||
|             body.push_str("<html><body>"); | ||||
|  | ||||
|             while let Some((key, values, state)) = stream.next() { | ||||
|             while let Some((key, ivalues)) = stream.next() { | ||||
|                 match std::str::from_utf8(key) { | ||||
|                     Ok(key) => { | ||||
|                         let values = &values[..values.len().min(10)]; | ||||
|                         let distance = lev.distance(state); | ||||
|                         body.push_str(&format!("<p>{:?} (dist: {:?}) {:?}</p>", key, distance, values)); | ||||
|                         for ivalue in ivalues { | ||||
|                             let i = ivalue.index; | ||||
|                             let state = ivalue.state; | ||||
|                             let distance = automatons[i].distance(state); | ||||
|                             body.push_str(&format!("<p>{:?} (dist: {:?}) {:?}</p>", key, distance, ivalue.values)); | ||||
|                         } | ||||
|                     }, | ||||
|                     Err(e) => eprintln!("{:?}", e), | ||||
|                 } | ||||
|   | ||||
							
								
								
									
										170
									
								
								src/fst_map.rs
									
									
									
									
									
								
							
							
						
						
									
										170
									
								
								src/fst_map.rs
									
									
									
									
									
								
							| @@ -1,5 +1,5 @@ | ||||
| use bincode; | ||||
| use fst::{self, Map, MapBuilder, Automaton}; | ||||
| use fst::{self, Automaton}; | ||||
| use serde::de::DeserializeOwned; | ||||
| use serde::ser::Serialize; | ||||
| use std::fs::File; | ||||
| @@ -10,7 +10,7 @@ use {StreamBuilder, Stream}; | ||||
|  | ||||
| #[derive(Debug)] | ||||
| pub struct FstMap<T> { | ||||
|     inner: Map, | ||||
|     inner: fst::Map, | ||||
|     values: Values<T>, | ||||
| } | ||||
|  | ||||
| @@ -21,7 +21,7 @@ impl<T> FstMap<T> { | ||||
|         P: AsRef<Path>, | ||||
|         Q: AsRef<Path> | ||||
|     { | ||||
|         let inner = Map::from_path(map)?; | ||||
|         let inner = fst::Map::from_path(map)?; | ||||
|  | ||||
|         // TODO handle errors !!! | ||||
|         let values = File::open(values).unwrap(); | ||||
| @@ -35,7 +35,7 @@ impl<T> FstMap<T> { | ||||
|     where | ||||
|         T: DeserializeOwned | ||||
|     { | ||||
|         let inner = Map::from_bytes(map)?; | ||||
|         let inner = fst::Map::from_bytes(map)?; | ||||
|         let values = bincode::deserialize(values).unwrap(); | ||||
|  | ||||
|         Ok(Self { inner, values }) | ||||
| @@ -62,6 +62,19 @@ impl<T> FstMap<T> { | ||||
|             values: &self.values, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn op(&self) -> OpBuilder<T> { | ||||
|         // OpBuilder::new(&self.values).add(self.as_inner()) | ||||
|         unimplemented!() | ||||
|     } | ||||
|  | ||||
|     pub fn as_map(&self) -> &fst::Map { | ||||
|         &self.inner | ||||
|     } | ||||
|  | ||||
|     pub fn values(&self) -> &Values<T> { | ||||
|         &self.values | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Serialize, Deserialize)] | ||||
| @@ -137,7 +150,7 @@ impl<T> FstMapBuilder<T> { | ||||
|  | ||||
|     pub fn build_memory(self) -> fst::Result<FstMap<T>> { | ||||
|         Ok(FstMap { | ||||
|             inner: Map::from_iter(self.map)?, | ||||
|             inner: fst::Map::from_iter(self.map)?, | ||||
|             values: Values::new(self.values), | ||||
|         }) | ||||
|     } | ||||
| @@ -148,7 +161,7 @@ impl<T> FstMapBuilder<T> { | ||||
|         W: Write, | ||||
|         X: Write | ||||
|     { | ||||
|         let mut builder = MapBuilder::new(map_wrt)?; | ||||
|         let mut builder = fst::MapBuilder::new(map_wrt)?; | ||||
|         builder.extend_iter(self.map)?; | ||||
|         let map = builder.into_inner()?; | ||||
|         let values = Values::new(self.values); | ||||
| @@ -159,3 +172,148 @@ impl<T> FstMapBuilder<T> { | ||||
|         Ok((map, values_wrt)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct OpBuilder<'m, 'v, T: 'v> { | ||||
|     inner: fst::map::OpBuilder<'m>, | ||||
|     values: &'v Values<T>, | ||||
| } | ||||
|  | ||||
| impl<'m, 'v, T: 'v> OpBuilder<'m, 'v, T> { | ||||
|     pub fn new(values: &'v Values<T>) -> Self { | ||||
|         OpBuilder { | ||||
|             inner: fst::map::OpBuilder::new(), | ||||
|             values: values, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn add<I, S>(mut self, streamable: I) -> Self | ||||
|     where | ||||
|         I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>, | ||||
|         S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>, | ||||
|     { | ||||
|         self.push(streamable); | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     pub fn push<I, S>(&mut self, streamable: I) | ||||
|     where | ||||
|         I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>, | ||||
|         S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>, | ||||
|     { | ||||
|         self.inner.push(streamable); | ||||
|     } | ||||
|  | ||||
|     pub fn union(self) -> Union<'m, 'v, T> { | ||||
|         Union { | ||||
|             inner: self.inner.union(), | ||||
|             outs: Vec::new(), | ||||
|             values: self.values, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct Union<'m, 'v, T: 'v> { | ||||
|     inner: fst::map::Union<'m>, | ||||
|     outs: Vec<IndexedValues<'v, T>>, | ||||
|     values: &'v Values<T>, | ||||
| } | ||||
|  | ||||
| impl<'a, 'm, 'v, T: 'v + 'a> fst::Streamer<'a> for Union<'m, 'v, T> { | ||||
|     type Item = (&'a [u8], &'a [IndexedValues<'a, T>]); | ||||
|  | ||||
|     fn next(&'a mut self) -> Option<Self::Item> { | ||||
|         match self.inner.next() { | ||||
|             Some((s, ivalues)) => { | ||||
|                 self.outs.clear(); | ||||
|                 for ivalue in ivalues { | ||||
|                     let index = ivalue.index; | ||||
|                     let values = unsafe { self.values.get_unchecked(ivalue.value as usize) }; | ||||
|                     self.outs.push(IndexedValues { index, values }) | ||||
|                 } | ||||
|                 Some((s, &self.outs)) | ||||
|             }, | ||||
|             None => None, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug)] | ||||
| pub struct IndexedValues<'a, T: 'a> { | ||||
|     pub index: usize, | ||||
|     pub values: &'a [T], | ||||
| } | ||||
|  | ||||
| pub struct OpWithStateBuilder<'m, 'v, T: 'v, U> { | ||||
|     inner: fst::map::OpWithStateBuilder<'m, U>, | ||||
|     values: &'v Values<T>, | ||||
| } | ||||
|  | ||||
| impl<'m, 'v, T: 'v, U: 'static> OpWithStateBuilder<'m, 'v, T, U> { | ||||
|     pub fn new(values: &'v Values<T>) -> Self { | ||||
|         Self { | ||||
|             inner: fst::map::OpWithStateBuilder::new(), | ||||
|             values: values, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn add<I, S>(mut self, streamable: I) -> Self | ||||
|     where | ||||
|         I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>, | ||||
|         S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>, | ||||
|     { | ||||
|         self.push(streamable); | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     pub fn push<I, S>(&mut self, streamable: I) | ||||
|     where | ||||
|         I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>, | ||||
|         S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>, | ||||
|     { | ||||
|         self.inner.push(streamable); | ||||
|     } | ||||
|  | ||||
|     pub fn union(self) -> UnionWithState<'m, 'v, T, U> { | ||||
|         UnionWithState { | ||||
|             inner: self.inner.union(), | ||||
|             outs: Vec::new(), | ||||
|             values: self.values, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct UnionWithState<'m, 'v, T: 'v, U> { | ||||
|     inner: fst::map::UnionWithState<'m, U>, | ||||
|     outs: Vec<IndexedValuesWithState<'v, T, U>>, | ||||
|     values: &'v Values<T>, | ||||
| } | ||||
|  | ||||
| impl<'a, 'm, 'v, T: 'v + 'a, U: 'a> fst::Streamer<'a> for UnionWithState<'m, 'v, T, U> | ||||
| where | ||||
|     U: Clone, | ||||
| { | ||||
|     type Item = (&'a [u8], &'a [IndexedValuesWithState<'a, T, U>]); | ||||
|  | ||||
|     fn next(&'a mut self) -> Option<Self::Item> { | ||||
|         match self.inner.next() { | ||||
|             Some((s, ivalues)) => { | ||||
|                 self.outs.clear(); | ||||
|                 for ivalue in ivalues { | ||||
|                     let index = ivalue.index; | ||||
|                     let values = unsafe { self.values.get_unchecked(ivalue.value as usize) }; | ||||
|                     let state = ivalue.state.clone(); | ||||
|                     self.outs.push(IndexedValuesWithState { index, values, state }) | ||||
|                 } | ||||
|                 Some((s, &self.outs)) | ||||
|             }, | ||||
|             None => None, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug)] | ||||
| pub struct IndexedValuesWithState<'a, T: 'a, U> { | ||||
|     pub index: usize, | ||||
|     pub values: &'a [T], | ||||
|     pub state: U, | ||||
| } | ||||
|   | ||||
| @@ -5,13 +5,13 @@ extern crate serde; | ||||
|  | ||||
| mod fst_map; | ||||
|  | ||||
| use std::ops::Range; | ||||
| use std::io::{Write, BufReader}; | ||||
| use std::fs::File; | ||||
| use std::path::Path; | ||||
| use fst::Automaton; | ||||
|  | ||||
| pub use self::fst_map::{FstMap, FstMapBuilder}; | ||||
| pub use self::fst_map::{ | ||||
|     OpBuilder, IndexedValues, | ||||
|     OpWithStateBuilder, IndexedValuesWithState, | ||||
| }; | ||||
| use self::fst_map::Values; | ||||
|  | ||||
| pub struct StreamBuilder<'m, 'v, T: 'v, A> { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user