mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 13:36:27 +00:00 
			
		
		
		
	fix: Remove stop-words from the serve examples
This commit is contained in:
		| @@ -10,25 +10,18 @@ use pentium::{automaton, DocumentId, Metadata}; | ||||
|  | ||||
| #[derive(Debug, StructOpt)] | ||||
| pub struct CommandConsole { | ||||
|     /// The stop word file, each word must be separated by a newline. | ||||
|     #[structopt(long = "stop-words", parse(from_os_str))] | ||||
|     pub stop_words: PathBuf, | ||||
|  | ||||
|     /// Meta file name (e.g. relaxed-colden). | ||||
|     #[structopt(parse(from_os_str))] | ||||
|     pub meta_name: PathBuf, | ||||
| } | ||||
|  | ||||
| pub struct ConsoleSearch { | ||||
|     common_words: CommonWords, | ||||
|     metadata: Metadata, | ||||
|     db: DB, | ||||
| } | ||||
|  | ||||
| impl ConsoleSearch { | ||||
|     pub fn from_command(command: CommandConsole) -> io::Result<ConsoleSearch> { | ||||
|         let common_words = CommonWords::from_file(command.stop_words)?; | ||||
|  | ||||
|         let map_file = command.meta_name.with_extension("map"); | ||||
|         let idx_file = command.meta_name.with_extension("idx"); | ||||
|         let sst_file = command.meta_name.with_extension("sst"); | ||||
| @@ -42,7 +35,7 @@ impl ConsoleSearch { | ||||
|         drop(db); | ||||
|         let db = DB::open_for_read_only(DBOptions::default(), rocksdb, false).unwrap(); | ||||
|  | ||||
|         Ok(ConsoleSearch { common_words, metadata, db }) | ||||
|         Ok(ConsoleSearch { metadata, db }) | ||||
|     } | ||||
|  | ||||
|     pub fn serve(self) { | ||||
| @@ -52,20 +45,19 @@ impl ConsoleSearch { | ||||
|  | ||||
|             let mut query = String::new(); | ||||
|             io::stdin().read_line(&mut query).unwrap(); | ||||
|             let query = query.trim().to_lowercase(); | ||||
|  | ||||
|             if query.is_empty() { break } | ||||
|  | ||||
|             let (elapsed, _) = measure_time(|| search(&self.metadata, &self.db, &self.common_words, &query)); | ||||
|             let (elapsed, _) = measure_time(|| search(&self.metadata, &self.db, &query)); | ||||
|             println!("Finished in {}", elapsed); | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn search(metadata: &Metadata, database: &DB, common_words: &CommonWords, query: &str) { | ||||
| fn search(metadata: &Metadata, database: &DB, query: &str) { | ||||
|     let mut automatons = Vec::new(); | ||||
|     for query in query.split_whitespace().filter(|q| !common_words.contains(*q)) { | ||||
|         let lev = automaton::build(query); | ||||
|     for query in query.split_whitespace().map(str::to_lowercase) { | ||||
|         let lev = automaton::build_prefix_dfa(&query); | ||||
|         automatons.push(lev); | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -19,10 +19,6 @@ pub struct CommandHttp { | ||||
|     #[structopt(short = "l", default_value = "127.0.0.1:3030")] | ||||
|     pub listen_addr: SocketAddr, | ||||
|  | ||||
|     /// The stop word file, each word must be separated by a newline. | ||||
|     #[structopt(long = "stop-words", parse(from_os_str))] | ||||
|     pub stop_words: PathBuf, | ||||
|  | ||||
|     /// Meta file name (e.g. relaxed-colden). | ||||
|     #[structopt(parse(from_os_str))] | ||||
|     pub meta_name: PathBuf, | ||||
| @@ -41,15 +37,12 @@ struct SearchQuery { q: String } | ||||
|  | ||||
| pub struct HttpServer { | ||||
|     listen_addr: SocketAddr, | ||||
|     common_words: Arc<CommonWords>, | ||||
|     metadata: Arc<Metadata>, | ||||
|     db: Arc<DB>, | ||||
| } | ||||
|  | ||||
| impl HttpServer { | ||||
|     pub fn from_command(command: CommandHttp) -> io::Result<HttpServer> { | ||||
|         let common_words = CommonWords::from_file(command.stop_words)?; | ||||
|  | ||||
|         let map_file = command.meta_name.with_extension("map"); | ||||
|         let idx_file = command.meta_name.with_extension("idx"); | ||||
|         let sst_file = command.meta_name.with_extension("sst"); | ||||
| @@ -64,19 +57,18 @@ impl HttpServer { | ||||
|  | ||||
|         Ok(HttpServer { | ||||
|             listen_addr: command.listen_addr, | ||||
|             common_words: Arc::new(common_words), | ||||
|             metadata: Arc::new(metadata), | ||||
|             db: Arc::new(db), | ||||
|         }) | ||||
|     } | ||||
|  | ||||
|     pub fn serve(self) { | ||||
|         let HttpServer { listen_addr, common_words, metadata, db } = self; | ||||
|         let HttpServer { listen_addr, metadata, db } = self; | ||||
|  | ||||
|         let routes = warp::path("search") | ||||
|             .and(warp::query()) | ||||
|             .map(move |query: SearchQuery| { | ||||
|                 let body = search(metadata.clone(), db.clone(), common_words.clone(), &query.q).unwrap(); | ||||
|                 let body = search(metadata.clone(), db.clone(), &query.q).unwrap(); | ||||
|                 body | ||||
|             }) | ||||
|             .with(warp::reply::with::header("Content-Type", "application/json")) | ||||
| @@ -86,15 +78,13 @@ impl HttpServer { | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn search<M, D, C>(metadata: M, database: D, common_words: C, query: &str) -> Result<String, Box<Error>> | ||||
| fn search<M, D>(metadata: M, database: D, query: &str) -> Result<String, Box<Error>> | ||||
| where M: AsRef<Metadata>, | ||||
|       D: AsRef<DB>, | ||||
|       C: AsRef<CommonWords>, | ||||
| { | ||||
|     let mut automatons = Vec::new(); | ||||
|     for query in query.split_whitespace().map(str::to_lowercase) { | ||||
|         if common_words.as_ref().contains(&query) { continue } | ||||
|         let lev = automaton::build(&query); | ||||
|         let lev = automaton::build_prefix_dfa(&query); | ||||
|         automatons.push(lev); | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -1,4 +1,5 @@ | ||||
| use std::ops::Deref; | ||||
|  | ||||
| use fst::Automaton; | ||||
| use levenshtein_automata::{ | ||||
|     LevenshteinAutomatonBuilder as LevBuilder, | ||||
| @@ -50,16 +51,40 @@ impl AutomatonExt for DfaExt { | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub fn build(query: &str) -> DfaExt { | ||||
| enum PrefixSetting { | ||||
|     Prefix, | ||||
|     NoPrefix, | ||||
| } | ||||
|  | ||||
| fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DfaExt { | ||||
|     use self::PrefixSetting::{Prefix, NoPrefix}; | ||||
|  | ||||
|     let dfa = match query.len() { | ||||
|         0 ..= 4 => LEVDIST0.build_prefix_dfa(query), | ||||
|         5 ..= 8 => LEVDIST1.build_prefix_dfa(query), | ||||
|         _       => LEVDIST2.build_prefix_dfa(query), | ||||
|         0 ..= 4 => match setting { | ||||
|             Prefix   => LEVDIST0.build_prefix_dfa(query), | ||||
|             NoPrefix => LEVDIST0.build_dfa(query), | ||||
|         }, | ||||
|         5 ..= 8 => match setting { | ||||
|             Prefix   => LEVDIST1.build_prefix_dfa(query), | ||||
|             NoPrefix => LEVDIST1.build_dfa(query), | ||||
|         }, | ||||
|         _ => match setting { | ||||
|             Prefix   => LEVDIST2.build_prefix_dfa(query), | ||||
|             NoPrefix => LEVDIST2.build_dfa(query), | ||||
|         }, | ||||
|     }; | ||||
|  | ||||
|     DfaExt { query_len: query.len(), automaton: dfa } | ||||
| } | ||||
|  | ||||
| pub fn build_prefix_dfa(query: &str) -> DfaExt { | ||||
|     build_dfa_with_setting(query, PrefixSetting::Prefix) | ||||
| } | ||||
|  | ||||
| pub fn build_dfa(query: &str) -> DfaExt { | ||||
|     build_dfa_with_setting(query, PrefixSetting::NoPrefix) | ||||
| } | ||||
|  | ||||
| pub trait AutomatonExt: Automaton { | ||||
|     fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance; | ||||
|     fn query_len(&self) -> usize; | ||||
|   | ||||
| @@ -57,6 +57,8 @@ impl Criterion for DocumentId { | ||||
|     } | ||||
| } | ||||
|  | ||||
| // TODO there is too much Box here, can we use | ||||
| //      static references or static closures | ||||
| pub fn default() -> Vec<Box<dyn Criterion>> { | ||||
|     vec![ | ||||
|         Box::new(SumOfTypos), | ||||
|   | ||||
| @@ -21,6 +21,11 @@ impl Document { | ||||
|         unsafe { Self::from_sorted_matches(doc, vec![match_]) } | ||||
|     } | ||||
|  | ||||
|     pub fn from_matches(doc: DocumentId, mut matches: Vec<Match>) -> Self { | ||||
|         matches.sort_unstable(); | ||||
|         unsafe { Self::from_sorted_matches(doc, matches) } | ||||
|     } | ||||
|  | ||||
|     pub unsafe fn from_sorted_matches(id: DocumentId, matches: Vec<Match>) -> Self { | ||||
|         Self { id, matches } | ||||
|     } | ||||
|   | ||||
| @@ -2,7 +2,7 @@ use std::collections::HashMap; | ||||
| use std::hash::Hash; | ||||
| use std::ops::Range; | ||||
| use std::rc::Rc; | ||||
| use std::{mem, vec, cmp}; | ||||
| use std::{mem, vec}; | ||||
|  | ||||
| use fnv::FnvHashMap; | ||||
| use fst::Streamer; | ||||
| @@ -11,10 +11,17 @@ use group_by::GroupByMut; | ||||
| use crate::automaton::{DfaExt, AutomatonExt}; | ||||
| use crate::metadata::Metadata; | ||||
| use crate::metadata::ops::OpBuilder; | ||||
| use crate::rank::criterion::{self, Criterion}; | ||||
| use crate::rank::criterion::Criterion; | ||||
| use crate::rank::Document; | ||||
| use crate::{Match, DocumentId}; | ||||
|  | ||||
| fn clamp_range<T: Copy + Ord>(range: Range<T>, big: Range<T>) -> Range<T> { | ||||
|     Range { | ||||
|         start: range.start.min(big.end).max(big.start), | ||||
|         end: range.end.min(big.end).max(big.start), | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct Config<'m, C, F> { | ||||
|     pub metadata: &'m Metadata, | ||||
|     pub automatons: Vec<DfaExt>, | ||||
| @@ -67,10 +74,7 @@ impl<'m, C, F> RankedStream<'m, C, F> { | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         matches.into_iter().map(|(id, mut matches)| { | ||||
|             matches.sort_unstable(); | ||||
|             unsafe { Document::from_sorted_matches(id, matches) } | ||||
|         }).collect() | ||||
|         matches.into_iter().map(|(id, matches)| Document::from_matches(id, matches)).collect() | ||||
|     } | ||||
| } | ||||
|  | ||||
| @@ -92,10 +96,7 @@ where C: Criterion | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         let range = Range { | ||||
|             start: cmp::min(range.start, documents.len()), | ||||
|             end: cmp::min(range.end, documents.len()), | ||||
|         }; | ||||
|         let range = clamp_range(range, 0..documents.len()); | ||||
|         documents[range].to_vec() | ||||
|     } | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user