mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 05:26:27 +00:00 
			
		
		
		
	Introduce a better query and document lexer
This commit is contained in:
		
							
								
								
									
										34
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										34
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -6,6 +6,15 @@ version = "0.2.2" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "ccc9a9dd069569f212bc4330af9f17c4afb5e8ce185e83dbb14f1349dda18b10" | ||||
|  | ||||
| [[package]] | ||||
| name = "aho-corasick" | ||||
| version = "0.7.13" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86" | ||||
| dependencies = [ | ||||
|  "memchr", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "anyhow" | ||||
| version = "1.0.31" | ||||
| @@ -1002,6 +1011,7 @@ dependencies = [ | ||||
|  "askama_warp", | ||||
|  "astar-iter", | ||||
|  "bitpacking", | ||||
|  "bstr", | ||||
|  "byteorder", | ||||
|  "cow-utils", | ||||
|  "criterion", | ||||
| @@ -1028,6 +1038,7 @@ dependencies = [ | ||||
|  "structopt", | ||||
|  "tempfile", | ||||
|  "tokio", | ||||
|  "unicode-linebreak", | ||||
|  "warp", | ||||
| ] | ||||
|  | ||||
| @@ -1631,7 +1642,10 @@ version = "1.3.9" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6" | ||||
| dependencies = [ | ||||
|  "aho-corasick", | ||||
|  "memchr", | ||||
|  "regex-syntax", | ||||
|  "thread_local 1.0.1", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| @@ -1849,7 +1863,7 @@ dependencies = [ | ||||
|  "chrono", | ||||
|  "log 0.4.8", | ||||
|  "termcolor", | ||||
|  "thread_local", | ||||
|  "thread_local 0.3.4", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| @@ -1964,6 +1978,15 @@ dependencies = [ | ||||
|  "unreachable", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "thread_local" | ||||
| version = "1.0.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" | ||||
| dependencies = [ | ||||
|  "lazy_static 1.4.0", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "time" | ||||
| version = "0.1.43" | ||||
| @@ -2128,6 +2151,15 @@ dependencies = [ | ||||
|  "matches", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "unicode-linebreak" | ||||
| version = "0.1.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "4e30c7c3c3fa01e2c0da7008b57c2e5414b132a27fdf797e49e5ecbfe4f4b150" | ||||
| dependencies = [ | ||||
|  "regex", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "unicode-normalization" | ||||
| version = "0.1.12" | ||||
|   | ||||
| @@ -10,6 +10,7 @@ anyhow = "1.0.28" | ||||
| arc-cache = { git = "https://github.com/Kerollmops/rust-arc-cache.git", rev = "56530f2" } | ||||
| astar-iter = { git = "https://github.com/Kerollmops/astar-iter" } | ||||
| bitpacking = "0.8.2" | ||||
| bstr = "0.2.13" | ||||
| byteorder = "1.3.4" | ||||
| cow-utils = "0.1.2" | ||||
| csv = "1.1.3" | ||||
| @@ -29,6 +30,7 @@ smallstr = "0.2.0" | ||||
| smallvec = "1.4.0" | ||||
| structopt = { version = "0.3.14", default-features = false } | ||||
| tempfile = "3.1.0" | ||||
| unicode-linebreak = "0.1.0" | ||||
|  | ||||
| # logging | ||||
| log = "0.4.8" | ||||
|   | ||||
| @@ -9,6 +9,7 @@ use std::time::Instant; | ||||
|  | ||||
| use anyhow::Context; | ||||
| use arc_cache::ArcCache; | ||||
| use bstr::ByteSlice as _; | ||||
| use cow_utils::CowUtils; | ||||
| use fst::IntoStreamer; | ||||
| use heed::EnvOpenOptions; | ||||
| @@ -18,12 +19,11 @@ use memmap::Mmap; | ||||
| use oxidized_mtbl::{Reader, Writer, Merger, Sorter, CompressionType}; | ||||
| use rayon::prelude::*; | ||||
| use roaring::RoaringBitmap; | ||||
| use slice_group_by::StrGroupBy; | ||||
| use structopt::StructOpt; | ||||
|  | ||||
| use milli::{SmallVec32, Index, DocumentId, Position, Attribute}; | ||||
| use milli::{lexer, SmallVec32, Index, DocumentId, Position, Attribute}; | ||||
|  | ||||
| const LMDB_MAX_KEY_LENGTH: usize = 512; | ||||
| const LMDB_MAX_KEY_LENGTH: usize = 511; | ||||
| const ONE_MILLION: usize = 1_000_000; | ||||
|  | ||||
| const MAX_POSITION: usize = 1000; | ||||
| @@ -39,11 +39,6 @@ const WORD_ATTRIBUTE_DOCIDS_BYTE: u8 = 3; | ||||
| #[global_allocator] | ||||
| static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; | ||||
|  | ||||
| pub fn simple_alphanumeric_tokens(string: &str) -> impl Iterator<Item = &str> { | ||||
|     let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric); | ||||
|     string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric) | ||||
| } | ||||
|  | ||||
| #[derive(Debug, StructOpt)] | ||||
| #[structopt(name = "milli-indexer", about = "The indexer binary of the milli project.")] | ||||
| struct Opt { | ||||
| @@ -345,7 +340,7 @@ where F: FnMut(&[u8], &[u8]) -> anyhow::Result<()> | ||||
|     let mut iter = merger.into_merge_iter()?; | ||||
|     while let Some(result) = iter.next() { | ||||
|         let (k, v) = result?; | ||||
|         (f)(&k, &v)?; | ||||
|         (f)(&k, &v).with_context(|| format!("writing {:?} {:?} into LMDB", k.as_bstr(), k.as_bstr()))?; | ||||
|     } | ||||
|  | ||||
|     debug!("MTBL stores merged in {:.02?}!", before.elapsed()); | ||||
| @@ -389,7 +384,7 @@ fn index_csv( | ||||
|         } | ||||
|  | ||||
|         for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) { | ||||
|             for (pos, word) in simple_alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) { | ||||
|             for (pos, word) in lexer::break_string(&content).enumerate().take(MAX_POSITION) { | ||||
|                 let word = word.cow_to_lowercase(); | ||||
|                 let position = (attr * MAX_POSITION + pos) as u32; | ||||
|                 store.insert_word_position(&word, position)?; | ||||
|   | ||||
							
								
								
									
										44
									
								
								src/lexer.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										44
									
								
								src/lexer.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,44 @@ | ||||
| use unicode_linebreak::{linebreaks, BreakClass, break_property}; | ||||
|  | ||||
| fn can_be_broken(c: char) -> bool { | ||||
|     use BreakClass::*; | ||||
|  | ||||
|     match break_property(c as u32) { | ||||
|           Ideographic | ||||
|         | Alphabetic | ||||
|         | Numeric | ||||
|         | CombiningMark | ||||
|         | WordJoiner | ||||
|         | NonBreakingGlue | ||||
|         | OpenPunctuation | ||||
|         | Symbol | ||||
|         | EmojiBase | ||||
|         | EmojiModifier | ||||
|         | HangulLJamo | ||||
|         | HangulVJamo | ||||
|         | HangulTJamo | ||||
|         | RegionalIndicator | ||||
|         | Quotation => false, | ||||
|         _ => true, | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn extract_token(s: &str) -> &str { | ||||
|     let end = s.char_indices().rev() | ||||
|         .take_while(|(_, c)| can_be_broken(*c)) | ||||
|         .last() | ||||
|         .map(|(i, _)| i) | ||||
|         .unwrap_or(s.len()); | ||||
|  | ||||
|     &s[..end] | ||||
| } | ||||
|  | ||||
| pub fn break_string(s: &str) -> impl Iterator<Item = &str> { | ||||
|     let mut prev = 0; | ||||
|     linebreaks(&s).map(move |(i, _)| { | ||||
|         let s = &s[prev..i]; | ||||
|         prev = i; | ||||
|         extract_token(s) | ||||
|     }) | ||||
|     .filter(|s| !s.is_empty()) | ||||
| } | ||||
| @@ -4,6 +4,7 @@ mod node; | ||||
| mod query_tokens; | ||||
| mod search; | ||||
| mod transitive_arc; | ||||
| pub mod lexer; | ||||
|  | ||||
| use std::collections::HashMap; | ||||
| use std::fs::{File, OpenOptions}; | ||||
|   | ||||
| @@ -1,4 +1,5 @@ | ||||
| use std::{mem, str}; | ||||
| use unicode_linebreak::{break_property, BreakClass}; | ||||
|  | ||||
| use QueryToken::{Quoted, Free}; | ||||
|  | ||||
| @@ -8,6 +9,7 @@ pub enum QueryToken<'a> { | ||||
|     Quoted(&'a str), | ||||
| } | ||||
|  | ||||
| #[derive(Debug)] | ||||
| enum State { | ||||
|     Free(usize), | ||||
|     Quoted(usize), | ||||
| @@ -67,8 +69,13 @@ impl<'a> Iterator for QueryTokens<'a> { | ||||
|                     }, | ||||
|                     State::Fused => return None, | ||||
|                 } | ||||
|             } else if break_property(c as u32) == BreakClass::Ideographic { | ||||
|                 match self.state.replace_by(State::Free(afteri)) { | ||||
|                     State::Quoted(s) => return Some(Quoted(&self.string[s..afteri])), | ||||
|                     State::Free(s) => return Some(Free(&self.string[s..afteri])), | ||||
|                     _ => self.state = State::Free(afteri), | ||||
|                 } | ||||
|             else if !self.state.is_quoted() && !c.is_alphanumeric() { | ||||
|             } else if !self.state.is_quoted() && !c.is_alphanumeric() { | ||||
|                 match self.state.replace_by(State::Free(afteri)) { | ||||
|                     State::Free(s) if i > s => return Some(Free(&self.string[s..i])), | ||||
|                     _ => self.state = State::Free(afteri), | ||||
| @@ -83,6 +90,15 @@ mod tests { | ||||
|     use super::*; | ||||
|     use QueryToken::{Quoted, Free}; | ||||
|  | ||||
|     #[test] | ||||
|     fn empty() { | ||||
|         let mut iter = QueryTokens::new(""); | ||||
|         assert_eq!(iter.next(), None); | ||||
|  | ||||
|         let mut iter = QueryTokens::new(" "); | ||||
|         assert_eq!(iter.next(), None); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn one_quoted_string() { | ||||
|         let mut iter = QueryTokens::new("\"hello\""); | ||||
| @@ -154,4 +170,14 @@ mod tests { | ||||
|         assert_eq!(iter.next(), Some(Quoted("monde est beau"))); | ||||
|         assert_eq!(iter.next(), None); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn chinese() { | ||||
|         let mut iter = QueryTokens::new("汽车男生"); | ||||
|         assert_eq!(iter.next(), Some(Free("汽"))); | ||||
|         assert_eq!(iter.next(), Some(Free("车"))); | ||||
|         assert_eq!(iter.next(), Some(Free("男"))); | ||||
|         assert_eq!(iter.next(), Some(Free("生"))); | ||||
|         assert_eq!(iter.next(), None); | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -217,6 +217,10 @@ impl<'a> Search<'a> { | ||||
|             None => return Ok(Default::default()), | ||||
|         }; | ||||
|  | ||||
|         if dfas.is_empty() { | ||||
|             return Ok(Default::default()); | ||||
|         } | ||||
|  | ||||
|         let (derived_words, union_positions) = Self::fetch_words_positions(rtxn, index, &fst, dfas)?; | ||||
|         let candidates = Self::compute_candidates(rtxn, index, &derived_words)?; | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user