mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 21:16:28 +00:00 
			
		
		
		
	Introduce the query words mapping along with the query tree
This commit is contained in:
		
							
								
								
									
										10
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										10
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -799,6 +799,14 @@ dependencies = [ | |||||||
|  "serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)", |  "serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
| ] | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "intervaltree" | ||||||
|  | version = "0.2.4" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | dependencies = [ | ||||||
|  |  "smallvec 0.6.13 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "iovec" | name = "iovec" | ||||||
| version = "0.1.4" | version = "0.1.4" | ||||||
| @@ -952,6 +960,7 @@ dependencies = [ | |||||||
|  "hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", |  "hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  "heed 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", |  "heed 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  "indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", |  "indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  |  "intervaltree 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  "itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)", |  "itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  "jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)", |  "jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  "levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", |  "levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
| @@ -2715,6 +2724,7 @@ dependencies = [ | |||||||
| "checksum idna 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "38f09e0f0b1fb55fdee1f17470ad800da77af5186a1a76c026b679358b7e844e" | "checksum idna 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "38f09e0f0b1fb55fdee1f17470ad800da77af5186a1a76c026b679358b7e844e" | ||||||
| "checksum idna 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "02e2673c30ee86b5b96a9cb52ad15718aa1f966f5ab9ad54a8b95d5ca33120a9" | "checksum idna 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "02e2673c30ee86b5b96a9cb52ad15718aa1f966f5ab9ad54a8b95d5ca33120a9" | ||||||
| "checksum indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712d7b3ea5827fcb9d4fda14bf4da5f136f0db2ae9c8f4bd4e2d1c6fde4e6db2" | "checksum indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712d7b3ea5827fcb9d4fda14bf4da5f136f0db2ae9c8f4bd4e2d1c6fde4e6db2" | ||||||
|  | "checksum intervaltree 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)" = "af39074dd8d5eff756ddea3d8f34c7ae287d4dadb6f29fb1b67ca6b3f5036482" | ||||||
| "checksum iovec 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" | "checksum iovec 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" | ||||||
| "checksum itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484" | "checksum itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484" | ||||||
| "checksum itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "501266b7edd0174f8530248f87f99c88fbe60ca4ef3dd486835b8d8d53136f7f" | "checksum itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "501266b7edd0174f8530248f87f99c88fbe60ca4ef3dd486835b8d8d53136f7f" | ||||||
|   | |||||||
| @@ -17,6 +17,7 @@ env_logger = "0.7.0" | |||||||
| fst = { version = "0.3.5", default-features = false } | fst = { version = "0.3.5", default-features = false } | ||||||
| hashbrown = { version = "0.6.0", features = ["serde"] } | hashbrown = { version = "0.6.0", features = ["serde"] } | ||||||
| heed = "0.6.1" | heed = "0.6.1" | ||||||
|  | intervaltree = "0.2.4" | ||||||
| itertools = "0.8.2" # kill me please | itertools = "0.8.2" # kill me please | ||||||
| levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] } | levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] } | ||||||
| log = "0.4.8" | log = "0.4.8" | ||||||
|   | |||||||
| @@ -61,8 +61,9 @@ where | |||||||
|         prefix_postings_lists: prefix_postings_lists_cache_store, |         prefix_postings_lists: prefix_postings_lists_cache_store, | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     let operation = create_query_tree(reader, &context, query).unwrap(); |     let (operation, mapping) = create_query_tree(reader, &context, query).unwrap(); | ||||||
|     println!("{:?}", operation); |     println!("{:?}", operation); | ||||||
|  |     println!("{:?}", mapping); | ||||||
|  |  | ||||||
|     let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation).unwrap(); |     let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation).unwrap(); | ||||||
|     println!("found {} documents", docids.len()); |     println!("found {} documents", docids.len()); | ||||||
|   | |||||||
| @@ -11,6 +11,7 @@ mod levenshtein; | |||||||
| mod number; | mod number; | ||||||
| mod query_builder; | mod query_builder; | ||||||
| mod query_tree; | mod query_tree; | ||||||
|  | mod query_words_mapper; | ||||||
| mod ranked_map; | mod ranked_map; | ||||||
| mod raw_document; | mod raw_document; | ||||||
| mod reordered_attrs; | mod reordered_attrs; | ||||||
| @@ -28,6 +29,7 @@ pub use self::raw_document::RawDocument; | |||||||
| pub use self::store::Index; | pub use self::store::Index; | ||||||
| pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType}; | pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType}; | ||||||
| pub use meilisearch_types::{DocIndex, DocumentId, Highlight}; | pub use meilisearch_types::{DocIndex, DocumentId, Highlight}; | ||||||
|  | pub use query_words_mapper::QueryWordsMapper; | ||||||
|  |  | ||||||
| use compact_arena::SmallArena; | use compact_arena::SmallArena; | ||||||
| use crate::bucket_sort::{QueryWordAutomaton, PostingsListView}; | use crate::bucket_sort::{QueryWordAutomaton, PostingsListView}; | ||||||
|   | |||||||
| @@ -1,5 +1,7 @@ | |||||||
| use std::borrow::Cow; | use std::borrow::Cow; | ||||||
| use std::collections::HashMap; | use std::collections::HashMap; | ||||||
|  | use std::hash::{Hash, Hasher}; | ||||||
|  | use std::ops::Range; | ||||||
| use std::time::Instant; | use std::time::Instant; | ||||||
| use std::{cmp, fmt, iter::once}; | use std::{cmp, fmt, iter::once}; | ||||||
|  |  | ||||||
| @@ -11,8 +13,9 @@ use fst::{IntoStreamer, Streamer}; | |||||||
| use crate::database::MainT; | use crate::database::MainT; | ||||||
| use crate::{store, DocumentId, DocIndex, MResult}; | use crate::{store, DocumentId, DocIndex, MResult}; | ||||||
| use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa}; | use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa}; | ||||||
|  | use crate::QueryWordsMapper; | ||||||
|  |  | ||||||
| #[derive(PartialEq, Eq, PartialOrd, Ord, Hash)] | #[derive(Clone, PartialEq, Eq, Hash)] | ||||||
| pub enum Operation { | pub enum Operation { | ||||||
|     And(Vec<Operation>), |     And(Vec<Operation>), | ||||||
|     Or(Vec<Operation>), |     Or(Vec<Operation>), | ||||||
| @@ -39,36 +42,49 @@ impl fmt::Debug for Operation { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | impl Operation { | ||||||
|  |     fn tolerant(id: QueryId, prefix: bool, s: &str) -> Operation { | ||||||
|  |         Operation::Query(Query { id, prefix, kind: QueryKind::Tolerant(s.to_string()) }) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn exact(id: QueryId, prefix: bool, s: &str) -> Operation { | ||||||
|  |         Operation::Query(Query { id, prefix, kind: QueryKind::Exact(s.to_string()) }) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn phrase2(id: QueryId, prefix: bool, (left, right): (&str, &str)) -> Operation { | ||||||
|  |         Operation::Query(Query { id, prefix, kind: QueryKind::Phrase(vec![left.to_owned(), right.to_owned()]) }) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| pub type QueryId = usize; | pub type QueryId = usize; | ||||||
|  |  | ||||||
| #[derive(PartialEq, Eq, PartialOrd, Ord, Hash)] | #[derive(Clone, Eq)] | ||||||
| pub struct Query { | pub struct Query { | ||||||
|     pub id: QueryId, |     pub id: QueryId, | ||||||
|     pub prefix: bool, |     pub prefix: bool, | ||||||
|     pub kind: QueryKind, |     pub kind: QueryKind, | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(PartialEq, Eq, PartialOrd, Ord, Hash)] | impl PartialEq for Query { | ||||||
|  |     fn eq(&self, other: &Self) -> bool { | ||||||
|  |         self.prefix == other.prefix && self.kind == other.kind | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl Hash for Query { | ||||||
|  |     fn hash<H: Hasher>(&self, state: &mut H) { | ||||||
|  |         self.prefix.hash(state); | ||||||
|  |         self.kind.hash(state); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[derive(Clone, PartialEq, Eq, Hash)] | ||||||
| pub enum QueryKind { | pub enum QueryKind { | ||||||
|     Tolerant(String), |     Tolerant(String), | ||||||
|     Exact(String), |     Exact(String), | ||||||
|     Phrase(Vec<String>), |     Phrase(Vec<String>), | ||||||
| } | } | ||||||
|  |  | ||||||
| impl Query { |  | ||||||
|     fn tolerant(id: QueryId, prefix: bool, s: &str) -> Query { |  | ||||||
|         Query { id, prefix, kind: QueryKind::Tolerant(s.to_string()) } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn exact(id: QueryId, prefix: bool, s: &str) -> Query { |  | ||||||
|         Query { id, prefix, kind: QueryKind::Exact(s.to_string()) } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn phrase2(id: QueryId, prefix: bool, (left, right): (&str, &str)) -> Query { |  | ||||||
|         Query { id, prefix, kind: QueryKind::Phrase(vec![left.to_owned(), right.to_owned()]) } |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl fmt::Debug for Query { | impl fmt::Debug for Query { | ||||||
|     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | ||||||
|         let Query { id, prefix, kind } = self; |         let Query { id, prefix, kind } = self; | ||||||
| @@ -151,54 +167,88 @@ where I: IntoIterator<Item=Operation>, | |||||||
|  |  | ||||||
| const MAX_NGRAM: usize = 3; | const MAX_NGRAM: usize = 3; | ||||||
|  |  | ||||||
| pub fn create_query_tree(reader: &heed::RoTxn<MainT>, ctx: &Context, query: &str) -> MResult<Operation> { | pub fn create_query_tree( | ||||||
|  |     reader: &heed::RoTxn<MainT>, | ||||||
|  |     ctx: &Context, | ||||||
|  |     query: &str, | ||||||
|  | ) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)> | ||||||
|  | { | ||||||
|     let query = query.to_lowercase(); |     let query = query.to_lowercase(); | ||||||
|  |  | ||||||
|     let words = query.linear_group_by_key(char::is_whitespace).map(ToOwned::to_owned); |     let words = query.linear_group_by_key(char::is_whitespace).map(ToOwned::to_owned); | ||||||
|     let words = words.filter(|s| !s.contains(char::is_whitespace)).enumerate(); |     let words: Vec<_> = words.filter(|s| !s.contains(char::is_whitespace)).enumerate().collect(); | ||||||
|     let words: Vec<_> = words.collect(); |  | ||||||
|  |  | ||||||
|  |     let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w)); | ||||||
|     let mut ngrams = Vec::new(); |     let mut ngrams = Vec::new(); | ||||||
|     for ngram in 1..=MAX_NGRAM { |     for ngram in 1..=MAX_NGRAM { | ||||||
|  |  | ||||||
|         let ngiter = words.windows(ngram).enumerate().map(|(i, group)| { |         let ngiter = words.windows(ngram).enumerate().map(|(i, group)| { | ||||||
|             let before = words[..i].windows(1); |             let before = words[0..i].windows(1).enumerate().map(|(i, g)| (i..i+1, g)); | ||||||
|             let after = words[i + ngram..].windows(1); |             let after = words[i + ngram..].windows(1) | ||||||
|             before.chain(Some(group)).chain(after) |                 .enumerate() | ||||||
|  |                 .map(move |(j, g)| (i + j + ngram..i + j + ngram + 1, g)); | ||||||
|  |             before.chain(Some((i..i + ngram, group))).chain(after) | ||||||
|         }); |         }); | ||||||
|  |  | ||||||
|         for group in ngiter { |         for group in ngiter { | ||||||
|             let mut ops = Vec::new(); |  | ||||||
|  |  | ||||||
|             for (is_last, words) in is_last(group) { |             let mut ops = Vec::new(); | ||||||
|  |             for (is_last, (range, words)) in is_last(group) { | ||||||
|  |  | ||||||
|                 let mut alts = Vec::new(); |                 let mut alts = Vec::new(); | ||||||
|                 match words { |                 match words { | ||||||
|                     [(id, word)] => { |                     [(id, word)] => { | ||||||
|                         let phrase = split_best_frequency(reader, ctx, word)? |                         let mut idgen = ((id + 1) * 100)..; | ||||||
|                             .map(|ws| Query::phrase2(*id, is_last, ws)) |  | ||||||
|                             .map(Operation::Query); |                         let phrase = split_best_frequency(reader, ctx, word)? | ||||||
|  |                             .map(|ws| { | ||||||
|  |                                 let id = idgen.next().unwrap(); | ||||||
|  |                                 idgen.next().unwrap(); | ||||||
|  |                                 mapper.declare(range.clone(), id, &[ws.0, ws.1]); | ||||||
|  |                                 Operation::phrase2(id, is_last, ws) | ||||||
|  |                             }); | ||||||
|  |  | ||||||
|  |                         let synonyms = fetch_synonyms(reader, ctx, &[word])? | ||||||
|  |                             .into_iter() | ||||||
|  |                             .map(|alts| { | ||||||
|  |                                 let id = idgen.next().unwrap(); | ||||||
|  |                                 mapper.declare(range.clone(), id, &alts); | ||||||
|  |  | ||||||
|  |                                 let mut idgen = once(id).chain(&mut idgen); | ||||||
|  |                                 let iter = alts.into_iter().map(|w| { | ||||||
|  |                                     let id = idgen.next().unwrap(); | ||||||
|  |                                     Operation::exact(id, false, &w) | ||||||
|  |                                 }); | ||||||
|  |  | ||||||
|                         let synonyms = fetch_synonyms(reader, ctx, &[word])?.into_iter().map(|alts| { |  | ||||||
|                             let iter = alts.into_iter().map(|w| Query::exact(*id, false, &w)).map(Operation::Query); |  | ||||||
|                                 create_operation(iter, Operation::And) |                                 create_operation(iter, Operation::And) | ||||||
|                             }); |                             }); | ||||||
|  |  | ||||||
|                         let query = Query::tolerant(*id, is_last, word); |                         let query = Operation::tolerant(*id, is_last, word); | ||||||
|  |  | ||||||
|                         alts.push(Operation::Query(query)); |                         alts.push(query); | ||||||
|                         alts.extend(synonyms.chain(phrase)); |                         alts.extend(synonyms.chain(phrase)); | ||||||
|                     }, |                     }, | ||||||
|                     words => { |                     words => { | ||||||
|                         let id = words[0].0; |                         let id = words[0].0; | ||||||
|  |                         let mut idgen = ((id + 1) * 100_usize.pow(ngram as u32))..; | ||||||
|  |  | ||||||
|                         let words: Vec<_> = words.iter().map(|(_, s)| s.as_str()).collect(); |                         let words: Vec<_> = words.iter().map(|(_, s)| s.as_str()).collect(); | ||||||
|  |  | ||||||
|                         for synonym in fetch_synonyms(reader, ctx, &words)? { |                         for synonym in fetch_synonyms(reader, ctx, &words)? { | ||||||
|                             let synonym = synonym.into_iter().map(|s| Operation::Query(Query::exact(id, false, &s))); |                             let id = idgen.next().unwrap(); | ||||||
|                             let synonym = create_operation(synonym, Operation::And); |                             mapper.declare(range.clone(), id, &synonym); | ||||||
|                             alts.push(synonym); |  | ||||||
|  |                             let mut idgen = once(id).chain(&mut idgen); | ||||||
|  |                             let synonym = synonym.into_iter().map(|s| { | ||||||
|  |                                 let id = idgen.next().unwrap(); | ||||||
|  |                                 Operation::exact(id, false, &s) | ||||||
|  |                             }); | ||||||
|  |                             alts.push(create_operation(synonym, Operation::And)); | ||||||
|                         } |                         } | ||||||
|  |  | ||||||
|                         let query = Query::exact(id, is_last, &words.concat()); |                         let id = idgen.next().unwrap(); | ||||||
|                         alts.push(Operation::Query(query)); |                         let concat = words.concat(); | ||||||
|  |                         alts.push(Operation::exact(id, is_last, &concat)); | ||||||
|  |                         mapper.declare(range.clone(), id, &[concat]); | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
| @@ -210,7 +260,10 @@ pub fn create_query_tree(reader: &heed::RoTxn<MainT>, ctx: &Context, query: &str | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     Ok(create_operation(ngrams, Operation::Or)) |     let mapping = mapper.mapping(); | ||||||
|  |     let operation = create_operation(ngrams, Operation::Or); | ||||||
|  |  | ||||||
|  |     Ok((operation, mapping)) | ||||||
| } | } | ||||||
|  |  | ||||||
| pub type Postings<'o, 'txn> = HashMap<(&'o Query, Vec<u8>), Cow<'txn, Set<DocIndex>>>; | pub type Postings<'o, 'txn> = HashMap<(&'o Query, Vec<u8>), Cow<'txn, Set<DocIndex>>>; | ||||||
|   | |||||||
							
								
								
									
										415
									
								
								meilisearch-core/src/query_words_mapper.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										415
									
								
								meilisearch-core/src/query_words_mapper.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,415 @@ | |||||||
|  | use std::collections::HashMap; | ||||||
|  | use std::iter::FromIterator; | ||||||
|  | use std::ops::Range; | ||||||
|  | use intervaltree::{Element, IntervalTree}; | ||||||
|  |  | ||||||
|  | pub type QueryId = usize; | ||||||
|  |  | ||||||
|  | pub struct QueryWordsMapper { | ||||||
|  |     originals: Vec<String>, | ||||||
|  |     mappings: HashMap<QueryId, (Range<usize>, Vec<String>)>, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl QueryWordsMapper { | ||||||
|  |     pub fn new<I, A>(originals: I) -> QueryWordsMapper | ||||||
|  |     where I: IntoIterator<Item = A>, | ||||||
|  |           A: ToString, | ||||||
|  |     { | ||||||
|  |         let originals = originals.into_iter().map(|s| s.to_string()).collect(); | ||||||
|  |         QueryWordsMapper { originals, mappings: HashMap::new() } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn declare<I, A>(&mut self, range: Range<usize>, id: QueryId, replacement: I) | ||||||
|  |     where I: IntoIterator<Item = A>, | ||||||
|  |           A: ToString, | ||||||
|  |     { | ||||||
|  |         assert!(range.len() != 0); | ||||||
|  |         assert!(self.originals.get(range.clone()).is_some()); | ||||||
|  |         assert!(id >= self.originals.len()); | ||||||
|  |  | ||||||
|  |         let replacement: Vec<_> = replacement.into_iter().map(|s| s.to_string()).collect(); | ||||||
|  |  | ||||||
|  |         assert!(!replacement.is_empty()); | ||||||
|  |  | ||||||
|  |         // We detect words at the end and at the front of the | ||||||
|  |         // replacement that are common with the originals: | ||||||
|  |         // | ||||||
|  |         //     x a b c d e f g | ||||||
|  |         //       ^^^/   \^^^ | ||||||
|  |         //     a b x c d k j e f | ||||||
|  |         //     ^^^           ^^^ | ||||||
|  |         // | ||||||
|  |  | ||||||
|  |         let left = &self.originals[..range.start]; | ||||||
|  |         let right = &self.originals[range.end..]; | ||||||
|  |  | ||||||
|  |         let common_left = longest_common_prefix(left, &replacement); | ||||||
|  |         let common_right = longest_common_prefix(&replacement, right); | ||||||
|  |  | ||||||
|  |         for i in 0..common_left { | ||||||
|  |             let range = range.start - common_left + i..range.start - common_left + i + 1; | ||||||
|  |             let replacement = vec![replacement[i].clone()]; | ||||||
|  |             self.mappings.insert(id + i, (range, replacement)); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         { | ||||||
|  |             let replacement = replacement[common_left..replacement.len() - common_right].iter().cloned().collect(); | ||||||
|  |             self.mappings.insert(id + common_left, (range.clone(), replacement)); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         for i in 0..common_right { | ||||||
|  |             let id = id + replacement.len() - common_right + i; | ||||||
|  |             let range = range.end + i..range.end + i + 1; | ||||||
|  |             let replacement = vec![replacement[replacement.len() - common_right + i].clone()]; | ||||||
|  |             self.mappings.insert(id, (range, replacement)); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn mapping(self) -> HashMap<QueryId, Range<usize>> { | ||||||
|  |         let mappings = self.mappings.into_iter().map(|(i, (r, v))| (r, (i, v))); | ||||||
|  |         let intervals = IntervalTree::from_iter(mappings); | ||||||
|  |  | ||||||
|  |         let mut output = HashMap::new(); | ||||||
|  |         let mut offset = 0; | ||||||
|  |  | ||||||
|  |         // We map each original word to the biggest number of | ||||||
|  |         // associated words. | ||||||
|  |         for i in 0..self.originals.len() { | ||||||
|  |             let max = intervals.query_point(i) | ||||||
|  |                 .filter_map(|e| { | ||||||
|  |                     if e.range.end - 1 == i { | ||||||
|  |                         let len = e.value.1.iter().skip(i - e.range.start).count(); | ||||||
|  |                         if len != 0 { Some(len) } else { None } | ||||||
|  |                     } else { None } | ||||||
|  |                 }) | ||||||
|  |                 .max() | ||||||
|  |                 .unwrap_or(1); | ||||||
|  |  | ||||||
|  |             let range = i + offset..i + offset + max; | ||||||
|  |             output.insert(i, range); | ||||||
|  |             offset += max - 1; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // We retrieve the range that each original word | ||||||
|  |         // is mapped to and apply it to each of the words. | ||||||
|  |         for i in 0..self.originals.len() { | ||||||
|  |  | ||||||
|  |             let iter = intervals.query_point(i).filter(|e| e.range.end - 1 == i); | ||||||
|  |             for Element { range, value: (id, words) } in iter { | ||||||
|  |  | ||||||
|  |                 // We ask for the complete range mapped to the area we map. | ||||||
|  |                 let start = output.get(&range.start).map(|r| r.start).unwrap_or(range.start); | ||||||
|  |                 let end = output.get(&(range.end - 1)).map(|r| r.end).unwrap_or(range.end); | ||||||
|  |                 let range = start..end; | ||||||
|  |  | ||||||
|  |                 // We map each query id to one word until the last, | ||||||
|  |                 // we map it to the remainings words. | ||||||
|  |                 let add = range.len() - words.len(); | ||||||
|  |                 for (j, x) in range.take(words.len()).enumerate() { | ||||||
|  |                     let add = if j == words.len() - 1 { add } else { 0 }; // is last? | ||||||
|  |                     let range = x..x + 1 + add; | ||||||
|  |                     output.insert(id + j, range); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         output | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn longest_common_prefix<T: Eq + std::fmt::Debug>(a: &[T], b: &[T]) -> usize { | ||||||
|  |     let mut best = None; | ||||||
|  |     for i in (0..a.len()).rev() { | ||||||
|  |         let count = a[i..].iter().zip(b).take_while(|(a, b)| a == b).count(); | ||||||
|  |         best = match best { | ||||||
|  |             Some(old) if count > old => Some(count), | ||||||
|  |             Some(_) => break, | ||||||
|  |             None => Some(count), | ||||||
|  |         }; | ||||||
|  |     } | ||||||
|  |     best.unwrap_or(0) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[cfg(test)] | ||||||
|  | mod tests { | ||||||
|  |     use super::*; | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn original_unmodified() { | ||||||
|  |         let query = ["new", "york", "city", "subway"]; | ||||||
|  |         //             0       1       2        3 | ||||||
|  |         let mut builder = QueryWordsMapper::new(&query); | ||||||
|  |  | ||||||
|  |         // new york = new york city | ||||||
|  |         builder.declare(0..2, 4, &["new", "york", "city"]); | ||||||
|  |         //                    ^      4       5       6 | ||||||
|  |  | ||||||
|  |         // new = new york city | ||||||
|  |         builder.declare(0..1, 7, &["new", "york", "city"]); | ||||||
|  |         //                    ^      7       8       9 | ||||||
|  |  | ||||||
|  |         let mapping = builder.mapping(); | ||||||
|  |  | ||||||
|  |         assert_eq!(mapping[&0], 0..1); // new | ||||||
|  |         assert_eq!(mapping[&1], 1..2); // york | ||||||
|  |         assert_eq!(mapping[&2], 2..3); // city | ||||||
|  |         assert_eq!(mapping[&3], 3..4); // subway | ||||||
|  |  | ||||||
|  |         assert_eq!(mapping[&4], 0..1); // new | ||||||
|  |         assert_eq!(mapping[&5], 1..2); // york | ||||||
|  |         assert_eq!(mapping[&6], 2..3); // city | ||||||
|  |  | ||||||
|  |         assert_eq!(mapping[&7], 0..1); // new | ||||||
|  |         assert_eq!(mapping[&8], 1..2); // york | ||||||
|  |         assert_eq!(mapping[&9], 2..3); // city | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn original_unmodified2() { | ||||||
|  |         let query = ["new", "york", "city", "subway"]; | ||||||
|  |         //             0       1       2        3 | ||||||
|  |         let mut builder = QueryWordsMapper::new(&query); | ||||||
|  |  | ||||||
|  |         // city subway = new york city underground train | ||||||
|  |         builder.declare(2..4, 4, &["new", "york", "city", "underground", "train"]); | ||||||
|  |         //                    ^      4      5       6           7           8 | ||||||
|  |  | ||||||
|  |         let mapping = builder.mapping(); | ||||||
|  |  | ||||||
|  |         assert_eq!(mapping[&0], 0..1); // new | ||||||
|  |         assert_eq!(mapping[&1], 1..2); // york | ||||||
|  |         assert_eq!(mapping[&2], 2..3); // city | ||||||
|  |         assert_eq!(mapping[&3], 3..5); // subway | ||||||
|  |  | ||||||
|  |         assert_eq!(mapping[&4], 0..1); // new | ||||||
|  |         assert_eq!(mapping[&5], 1..2); // york | ||||||
|  |         assert_eq!(mapping[&6], 2..3); // city | ||||||
|  |         assert_eq!(mapping[&7], 3..4); // underground | ||||||
|  |         assert_eq!(mapping[&8], 4..5); // train | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn original_unmodified3() { | ||||||
|  |         let query = ["a", "b", "x", "x", "a", "b", "c", "d", "e", "f", "g"]; | ||||||
|  |         //            0    1    2    3    4    5    6    7    8    9    10 | ||||||
|  |         let mut builder = QueryWordsMapper::new(&query); | ||||||
|  |  | ||||||
|  |         // c d = a b x c d k j e f | ||||||
|  |         builder.declare(6..8, 11, &["a", "b", "x", "c", "d", "k", "j", "e", "f"]); | ||||||
|  |         //                    ^^    11   12   13   14   15   16   17   18   19 | ||||||
|  |  | ||||||
|  |         let mapping = builder.mapping(); | ||||||
|  |  | ||||||
|  |         assert_eq!(mapping[&0],  0..1); // a | ||||||
|  |         assert_eq!(mapping[&1],  1..2); // b | ||||||
|  |         assert_eq!(mapping[&2],  2..3); // x | ||||||
|  |         assert_eq!(mapping[&3],  3..4); // x | ||||||
|  |         assert_eq!(mapping[&4],  4..5); // a | ||||||
|  |         assert_eq!(mapping[&5],  5..6); // b | ||||||
|  |         assert_eq!(mapping[&6],  6..7); // c | ||||||
|  |         assert_eq!(mapping[&7],  7..11); // d | ||||||
|  |         assert_eq!(mapping[&8],  11..12); // e | ||||||
|  |         assert_eq!(mapping[&9],  12..13); // f | ||||||
|  |         assert_eq!(mapping[&10], 13..14); // g | ||||||
|  |  | ||||||
|  |         assert_eq!(mapping[&11], 4..5); // a | ||||||
|  |         assert_eq!(mapping[&12], 5..6); // b | ||||||
|  |         assert_eq!(mapping[&13], 6..7); // x | ||||||
|  |         assert_eq!(mapping[&14], 7..8); // c | ||||||
|  |         assert_eq!(mapping[&15], 8..9); // d | ||||||
|  |         assert_eq!(mapping[&16], 9..10); // k | ||||||
|  |         assert_eq!(mapping[&17], 10..11); // j | ||||||
|  |         assert_eq!(mapping[&18], 11..12); // e | ||||||
|  |         assert_eq!(mapping[&19], 12..13); // f | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn simple_growing() { | ||||||
|  |         let query = ["new", "york", "subway"]; | ||||||
|  |         //             0       1        2 | ||||||
|  |         let mut builder = QueryWordsMapper::new(&query); | ||||||
|  |  | ||||||
|  |         // new york = new york city | ||||||
|  |         builder.declare(0..2, 3, &["new", "york", "city"]); | ||||||
|  |         //                    ^      3       4       5 | ||||||
|  |  | ||||||
|  |         let mapping = builder.mapping(); | ||||||
|  |  | ||||||
|  |         assert_eq!(mapping[&0], 0..1); // new | ||||||
|  |         assert_eq!(mapping[&1], 1..3); // york | ||||||
|  |         assert_eq!(mapping[&2], 3..4); // subway | ||||||
|  |         assert_eq!(mapping[&3], 0..1); // new | ||||||
|  |         assert_eq!(mapping[&4], 1..2); // york | ||||||
|  |         assert_eq!(mapping[&5], 2..3); // city | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn same_place_growings() { | ||||||
|  |         let query = ["NY", "subway"]; | ||||||
|  |         //             0       1 | ||||||
|  |         let mut builder = QueryWordsMapper::new(&query); | ||||||
|  |  | ||||||
|  |         // NY = new york | ||||||
|  |         builder.declare(0..1, 2, &["new", "york"]); | ||||||
|  |         //                    ^      2       3 | ||||||
|  |  | ||||||
|  |         // NY = new york city | ||||||
|  |         builder.declare(0..1, 4, &["new", "york", "city"]); | ||||||
|  |         //                    ^      4       5       6 | ||||||
|  |  | ||||||
|  |         // NY = NYC | ||||||
|  |         builder.declare(0..1, 7, &["NYC"]); | ||||||
|  |         //                    ^      7 | ||||||
|  |  | ||||||
|  |         // NY = new york city | ||||||
|  |         builder.declare(0..1, 8, &["new", "york", "city"]); | ||||||
|  |         //                    ^      8       9      10 | ||||||
|  |  | ||||||
|  |         // subway = underground train | ||||||
|  |         builder.declare(1..2, 11, &["underground", "train"]); | ||||||
|  |         //                    ^          11          12 | ||||||
|  |  | ||||||
|  |         let mapping = builder.mapping(); | ||||||
|  |  | ||||||
|  |         assert_eq!(mapping[&0], 0..3); // NY | ||||||
|  |         assert_eq!(mapping[&1], 3..5); // subway | ||||||
|  |         assert_eq!(mapping[&2], 0..1); // new | ||||||
|  |         assert_eq!(mapping[&3], 1..3); // york | ||||||
|  |         assert_eq!(mapping[&4], 0..1); // new | ||||||
|  |         assert_eq!(mapping[&5], 1..2); // york | ||||||
|  |         assert_eq!(mapping[&6], 2..3); // city | ||||||
|  |         assert_eq!(mapping[&7], 0..3); // NYC | ||||||
|  |         assert_eq!(mapping[&8], 0..1); // new | ||||||
|  |         assert_eq!(mapping[&9], 1..2); // york | ||||||
|  |         assert_eq!(mapping[&10], 2..3); // city | ||||||
|  |         assert_eq!(mapping[&11], 3..4); // underground | ||||||
|  |         assert_eq!(mapping[&12], 4..5); // train | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn bigger_growing() { | ||||||
|  |         let query = ["NYC", "subway"]; | ||||||
|  |         //             0        1 | ||||||
|  |         let mut builder = QueryWordsMapper::new(&query); | ||||||
|  |  | ||||||
|  |         // NYC = new york city | ||||||
|  |         builder.declare(0..1, 2, &["new", "york", "city"]); | ||||||
|  |         //                    ^      2       3       4 | ||||||
|  |  | ||||||
|  |         let mapping = builder.mapping(); | ||||||
|  |  | ||||||
|  |         assert_eq!(mapping[&0], 0..3); // NYC | ||||||
|  |         assert_eq!(mapping[&1], 3..4); // subway | ||||||
|  |         assert_eq!(mapping[&2], 0..1); // new | ||||||
|  |         assert_eq!(mapping[&3], 1..2); // york | ||||||
|  |         assert_eq!(mapping[&4], 2..3); // city | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn middle_query_growing() { | ||||||
|  |         let query = ["great", "awesome", "NYC", "subway"]; | ||||||
|  |         //              0         1        2        3 | ||||||
|  |         let mut builder = QueryWordsMapper::new(&query); | ||||||
|  |  | ||||||
|  |         // NYC = new york city | ||||||
|  |         builder.declare(2..3, 4, &["new", "york", "city"]); | ||||||
|  |         //                    ^      4       5       6 | ||||||
|  |  | ||||||
|  |         let mapping = builder.mapping(); | ||||||
|  |  | ||||||
|  |         assert_eq!(mapping[&0], 0..1); // great | ||||||
|  |         assert_eq!(mapping[&1], 1..2); // awesome | ||||||
|  |         assert_eq!(mapping[&2], 2..5); // NYC | ||||||
|  |         assert_eq!(mapping[&3], 5..6); // subway | ||||||
|  |         assert_eq!(mapping[&4], 2..3); // new | ||||||
|  |         assert_eq!(mapping[&5], 3..4); // york | ||||||
|  |         assert_eq!(mapping[&6], 4..5); // city | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn end_query_growing() { | ||||||
|  |         let query = ["NYC", "subway"]; | ||||||
|  |         //             0        1 | ||||||
|  |         let mut builder = QueryWordsMapper::new(&query); | ||||||
|  |  | ||||||
|  |         // NYC = new york city | ||||||
|  |         builder.declare(1..2, 2, &["underground", "train"]); | ||||||
|  |         //                    ^         2            3 | ||||||
|  |  | ||||||
|  |         let mapping = builder.mapping(); | ||||||
|  |  | ||||||
|  |         assert_eq!(mapping[&0], 0..1); // NYC | ||||||
|  |         assert_eq!(mapping[&1], 1..3); // subway | ||||||
|  |         assert_eq!(mapping[&2], 1..2); // underground | ||||||
|  |         assert_eq!(mapping[&3], 2..3); // train | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn multiple_growings() { | ||||||
|  |         let query = ["great", "awesome", "NYC", "subway"]; | ||||||
|  |         //              0         1        2        3 | ||||||
|  |         let mut builder = QueryWordsMapper::new(&query); | ||||||
|  |  | ||||||
|  |         // NYC = new york city | ||||||
|  |         builder.declare(2..3, 4, &["new", "york", "city"]); | ||||||
|  |         //                    ^      4       5       6 | ||||||
|  |  | ||||||
|  |         // subway = underground train | ||||||
|  |         builder.declare(3..4, 7, &["underground", "train"]); | ||||||
|  |         //                    ^          7           8 | ||||||
|  |  | ||||||
|  |         let mapping = builder.mapping(); | ||||||
|  |  | ||||||
|  |         assert_eq!(mapping[&0], 0..1); // great | ||||||
|  |         assert_eq!(mapping[&1], 1..2); // awesome | ||||||
|  |         assert_eq!(mapping[&2], 2..5); // NYC | ||||||
|  |         assert_eq!(mapping[&3], 5..7); // subway | ||||||
|  |         assert_eq!(mapping[&4], 2..3); // new | ||||||
|  |         assert_eq!(mapping[&5], 3..4); // york | ||||||
|  |         assert_eq!(mapping[&6], 4..5); // city | ||||||
|  |         assert_eq!(mapping[&7], 5..6); // underground | ||||||
|  |         assert_eq!(mapping[&8], 6..7); // train | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn multiple_probable_growings() { | ||||||
|  |         let query = ["great", "awesome", "NYC", "subway"]; | ||||||
|  |         //              0         1        2        3 | ||||||
|  |         let mut builder = QueryWordsMapper::new(&query); | ||||||
|  |  | ||||||
|  |         // NYC = new york city | ||||||
|  |         builder.declare(2..3, 4, &["new", "york", "city"]); | ||||||
|  |         //                    ^      4       5       6 | ||||||
|  |  | ||||||
|  |         // subway = underground train | ||||||
|  |         builder.declare(3..4, 7, &["underground", "train"]); | ||||||
|  |         //                    ^          7           8 | ||||||
|  |  | ||||||
|  |         // great awesome = good | ||||||
|  |         builder.declare(0..2, 9, &["good"]); | ||||||
|  |         //                    ^       9 | ||||||
|  |  | ||||||
|  |         // awesome NYC = NY | ||||||
|  |         builder.declare(1..3, 10, &["NY"]); | ||||||
|  |         //                    ^^     10 | ||||||
|  |  | ||||||
|  |         // NYC subway = metro | ||||||
|  |         builder.declare(2..4, 11, &["metro"]); | ||||||
|  |         //                    ^^      11 | ||||||
|  |  | ||||||
|  |         let mapping = builder.mapping(); | ||||||
|  |  | ||||||
|  |         assert_eq!(mapping[&0], 0..1); // great | ||||||
|  |         assert_eq!(mapping[&1], 1..2); // awesome | ||||||
|  |         assert_eq!(mapping[&2], 2..5); // NYC | ||||||
|  |         assert_eq!(mapping[&3], 5..7); // subway | ||||||
|  |         assert_eq!(mapping[&4], 2..3); // new | ||||||
|  |         assert_eq!(mapping[&5], 3..4); // york | ||||||
|  |         assert_eq!(mapping[&6], 4..5); // city | ||||||
|  |         assert_eq!(mapping[&7], 5..6); // underground | ||||||
|  |         assert_eq!(mapping[&8], 6..7); // train | ||||||
|  |         assert_eq!(mapping[&9], 0..2); // good | ||||||
|  |         assert_eq!(mapping[&10], 1..5); // NY | ||||||
|  |         assert_eq!(mapping[&11], 2..7); // metro | ||||||
|  |     } | ||||||
|  | } | ||||||
		Reference in New Issue
	
	Block a user