mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 04:56:28 +00:00 
			
		
		
		
	Merge #4535
4535: Support Negative Keywords r=ManyTheFish a=Kerollmops This PR fixes #4422 by supporting `-` before any word in the query. The minus symbol `-`, from the ASCII table, is not the only character that can be considered the negative operator. You can see the two other matching characters under the `Based on "-" (U+002D)` section on [this unicode reference website](https://www.compart.com/en/unicode/U+002D). It's important to notice the strange behavior when a query includes and excludes the same word; only the derivative ( synonyms and split) will be kept: - If you input `progamer -progamer`, the engine will still search for `pro gamer`. - If you have the synonym `like = love` and you input `like -like`, it will still search for `love`. ## TODO - [x] Add analytics - [x] Add support to the `-` operator - [x] Make sure to support spaces around `-` well - [x] Support phrase negation - [x] Add tests Co-authored-by: Clément Renault <clement@meilisearch.com>
This commit is contained in:
		| @@ -583,6 +583,7 @@ pub struct SearchAggregator { | |||||||
|     total_received: usize, |     total_received: usize, | ||||||
|     total_succeeded: usize, |     total_succeeded: usize, | ||||||
|     total_degraded: usize, |     total_degraded: usize, | ||||||
|  |     total_used_negative_operator: usize, | ||||||
|     time_spent: BinaryHeap<usize>, |     time_spent: BinaryHeap<usize>, | ||||||
|  |  | ||||||
|     // sort |     // sort | ||||||
| @@ -763,12 +764,16 @@ impl SearchAggregator { | |||||||
|             facet_distribution: _, |             facet_distribution: _, | ||||||
|             facet_stats: _, |             facet_stats: _, | ||||||
|             degraded, |             degraded, | ||||||
|  |             used_negative_operator, | ||||||
|         } = result; |         } = result; | ||||||
|  |  | ||||||
|         self.total_succeeded = self.total_succeeded.saturating_add(1); |         self.total_succeeded = self.total_succeeded.saturating_add(1); | ||||||
|         if *degraded { |         if *degraded { | ||||||
|             self.total_degraded = self.total_degraded.saturating_add(1); |             self.total_degraded = self.total_degraded.saturating_add(1); | ||||||
|         } |         } | ||||||
|  |         if *used_negative_operator { | ||||||
|  |             self.total_used_negative_operator = self.total_used_negative_operator.saturating_add(1); | ||||||
|  |         } | ||||||
|         self.time_spent.push(*processing_time_ms as usize); |         self.time_spent.push(*processing_time_ms as usize); | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -811,6 +816,7 @@ impl SearchAggregator { | |||||||
|             embedder, |             embedder, | ||||||
|             hybrid, |             hybrid, | ||||||
|             total_degraded, |             total_degraded, | ||||||
|  |             total_used_negative_operator, | ||||||
|         } = other; |         } = other; | ||||||
|  |  | ||||||
|         if self.timestamp.is_none() { |         if self.timestamp.is_none() { | ||||||
| @@ -826,6 +832,8 @@ impl SearchAggregator { | |||||||
|         self.total_received = self.total_received.saturating_add(total_received); |         self.total_received = self.total_received.saturating_add(total_received); | ||||||
|         self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); |         self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); | ||||||
|         self.total_degraded = self.total_degraded.saturating_add(total_degraded); |         self.total_degraded = self.total_degraded.saturating_add(total_degraded); | ||||||
|  |         self.total_used_negative_operator = | ||||||
|  |             self.total_used_negative_operator.saturating_add(total_used_negative_operator); | ||||||
|         self.time_spent.append(time_spent); |         self.time_spent.append(time_spent); | ||||||
|  |  | ||||||
|         // sort |         // sort | ||||||
| @@ -932,6 +940,7 @@ impl SearchAggregator { | |||||||
|             embedder, |             embedder, | ||||||
|             hybrid, |             hybrid, | ||||||
|             total_degraded, |             total_degraded, | ||||||
|  |             total_used_negative_operator, | ||||||
|         } = self; |         } = self; | ||||||
|  |  | ||||||
|         if total_received == 0 { |         if total_received == 0 { | ||||||
| @@ -952,6 +961,7 @@ impl SearchAggregator { | |||||||
|                     "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics |                     "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics | ||||||
|                     "total_received": total_received, |                     "total_received": total_received, | ||||||
|                     "total_degraded": total_degraded, |                     "total_degraded": total_degraded, | ||||||
|  |                     "total_used_negative_operator": total_used_negative_operator, | ||||||
|                 }, |                 }, | ||||||
|                 "sort": { |                 "sort": { | ||||||
|                     "with_geoPoint": sort_with_geo_point, |                     "with_geoPoint": sort_with_geo_point, | ||||||
|   | |||||||
| @@ -324,9 +324,11 @@ pub struct SearchResult { | |||||||
|     #[serde(skip_serializing_if = "Option::is_none")] |     #[serde(skip_serializing_if = "Option::is_none")] | ||||||
|     pub facet_stats: Option<BTreeMap<String, FacetStats>>, |     pub facet_stats: Option<BTreeMap<String, FacetStats>>, | ||||||
|  |  | ||||||
|     // This information is only used for analytics purposes |     // These fields are only used for analytics purposes | ||||||
|     #[serde(skip)] |     #[serde(skip)] | ||||||
|     pub degraded: bool, |     pub degraded: bool, | ||||||
|  |     #[serde(skip)] | ||||||
|  |     pub used_negative_operator: bool, | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Serialize, Debug, Clone, PartialEq)] | #[derive(Serialize, Debug, Clone, PartialEq)] | ||||||
| @@ -512,6 +514,7 @@ pub fn perform_search( | |||||||
|         candidates, |         candidates, | ||||||
|         document_scores, |         document_scores, | ||||||
|         degraded, |         degraded, | ||||||
|  |         used_negative_operator, | ||||||
|         .. |         .. | ||||||
|     } = match &query.hybrid { |     } = match &query.hybrid { | ||||||
|         Some(hybrid) => match *hybrid.semantic_ratio { |         Some(hybrid) => match *hybrid.semantic_ratio { | ||||||
| @@ -717,6 +720,7 @@ pub fn perform_search( | |||||||
|         facet_distribution, |         facet_distribution, | ||||||
|         facet_stats, |         facet_stats, | ||||||
|         degraded, |         degraded, | ||||||
|  |         used_negative_operator, | ||||||
|     }; |     }; | ||||||
|     Ok(result) |     Ok(result) | ||||||
| } | } | ||||||
|   | |||||||
| @@ -185,6 +185,110 @@ async fn phrase_search_with_stop_word() { | |||||||
|         .await; |         .await; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #[actix_rt::test] | ||||||
|  | async fn negative_phrase_search() { | ||||||
|  |     let server = Server::new().await; | ||||||
|  |     let index = server.index("test"); | ||||||
|  |  | ||||||
|  |     let documents = DOCUMENTS.clone(); | ||||||
|  |     index.add_documents(documents, None).await; | ||||||
|  |     index.wait_task(0).await; | ||||||
|  |  | ||||||
|  |     index | ||||||
|  |         .search(json!({"q": "-\"train your dragon\"" }), |response, code| { | ||||||
|  |             assert_eq!(code, 200, "{}", response); | ||||||
|  |             let hits = response["hits"].as_array().unwrap(); | ||||||
|  |             assert_eq!(hits.len(), 4); | ||||||
|  |             assert_eq!(hits[0]["id"], "287947"); | ||||||
|  |             assert_eq!(hits[1]["id"], "299537"); | ||||||
|  |             assert_eq!(hits[2]["id"], "522681"); | ||||||
|  |             assert_eq!(hits[3]["id"], "450465"); | ||||||
|  |         }) | ||||||
|  |         .await; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[actix_rt::test] | ||||||
|  | async fn negative_word_search() { | ||||||
|  |     let server = Server::new().await; | ||||||
|  |     let index = server.index("test"); | ||||||
|  |  | ||||||
|  |     let documents = DOCUMENTS.clone(); | ||||||
|  |     index.add_documents(documents, None).await; | ||||||
|  |     index.wait_task(0).await; | ||||||
|  |  | ||||||
|  |     index | ||||||
|  |         .search(json!({"q": "-escape" }), |response, code| { | ||||||
|  |             assert_eq!(code, 200, "{}", response); | ||||||
|  |             let hits = response["hits"].as_array().unwrap(); | ||||||
|  |             assert_eq!(hits.len(), 4); | ||||||
|  |             assert_eq!(hits[0]["id"], "287947"); | ||||||
|  |             assert_eq!(hits[1]["id"], "299537"); | ||||||
|  |             assert_eq!(hits[2]["id"], "166428"); | ||||||
|  |             assert_eq!(hits[3]["id"], "450465"); | ||||||
|  |         }) | ||||||
|  |         .await; | ||||||
|  |  | ||||||
|  |     // Everything that contains derivates of escape but not escape: nothing | ||||||
|  |     index | ||||||
|  |         .search(json!({"q": "-escape escape" }), |response, code| { | ||||||
|  |             assert_eq!(code, 200, "{}", response); | ||||||
|  |             let hits = response["hits"].as_array().unwrap(); | ||||||
|  |             assert_eq!(hits.len(), 0); | ||||||
|  |         }) | ||||||
|  |         .await; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[actix_rt::test] | ||||||
|  | async fn non_negative_search() { | ||||||
|  |     let server = Server::new().await; | ||||||
|  |     let index = server.index("test"); | ||||||
|  |  | ||||||
|  |     let documents = DOCUMENTS.clone(); | ||||||
|  |     index.add_documents(documents, None).await; | ||||||
|  |     index.wait_task(0).await; | ||||||
|  |  | ||||||
|  |     index | ||||||
|  |         .search(json!({"q": "- escape" }), |response, code| { | ||||||
|  |             assert_eq!(code, 200, "{}", response); | ||||||
|  |             let hits = response["hits"].as_array().unwrap(); | ||||||
|  |             assert_eq!(hits.len(), 1); | ||||||
|  |             assert_eq!(hits[0]["id"], "522681"); | ||||||
|  |         }) | ||||||
|  |         .await; | ||||||
|  |  | ||||||
|  |     index | ||||||
|  |         .search(json!({"q": "- \"train your dragon\"" }), |response, code| { | ||||||
|  |             assert_eq!(code, 200, "{}", response); | ||||||
|  |             let hits = response["hits"].as_array().unwrap(); | ||||||
|  |             assert_eq!(hits.len(), 1); | ||||||
|  |             assert_eq!(hits[0]["id"], "166428"); | ||||||
|  |         }) | ||||||
|  |         .await; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[actix_rt::test] | ||||||
|  | async fn negative_special_cases_search() { | ||||||
|  |     let server = Server::new().await; | ||||||
|  |     let index = server.index("test"); | ||||||
|  |  | ||||||
|  |     let documents = DOCUMENTS.clone(); | ||||||
|  |     index.add_documents(documents, None).await; | ||||||
|  |     index.wait_task(0).await; | ||||||
|  |  | ||||||
|  |     index.update_settings(json!({"synonyms": { "escape": ["glass"] }})).await; | ||||||
|  |     index.wait_task(1).await; | ||||||
|  |  | ||||||
|  |     // There is a synonym for escape -> glass but we don't want "escape", only the derivates: glass | ||||||
|  |     index | ||||||
|  |         .search(json!({"q": "-escape escape" }), |response, code| { | ||||||
|  |             assert_eq!(code, 200, "{}", response); | ||||||
|  |             let hits = response["hits"].as_array().unwrap(); | ||||||
|  |             assert_eq!(hits.len(), 1); | ||||||
|  |             assert_eq!(hits[0]["id"], "450465"); | ||||||
|  |         }) | ||||||
|  |         .await; | ||||||
|  | } | ||||||
|  |  | ||||||
| #[cfg(feature = "default")] | #[cfg(feature = "default")] | ||||||
| #[actix_rt::test] | #[actix_rt::test] | ||||||
| async fn test_kanji_language_detection() { | async fn test_kanji_language_detection() { | ||||||
|   | |||||||
| @@ -2435,6 +2435,7 @@ pub(crate) mod tests { | |||||||
|             document_scores: _, |             document_scores: _, | ||||||
|             mut documents_ids, |             mut documents_ids, | ||||||
|             degraded: _, |             degraded: _, | ||||||
|  |             used_negative_operator: _, | ||||||
|         } = search.execute().unwrap(); |         } = search.execute().unwrap(); | ||||||
|         let primary_key_id = index.fields_ids_map(&rtxn).unwrap().id("primary_key").unwrap(); |         let primary_key_id = index.fields_ids_map(&rtxn).unwrap().id("primary_key").unwrap(); | ||||||
|         documents_ids.sort_unstable(); |         documents_ids.sort_unstable(); | ||||||
|   | |||||||
| @@ -11,6 +11,7 @@ struct ScoreWithRatioResult { | |||||||
|     candidates: RoaringBitmap, |     candidates: RoaringBitmap, | ||||||
|     document_scores: Vec<(u32, ScoreWithRatio)>, |     document_scores: Vec<(u32, ScoreWithRatio)>, | ||||||
|     degraded: bool, |     degraded: bool, | ||||||
|  |     used_negative_operator: bool, | ||||||
| } | } | ||||||
|  |  | ||||||
| type ScoreWithRatio = (Vec<ScoreDetails>, f32); | type ScoreWithRatio = (Vec<ScoreDetails>, f32); | ||||||
| @@ -78,6 +79,7 @@ impl ScoreWithRatioResult { | |||||||
|             candidates: results.candidates, |             candidates: results.candidates, | ||||||
|             document_scores, |             document_scores, | ||||||
|             degraded: results.degraded, |             degraded: results.degraded, | ||||||
|  |             used_negative_operator: results.used_negative_operator, | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -113,6 +115,7 @@ impl ScoreWithRatioResult { | |||||||
|             documents_ids, |             documents_ids, | ||||||
|             document_scores, |             document_scores, | ||||||
|             degraded: left.degraded | right.degraded, |             degraded: left.degraded | right.degraded, | ||||||
|  |             used_negative_operator: left.used_negative_operator | right.used_negative_operator, | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -183,6 +183,7 @@ impl<'a> Search<'a> { | |||||||
|             documents_ids, |             documents_ids, | ||||||
|             document_scores, |             document_scores, | ||||||
|             degraded, |             degraded, | ||||||
|  |             used_negative_operator, | ||||||
|         } = match self.vector.as_ref() { |         } = match self.vector.as_ref() { | ||||||
|             Some(vector) => execute_vector_search( |             Some(vector) => execute_vector_search( | ||||||
|                 &mut ctx, |                 &mut ctx, | ||||||
| @@ -221,7 +222,14 @@ impl<'a> Search<'a> { | |||||||
|             None => MatchingWords::default(), |             None => MatchingWords::default(), | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         Ok(SearchResult { matching_words, candidates, document_scores, documents_ids, degraded }) |         Ok(SearchResult { | ||||||
|  |             matching_words, | ||||||
|  |             candidates, | ||||||
|  |             document_scores, | ||||||
|  |             documents_ids, | ||||||
|  |             degraded, | ||||||
|  |             used_negative_operator, | ||||||
|  |         }) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -272,6 +280,7 @@ pub struct SearchResult { | |||||||
|     pub documents_ids: Vec<DocumentId>, |     pub documents_ids: Vec<DocumentId>, | ||||||
|     pub document_scores: Vec<Vec<ScoreDetails>>, |     pub document_scores: Vec<Vec<ScoreDetails>>, | ||||||
|     pub degraded: bool, |     pub degraded: bool, | ||||||
|  |     pub used_negative_operator: bool, | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, Copy, PartialEq, Eq)] | #[derive(Debug, Clone, Copy, PartialEq, Eq)] | ||||||
|   | |||||||
| @@ -240,6 +240,7 @@ pub(crate) mod tests { | |||||||
|     use super::super::super::located_query_terms_from_tokens; |     use super::super::super::located_query_terms_from_tokens; | ||||||
|     use super::*; |     use super::*; | ||||||
|     use crate::index::tests::TempIndex; |     use crate::index::tests::TempIndex; | ||||||
|  |     use crate::search::new::query_term::ExtractedTokens; | ||||||
|  |  | ||||||
|     pub(crate) fn temp_index_with_documents() -> TempIndex { |     pub(crate) fn temp_index_with_documents() -> TempIndex { | ||||||
|         let temp_index = TempIndex::new(); |         let temp_index = TempIndex::new(); | ||||||
| @@ -261,7 +262,8 @@ pub(crate) mod tests { | |||||||
|         let mut builder = TokenizerBuilder::default(); |         let mut builder = TokenizerBuilder::default(); | ||||||
|         let tokenizer = builder.build(); |         let tokenizer = builder.build(); | ||||||
|         let tokens = tokenizer.tokenize("split this world"); |         let tokens = tokenizer.tokenize("split this world"); | ||||||
|         let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap(); |         let ExtractedTokens { query_terms, .. } = | ||||||
|  |             located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap(); | ||||||
|         let matching_words = MatchingWords::new(ctx, query_terms); |         let matching_words = MatchingWords::new(ctx, query_terms); | ||||||
|  |  | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
|   | |||||||
| @@ -33,7 +33,9 @@ use interner::{DedupInterner, Interner}; | |||||||
| pub use logger::visual::VisualSearchLogger; | pub use logger::visual::VisualSearchLogger; | ||||||
| pub use logger::{DefaultSearchLogger, SearchLogger}; | pub use logger::{DefaultSearchLogger, SearchLogger}; | ||||||
| use query_graph::{QueryGraph, QueryNode}; | use query_graph::{QueryGraph, QueryNode}; | ||||||
| use query_term::{located_query_terms_from_tokens, LocatedQueryTerm, Phrase, QueryTerm}; | use query_term::{ | ||||||
|  |     located_query_terms_from_tokens, ExtractedTokens, LocatedQueryTerm, Phrase, QueryTerm, | ||||||
|  | }; | ||||||
| use ranking_rules::{ | use ranking_rules::{ | ||||||
|     BoxRankingRule, PlaceholderQuery, RankingRule, RankingRuleOutput, RankingRuleQueryTrait, |     BoxRankingRule, PlaceholderQuery, RankingRule, RankingRuleOutput, RankingRuleQueryTrait, | ||||||
| }; | }; | ||||||
| @@ -209,6 +211,35 @@ fn resolve_universe( | |||||||
|     ) |     ) | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #[tracing::instrument(level = "trace", skip_all, target = "search")] | ||||||
|  | fn resolve_negative_words( | ||||||
|  |     ctx: &mut SearchContext, | ||||||
|  |     negative_words: &[Word], | ||||||
|  | ) -> Result<RoaringBitmap> { | ||||||
|  |     let mut negative_bitmap = RoaringBitmap::new(); | ||||||
|  |     for &word in negative_words { | ||||||
|  |         if let Some(bitmap) = ctx.word_docids(word)? { | ||||||
|  |             negative_bitmap |= bitmap; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     Ok(negative_bitmap) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[tracing::instrument(level = "trace", skip_all, target = "search")] | ||||||
|  | fn resolve_negative_phrases( | ||||||
|  |     ctx: &mut SearchContext, | ||||||
|  |     negative_phrases: &[LocatedQueryTerm], | ||||||
|  | ) -> Result<RoaringBitmap> { | ||||||
|  |     let mut negative_bitmap = RoaringBitmap::new(); | ||||||
|  |     for term in negative_phrases { | ||||||
|  |         let query_term = ctx.term_interner.get(term.value); | ||||||
|  |         if let Some(phrase) = query_term.original_phrase() { | ||||||
|  |             negative_bitmap |= ctx.get_phrase_docids(phrase)?; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     Ok(negative_bitmap) | ||||||
|  | } | ||||||
|  |  | ||||||
| /// Return the list of initialised ranking rules to be used for a placeholder search. | /// Return the list of initialised ranking rules to be used for a placeholder search. | ||||||
| fn get_ranking_rules_for_placeholder_search<'ctx>( | fn get_ranking_rules_for_placeholder_search<'ctx>( | ||||||
|     ctx: &SearchContext<'ctx>, |     ctx: &SearchContext<'ctx>, | ||||||
| @@ -557,6 +588,7 @@ pub fn execute_vector_search( | |||||||
|         documents_ids: docids, |         documents_ids: docids, | ||||||
|         located_query_terms: None, |         located_query_terms: None, | ||||||
|         degraded, |         degraded, | ||||||
|  |         used_negative_operator: false, | ||||||
|     }) |     }) | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -580,6 +612,7 @@ pub fn execute_search( | |||||||
| ) -> Result<PartialSearchResult> { | ) -> Result<PartialSearchResult> { | ||||||
|     check_sort_criteria(ctx, sort_criteria.as_ref())?; |     check_sort_criteria(ctx, sort_criteria.as_ref())?; | ||||||
|  |  | ||||||
|  |     let mut used_negative_operator = false; | ||||||
|     let mut located_query_terms = None; |     let mut located_query_terms = None; | ||||||
|     let query_terms = if let Some(query) = query { |     let query_terms = if let Some(query) = query { | ||||||
|         let span = tracing::trace_span!(target: "search::tokens", "tokenizer_builder"); |         let span = tracing::trace_span!(target: "search::tokens", "tokenizer_builder"); | ||||||
| @@ -620,7 +653,16 @@ pub fn execute_search( | |||||||
|         let tokens = tokenizer.tokenize(query); |         let tokens = tokenizer.tokenize(query); | ||||||
|         drop(entered); |         drop(entered); | ||||||
|  |  | ||||||
|         let query_terms = located_query_terms_from_tokens(ctx, tokens, words_limit)?; |         let ExtractedTokens { query_terms, negative_words, negative_phrases } = | ||||||
|  |             located_query_terms_from_tokens(ctx, tokens, words_limit)?; | ||||||
|  |         used_negative_operator = !negative_words.is_empty() || !negative_phrases.is_empty(); | ||||||
|  |  | ||||||
|  |         let ignored_documents = resolve_negative_words(ctx, &negative_words)?; | ||||||
|  |         let ignored_phrases = resolve_negative_phrases(ctx, &negative_phrases)?; | ||||||
|  |  | ||||||
|  |         universe -= ignored_documents; | ||||||
|  |         universe -= ignored_phrases; | ||||||
|  |  | ||||||
|         if query_terms.is_empty() { |         if query_terms.is_empty() { | ||||||
|             // Do a placeholder search instead |             // Do a placeholder search instead | ||||||
|             None |             None | ||||||
| @@ -630,6 +672,7 @@ pub fn execute_search( | |||||||
|     } else { |     } else { | ||||||
|         None |         None | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     let bucket_sort_output = if let Some(query_terms) = query_terms { |     let bucket_sort_output = if let Some(query_terms) = query_terms { | ||||||
|         let (graph, new_located_query_terms) = QueryGraph::from_query(ctx, &query_terms)?; |         let (graph, new_located_query_terms) = QueryGraph::from_query(ctx, &query_terms)?; | ||||||
|         located_query_terms = Some(new_located_query_terms); |         located_query_terms = Some(new_located_query_terms); | ||||||
| @@ -690,6 +733,7 @@ pub fn execute_search( | |||||||
|         documents_ids: docids, |         documents_ids: docids, | ||||||
|         located_query_terms, |         located_query_terms, | ||||||
|         degraded, |         degraded, | ||||||
|  |         used_negative_operator, | ||||||
|     }) |     }) | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -752,4 +796,5 @@ pub struct PartialSearchResult { | |||||||
|     pub document_scores: Vec<Vec<ScoreDetails>>, |     pub document_scores: Vec<Vec<ScoreDetails>>, | ||||||
|  |  | ||||||
|     pub degraded: bool, |     pub degraded: bool, | ||||||
|  |     pub used_negative_operator: bool, | ||||||
| } | } | ||||||
|   | |||||||
| @@ -9,7 +9,9 @@ use std::ops::RangeInclusive; | |||||||
|  |  | ||||||
| use either::Either; | use either::Either; | ||||||
| pub use ntypo_subset::NTypoTermSubset; | pub use ntypo_subset::NTypoTermSubset; | ||||||
| pub use parse_query::{located_query_terms_from_tokens, make_ngram, number_of_typos_allowed}; | pub use parse_query::{ | ||||||
|  |     located_query_terms_from_tokens, make_ngram, number_of_typos_allowed, ExtractedTokens, | ||||||
|  | }; | ||||||
| pub use phrase::Phrase; | pub use phrase::Phrase; | ||||||
|  |  | ||||||
| use super::interner::{DedupInterner, Interned}; | use super::interner::{DedupInterner, Interned}; | ||||||
| @@ -478,6 +480,11 @@ impl QueryTerm { | |||||||
|     pub fn original_word(&self, ctx: &SearchContext) -> String { |     pub fn original_word(&self, ctx: &SearchContext) -> String { | ||||||
|         ctx.word_interner.get(self.original).clone() |         ctx.word_interner.get(self.original).clone() | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     pub fn original_phrase(&self) -> Option<Interned<Phrase>> { | ||||||
|  |         self.zero_typo.phrase | ||||||
|  |     } | ||||||
|  |  | ||||||
|     pub fn all_computed_derivations(&self) -> (Vec<Interned<String>>, Vec<Interned<Phrase>>) { |     pub fn all_computed_derivations(&self) -> (Vec<Interned<String>>, Vec<Interned<Phrase>>) { | ||||||
|         let mut words = BTreeSet::new(); |         let mut words = BTreeSet::new(); | ||||||
|         let mut phrases = BTreeSet::new(); |         let mut phrases = BTreeSet::new(); | ||||||
|   | |||||||
| @@ -6,20 +6,37 @@ use charabia::{SeparatorKind, TokenKind}; | |||||||
| use super::compute_derivations::partially_initialized_term_from_word; | use super::compute_derivations::partially_initialized_term_from_word; | ||||||
| use super::{LocatedQueryTerm, ZeroTypoTerm}; | use super::{LocatedQueryTerm, ZeroTypoTerm}; | ||||||
| use crate::search::new::query_term::{Lazy, Phrase, QueryTerm}; | use crate::search::new::query_term::{Lazy, Phrase, QueryTerm}; | ||||||
|  | use crate::search::new::Word; | ||||||
| use crate::{Result, SearchContext, MAX_WORD_LENGTH}; | use crate::{Result, SearchContext, MAX_WORD_LENGTH}; | ||||||
|  |  | ||||||
|  | #[derive(Clone)] | ||||||
|  | /// Extraction of the content of a query. | ||||||
|  | pub struct ExtractedTokens { | ||||||
|  |     /// The terms to search for in the database. | ||||||
|  |     pub query_terms: Vec<LocatedQueryTerm>, | ||||||
|  |     /// The words that must not appear in the results. | ||||||
|  |     pub negative_words: Vec<Word>, | ||||||
|  |     /// The phrases that must not appear in the results. | ||||||
|  |     pub negative_phrases: Vec<LocatedQueryTerm>, | ||||||
|  | } | ||||||
|  |  | ||||||
| /// Convert the tokenised search query into a list of located query terms. | /// Convert the tokenised search query into a list of located query terms. | ||||||
| #[tracing::instrument(level = "trace", skip_all, target = "search::query")] | #[tracing::instrument(level = "trace", skip_all, target = "search::query")] | ||||||
| pub fn located_query_terms_from_tokens( | pub fn located_query_terms_from_tokens( | ||||||
|     ctx: &mut SearchContext, |     ctx: &mut SearchContext, | ||||||
|     query: NormalizedTokenIter, |     query: NormalizedTokenIter, | ||||||
|     words_limit: Option<usize>, |     words_limit: Option<usize>, | ||||||
| ) -> Result<Vec<LocatedQueryTerm>> { | ) -> Result<ExtractedTokens> { | ||||||
|     let nbr_typos = number_of_typos_allowed(ctx)?; |     let nbr_typos = number_of_typos_allowed(ctx)?; | ||||||
|  |  | ||||||
|     let mut located_terms = Vec::new(); |     let mut query_terms = Vec::new(); | ||||||
|  |  | ||||||
|  |     let mut negative_phrase = false; | ||||||
|     let mut phrase: Option<PhraseBuilder> = None; |     let mut phrase: Option<PhraseBuilder> = None; | ||||||
|  |     let mut encountered_whitespace = true; | ||||||
|  |     let mut negative_next_token = false; | ||||||
|  |     let mut negative_words = Vec::new(); | ||||||
|  |     let mut negative_phrases = Vec::new(); | ||||||
|  |  | ||||||
|     let parts_limit = words_limit.unwrap_or(usize::MAX); |     let parts_limit = words_limit.unwrap_or(usize::MAX); | ||||||
|  |  | ||||||
| @@ -31,9 +48,10 @@ pub fn located_query_terms_from_tokens( | |||||||
|         if token.lemma().is_empty() { |         if token.lemma().is_empty() { | ||||||
|             continue; |             continue; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         // early return if word limit is exceeded |         // early return if word limit is exceeded | ||||||
|         if located_terms.len() >= parts_limit { |         if query_terms.len() >= parts_limit { | ||||||
|             return Ok(located_terms); |             return Ok(ExtractedTokens { query_terms, negative_words, negative_phrases }); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         match token.kind { |         match token.kind { | ||||||
| @@ -46,6 +64,11 @@ pub fn located_query_terms_from_tokens( | |||||||
|                 // 3. if the word is the last token of the query we push it as a prefix word. |                 // 3. if the word is the last token of the query we push it as a prefix word. | ||||||
|                 if let Some(phrase) = &mut phrase { |                 if let Some(phrase) = &mut phrase { | ||||||
|                     phrase.push_word(ctx, &token, position) |                     phrase.push_word(ctx, &token, position) | ||||||
|  |                 } else if negative_next_token { | ||||||
|  |                     let word = token.lemma().to_string(); | ||||||
|  |                     let word = Word::Original(ctx.word_interner.insert(word)); | ||||||
|  |                     negative_words.push(word); | ||||||
|  |                     negative_next_token = false; | ||||||
|                 } else if peekable.peek().is_some() { |                 } else if peekable.peek().is_some() { | ||||||
|                     match token.kind { |                     match token.kind { | ||||||
|                         TokenKind::Word => { |                         TokenKind::Word => { | ||||||
| @@ -61,9 +84,9 @@ pub fn located_query_terms_from_tokens( | |||||||
|                                 value: ctx.term_interner.push(term), |                                 value: ctx.term_interner.push(term), | ||||||
|                                 positions: position..=position, |                                 positions: position..=position, | ||||||
|                             }; |                             }; | ||||||
|                             located_terms.push(located_term); |                             query_terms.push(located_term); | ||||||
|                         } |                         } | ||||||
|                         TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {} |                         TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => (), | ||||||
|                     } |                     } | ||||||
|                 } else { |                 } else { | ||||||
|                     let word = token.lemma(); |                     let word = token.lemma(); | ||||||
| @@ -78,7 +101,7 @@ pub fn located_query_terms_from_tokens( | |||||||
|                         value: ctx.term_interner.push(term), |                         value: ctx.term_interner.push(term), | ||||||
|                         positions: position..=position, |                         positions: position..=position, | ||||||
|                     }; |                     }; | ||||||
|                     located_terms.push(located_term); |                     query_terms.push(located_term); | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|             TokenKind::Separator(separator_kind) => { |             TokenKind::Separator(separator_kind) => { | ||||||
| @@ -94,7 +117,14 @@ pub fn located_query_terms_from_tokens( | |||||||
|                     let phrase = if separator_kind == SeparatorKind::Hard { |                     let phrase = if separator_kind == SeparatorKind::Hard { | ||||||
|                         if let Some(phrase) = phrase { |                         if let Some(phrase) = phrase { | ||||||
|                             if let Some(located_query_term) = phrase.build(ctx) { |                             if let Some(located_query_term) = phrase.build(ctx) { | ||||||
|                                 located_terms.push(located_query_term) |                                 // as we are evaluating a negative operator we put the phrase | ||||||
|  |                                 // in the negative one *but* we don't reset the negative operator | ||||||
|  |                                 // as we are immediatly starting a new negative phrase. | ||||||
|  |                                 if negative_phrase { | ||||||
|  |                                     negative_phrases.push(located_query_term); | ||||||
|  |                                 } else { | ||||||
|  |                                     query_terms.push(located_query_term); | ||||||
|  |                                 } | ||||||
|                             } |                             } | ||||||
|                             Some(PhraseBuilder::empty()) |                             Some(PhraseBuilder::empty()) | ||||||
|                         } else { |                         } else { | ||||||
| @@ -115,26 +145,49 @@ pub fn located_query_terms_from_tokens( | |||||||
|                         // Per the check above, quote_count > 0 |                         // Per the check above, quote_count > 0 | ||||||
|                         quote_count -= 1; |                         quote_count -= 1; | ||||||
|                         if let Some(located_query_term) = phrase.build(ctx) { |                         if let Some(located_query_term) = phrase.build(ctx) { | ||||||
|                             located_terms.push(located_query_term) |                             // we were evaluating a negative operator so we | ||||||
|  |                             // put the phrase in the negative phrases | ||||||
|  |                             if negative_phrase { | ||||||
|  |                                 negative_phrases.push(located_query_term); | ||||||
|  |                                 negative_phrase = false; | ||||||
|  |                             } else { | ||||||
|  |                                 query_terms.push(located_query_term); | ||||||
|  |                             } | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|  |  | ||||||
|                     // Start new phrase if the token ends with an opening quote |                     // Start new phrase if the token ends with an opening quote | ||||||
|                     (quote_count % 2 == 1).then_some(PhraseBuilder::empty()) |                     if quote_count % 2 == 1 { | ||||||
|  |                         negative_phrase = negative_next_token; | ||||||
|  |                         Some(PhraseBuilder::empty()) | ||||||
|  |                     } else { | ||||||
|  |                         None | ||||||
|  |                     } | ||||||
|                 }; |                 }; | ||||||
|  |  | ||||||
|  |                 negative_next_token = | ||||||
|  |                     phrase.is_none() && token.lemma() == "-" && encountered_whitespace; | ||||||
|             } |             } | ||||||
|             _ => (), |             _ => (), | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         encountered_whitespace = | ||||||
|  |             token.lemma().chars().last().filter(|c| c.is_whitespace()).is_some(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // If a quote is never closed, we consider all of the end of the query as a phrase. |     // If a quote is never closed, we consider all of the end of the query as a phrase. | ||||||
|     if let Some(phrase) = phrase.take() { |     if let Some(phrase) = phrase.take() { | ||||||
|         if let Some(located_query_term) = phrase.build(ctx) { |         if let Some(located_query_term) = phrase.build(ctx) { | ||||||
|             located_terms.push(located_query_term); |             // put the phrase in the negative set if we are evaluating a negative operator. | ||||||
|  |             if negative_phrase { | ||||||
|  |                 negative_phrases.push(located_query_term); | ||||||
|  |             } else { | ||||||
|  |                 query_terms.push(located_query_term); | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     Ok(located_terms) |     Ok(ExtractedTokens { query_terms, negative_words, negative_phrases }) | ||||||
| } | } | ||||||
|  |  | ||||||
| pub fn number_of_typos_allowed<'ctx>( | pub fn number_of_typos_allowed<'ctx>( | ||||||
| @@ -315,8 +368,10 @@ mod tests { | |||||||
|         let rtxn = index.read_txn()?; |         let rtxn = index.read_txn()?; | ||||||
|         let mut ctx = SearchContext::new(&index, &rtxn); |         let mut ctx = SearchContext::new(&index, &rtxn); | ||||||
|         // panics with `attempt to add with overflow` before <https://github.com/meilisearch/meilisearch/issues/3785> |         // panics with `attempt to add with overflow` before <https://github.com/meilisearch/meilisearch/issues/3785> | ||||||
|         let located_query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None)?; |         let ExtractedTokens { query_terms, .. } = | ||||||
|         assert!(located_query_terms.is_empty()); |             located_query_terms_from_tokens(&mut ctx, tokens, None)?; | ||||||
|  |         assert!(query_terms.is_empty()); | ||||||
|  |  | ||||||
|         Ok(()) |         Ok(()) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user