mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-24 20:46:27 +00:00 
			
		
		
		
	Merge #150
150: add _formated field to search result r=MarinPostma a=MarinPostma close #75 Co-authored-by: Marin Postma <postma.marin@protonmail.com>
This commit is contained in:
		
							
								
								
									
										9
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										9
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -345,9 +345,9 @@ checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" | ||||
|  | ||||
| [[package]] | ||||
| name = "backtrace" | ||||
| version = "0.3.56" | ||||
| version = "0.3.57" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "9d117600f438b1707d4e4ae15d3595657288f8235a0eb593e80ecc98ab34e1bc" | ||||
| checksum = "78ed203b9ba68b242c62b3fb7480f589dd49829be1edb3fe8fc8b4ffda2dcb8d" | ||||
| dependencies = [ | ||||
|  "addr2line", | ||||
|  "cfg-if 1.0.0", | ||||
| @@ -1661,6 +1661,7 @@ dependencies = [ | ||||
|  "milli", | ||||
|  "mime", | ||||
|  "mockall", | ||||
|  "obkv", | ||||
|  "once_cell", | ||||
|  "oxidized-json-checker", | ||||
|  "parking_lot", | ||||
| @@ -2837,9 +2838,9 @@ dependencies = [ | ||||
|  | ||||
| [[package]] | ||||
| name = "slab" | ||||
| version = "0.4.2" | ||||
| version = "0.4.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8" | ||||
| checksum = "f173ac3d1a7e3b28003f40de0b5ce7fe2710f9b9dc3fc38664cebee46b3b6527" | ||||
|  | ||||
| [[package]] | ||||
| name = "slice-group-by" | ||||
|   | ||||
| @@ -63,6 +63,7 @@ tokio = { version = "1", features = ["full"] } | ||||
| uuid = "0.8.2" | ||||
| oxidized-json-checker = "0.3.2" | ||||
| walkdir = "2.3.2" | ||||
| obkv = "0.1.1" | ||||
|  | ||||
| [dependencies.sentry] | ||||
| default-features = false | ||||
|   | ||||
| @@ -83,7 +83,7 @@ impl Index { | ||||
|  | ||||
|         let fields_ids_map = self.fields_ids_map(&txn)?; | ||||
|         let fields_to_display = | ||||
|             self.fields_to_display(&txn, attributes_to_retrieve, &fields_ids_map)?; | ||||
|             self.fields_to_display(&txn, &attributes_to_retrieve, &fields_ids_map)?; | ||||
|  | ||||
|         let iter = self.documents.range(&txn, &(..))?.skip(offset).take(limit); | ||||
|  | ||||
| @@ -108,7 +108,7 @@ impl Index { | ||||
|         let fields_ids_map = self.fields_ids_map(&txn)?; | ||||
|  | ||||
|         let fields_to_display = | ||||
|             self.fields_to_display(&txn, attributes_to_retrieve, &fields_ids_map)?; | ||||
|             self.fields_to_display(&txn, &attributes_to_retrieve, &fields_ids_map)?; | ||||
|  | ||||
|         let internal_id = self | ||||
|             .external_documents_ids(&txn)? | ||||
| @@ -134,7 +134,7 @@ impl Index { | ||||
|     fn fields_to_display<S: AsRef<str>>( | ||||
|         &self, | ||||
|         txn: &heed::RoTxn, | ||||
|         attributes_to_retrieve: Option<Vec<S>>, | ||||
|         attributes_to_retrieve: &Option<Vec<S>>, | ||||
|         fields_ids_map: &milli::FieldsIdsMap, | ||||
|     ) -> anyhow::Result<Vec<u8>> { | ||||
|         let mut displayed_fields_ids = match self.displayed_fields_ids(&txn)? { | ||||
|   | ||||
| @@ -1,17 +1,21 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::collections::{BTreeMap, HashSet}; | ||||
| use std::mem; | ||||
| use std::time::Instant; | ||||
|  | ||||
| use anyhow::bail; | ||||
| use either::Either; | ||||
| use heed::RoTxn; | ||||
| use indexmap::IndexMap; | ||||
| use itertools::Itertools; | ||||
| use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; | ||||
| use milli::{facet::FacetValue, FacetCondition, MatchingWords}; | ||||
| use milli::{facet::FacetValue, FacetCondition, FieldId, FieldsIdsMap, MatchingWords}; | ||||
| use serde::{Deserialize, Serialize}; | ||||
| use serde_json::{Map, Value}; | ||||
| use serde_json::Value; | ||||
|  | ||||
| use super::Index; | ||||
|  | ||||
| pub type Document = IndexMap<String, Value>; | ||||
|  | ||||
| pub const DEFAULT_SEARCH_LIMIT: usize = 20; | ||||
|  | ||||
| const fn default_search_limit() -> usize { | ||||
| @@ -25,8 +29,8 @@ pub struct SearchQuery { | ||||
|     pub offset: Option<usize>, | ||||
|     #[serde(default = "default_search_limit")] | ||||
|     pub limit: usize, | ||||
|     pub attributes_to_retrieve: Option<Vec<String>>, | ||||
|     pub attributes_to_crop: Option<Vec<String>>, | ||||
|     pub attributes_to_retrieve: Option<HashSet<String>>, | ||||
|     pub attributes_to_crop: Option<HashSet<String>>, | ||||
|     pub crop_length: Option<usize>, | ||||
|     pub attributes_to_highlight: Option<HashSet<String>>, | ||||
|     pub filters: Option<String>, | ||||
| @@ -35,10 +39,18 @@ pub struct SearchQuery { | ||||
|     pub facet_distributions: Option<Vec<String>>, | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Serialize)] | ||||
| pub struct SearchHit { | ||||
|     #[serde(flatten)] | ||||
|     pub document: Document, | ||||
|     #[serde(rename = "_formatted", skip_serializing_if = "Document::is_empty")] | ||||
|     pub formatted: Document, | ||||
| } | ||||
|  | ||||
| #[derive(Serialize)] | ||||
| #[serde(rename_all = "camelCase")] | ||||
| pub struct SearchResult { | ||||
|     pub hits: Vec<Map<String, Value>>, | ||||
|     pub hits: Vec<SearchHit>, | ||||
|     pub nb_hits: u64, | ||||
|     pub exhaustive_nb_hits: bool, | ||||
|     pub query: String, | ||||
| @@ -78,19 +90,92 @@ impl Index { | ||||
|         let mut documents = Vec::new(); | ||||
|         let fields_ids_map = self.fields_ids_map(&rtxn).unwrap(); | ||||
|  | ||||
|         let fields_to_display = | ||||
|             self.fields_to_display(&rtxn, query.attributes_to_retrieve, &fields_ids_map)?; | ||||
|         let displayed_ids = self.displayed_fields_ids(&rtxn)? | ||||
|             .map(|fields| fields.into_iter().collect::<HashSet<_>>()) | ||||
|             .unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect()); | ||||
|  | ||||
|         let fids = |attrs: &HashSet<String>| { | ||||
|             let mut ids = HashSet::new(); | ||||
|             for attr in attrs { | ||||
|                 if attr == "*" { | ||||
|                     ids = displayed_ids.clone(); | ||||
|                     break; | ||||
|                 } | ||||
|  | ||||
|                 if let Some(id) = fields_ids_map.id(attr) { | ||||
|                     ids.insert(id); | ||||
|                 } | ||||
|             } | ||||
|             ids | ||||
|         }; | ||||
|  | ||||
|         let to_retrieve_ids = query | ||||
|             .attributes_to_retrieve | ||||
|             .as_ref() | ||||
|             .map(fids) | ||||
|             .unwrap_or_else(|| displayed_ids.clone()); | ||||
|  | ||||
|         let to_highlight_ids = query | ||||
|             .attributes_to_highlight | ||||
|             .as_ref() | ||||
|             .map(fids) | ||||
|             .unwrap_or_default(); | ||||
|  | ||||
|         let to_crop_ids = query | ||||
|             .attributes_to_crop | ||||
|             .as_ref() | ||||
|             .map(fids) | ||||
|             .unwrap_or_default(); | ||||
|  | ||||
|         // The attributes to retrieve are: | ||||
|         // - the ones explicitly marked as to retrieve that are also in the displayed attributes | ||||
|         let all_attributes: Vec<_> = to_retrieve_ids | ||||
|             .intersection(&displayed_ids) | ||||
|             .cloned() | ||||
|             .sorted() | ||||
|             .collect(); | ||||
|  | ||||
|         // The formatted attributes are: | ||||
|         // - The one in either highlighted attributes or cropped attributes if there are attributes | ||||
|         // to retrieve | ||||
|         // - All the attributes to retrieve if there are either highlighted or cropped attributes | ||||
|         // the request specified that all attributes are to retrieve (i.e attributes to retrieve is | ||||
|         // empty in the query) | ||||
|         let all_formatted = if query.attributes_to_retrieve.is_none() { | ||||
|             if query.attributes_to_highlight.is_some() || query.attributes_to_crop.is_some() { | ||||
|                 Cow::Borrowed(&all_attributes) | ||||
|             } else { | ||||
|                 Cow::Owned(Vec::new()) | ||||
|             } | ||||
|         } else { | ||||
|             let attrs = (&to_crop_ids | &to_highlight_ids) | ||||
|                 .intersection(&displayed_ids) | ||||
|                 .cloned() | ||||
|                 .collect::<Vec<_>>(); | ||||
|             Cow::Owned(attrs) | ||||
|         }; | ||||
|  | ||||
|         let stop_words = fst::Set::default(); | ||||
|         let highlighter = Highlighter::new(&stop_words); | ||||
|         let highlighter = Highlighter::new( | ||||
|             &stop_words, | ||||
|             (String::from("<mark>"), String::from("</mark>")), | ||||
|         ); | ||||
|  | ||||
|         for (_id, obkv) in self.documents(&rtxn, documents_ids)? { | ||||
|             let mut object = | ||||
|                 milli::obkv_to_json(&fields_to_display, &fields_ids_map, obkv).unwrap(); | ||||
|             if let Some(ref attributes_to_highlight) = query.attributes_to_highlight { | ||||
|                 highlighter.highlight_record(&mut object, &matching_words, attributes_to_highlight); | ||||
|             } | ||||
|             documents.push(object); | ||||
|             let document = make_document(&all_attributes, &fields_ids_map, obkv.clone())?; | ||||
|             let formatted = compute_formatted( | ||||
|                 &fields_ids_map, | ||||
|                 obkv, | ||||
|                 &highlighter, | ||||
|                 &matching_words, | ||||
|                 all_formatted.as_ref().as_slice(), | ||||
|                 &to_highlight_ids, | ||||
|             )?; | ||||
|             let hit = SearchHit { | ||||
|                 document, | ||||
|                 formatted, | ||||
|             }; | ||||
|             documents.push(hit); | ||||
|         } | ||||
|  | ||||
|         let nb_hits = candidates.len(); | ||||
| @@ -120,6 +205,79 @@ impl Index { | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn make_document( | ||||
|     attributes_to_retrieve: &[FieldId], | ||||
|     field_ids_map: &FieldsIdsMap, | ||||
|     obkv: obkv::KvReader, | ||||
| ) -> anyhow::Result<Document> { | ||||
|     let mut document = Document::new(); | ||||
|     for attr in attributes_to_retrieve { | ||||
|         if let Some(value) = obkv.get(*attr) { | ||||
|             let value = serde_json::from_slice(value)?; | ||||
|  | ||||
|             // This unwrap must be safe since we got the ids from the fields_ids_map just | ||||
|             // before. | ||||
|             let key = field_ids_map | ||||
|                 .name(*attr) | ||||
|                 .expect("Missing field name") | ||||
|                 .to_string(); | ||||
|  | ||||
|             document.insert(key, value); | ||||
|         } | ||||
|     } | ||||
|     Ok(document) | ||||
| } | ||||
|  | ||||
| fn compute_formatted<A: AsRef<[u8]>>( | ||||
|     field_ids_map: &FieldsIdsMap, | ||||
|     obkv: obkv::KvReader, | ||||
|     highlighter: &Highlighter<A>, | ||||
|     matching_words: &impl Matcher, | ||||
|     all_formatted: &[FieldId], | ||||
|     to_highlight_ids: &HashSet<FieldId>, | ||||
| ) -> anyhow::Result<Document> { | ||||
|     let mut document = Document::new(); | ||||
|  | ||||
|     for field in all_formatted { | ||||
|         if let Some(value) = obkv.get(*field) { | ||||
|             let mut value: Value = serde_json::from_slice(value)?; | ||||
|  | ||||
|             if to_highlight_ids.contains(field) { | ||||
|                 value = highlighter.highlight_value(value, matching_words); | ||||
|             } | ||||
|  | ||||
|             // This unwrap must be safe since we got the ids from the fields_ids_map just | ||||
|             // before. | ||||
|             let key = field_ids_map | ||||
|                 .name(*field) | ||||
|                 .expect("Missing field name") | ||||
|                 .to_string(); | ||||
|  | ||||
|             document.insert(key, value); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(document) | ||||
| } | ||||
|  | ||||
| /// trait to allow unit testing of `compute_formated` | ||||
| trait Matcher { | ||||
|     fn matches(&self, w: &str) -> bool; | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| impl Matcher for HashSet<String> { | ||||
|     fn matches(&self, w: &str) -> bool { | ||||
|         self.contains(w) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl Matcher for MatchingWords { | ||||
|     fn matches(&self, w: &str) -> bool { | ||||
|         self.matches(w) | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn parse_facets_array( | ||||
|     txn: &RoTxn, | ||||
|     index: &Index, | ||||
| @@ -149,21 +307,22 @@ fn parse_facets_array( | ||||
|     FacetCondition::from_array(txn, &index.0, ands) | ||||
| } | ||||
|  | ||||
| pub struct Highlighter<'a, A> { | ||||
| struct Highlighter<'a, A> { | ||||
|     analyzer: Analyzer<'a, A>, | ||||
|     marks: (String, String), | ||||
| } | ||||
|  | ||||
| impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { | ||||
|     pub fn new(stop_words: &'a fst::Set<A>) -> Self { | ||||
|     pub fn new(stop_words: &'a fst::Set<A>, marks: (String, String)) -> Self { | ||||
|         let mut config = AnalyzerConfig::default(); | ||||
|         config.stop_words(stop_words); | ||||
|  | ||||
|         let analyzer = Analyzer::new(config); | ||||
|  | ||||
|         Self { analyzer } | ||||
|         Self { analyzer, marks } | ||||
|     } | ||||
|  | ||||
|     pub fn highlight_value(&self, value: Value, words_to_highlight: &MatchingWords) -> Value { | ||||
|     fn highlight_value(&self, value: Value, words_to_highlight: &impl Matcher) -> Value { | ||||
|         match value { | ||||
|             Value::Null => Value::Null, | ||||
|             Value::Bool(boolean) => Value::Bool(boolean), | ||||
| @@ -175,11 +334,11 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { | ||||
|                     if token.is_word() { | ||||
|                         let to_highlight = words_to_highlight.matches(token.text()); | ||||
|                         if to_highlight { | ||||
|                             string.push_str("<mark>") | ||||
|                             string.push_str(&self.marks.0) | ||||
|                         } | ||||
|                         string.push_str(word); | ||||
|                         if to_highlight { | ||||
|                             string.push_str("</mark>") | ||||
|                             string.push_str(&self.marks.1) | ||||
|                         } | ||||
|                     } else { | ||||
|                         string.push_str(word); | ||||
| @@ -201,21 +360,6 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { | ||||
|             ), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn highlight_record( | ||||
|         &self, | ||||
|         object: &mut Map<String, Value>, | ||||
|         words_to_highlight: &MatchingWords, | ||||
|         attributes_to_highlight: &HashSet<String>, | ||||
|     ) { | ||||
|         // TODO do we need to create a string for element that are not and needs to be highlight? | ||||
|         for (key, value) in object.iter_mut() { | ||||
|             if attributes_to_highlight.contains(key) { | ||||
|                 let old_value = mem::take(value); | ||||
|                 *value = self.highlight_value(old_value, words_to_highlight); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn parse_facets( | ||||
| @@ -230,3 +374,115 @@ fn parse_facets( | ||||
|         v => bail!("Invalid facet expression, expected Array, found: {:?}", v), | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod test { | ||||
|     use std::iter::FromIterator; | ||||
|  | ||||
|     use super::*; | ||||
|  | ||||
|     #[test] | ||||
|     fn no_formatted() { | ||||
|         let stop_words = fst::Set::default(); | ||||
|         let highlighter = Highlighter::new( | ||||
|             &stop_words, | ||||
|             (String::from("<mark>"), String::from("</mark>")), | ||||
|         ); | ||||
|  | ||||
|         let mut fields = FieldsIdsMap::new(); | ||||
|         let id = fields.insert("test").unwrap(); | ||||
|  | ||||
|         let mut buf = Vec::new(); | ||||
|         let mut obkv = obkv::KvWriter::new(&mut buf); | ||||
|         obkv.insert(id, Value::String("hello".into()).to_string().as_bytes()).unwrap(); | ||||
|         obkv.finish().unwrap(); | ||||
|  | ||||
|         let obkv = obkv::KvReader::new(&buf); | ||||
|  | ||||
|         let all_formatted = Vec::new(); | ||||
|         let to_highlight_ids = HashSet::new(); | ||||
|  | ||||
|         let matching_words = MatchingWords::default(); | ||||
|  | ||||
|         let value = compute_formatted( | ||||
|             &fields, | ||||
|             obkv, | ||||
|             &highlighter, | ||||
|             &matching_words, | ||||
|             &all_formatted, | ||||
|             &to_highlight_ids | ||||
|         ).unwrap(); | ||||
|  | ||||
|         assert!(value.is_empty()); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn formatted_no_highlight() { | ||||
|         let stop_words = fst::Set::default(); | ||||
|         let highlighter = Highlighter::new( | ||||
|             &stop_words, | ||||
|             (String::from("<mark>"), String::from("</mark>")), | ||||
|         ); | ||||
|  | ||||
|         let mut fields = FieldsIdsMap::new(); | ||||
|         let id = fields.insert("test").unwrap(); | ||||
|  | ||||
|         let mut buf = Vec::new(); | ||||
|         let mut obkv = obkv::KvWriter::new(&mut buf); | ||||
|         obkv.insert(id, Value::String("hello".into()).to_string().as_bytes()).unwrap(); | ||||
|         obkv.finish().unwrap(); | ||||
|  | ||||
|         let obkv = obkv::KvReader::new(&buf); | ||||
|  | ||||
|         let all_formatted = vec![id]; | ||||
|         let to_highlight_ids = HashSet::new(); | ||||
|  | ||||
|         let matching_words = MatchingWords::default(); | ||||
|  | ||||
|         let value = compute_formatted( | ||||
|             &fields, | ||||
|             obkv, | ||||
|             &highlighter, | ||||
|             &matching_words, | ||||
|             &all_formatted, | ||||
|             &to_highlight_ids | ||||
|         ).unwrap(); | ||||
|  | ||||
|         assert_eq!(value["test"], "hello"); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn formatted_with_highlight() { | ||||
|         let stop_words = fst::Set::default(); | ||||
|         let highlighter = Highlighter::new( | ||||
|             &stop_words, | ||||
|             (String::from("<mark>"), String::from("</mark>")), | ||||
|         ); | ||||
|  | ||||
|         let mut fields = FieldsIdsMap::new(); | ||||
|         let id = fields.insert("test").unwrap(); | ||||
|  | ||||
|         let mut buf = Vec::new(); | ||||
|         let mut obkv = obkv::KvWriter::new(&mut buf); | ||||
|         obkv.insert(id, Value::String("hello".into()).to_string().as_bytes()).unwrap(); | ||||
|         obkv.finish().unwrap(); | ||||
|  | ||||
|         let obkv = obkv::KvReader::new(&buf); | ||||
|  | ||||
|         let all_formatted = vec![id]; | ||||
|         let to_highlight_ids = HashSet::from_iter(Some(id)); | ||||
|  | ||||
|         let matching_words = HashSet::from_iter(Some(String::from("hello"))); | ||||
|  | ||||
|         let value = compute_formatted( | ||||
|             &fields, | ||||
|             obkv, | ||||
|             &highlighter, | ||||
|             &matching_words, | ||||
|             &all_formatted, | ||||
|             &to_highlight_ids | ||||
|         ).unwrap(); | ||||
|  | ||||
|         assert_eq!(value["test"], "<mark>hello</mark>"); | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,7 +1,6 @@ | ||||
| use actix_web::{delete, get, post, put}; | ||||
| use actix_web::{web, HttpResponse}; | ||||
| use chrono::{DateTime, Utc}; | ||||
| use serde::{Deserialize, Serialize}; | ||||
| use serde::Deserialize; | ||||
|  | ||||
| use crate::error::ResponseError; | ||||
| use crate::helpers::Authentication; | ||||
| @@ -69,16 +68,6 @@ struct UpdateIndexRequest { | ||||
|     primary_key: Option<String>, | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Serialize)] | ||||
| #[serde(rename_all = "camelCase")] | ||||
| struct UpdateIndexResponse { | ||||
|     name: String, | ||||
|     uid: String, | ||||
|     created_at: DateTime<Utc>, | ||||
|     updated_at: DateTime<Utc>, | ||||
|     primary_key: Option<String>, | ||||
| } | ||||
|  | ||||
| #[put("/indexes/{index_uid}", wrap = "Authentication::Private")] | ||||
| async fn update_index( | ||||
|     data: web::Data<Data>, | ||||
|   | ||||
| @@ -36,11 +36,11 @@ impl TryFrom<SearchQueryGet> for SearchQuery { | ||||
|     fn try_from(other: SearchQueryGet) -> anyhow::Result<Self> { | ||||
|         let attributes_to_retrieve = other | ||||
|             .attributes_to_retrieve | ||||
|             .map(|attrs| attrs.split(',').map(String::from).collect::<Vec<_>>()); | ||||
|             .map(|attrs| attrs.split(',').map(String::from).collect::<HashSet<_>>()); | ||||
|  | ||||
|         let attributes_to_crop = other | ||||
|             .attributes_to_crop | ||||
|             .map(|attrs| attrs.split(',').map(String::from).collect::<Vec<_>>()); | ||||
|             .map(|attrs| attrs.split(',').map(String::from).collect::<HashSet<_>>()); | ||||
|  | ||||
|         let attributes_to_highlight = other | ||||
|             .attributes_to_highlight | ||||
|   | ||||
		Reference in New Issue
	
	Block a user