mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 04:56:28 +00:00 
			
		
		
		
	Merge pull request #494 from meilisearch/flatten-what-is-needed
Only flatten the required objects
This commit is contained in:
		| @@ -1,6 +1,6 @@ | |||||||
| [workspace] | [workspace] | ||||||
| resolver = "2" | resolver = "2" | ||||||
| members = ["milli", "filter-parser", "flatten-serde-json", "http-ui", "benchmarks", "infos", "helpers", "cli"] | members = ["milli", "filter-parser", "flatten-serde-json", "json-depth-checker", "http-ui", "benchmarks", "infos", "helpers", "cli"] | ||||||
| default-members = ["milli"] | default-members = ["milli"] | ||||||
|  |  | ||||||
| [profile.dev] | [profile.dev] | ||||||
|   | |||||||
							
								
								
									
										16
									
								
								json-depth-checker/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								json-depth-checker/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,16 @@ | |||||||
|  | [package] | ||||||
|  | name = "json-depth-checker" | ||||||
|  | version = "0.1.0" | ||||||
|  | edition = "2021" | ||||||
|  |  | ||||||
|  | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | ||||||
|  |  | ||||||
|  | [dependencies] | ||||||
|  | serde_json = "1.0" | ||||||
|  |  | ||||||
|  | [dev-dependencies] | ||||||
|  | criterion = "0.3" | ||||||
|  |  | ||||||
|  | [[bench]] | ||||||
|  | name = "depth" | ||||||
|  | harness = false | ||||||
							
								
								
									
										59
									
								
								json-depth-checker/benches/depth.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								json-depth-checker/benches/depth.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,59 @@ | |||||||
|  | use criterion::{criterion_group, criterion_main, Criterion}; | ||||||
|  | use json_depth_checker::should_flatten_from_unchecked_slice; | ||||||
|  | use serde_json::json; | ||||||
|  |  | ||||||
|  | fn criterion_benchmark(c: &mut Criterion) { | ||||||
|  |     let null = serde_json::to_vec(&json!(null)).unwrap(); | ||||||
|  |     let bool_true = serde_json::to_vec(&json!(true)).unwrap(); | ||||||
|  |     let bool_false = serde_json::to_vec(&json!(false)).unwrap(); | ||||||
|  |     let integer = serde_json::to_vec(&json!(42)).unwrap(); | ||||||
|  |     let float = serde_json::to_vec(&json!(1456.258)).unwrap(); | ||||||
|  |     let string = serde_json::to_vec(&json!("hello world")).unwrap(); | ||||||
|  |     let object = serde_json::to_vec(&json!({ "hello": "world",})).unwrap(); | ||||||
|  |     let complex_object = serde_json::to_vec(&json!({ | ||||||
|  |         "doggos": [ | ||||||
|  |             { "bernard": true }, | ||||||
|  |             { "michel": 42 }, | ||||||
|  |             false, | ||||||
|  |         ], | ||||||
|  |         "bouvier": true, | ||||||
|  |         "caniche": null, | ||||||
|  |     })) | ||||||
|  |     .unwrap(); | ||||||
|  |     let simple_array = serde_json::to_vec(&json!([ | ||||||
|  |         1, | ||||||
|  |         2, | ||||||
|  |         3, | ||||||
|  |         "viva", | ||||||
|  |         "l\"algeria", | ||||||
|  |         true, | ||||||
|  |         "[array]", | ||||||
|  |         "escaped string \"" | ||||||
|  |     ])) | ||||||
|  |     .unwrap(); | ||||||
|  |     let array_of_array = serde_json::to_vec(&json!([1, [2, [3]]])).unwrap(); | ||||||
|  |     let array_of_object = serde_json::to_vec(&json!([1, [2, [3]], {}])).unwrap(); | ||||||
|  |  | ||||||
|  |     c.bench_function("null", |b| b.iter(|| should_flatten_from_unchecked_slice(&null))); | ||||||
|  |     c.bench_function("true", |b| b.iter(|| should_flatten_from_unchecked_slice(&bool_true))); | ||||||
|  |     c.bench_function("false", |b| b.iter(|| should_flatten_from_unchecked_slice(&bool_false))); | ||||||
|  |     c.bench_function("integer", |b| b.iter(|| should_flatten_from_unchecked_slice(&integer))); | ||||||
|  |     c.bench_function("float", |b| b.iter(|| should_flatten_from_unchecked_slice(&float))); | ||||||
|  |     c.bench_function("string", |b| b.iter(|| should_flatten_from_unchecked_slice(&string))); | ||||||
|  |     c.bench_function("object", |b| b.iter(|| should_flatten_from_unchecked_slice(&object))); | ||||||
|  |     c.bench_function("complex object", |b| { | ||||||
|  |         b.iter(|| should_flatten_from_unchecked_slice(&complex_object)) | ||||||
|  |     }); | ||||||
|  |     c.bench_function("simple array", |b| { | ||||||
|  |         b.iter(|| should_flatten_from_unchecked_slice(&simple_array)) | ||||||
|  |     }); | ||||||
|  |     c.bench_function("array of array", |b| { | ||||||
|  |         b.iter(|| should_flatten_from_unchecked_slice(&array_of_array)) | ||||||
|  |     }); | ||||||
|  |     c.bench_function("array of object", |b| { | ||||||
|  |         b.iter(|| should_flatten_from_unchecked_slice(&array_of_object)) | ||||||
|  |     }); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | criterion_group!(benches, criterion_benchmark); | ||||||
|  | criterion_main!(benches); | ||||||
							
								
								
									
										27
									
								
								json-depth-checker/fuzz/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								json-depth-checker/fuzz/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,27 @@ | |||||||
|  | [package] | ||||||
|  | name = "json-depth-checker" | ||||||
|  | version = "0.0.0" | ||||||
|  | authors = ["Automatically generated"] | ||||||
|  | publish = false | ||||||
|  | edition = "2018" | ||||||
|  |  | ||||||
|  | [package.metadata] | ||||||
|  | cargo-fuzz = true | ||||||
|  |  | ||||||
|  | [dependencies] | ||||||
|  | libfuzzer-sys = "0.4" | ||||||
|  | arbitrary-json = "0.1.1" | ||||||
|  | serde_json = "1.0.79" | ||||||
|  |  | ||||||
|  | [dependencies.json-depth-checker] | ||||||
|  | path = ".." | ||||||
|  |  | ||||||
|  | # Prevent this from interfering with workspaces | ||||||
|  | [workspace] | ||||||
|  | members = ["."] | ||||||
|  |  | ||||||
|  | [[bin]] | ||||||
|  | name = "depth" | ||||||
|  | path = "fuzz_targets/depth.rs" | ||||||
|  | test = false | ||||||
|  | doc = false | ||||||
							
								
								
									
										13
									
								
								json-depth-checker/fuzz/fuzz_targets/depth.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								json-depth-checker/fuzz/fuzz_targets/depth.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,13 @@ | |||||||
|  | #![no_main] | ||||||
|  | use arbitrary_json::ArbitraryValue; | ||||||
|  | use json_depth_checker::*; | ||||||
|  | use libfuzzer_sys::fuzz_target; | ||||||
|  |  | ||||||
|  | fuzz_target!(|value: ArbitraryValue| { | ||||||
|  |     let value = serde_json::Value::from(value); | ||||||
|  |     let left = should_flatten_from_value(&value); | ||||||
|  |     let value = serde_json::to_vec(&value).unwrap(); | ||||||
|  |     let right = should_flatten_from_unchecked_slice(&value); | ||||||
|  |  | ||||||
|  |     assert_eq!(left, right); | ||||||
|  | }); | ||||||
							
								
								
									
										114
									
								
								json-depth-checker/src/lib.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										114
									
								
								json-depth-checker/src/lib.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,114 @@ | |||||||
|  | use serde_json::Value; | ||||||
|  |  | ||||||
|  | /// Your json MUST BE valid and generated by `serde_json::to_vec` before being | ||||||
|  | /// sent in this function. This function is DUMB and FAST but makes a lot of | ||||||
|  | /// asumption about the way `serde_json` will generate its input. | ||||||
|  | /// | ||||||
|  | /// Will return `true` if the JSON contains an object, an array of array | ||||||
|  | /// or an array containing an object. Returns `false` for everything else. | ||||||
|  | pub fn should_flatten_from_unchecked_slice(json: &[u8]) -> bool { | ||||||
|  |     if json.is_empty() { | ||||||
|  |         return false; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // since the json we receive has been generated by serde_json we know | ||||||
|  |     // it doesn't contains any whitespace at the beginning thus we can check | ||||||
|  |     // directly if we're looking at an object. | ||||||
|  |     if json[0] == b'{' { | ||||||
|  |         return true; | ||||||
|  |     } else if json[0] != b'[' { | ||||||
|  |         // if the json isn't an object or an array it means it's a simple value. | ||||||
|  |         return false; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // The array case is a little bit more complex. We are looking for a second | ||||||
|  |     // `[` but we need to ensure that it doesn't appear inside of a string. Thus | ||||||
|  |     // we need to keep track of if we're in a string or not. | ||||||
|  |  | ||||||
|  |     // will be used when we met a `\` to skip the next character. | ||||||
|  |     let mut skip_next = false; | ||||||
|  |     let mut in_string = false; | ||||||
|  |  | ||||||
|  |     for byte in json.iter().skip(1) { | ||||||
|  |         match byte { | ||||||
|  |             // handle the backlash. | ||||||
|  |             _ if skip_next => skip_next = false, | ||||||
|  |             b'\\' => skip_next = true, | ||||||
|  |  | ||||||
|  |             // handle the strings. | ||||||
|  |             byte if in_string => { | ||||||
|  |                 if *byte == b'"' { | ||||||
|  |                     in_string = false; | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |             b'"' => in_string = true, | ||||||
|  |  | ||||||
|  |             // handle the arrays. | ||||||
|  |             b'[' => return true, | ||||||
|  |             // since we know the json is valid we don't need to ensure the | ||||||
|  |             // array is correctly closed | ||||||
|  |  | ||||||
|  |             // handle the objects. | ||||||
|  |             b'{' => return true, | ||||||
|  |  | ||||||
|  |             // ignore everything else | ||||||
|  |             _ => (), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     false | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Consider using [`should_flatten_from_unchecked_slice`] when you can. | ||||||
|  | /// Will returns `true` if the json contains an object, an array of array | ||||||
|  | /// or an array containing an object. | ||||||
|  | /// Returns `false` for everything else. | ||||||
|  | /// This function has been written to test the [`should_flatten_from_unchecked_slice`]. | ||||||
|  | pub fn should_flatten_from_value(json: &Value) -> bool { | ||||||
|  |     match json { | ||||||
|  |         Value::Object(..) => true, | ||||||
|  |         Value::Array(array) => array.iter().any(|value| value.is_array() || value.is_object()), | ||||||
|  |         _ => false, | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[cfg(test)] | ||||||
|  | mod tests { | ||||||
|  |     use serde_json::*; | ||||||
|  |  | ||||||
|  |     use super::*; | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn test_shouldnt_flatten() { | ||||||
|  |         let shouldnt_flatten = vec![ | ||||||
|  |             json!(null), | ||||||
|  |             json!(true), | ||||||
|  |             json!(false), | ||||||
|  |             json!("a superb string"), | ||||||
|  |             json!("a string escaping other \"string\""), | ||||||
|  |             json!([null, true, false]), | ||||||
|  |             json!(["hello", "world", "!"]), | ||||||
|  |             json!(["a \"string\" escaping 'an other'", "\"[\"", "\"{\""]), | ||||||
|  |         ]; | ||||||
|  |         for value in shouldnt_flatten { | ||||||
|  |             assert!(!should_flatten_from_value(&value)); | ||||||
|  |             let value = serde_json::to_vec(&value).unwrap(); | ||||||
|  |             assert!(!should_flatten_from_unchecked_slice(&value)); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn test_should_flatten() { | ||||||
|  |         let should_flatten = vec![ | ||||||
|  |             json!({}), | ||||||
|  |             json!({ "hello": "world" }), | ||||||
|  |             json!(["hello", ["world"]]), | ||||||
|  |             json!([true, true, true, true, true, true, true, true, true, {}]), | ||||||
|  |         ]; | ||||||
|  |         for value in should_flatten { | ||||||
|  |             assert!(should_flatten_from_value(&value)); | ||||||
|  |             let value = serde_json::to_vec(&value).unwrap(); | ||||||
|  |             assert!(should_flatten_from_unchecked_slice(&value)); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -18,6 +18,7 @@ flatten-serde-json = { path = "../flatten-serde-json" } | |||||||
| grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } | grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } | ||||||
| geoutils = "0.4.1" | geoutils = "0.4.1" | ||||||
| heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } | heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } | ||||||
|  | json-depth-checker = { path = "../json-depth-checker" } | ||||||
| levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } | levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } | ||||||
| meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.9" } | meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.9" } | ||||||
| memmap2 = "0.5.3" | memmap2 = "0.5.3" | ||||||
|   | |||||||
| @@ -286,9 +286,10 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|                     })?; |                     })?; | ||||||
|  |  | ||||||
|                 self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?; |                 self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?; | ||||||
|                 let buffer = self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))?; |                 match self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))? { | ||||||
|  |                     Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, | ||||||
|                 self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?; |                     None => self.flattened_sorter.insert(docid.to_be_bytes(), base_obkv)?, | ||||||
|  |                 } | ||||||
|             } else { |             } else { | ||||||
|                 self.new_documents_ids.insert(docid); |                 self.new_documents_ids.insert(docid); | ||||||
|             } |             } | ||||||
| @@ -300,8 +301,12 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|             if let Some(flatten) = flattened_document { |             if let Some(flatten) = flattened_document { | ||||||
|                 self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?; |                 self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?; | ||||||
|             } else { |             } else { | ||||||
|                 let buffer = self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))?; |                 match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { | ||||||
|                 self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?; |                     Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, | ||||||
|  |                     None => { | ||||||
|  |                         self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())? | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             progress_callback(UpdateIndexingStep::RemapDocumentAddition { |             progress_callback(UpdateIndexingStep::RemapDocumentAddition { | ||||||
| @@ -326,8 +331,15 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     // Flatten a document from the fields ids map contained in self and insert the new |     // Flatten a document from the fields ids map contained in self and insert the new | ||||||
|     // created fields. |     // created fields. Returns `None` if the document doesn't need to be flattened. | ||||||
|     fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Vec<u8>> { |     fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> { | ||||||
|  |         if obkv | ||||||
|  |             .iter() | ||||||
|  |             .all(|(_, value)| !json_depth_checker::should_flatten_from_unchecked_slice(value)) | ||||||
|  |         { | ||||||
|  |             return Ok(None); | ||||||
|  |         } | ||||||
|  |  | ||||||
|         let mut doc = serde_json::Map::new(); |         let mut doc = serde_json::Map::new(); | ||||||
|  |  | ||||||
|         for (k, v) in obkv.iter() { |         for (k, v) in obkv.iter() { | ||||||
| @@ -357,7 +369,7 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|             writer.insert(fid, &value)?; |             writer.insert(fid, &value)?; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         Ok(buffer) |         Ok(Some(buffer)) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // Flatten a document from a field mapping generated by [create_fields_mapping] |     // Flatten a document from a field mapping generated by [create_fields_mapping] | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user