mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-24 20:46:27 +00:00 
			
		
		
		
	Merge pull request #494 from meilisearch/flatten-what-is-needed
Only flatten the required objects
This commit is contained in:
		| @@ -1,6 +1,6 @@ | ||||
| [workspace] | ||||
| resolver = "2" | ||||
| members = ["milli", "filter-parser", "flatten-serde-json", "http-ui", "benchmarks", "infos", "helpers", "cli"] | ||||
| members = ["milli", "filter-parser", "flatten-serde-json", "json-depth-checker", "http-ui", "benchmarks", "infos", "helpers", "cli"] | ||||
| default-members = ["milli"] | ||||
|  | ||||
| [profile.dev] | ||||
|   | ||||
							
								
								
									
										16
									
								
								json-depth-checker/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								json-depth-checker/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,16 @@ | ||||
| [package] | ||||
| name = "json-depth-checker" | ||||
| version = "0.1.0" | ||||
| edition = "2021" | ||||
|  | ||||
| # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | ||||
|  | ||||
| [dependencies] | ||||
| serde_json = "1.0" | ||||
|  | ||||
| [dev-dependencies] | ||||
| criterion = "0.3" | ||||
|  | ||||
| [[bench]] | ||||
| name = "depth" | ||||
| harness = false | ||||
							
								
								
									
										59
									
								
								json-depth-checker/benches/depth.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								json-depth-checker/benches/depth.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,59 @@ | ||||
| use criterion::{criterion_group, criterion_main, Criterion}; | ||||
| use json_depth_checker::should_flatten_from_unchecked_slice; | ||||
| use serde_json::json; | ||||
|  | ||||
| fn criterion_benchmark(c: &mut Criterion) { | ||||
|     let null = serde_json::to_vec(&json!(null)).unwrap(); | ||||
|     let bool_true = serde_json::to_vec(&json!(true)).unwrap(); | ||||
|     let bool_false = serde_json::to_vec(&json!(false)).unwrap(); | ||||
|     let integer = serde_json::to_vec(&json!(42)).unwrap(); | ||||
|     let float = serde_json::to_vec(&json!(1456.258)).unwrap(); | ||||
|     let string = serde_json::to_vec(&json!("hello world")).unwrap(); | ||||
|     let object = serde_json::to_vec(&json!({ "hello": "world",})).unwrap(); | ||||
|     let complex_object = serde_json::to_vec(&json!({ | ||||
|         "doggos": [ | ||||
|             { "bernard": true }, | ||||
|             { "michel": 42 }, | ||||
|             false, | ||||
|         ], | ||||
|         "bouvier": true, | ||||
|         "caniche": null, | ||||
|     })) | ||||
|     .unwrap(); | ||||
|     let simple_array = serde_json::to_vec(&json!([ | ||||
|         1, | ||||
|         2, | ||||
|         3, | ||||
|         "viva", | ||||
|         "l\"algeria", | ||||
|         true, | ||||
|         "[array]", | ||||
|         "escaped string \"" | ||||
|     ])) | ||||
|     .unwrap(); | ||||
|     let array_of_array = serde_json::to_vec(&json!([1, [2, [3]]])).unwrap(); | ||||
|     let array_of_object = serde_json::to_vec(&json!([1, [2, [3]], {}])).unwrap(); | ||||
|  | ||||
|     c.bench_function("null", |b| b.iter(|| should_flatten_from_unchecked_slice(&null))); | ||||
|     c.bench_function("true", |b| b.iter(|| should_flatten_from_unchecked_slice(&bool_true))); | ||||
|     c.bench_function("false", |b| b.iter(|| should_flatten_from_unchecked_slice(&bool_false))); | ||||
|     c.bench_function("integer", |b| b.iter(|| should_flatten_from_unchecked_slice(&integer))); | ||||
|     c.bench_function("float", |b| b.iter(|| should_flatten_from_unchecked_slice(&float))); | ||||
|     c.bench_function("string", |b| b.iter(|| should_flatten_from_unchecked_slice(&string))); | ||||
|     c.bench_function("object", |b| b.iter(|| should_flatten_from_unchecked_slice(&object))); | ||||
|     c.bench_function("complex object", |b| { | ||||
|         b.iter(|| should_flatten_from_unchecked_slice(&complex_object)) | ||||
|     }); | ||||
|     c.bench_function("simple array", |b| { | ||||
|         b.iter(|| should_flatten_from_unchecked_slice(&simple_array)) | ||||
|     }); | ||||
|     c.bench_function("array of array", |b| { | ||||
|         b.iter(|| should_flatten_from_unchecked_slice(&array_of_array)) | ||||
|     }); | ||||
|     c.bench_function("array of object", |b| { | ||||
|         b.iter(|| should_flatten_from_unchecked_slice(&array_of_object)) | ||||
|     }); | ||||
| } | ||||
|  | ||||
| criterion_group!(benches, criterion_benchmark); | ||||
| criterion_main!(benches); | ||||
							
								
								
									
										27
									
								
								json-depth-checker/fuzz/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								json-depth-checker/fuzz/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,27 @@ | ||||
| [package] | ||||
| name = "json-depth-checker" | ||||
| version = "0.0.0" | ||||
| authors = ["Automatically generated"] | ||||
| publish = false | ||||
| edition = "2018" | ||||
|  | ||||
| [package.metadata] | ||||
| cargo-fuzz = true | ||||
|  | ||||
| [dependencies] | ||||
| libfuzzer-sys = "0.4" | ||||
| arbitrary-json = "0.1.1" | ||||
| serde_json = "1.0.79" | ||||
|  | ||||
| [dependencies.json-depth-checker] | ||||
| path = ".." | ||||
|  | ||||
| # Prevent this from interfering with workspaces | ||||
| [workspace] | ||||
| members = ["."] | ||||
|  | ||||
| [[bin]] | ||||
| name = "depth" | ||||
| path = "fuzz_targets/depth.rs" | ||||
| test = false | ||||
| doc = false | ||||
							
								
								
									
										13
									
								
								json-depth-checker/fuzz/fuzz_targets/depth.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								json-depth-checker/fuzz/fuzz_targets/depth.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,13 @@ | ||||
| #![no_main] | ||||
| use arbitrary_json::ArbitraryValue; | ||||
| use json_depth_checker::*; | ||||
| use libfuzzer_sys::fuzz_target; | ||||
|  | ||||
| fuzz_target!(|value: ArbitraryValue| { | ||||
|     let value = serde_json::Value::from(value); | ||||
|     let left = should_flatten_from_value(&value); | ||||
|     let value = serde_json::to_vec(&value).unwrap(); | ||||
|     let right = should_flatten_from_unchecked_slice(&value); | ||||
|  | ||||
|     assert_eq!(left, right); | ||||
| }); | ||||
							
								
								
									
										114
									
								
								json-depth-checker/src/lib.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										114
									
								
								json-depth-checker/src/lib.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,114 @@ | ||||
| use serde_json::Value; | ||||
|  | ||||
| /// Your json MUST BE valid and generated by `serde_json::to_vec` before being | ||||
| /// sent in this function. This function is DUMB and FAST but makes a lot of | ||||
| /// asumption about the way `serde_json` will generate its input. | ||||
| /// | ||||
| /// Will return `true` if the JSON contains an object, an array of array | ||||
| /// or an array containing an object. Returns `false` for everything else. | ||||
| pub fn should_flatten_from_unchecked_slice(json: &[u8]) -> bool { | ||||
|     if json.is_empty() { | ||||
|         return false; | ||||
|     } | ||||
|  | ||||
|     // since the json we receive has been generated by serde_json we know | ||||
|     // it doesn't contains any whitespace at the beginning thus we can check | ||||
|     // directly if we're looking at an object. | ||||
|     if json[0] == b'{' { | ||||
|         return true; | ||||
|     } else if json[0] != b'[' { | ||||
|         // if the json isn't an object or an array it means it's a simple value. | ||||
|         return false; | ||||
|     } | ||||
|  | ||||
|     // The array case is a little bit more complex. We are looking for a second | ||||
|     // `[` but we need to ensure that it doesn't appear inside of a string. Thus | ||||
|     // we need to keep track of if we're in a string or not. | ||||
|  | ||||
|     // will be used when we met a `\` to skip the next character. | ||||
|     let mut skip_next = false; | ||||
|     let mut in_string = false; | ||||
|  | ||||
|     for byte in json.iter().skip(1) { | ||||
|         match byte { | ||||
|             // handle the backlash. | ||||
|             _ if skip_next => skip_next = false, | ||||
|             b'\\' => skip_next = true, | ||||
|  | ||||
|             // handle the strings. | ||||
|             byte if in_string => { | ||||
|                 if *byte == b'"' { | ||||
|                     in_string = false; | ||||
|                 } | ||||
|             } | ||||
|             b'"' => in_string = true, | ||||
|  | ||||
|             // handle the arrays. | ||||
|             b'[' => return true, | ||||
|             // since we know the json is valid we don't need to ensure the | ||||
|             // array is correctly closed | ||||
|  | ||||
|             // handle the objects. | ||||
|             b'{' => return true, | ||||
|  | ||||
|             // ignore everything else | ||||
|             _ => (), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     false | ||||
| } | ||||
|  | ||||
| /// Consider using [`should_flatten_from_unchecked_slice`] when you can. | ||||
| /// Will returns `true` if the json contains an object, an array of array | ||||
| /// or an array containing an object. | ||||
| /// Returns `false` for everything else. | ||||
| /// This function has been written to test the [`should_flatten_from_unchecked_slice`]. | ||||
| pub fn should_flatten_from_value(json: &Value) -> bool { | ||||
|     match json { | ||||
|         Value::Object(..) => true, | ||||
|         Value::Array(array) => array.iter().any(|value| value.is_array() || value.is_object()), | ||||
|         _ => false, | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use serde_json::*; | ||||
|  | ||||
|     use super::*; | ||||
|  | ||||
|     #[test] | ||||
|     fn test_shouldnt_flatten() { | ||||
|         let shouldnt_flatten = vec![ | ||||
|             json!(null), | ||||
|             json!(true), | ||||
|             json!(false), | ||||
|             json!("a superb string"), | ||||
|             json!("a string escaping other \"string\""), | ||||
|             json!([null, true, false]), | ||||
|             json!(["hello", "world", "!"]), | ||||
|             json!(["a \"string\" escaping 'an other'", "\"[\"", "\"{\""]), | ||||
|         ]; | ||||
|         for value in shouldnt_flatten { | ||||
|             assert!(!should_flatten_from_value(&value)); | ||||
|             let value = serde_json::to_vec(&value).unwrap(); | ||||
|             assert!(!should_flatten_from_unchecked_slice(&value)); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn test_should_flatten() { | ||||
|         let should_flatten = vec![ | ||||
|             json!({}), | ||||
|             json!({ "hello": "world" }), | ||||
|             json!(["hello", ["world"]]), | ||||
|             json!([true, true, true, true, true, true, true, true, true, {}]), | ||||
|         ]; | ||||
|         for value in should_flatten { | ||||
|             assert!(should_flatten_from_value(&value)); | ||||
|             let value = serde_json::to_vec(&value).unwrap(); | ||||
|             assert!(should_flatten_from_unchecked_slice(&value)); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| @@ -18,6 +18,7 @@ flatten-serde-json = { path = "../flatten-serde-json" } | ||||
| grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } | ||||
| geoutils = "0.4.1" | ||||
| heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } | ||||
| json-depth-checker = { path = "../json-depth-checker" } | ||||
| levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } | ||||
| meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.9" } | ||||
| memmap2 = "0.5.3" | ||||
|   | ||||
| @@ -286,9 +286,10 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|                     })?; | ||||
|  | ||||
|                 self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?; | ||||
|                 let buffer = self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))?; | ||||
|  | ||||
|                 self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?; | ||||
|                 match self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))? { | ||||
|                     Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, | ||||
|                     None => self.flattened_sorter.insert(docid.to_be_bytes(), base_obkv)?, | ||||
|                 } | ||||
|             } else { | ||||
|                 self.new_documents_ids.insert(docid); | ||||
|             } | ||||
| @@ -300,8 +301,12 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|             if let Some(flatten) = flattened_document { | ||||
|                 self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?; | ||||
|             } else { | ||||
|                 let buffer = self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))?; | ||||
|                 self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?; | ||||
|                 match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { | ||||
|                     Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, | ||||
|                     None => { | ||||
|                         self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())? | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             progress_callback(UpdateIndexingStep::RemapDocumentAddition { | ||||
| @@ -326,8 +331,15 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|     } | ||||
|  | ||||
|     // Flatten a document from the fields ids map contained in self and insert the new | ||||
|     // created fields. | ||||
|     fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Vec<u8>> { | ||||
|     // created fields. Returns `None` if the document doesn't need to be flattened. | ||||
|     fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> { | ||||
|         if obkv | ||||
|             .iter() | ||||
|             .all(|(_, value)| !json_depth_checker::should_flatten_from_unchecked_slice(value)) | ||||
|         { | ||||
|             return Ok(None); | ||||
|         } | ||||
|  | ||||
|         let mut doc = serde_json::Map::new(); | ||||
|  | ||||
|         for (k, v) in obkv.iter() { | ||||
| @@ -357,7 +369,7 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|             writer.insert(fid, &value)?; | ||||
|         } | ||||
|  | ||||
|         Ok(buffer) | ||||
|         Ok(Some(buffer)) | ||||
|     } | ||||
|  | ||||
|     // Flatten a document from a field mapping generated by [create_fields_mapping] | ||||
|   | ||||
		Reference in New Issue
	
	Block a user