mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 13:36:27 +00:00 
			
		
		
		
	optimize document transform
fix error types bump milli
This commit is contained in:
		
							
								
								
									
										14
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										14
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -1763,13 +1763,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" | |||||||
| checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" | checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "memmap" | name = "memmap2" | ||||||
| version = "0.7.0" | version = "0.5.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" | checksum = "4647a11b578fead29cdbb34d4adef8dd3dc35b876c9c6d5240d83f205abfe96e" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "libc", |  "libc", | ||||||
|  "winapi", |  | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| @@ -1783,8 +1782,8 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "milli" | name = "milli" | ||||||
| version = "0.17.2" | version = "0.19.0" | ||||||
| source = "git+https://github.com/meilisearch/milli.git?tag=v0.17.3#1e8acaa20b323a198229ad8ede96d045072e45c8" | source = "git+https://github.com/meilisearch/milli.git?tag=v0.19.0#d7943fe22553b8205b86c32a0f2656d9e42de351" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "bimap", |  "bimap", | ||||||
|  "bincode", |  "bincode", | ||||||
| @@ -1793,6 +1792,7 @@ dependencies = [ | |||||||
|  "chrono", |  "chrono", | ||||||
|  "concat-arrays", |  "concat-arrays", | ||||||
|  "crossbeam-channel", |  "crossbeam-channel", | ||||||
|  |  "csv", | ||||||
|  "either", |  "either", | ||||||
|  "flate2", |  "flate2", | ||||||
|  "fst", |  "fst", | ||||||
| @@ -1807,7 +1807,7 @@ dependencies = [ | |||||||
|  "log", |  "log", | ||||||
|  "logging_timer", |  "logging_timer", | ||||||
|  "meilisearch-tokenizer", |  "meilisearch-tokenizer", | ||||||
|  "memmap", |  "memmap2", | ||||||
|  "obkv", |  "obkv", | ||||||
|  "once_cell", |  "once_cell", | ||||||
|  "ordered-float", |  "ordered-float", | ||||||
|   | |||||||
| @@ -30,7 +30,7 @@ lazy_static = "1.4.0" | |||||||
| log = "0.4.14" | log = "0.4.14" | ||||||
| meilisearch-error = { path = "../meilisearch-error" } | meilisearch-error = { path = "../meilisearch-error" } | ||||||
| meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.5" } | meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.5" } | ||||||
| milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.17.3" } | milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.19.0" } | ||||||
| mime = "0.3.16" | mime = "0.3.16" | ||||||
| num_cpus = "1.13.0" | num_cpus = "1.13.0" | ||||||
| once_cell = "1.8.0" | once_cell = "1.8.0" | ||||||
|   | |||||||
| @@ -1,10 +1,8 @@ | |||||||
| use std::fmt; | use std::fmt; | ||||||
| use std::io::{self, Read, Result as IoResult, Seek, Write}; | use std::io::{self, BufRead, BufReader, BufWriter, Cursor, Read, Seek, Write}; | ||||||
|  |  | ||||||
| use csv::{Reader as CsvReader, StringRecordsIntoIter}; |  | ||||||
| use meilisearch_error::{Code, ErrorCode}; | use meilisearch_error::{Code, ErrorCode}; | ||||||
| use milli::documents::DocumentBatchBuilder; | use milli::documents::DocumentBatchBuilder; | ||||||
| use serde_json::{Deserializer, Map, Value}; |  | ||||||
|  |  | ||||||
| type Result<T> = std::result::Result<T, DocumentFormatError>; | type Result<T> = std::result::Result<T, DocumentFormatError>; | ||||||
|  |  | ||||||
| @@ -36,6 +34,15 @@ pub enum DocumentFormatError { | |||||||
|     ), |     ), | ||||||
| } | } | ||||||
|  |  | ||||||
|  | impl From<(PayloadType, milli::documents::Error)> for DocumentFormatError { | ||||||
|  |     fn from((ty, error): (PayloadType, milli::documents::Error)) -> Self { | ||||||
|  |         match error { | ||||||
|  |             milli::documents::Error::Io(e) => Self::Internal(Box::new(e)), | ||||||
|  |             e => Self::MalformedPayload(Box::new(e), ty), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| impl ErrorCode for DocumentFormatError { | impl ErrorCode for DocumentFormatError { | ||||||
|     fn error_code(&self) -> Code { |     fn error_code(&self) -> Code { | ||||||
|         match self { |         match self { | ||||||
| @@ -45,330 +52,47 @@ impl ErrorCode for DocumentFormatError { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| internal_error!(DocumentFormatError: milli::documents::Error, io::Error); | internal_error!(DocumentFormatError: io::Error); | ||||||
|  |  | ||||||
| macro_rules! malformed { |  | ||||||
|     ($type:path, $e:expr) => { |  | ||||||
|         $e.map_err(|e| DocumentFormatError::MalformedPayload(Box::new(e), $type)) |  | ||||||
|     }; |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  | /// reads csv from input and write an obkv batch to writer. | ||||||
| pub fn read_csv(input: impl Read, writer: impl Write + Seek) -> Result<()> { | pub fn read_csv(input: impl Read, writer: impl Write + Seek) -> Result<()> { | ||||||
|     let mut builder = DocumentBatchBuilder::new(writer).unwrap(); |     let writer = BufWriter::new(writer); | ||||||
|  |     DocumentBatchBuilder::from_csv(input, writer) | ||||||
|     let iter = CsvDocumentIter::from_reader(input)?; |         .map_err(|e| (PayloadType::Csv, e))? | ||||||
|     for doc in iter { |         .finish() | ||||||
|         let doc = doc?; |         .map_err(|e| (PayloadType::Csv, e))?; | ||||||
|         builder.add_documents(doc).unwrap(); |  | ||||||
|     } |  | ||||||
|     builder.finish().unwrap(); |  | ||||||
|  |  | ||||||
|     Ok(()) |     Ok(()) | ||||||
| } | } | ||||||
|  |  | ||||||
| /// read jsonl from input and write an obkv batch to writer. | /// reads jsonl from input and write an obkv batch to writer. | ||||||
| pub fn read_ndjson(input: impl Read, writer: impl Write + Seek) -> Result<()> { | pub fn read_ndjson(input: impl Read, writer: impl Write + Seek) -> Result<()> { | ||||||
|     let mut builder = DocumentBatchBuilder::new(writer)?; |     let mut reader = BufReader::new(input); | ||||||
|     let stream = Deserializer::from_reader(input).into_iter::<Map<String, Value>>(); |     let writer = BufWriter::new(writer); | ||||||
|  |  | ||||||
|     for value in stream { |     let mut builder = DocumentBatchBuilder::new(writer).map_err(|e| (PayloadType::Ndjson, e))?; | ||||||
|         let value = malformed!(PayloadType::Ndjson, value)?; |     let mut buf = String::new(); | ||||||
|         builder.add_documents(&value)?; |  | ||||||
|  |     while reader.read_line(&mut buf)? > 0 { | ||||||
|  |         builder | ||||||
|  |             .extend_from_json(Cursor::new(&buf.as_bytes())) | ||||||
|  |             .map_err(|e| (PayloadType::Ndjson, e))?; | ||||||
|  |         buf.clear(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     builder.finish()?; |     builder.finish().map_err(|e| (PayloadType::Ndjson, e))?; | ||||||
|  |  | ||||||
|     Ok(()) |     Ok(()) | ||||||
| } | } | ||||||
|  |  | ||||||
| /// read json from input and write an obkv batch to writer. | /// reads json from input and write an obkv batch to writer. | ||||||
| pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<()> { | pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<()> { | ||||||
|     let mut builder = DocumentBatchBuilder::new(writer).unwrap(); |     let writer = BufWriter::new(writer); | ||||||
|  |     let mut builder = DocumentBatchBuilder::new(writer).map_err(|e| (PayloadType::Json, e))?; | ||||||
|     let documents: Vec<Map<String, Value>> = |     builder | ||||||
|         malformed!(PayloadType::Json, serde_json::from_reader(input))?; |         .extend_from_json(input) | ||||||
|     builder.add_documents(documents).unwrap(); |         .map_err(|e| (PayloadType::Json, e))?; | ||||||
|     builder.finish().unwrap(); |     builder.finish().map_err(|e| (PayloadType::Json, e))?; | ||||||
|  |  | ||||||
|     Ok(()) |     Ok(()) | ||||||
| } | } | ||||||
|  |  | ||||||
| enum AllowedType { |  | ||||||
|     String, |  | ||||||
|     Number, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| fn parse_csv_header(header: &str) -> (String, AllowedType) { |  | ||||||
|     // if there are several separators we only split on the last one. |  | ||||||
|     match header.rsplit_once(':') { |  | ||||||
|         Some((field_name, field_type)) => match field_type { |  | ||||||
|             "string" => (field_name.to_string(), AllowedType::String), |  | ||||||
|             "number" => (field_name.to_string(), AllowedType::Number), |  | ||||||
|             // if the pattern isn't reconized, we keep the whole field. |  | ||||||
|             _otherwise => (header.to_string(), AllowedType::String), |  | ||||||
|         }, |  | ||||||
|         None => (header.to_string(), AllowedType::String), |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub struct CsvDocumentIter<R> |  | ||||||
| where |  | ||||||
|     R: Read, |  | ||||||
| { |  | ||||||
|     documents: StringRecordsIntoIter<R>, |  | ||||||
|     headers: Vec<(String, AllowedType)>, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl<R: Read> CsvDocumentIter<R> { |  | ||||||
|     pub fn from_reader(reader: R) -> IoResult<Self> { |  | ||||||
|         let mut records = CsvReader::from_reader(reader); |  | ||||||
|  |  | ||||||
|         let headers = records |  | ||||||
|             .headers()? |  | ||||||
|             .into_iter() |  | ||||||
|             .map(parse_csv_header) |  | ||||||
|             .collect(); |  | ||||||
|  |  | ||||||
|         Ok(Self { |  | ||||||
|             documents: records.into_records(), |  | ||||||
|             headers, |  | ||||||
|         }) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl<R: Read> Iterator for CsvDocumentIter<R> { |  | ||||||
|     type Item = Result<Map<String, Value>>; |  | ||||||
|  |  | ||||||
|     fn next(&mut self) -> Option<Self::Item> { |  | ||||||
|         let csv_document = self.documents.next()?; |  | ||||||
|  |  | ||||||
|         match csv_document { |  | ||||||
|             Ok(csv_document) => { |  | ||||||
|                 let mut document = Map::new(); |  | ||||||
|  |  | ||||||
|                 for ((field_name, field_type), value) in |  | ||||||
|                     self.headers.iter().zip(csv_document.into_iter()) |  | ||||||
|                 { |  | ||||||
|                     let parsed_value = match field_type { |  | ||||||
|                         AllowedType::Number => { |  | ||||||
|                             malformed!(PayloadType::Csv, value.parse::<f64>().map(Value::from)) |  | ||||||
|                         } |  | ||||||
|                         AllowedType::String => Ok(Value::String(value.to_string())), |  | ||||||
|                     }; |  | ||||||
|  |  | ||||||
|                     match parsed_value { |  | ||||||
|                         Ok(value) => drop(document.insert(field_name.to_string(), value)), |  | ||||||
|                         Err(e) => return Some(Err(e)), |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|  |  | ||||||
|                 Some(Ok(document)) |  | ||||||
|             } |  | ||||||
|             Err(e) => Some(Err(DocumentFormatError::MalformedPayload( |  | ||||||
|                 Box::new(e), |  | ||||||
|                 PayloadType::Csv, |  | ||||||
|             ))), |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[cfg(test)] |  | ||||||
| mod test { |  | ||||||
|     use serde_json::json; |  | ||||||
|  |  | ||||||
|     use super::*; |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn simple_csv_document() { |  | ||||||
|         let documents = r#"city,country,pop |  | ||||||
| "Boston","United States","4628910""#; |  | ||||||
|  |  | ||||||
|         let mut csv_iter = CsvDocumentIter::from_reader(documents.as_bytes()).unwrap(); |  | ||||||
|  |  | ||||||
|         assert_eq!( |  | ||||||
|             Value::Object(csv_iter.next().unwrap().unwrap()), |  | ||||||
|             json!({ |  | ||||||
|                 "city": "Boston", |  | ||||||
|                 "country": "United States", |  | ||||||
|                 "pop": "4628910", |  | ||||||
|             }) |  | ||||||
|         ); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn coma_in_field() { |  | ||||||
|         let documents = r#"city,country,pop |  | ||||||
| "Boston","United, States","4628910""#; |  | ||||||
|  |  | ||||||
|         let mut csv_iter = CsvDocumentIter::from_reader(documents.as_bytes()).unwrap(); |  | ||||||
|  |  | ||||||
|         assert_eq!( |  | ||||||
|             Value::Object(csv_iter.next().unwrap().unwrap()), |  | ||||||
|             json!({ |  | ||||||
|                 "city": "Boston", |  | ||||||
|                 "country": "United, States", |  | ||||||
|                 "pop": "4628910", |  | ||||||
|             }) |  | ||||||
|         ); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn quote_in_field() { |  | ||||||
|         let documents = r#"city,country,pop |  | ||||||
| "Boston","United"" States","4628910""#; |  | ||||||
|  |  | ||||||
|         let mut csv_iter = CsvDocumentIter::from_reader(documents.as_bytes()).unwrap(); |  | ||||||
|  |  | ||||||
|         assert_eq!( |  | ||||||
|             Value::Object(csv_iter.next().unwrap().unwrap()), |  | ||||||
|             json!({ |  | ||||||
|                 "city": "Boston", |  | ||||||
|                 "country": "United\" States", |  | ||||||
|                 "pop": "4628910", |  | ||||||
|             }) |  | ||||||
|         ); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn integer_in_field() { |  | ||||||
|         let documents = r#"city,country,pop:number |  | ||||||
| "Boston","United States","4628910""#; |  | ||||||
|  |  | ||||||
|         let mut csv_iter = CsvDocumentIter::from_reader(documents.as_bytes()).unwrap(); |  | ||||||
|  |  | ||||||
|         assert_eq!( |  | ||||||
|             Value::Object(csv_iter.next().unwrap().unwrap()), |  | ||||||
|             json!({ |  | ||||||
|                 "city": "Boston", |  | ||||||
|                 "country": "United States", |  | ||||||
|                 "pop": 4628910.0, |  | ||||||
|             }) |  | ||||||
|         ); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn float_in_field() { |  | ||||||
|         let documents = r#"city,country,pop:number |  | ||||||
| "Boston","United States","4628910.01""#; |  | ||||||
|  |  | ||||||
|         let mut csv_iter = CsvDocumentIter::from_reader(documents.as_bytes()).unwrap(); |  | ||||||
|  |  | ||||||
|         assert_eq!( |  | ||||||
|             Value::Object(csv_iter.next().unwrap().unwrap()), |  | ||||||
|             json!({ |  | ||||||
|                 "city": "Boston", |  | ||||||
|                 "country": "United States", |  | ||||||
|                 "pop": 4628910.01, |  | ||||||
|             }) |  | ||||||
|         ); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn several_colon_in_header() { |  | ||||||
|         let documents = r#"city:love:string,country:state,pop |  | ||||||
| "Boston","United States","4628910""#; |  | ||||||
|  |  | ||||||
|         let mut csv_iter = CsvDocumentIter::from_reader(documents.as_bytes()).unwrap(); |  | ||||||
|  |  | ||||||
|         assert_eq!( |  | ||||||
|             Value::Object(csv_iter.next().unwrap().unwrap()), |  | ||||||
|             json!({ |  | ||||||
|                 "city:love": "Boston", |  | ||||||
|                 "country:state": "United States", |  | ||||||
|                 "pop": "4628910", |  | ||||||
|             }) |  | ||||||
|         ); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn ending_by_colon_in_header() { |  | ||||||
|         let documents = r#"city:,country,pop |  | ||||||
| "Boston","United States","4628910""#; |  | ||||||
|  |  | ||||||
|         let mut csv_iter = CsvDocumentIter::from_reader(documents.as_bytes()).unwrap(); |  | ||||||
|  |  | ||||||
|         assert_eq!( |  | ||||||
|             Value::Object(csv_iter.next().unwrap().unwrap()), |  | ||||||
|             json!({ |  | ||||||
|                 "city:": "Boston", |  | ||||||
|                 "country": "United States", |  | ||||||
|                 "pop": "4628910", |  | ||||||
|             }) |  | ||||||
|         ); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn starting_by_colon_in_header() { |  | ||||||
|         let documents = r#":city,country,pop |  | ||||||
| "Boston","United States","4628910""#; |  | ||||||
|  |  | ||||||
|         let mut csv_iter = CsvDocumentIter::from_reader(documents.as_bytes()).unwrap(); |  | ||||||
|  |  | ||||||
|         assert_eq!( |  | ||||||
|             Value::Object(csv_iter.next().unwrap().unwrap()), |  | ||||||
|             json!({ |  | ||||||
|                 ":city": "Boston", |  | ||||||
|                 "country": "United States", |  | ||||||
|                 "pop": "4628910", |  | ||||||
|             }) |  | ||||||
|         ); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[ignore] |  | ||||||
|     #[test] |  | ||||||
|     fn starting_by_colon_in_header2() { |  | ||||||
|         let documents = r#":string,country,pop |  | ||||||
| "Boston","United States","4628910""#; |  | ||||||
|  |  | ||||||
|         let mut csv_iter = CsvDocumentIter::from_reader(documents.as_bytes()).unwrap(); |  | ||||||
|  |  | ||||||
|         assert!(csv_iter.next().unwrap().is_err()); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn double_colon_in_header() { |  | ||||||
|         let documents = r#"city::string,country,pop |  | ||||||
| "Boston","United States","4628910""#; |  | ||||||
|  |  | ||||||
|         let mut csv_iter = CsvDocumentIter::from_reader(documents.as_bytes()).unwrap(); |  | ||||||
|  |  | ||||||
|         assert_eq!( |  | ||||||
|             Value::Object(csv_iter.next().unwrap().unwrap()), |  | ||||||
|             json!({ |  | ||||||
|                 "city:": "Boston", |  | ||||||
|                 "country": "United States", |  | ||||||
|                 "pop": "4628910", |  | ||||||
|             }) |  | ||||||
|         ); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn bad_type_in_header() { |  | ||||||
|         let documents = r#"city,country:number,pop |  | ||||||
| "Boston","United States","4628910""#; |  | ||||||
|  |  | ||||||
|         let mut csv_iter = CsvDocumentIter::from_reader(documents.as_bytes()).unwrap(); |  | ||||||
|  |  | ||||||
|         assert!(csv_iter.next().unwrap().is_err()); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn bad_column_count1() { |  | ||||||
|         let documents = r#"city,country,pop |  | ||||||
| "Boston","United States","4628910", "too much""#; |  | ||||||
|  |  | ||||||
|         let mut csv_iter = CsvDocumentIter::from_reader(documents.as_bytes()).unwrap(); |  | ||||||
|  |  | ||||||
|         assert!(csv_iter.next().unwrap().is_err()); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn bad_column_count2() { |  | ||||||
|         let documents = r#"city,country,pop |  | ||||||
| "Boston","United States""#; |  | ||||||
|  |  | ||||||
|         let mut csv_iter = CsvDocumentIter::from_reader(documents.as_bytes()).unwrap(); |  | ||||||
|  |  | ||||||
|         assert!(csv_iter.next().unwrap().is_err()); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|   | |||||||
| @@ -149,7 +149,7 @@ impl UpdateFileStore { | |||||||
|         // for jsonl for example...) |         // for jsonl for example...) | ||||||
|         while let Some((index, document)) = document_reader.next_document_with_index()? { |         while let Some((index, document)) = document_reader.next_document_with_index()? { | ||||||
|             for (field_id, content) in document.iter() { |             for (field_id, content) in document.iter() { | ||||||
|                 if let Some(field_name) = index.get_by_left(&field_id) { |                 if let Some(field_name) = index.name(field_id) { | ||||||
|                     let content = serde_json::from_slice(content)?; |                     let content = serde_json::from_slice(content)?; | ||||||
|                     document_buffer.insert(field_name.to_string(), content); |                     document_buffer.insert(field_name.to_string(), content); | ||||||
|                 } |                 } | ||||||
|   | |||||||
| @@ -3,15 +3,13 @@ mod message; | |||||||
| pub mod status; | pub mod status; | ||||||
| pub mod store; | pub mod store; | ||||||
|  |  | ||||||
| use std::io::{self, BufRead, BufReader}; | use std::io::Cursor; | ||||||
| use std::path::{Path, PathBuf}; | use std::path::{Path, PathBuf}; | ||||||
| use std::sync::atomic::AtomicBool; | use std::sync::atomic::AtomicBool; | ||||||
| use std::sync::Arc; | use std::sync::Arc; | ||||||
|  |  | ||||||
| use actix_web::error::PayloadError; |  | ||||||
| use async_stream::stream; | use async_stream::stream; | ||||||
| use bytes::Bytes; | use futures::StreamExt; | ||||||
| use futures::{Stream, StreamExt}; |  | ||||||
| use log::trace; | use log::trace; | ||||||
| use milli::update::IndexDocumentsMethod; | use milli::update::IndexDocumentsMethod; | ||||||
| use serde::{Deserialize, Serialize}; | use serde::{Deserialize, Serialize}; | ||||||
| @@ -51,48 +49,6 @@ where | |||||||
|     Ok(sender) |     Ok(sender) | ||||||
| } | } | ||||||
|  |  | ||||||
| /// A wrapper type to implement read on a `Stream<Result<Bytes, Error>>`. |  | ||||||
| struct StreamReader<S> { |  | ||||||
|     stream: S, |  | ||||||
|     current: Option<Bytes>, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl<S> StreamReader<S> { |  | ||||||
|     fn new(stream: S) -> Self { |  | ||||||
|         Self { |  | ||||||
|             stream, |  | ||||||
|             current: None, |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl<S: Stream<Item = std::result::Result<Bytes, PayloadError>> + Unpin> io::Read |  | ||||||
|     for StreamReader<S> |  | ||||||
| { |  | ||||||
|     fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> { |  | ||||||
|         // TODO: optimize buf filling |  | ||||||
|         match self.current.take() { |  | ||||||
|             Some(mut bytes) => { |  | ||||||
|                 let split_at = bytes.len().min(buf.len()); |  | ||||||
|                 let copied = bytes.split_to(split_at); |  | ||||||
|                 buf[..split_at].copy_from_slice(&copied); |  | ||||||
|                 if !bytes.is_empty() { |  | ||||||
|                     self.current.replace(bytes); |  | ||||||
|                 } |  | ||||||
|                 Ok(copied.len()) |  | ||||||
|             } |  | ||||||
|             None => match tokio::runtime::Handle::current().block_on(self.stream.next()) { |  | ||||||
|                 Some(Ok(bytes)) => { |  | ||||||
|                     self.current.replace(bytes); |  | ||||||
|                     self.read(buf) |  | ||||||
|                 } |  | ||||||
|                 Some(Err(e)) => Err(io::Error::new(io::ErrorKind::BrokenPipe, e)), |  | ||||||
|                 None => Ok(0), |  | ||||||
|             }, |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub struct UpdateLoop { | pub struct UpdateLoop { | ||||||
|     store: Arc<UpdateStore>, |     store: Arc<UpdateStore>, | ||||||
|     inbox: Option<mpsc::Receiver<UpdateMsg>>, |     inbox: Option<mpsc::Receiver<UpdateMsg>>, | ||||||
| @@ -196,20 +152,28 @@ impl UpdateLoop { | |||||||
|     async fn handle_update(&self, index_uuid: Uuid, update: Update) -> Result<UpdateStatus> { |     async fn handle_update(&self, index_uuid: Uuid, update: Update) -> Result<UpdateStatus> { | ||||||
|         let registration = match update { |         let registration = match update { | ||||||
|             Update::DocumentAddition { |             Update::DocumentAddition { | ||||||
|                 payload, |                 mut payload, | ||||||
|                 primary_key, |                 primary_key, | ||||||
|                 method, |                 method, | ||||||
|                 format, |                 format, | ||||||
|             } => { |             } => { | ||||||
|                 let mut reader = BufReader::new(StreamReader::new(payload)); |                 let mut buffer = Vec::new(); | ||||||
|  |                 while let Some(bytes) = payload.next().await { | ||||||
|  |                     match bytes { | ||||||
|  |                         Ok(bytes) => { | ||||||
|  |                             buffer.extend_from_slice(&bytes); | ||||||
|  |                         } | ||||||
|  |                         Err(e) => return Err(e.into()), | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|                 let (content_uuid, mut update_file) = self.update_file_store.new_update()?; |                 let (content_uuid, mut update_file) = self.update_file_store.new_update()?; | ||||||
|                 tokio::task::spawn_blocking(move || -> Result<_> { |                 tokio::task::spawn_blocking(move || -> Result<_> { | ||||||
|                     // check if the payload is empty, and return an error |                     // check if the payload is empty, and return an error | ||||||
|                     reader.fill_buf()?; |                     if buffer.is_empty() { | ||||||
|                     if reader.buffer().is_empty() { |  | ||||||
|                         return Err(UpdateLoopError::MissingPayload(format)); |                         return Err(UpdateLoopError::MissingPayload(format)); | ||||||
|                     } |                     } | ||||||
|  |  | ||||||
|  |                     let reader = Cursor::new(buffer); | ||||||
|                     match format { |                     match format { | ||||||
|                         DocumentAdditionFormat::Json => read_json(reader, &mut *update_file)?, |                         DocumentAdditionFormat::Json => read_json(reader, &mut *update_file)?, | ||||||
|                         DocumentAdditionFormat::Csv => read_csv(reader, &mut *update_file)?, |                         DocumentAdditionFormat::Csv => read_csv(reader, &mut *update_file)?, | ||||||
|   | |||||||
| @@ -11,7 +11,7 @@ pub use index_controller::MeiliSearch; | |||||||
| pub use milli; | pub use milli; | ||||||
|  |  | ||||||
| mod compression; | mod compression; | ||||||
| mod document_formats; | pub mod document_formats; | ||||||
|  |  | ||||||
| use walkdir::WalkDir; | use walkdir::WalkDir; | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user