mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 07:56:28 +00:00 
			
		
		
		
	merge with main
This commit is contained in:
		| @@ -1,7 +1,7 @@ | ||||
| #![allow(dead_code)] | ||||
|  | ||||
| use std::fs::{create_dir_all, remove_dir_all, File}; | ||||
| use std::io::{self, Cursor, Read, Seek}; | ||||
| use std::io::{self, BufRead, BufReader, Cursor, Read, Seek}; | ||||
| use std::num::ParseFloatError; | ||||
| use std::path::Path; | ||||
|  | ||||
| @@ -146,44 +146,34 @@ pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader<imp | ||||
|     DocumentBatchReader::from_reader(Cursor::new(documents)).unwrap() | ||||
| } | ||||
|  | ||||
| fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result<Vec<u8>> { | ||||
| fn documents_from_jsonl(reader: impl Read) -> anyhow::Result<Vec<u8>> { | ||||
|     let mut writer = Cursor::new(Vec::new()); | ||||
|     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; | ||||
|  | ||||
|     let values = serde_json::Deserializer::from_reader(reader) | ||||
|         .into_iter::<serde_json::Map<String, serde_json::Value>>(); | ||||
|     for document in values { | ||||
|         let document = document?; | ||||
|         documents.add_documents(document)?; | ||||
|     let mut buf = String::new(); | ||||
|     let mut reader = BufReader::new(reader); | ||||
|  | ||||
|     while reader.read_line(&mut buf)? > 0 { | ||||
|         documents.extend_from_json(&mut buf.as_bytes())?; | ||||
|     } | ||||
|     documents.finish()?; | ||||
|  | ||||
|     Ok(writer.into_inner()) | ||||
| } | ||||
|  | ||||
| fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> { | ||||
| fn documents_from_json(reader: impl Read) -> anyhow::Result<Vec<u8>> { | ||||
|     let mut writer = Cursor::new(Vec::new()); | ||||
|     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; | ||||
|  | ||||
|     let json: serde_json::Value = serde_json::from_reader(reader)?; | ||||
|     documents.add_documents(json)?; | ||||
|     documents.extend_from_json(reader)?; | ||||
|     documents.finish()?; | ||||
|  | ||||
|     Ok(writer.into_inner()) | ||||
| } | ||||
|  | ||||
| fn documents_from_csv(reader: impl io::Read) -> anyhow::Result<Vec<u8>> { | ||||
| fn documents_from_csv(reader: impl Read) -> anyhow::Result<Vec<u8>> { | ||||
|     let mut writer = Cursor::new(Vec::new()); | ||||
|     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; | ||||
|  | ||||
|     let iter = CSVDocumentDeserializer::from_reader(reader)?; | ||||
|  | ||||
|     for doc in iter { | ||||
|         let doc = doc?; | ||||
|         documents.add_documents(doc)?; | ||||
|     } | ||||
|  | ||||
|     documents.finish()?; | ||||
|     milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; | ||||
|  | ||||
|     Ok(writer.into_inner()) | ||||
| } | ||||
|   | ||||
| @@ -1,6 +1,6 @@ | ||||
| [package] | ||||
| name = "cli" | ||||
| version = "0.1.0" | ||||
| version = "0.20.2" | ||||
| edition = "2018" | ||||
| description = "A CLI to interact with a milli index" | ||||
|  | ||||
|   | ||||
| @@ -1,5 +1,5 @@ | ||||
| use std::fs::File; | ||||
| use std::io::{stdin, Cursor, Read}; | ||||
| use std::io::{stdin, BufRead, BufReader, Cursor, Read}; | ||||
| use std::path::PathBuf; | ||||
| use std::str::FromStr; | ||||
|  | ||||
| @@ -9,7 +9,6 @@ use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; | ||||
| use milli::update::UpdateIndexingStep::{ | ||||
|     ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, | ||||
| }; | ||||
| use serde_json::{Map, Value}; | ||||
| use structopt::StructOpt; | ||||
|  | ||||
| #[cfg(target_os = "linux")] | ||||
| @@ -202,11 +201,11 @@ fn documents_from_jsonl(reader: impl Read) -> Result<Vec<u8>> { | ||||
|     let mut writer = Cursor::new(Vec::new()); | ||||
|     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; | ||||
|  | ||||
|     let values = serde_json::Deserializer::from_reader(reader) | ||||
|         .into_iter::<serde_json::Map<String, serde_json::Value>>(); | ||||
|     for document in values { | ||||
|         let document = document?; | ||||
|         documents.add_documents(document)?; | ||||
|     let mut buf = String::new(); | ||||
|     let mut reader = BufReader::new(reader); | ||||
|  | ||||
|     while reader.read_line(&mut buf)? > 0 { | ||||
|         documents.extend_from_json(&mut buf.as_bytes())?; | ||||
|     } | ||||
|     documents.finish()?; | ||||
|  | ||||
| @@ -217,8 +216,7 @@ fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> { | ||||
|     let mut writer = Cursor::new(Vec::new()); | ||||
|     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; | ||||
|  | ||||
|     let json: serde_json::Value = serde_json::from_reader(reader)?; | ||||
|     documents.add_documents(json)?; | ||||
|     documents.extend_from_json(reader)?; | ||||
|     documents.finish()?; | ||||
|  | ||||
|     Ok(writer.into_inner()) | ||||
| @@ -226,17 +224,7 @@ fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> { | ||||
|  | ||||
| fn documents_from_csv(reader: impl Read) -> Result<Vec<u8>> { | ||||
|     let mut writer = Cursor::new(Vec::new()); | ||||
|     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; | ||||
|  | ||||
|     let mut records = csv::Reader::from_reader(reader); | ||||
|     let iter = records.deserialize::<Map<String, Value>>(); | ||||
|  | ||||
|     for doc in iter { | ||||
|         let doc = doc?; | ||||
|         documents.add_documents(doc)?; | ||||
|     } | ||||
|  | ||||
|     documents.finish()?; | ||||
|     milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; | ||||
|  | ||||
|     Ok(writer.into_inner()) | ||||
| } | ||||
|   | ||||
| @@ -1,6 +1,6 @@ | ||||
| [package] | ||||
| name = "helpers" | ||||
| version = "0.18.0" | ||||
| version = "0.20.2" | ||||
| authors = ["Clément Renault <clement@meilisearch.com>"] | ||||
| edition = "2018" | ||||
|  | ||||
|   | ||||
| @@ -1,7 +1,7 @@ | ||||
| [package] | ||||
| name = "http-ui" | ||||
| description = "The HTTP user interface of the milli search engine" | ||||
| version = "0.18.0" | ||||
| version = "0.20.2" | ||||
| authors = ["Clément Renault <clement@meilisearch.com>"] | ||||
| edition = "2018" | ||||
|  | ||||
|   | ||||
| @@ -1,285 +0,0 @@ | ||||
| use std::io::{Read, Result as IoResult}; | ||||
| use std::num::ParseFloatError; | ||||
|  | ||||
| use serde_json::{Map, Value}; | ||||
|  | ||||
| enum AllowedType { | ||||
|     String, | ||||
|     Number, | ||||
| } | ||||
|  | ||||
| fn parse_csv_header(header: &str) -> (String, AllowedType) { | ||||
|     // if there are several separators we only split on the last one. | ||||
|     match header.rsplit_once(':') { | ||||
|         Some((field_name, field_type)) => match field_type { | ||||
|             "string" => (field_name.to_string(), AllowedType::String), | ||||
|             "number" => (field_name.to_string(), AllowedType::Number), | ||||
|             // we may return an error in this case. | ||||
|             _otherwise => (header.to_string(), AllowedType::String), | ||||
|         }, | ||||
|         None => (header.to_string(), AllowedType::String), | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct CSVDocumentDeserializer<R> | ||||
| where | ||||
|     R: Read, | ||||
| { | ||||
|     documents: csv::StringRecordsIntoIter<R>, | ||||
|     headers: Vec<(String, AllowedType)>, | ||||
| } | ||||
|  | ||||
| impl<R: Read> CSVDocumentDeserializer<R> { | ||||
|     pub fn from_reader(reader: R) -> IoResult<Self> { | ||||
|         let mut records = csv::Reader::from_reader(reader); | ||||
|  | ||||
|         let headers = records.headers()?.into_iter().map(parse_csv_header).collect(); | ||||
|  | ||||
|         Ok(Self { documents: records.into_records(), headers }) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<R: Read> Iterator for CSVDocumentDeserializer<R> { | ||||
|     type Item = anyhow::Result<Map<String, Value>>; | ||||
|  | ||||
|     fn next(&mut self) -> Option<Self::Item> { | ||||
|         let csv_document = self.documents.next()?; | ||||
|  | ||||
|         match csv_document { | ||||
|             Ok(csv_document) => { | ||||
|                 let mut document = Map::new(); | ||||
|  | ||||
|                 for ((field_name, field_type), value) in | ||||
|                     self.headers.iter().zip(csv_document.into_iter()) | ||||
|                 { | ||||
|                     let parsed_value: Result<Value, ParseFloatError> = match field_type { | ||||
|                         AllowedType::Number => { | ||||
|                             value.parse::<f64>().map(Value::from).map_err(Into::into) | ||||
|                         } | ||||
|                         AllowedType::String => Ok(Value::String(value.to_string())), | ||||
|                     }; | ||||
|  | ||||
|                     match parsed_value { | ||||
|                         Ok(value) => drop(document.insert(field_name.to_string(), value)), | ||||
|                         Err(_e) => { | ||||
|                             return Some(Err(anyhow::anyhow!( | ||||
|                                 "Value '{}' is not a valid number", | ||||
|                                 value | ||||
|                             ))) | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 Some(Ok(document)) | ||||
|             } | ||||
|             Err(e) => Some(Err(anyhow::anyhow!("Error parsing csv document: {}", e))), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod test { | ||||
|     use serde_json::json; | ||||
|  | ||||
|     use super::*; | ||||
|  | ||||
|     #[test] | ||||
|     fn simple_csv_document() { | ||||
|         let documents = r#"city,country,pop | ||||
| "Boston","United States","4628910""#; | ||||
|  | ||||
|         let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); | ||||
|  | ||||
|         assert_eq!( | ||||
|             Value::Object(csv_iter.next().unwrap().unwrap()), | ||||
|             json!({ | ||||
|                 "city": "Boston", | ||||
|                 "country": "United States", | ||||
|                 "pop": "4628910", | ||||
|             }) | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn coma_in_field() { | ||||
|         let documents = r#"city,country,pop | ||||
| "Boston","United, States","4628910""#; | ||||
|  | ||||
|         let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); | ||||
|  | ||||
|         assert_eq!( | ||||
|             Value::Object(csv_iter.next().unwrap().unwrap()), | ||||
|             json!({ | ||||
|                 "city": "Boston", | ||||
|                 "country": "United, States", | ||||
|                 "pop": "4628910", | ||||
|             }) | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn quote_in_field() { | ||||
|         let documents = r#"city,country,pop | ||||
| "Boston","United"" States","4628910""#; | ||||
|  | ||||
|         let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); | ||||
|  | ||||
|         assert_eq!( | ||||
|             Value::Object(csv_iter.next().unwrap().unwrap()), | ||||
|             json!({ | ||||
|                 "city": "Boston", | ||||
|                 "country": "United\" States", | ||||
|                 "pop": "4628910", | ||||
|             }) | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn integer_in_field() { | ||||
|         let documents = r#"city,country,pop:number | ||||
| "Boston","United States","4628910""#; | ||||
|  | ||||
|         let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); | ||||
|  | ||||
|         assert_eq!( | ||||
|             Value::Object(csv_iter.next().unwrap().unwrap()), | ||||
|             json!({ | ||||
|                 "city": "Boston", | ||||
|                 "country": "United States", | ||||
|                 "pop": 4628910.0, | ||||
|             }) | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn float_in_field() { | ||||
|         let documents = r#"city,country,pop:number | ||||
| "Boston","United States","4628910.01""#; | ||||
|  | ||||
|         let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); | ||||
|  | ||||
|         assert_eq!( | ||||
|             Value::Object(csv_iter.next().unwrap().unwrap()), | ||||
|             json!({ | ||||
|                 "city": "Boston", | ||||
|                 "country": "United States", | ||||
|                 "pop": 4628910.01, | ||||
|             }) | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn several_double_dot_in_header() { | ||||
|         let documents = r#"city:love:string,country:state,pop | ||||
| "Boston","United States","4628910""#; | ||||
|  | ||||
|         let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); | ||||
|  | ||||
|         assert_eq!( | ||||
|             Value::Object(csv_iter.next().unwrap().unwrap()), | ||||
|             json!({ | ||||
|                 "city:love": "Boston", | ||||
|                 "country:state": "United States", | ||||
|                 "pop": "4628910", | ||||
|             }) | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn ending_by_double_dot_in_header() { | ||||
|         let documents = r#"city:,country,pop | ||||
| "Boston","United States","4628910""#; | ||||
|  | ||||
|         let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); | ||||
|  | ||||
|         assert_eq!( | ||||
|             Value::Object(csv_iter.next().unwrap().unwrap()), | ||||
|             json!({ | ||||
|                 "city:": "Boston", | ||||
|                 "country": "United States", | ||||
|                 "pop": "4628910", | ||||
|             }) | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn starting_by_double_dot_in_header() { | ||||
|         let documents = r#":city,country,pop | ||||
| "Boston","United States","4628910""#; | ||||
|  | ||||
|         let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); | ||||
|  | ||||
|         assert_eq!( | ||||
|             Value::Object(csv_iter.next().unwrap().unwrap()), | ||||
|             json!({ | ||||
|                 ":city": "Boston", | ||||
|                 "country": "United States", | ||||
|                 "pop": "4628910", | ||||
|             }) | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn starting_by_double_dot_in_header2() { | ||||
|         let documents = r#":string,country,pop | ||||
| "Boston","United States","4628910""#; | ||||
|  | ||||
|         let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); | ||||
|  | ||||
|         assert_eq!( | ||||
|             Value::Object(csv_iter.next().unwrap().unwrap()), | ||||
|             json!({ | ||||
|                 "": "Boston", | ||||
|                 "country": "United States", | ||||
|                 "pop": "4628910", | ||||
|             }) | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn double_double_dot_in_header() { | ||||
|         let documents = r#"city::string,country,pop | ||||
| "Boston","United States","4628910""#; | ||||
|  | ||||
|         let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); | ||||
|  | ||||
|         assert_eq!( | ||||
|             Value::Object(csv_iter.next().unwrap().unwrap()), | ||||
|             json!({ | ||||
|                 "city:": "Boston", | ||||
|                 "country": "United States", | ||||
|                 "pop": "4628910", | ||||
|             }) | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn bad_type_in_header() { | ||||
|         let documents = r#"city,country:number,pop | ||||
| "Boston","United States","4628910""#; | ||||
|  | ||||
|         let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); | ||||
|  | ||||
|         assert!(csv_iter.next().unwrap().is_err()); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn bad_column_count1() { | ||||
|         let documents = r#"city,country,pop | ||||
| "Boston","United States","4628910", "too much""#; | ||||
|  | ||||
|         let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); | ||||
|  | ||||
|         assert!(csv_iter.next().unwrap().is_err()); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn bad_column_count2() { | ||||
|         let documents = r#"city,country,pop | ||||
| "Boston","United States""#; | ||||
|  | ||||
|         let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); | ||||
|  | ||||
|         assert!(csv_iter.next().unwrap().is_err()); | ||||
|     } | ||||
| } | ||||
| @@ -1,10 +1,9 @@ | ||||
| mod documents_from_csv; | ||||
| mod update_store; | ||||
|  | ||||
| use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; | ||||
| use std::fmt::Display; | ||||
| use std::fs::{create_dir_all, File}; | ||||
| use std::io::Cursor; | ||||
| use std::io::{BufRead, BufReader, Cursor}; | ||||
| use std::net::SocketAddr; | ||||
| use std::num::{NonZeroU32, NonZeroUsize}; | ||||
| use std::path::PathBuf; | ||||
| @@ -40,7 +39,6 @@ use warp::http::Response; | ||||
| use warp::Filter; | ||||
|  | ||||
| use self::update_store::UpdateStore; | ||||
| use crate::documents_from_csv::CSVDocumentDeserializer; | ||||
|  | ||||
| #[cfg(target_os = "linux")] | ||||
| #[global_allocator] | ||||
| @@ -1048,11 +1046,11 @@ fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result<Vec<u8>> { | ||||
|     let mut writer = Cursor::new(Vec::new()); | ||||
|     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; | ||||
|  | ||||
|     let values = serde_json::Deserializer::from_reader(reader) | ||||
|         .into_iter::<serde_json::Map<String, serde_json::Value>>(); | ||||
|     for document in values { | ||||
|         let document = document?; | ||||
|         documents.add_documents(document)?; | ||||
|     let mut buf = String::new(); | ||||
|     let mut reader = BufReader::new(reader); | ||||
|  | ||||
|     while reader.read_line(&mut buf)? > 0 { | ||||
|         documents.extend_from_json(&mut buf.as_bytes())?; | ||||
|     } | ||||
|     documents.finish()?; | ||||
|  | ||||
| @@ -1063,8 +1061,7 @@ fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> { | ||||
|     let mut writer = Cursor::new(Vec::new()); | ||||
|     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; | ||||
|  | ||||
|     let json: serde_json::Value = serde_json::from_reader(reader)?; | ||||
|     documents.add_documents(json)?; | ||||
|     documents.extend_from_json(reader)?; | ||||
|     documents.finish()?; | ||||
|  | ||||
|     Ok(writer.into_inner()) | ||||
| @@ -1072,16 +1069,7 @@ fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> { | ||||
|  | ||||
| fn documents_from_csv(reader: impl io::Read) -> anyhow::Result<Vec<u8>> { | ||||
|     let mut writer = Cursor::new(Vec::new()); | ||||
|     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; | ||||
|  | ||||
|     let iter = CSVDocumentDeserializer::from_reader(reader)?; | ||||
|  | ||||
|     for doc in iter { | ||||
|         let doc = doc?; | ||||
|         documents.add_documents(doc)?; | ||||
|     } | ||||
|  | ||||
|     documents.finish()?; | ||||
|     milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; | ||||
|  | ||||
|     Ok(writer.into_inner()) | ||||
| } | ||||
|   | ||||
| @@ -1,6 +1,6 @@ | ||||
| [package] | ||||
| name = "infos" | ||||
| version = "0.18.0" | ||||
| version = "0.20.2" | ||||
| authors = ["Clément Renault <clement@meilisearch.com>"] | ||||
| edition = "2018" | ||||
|  | ||||
|   | ||||
| @@ -1,6 +1,6 @@ | ||||
| [package] | ||||
| name = "milli" | ||||
| version = "0.18.0" | ||||
| version = "0.20.2" | ||||
| authors = ["Kerollmops <clement@meilisearch.com>"] | ||||
| edition = "2018" | ||||
|  | ||||
| @@ -46,6 +46,7 @@ itertools = "0.10.0" | ||||
| # logging | ||||
| log = "0.4.14" | ||||
| logging_timer = "1.0.0" | ||||
| csv = "1.1.6" | ||||
|  | ||||
| [dev-dependencies] | ||||
| big_s = "1.0.2" | ||||
|   | ||||
| @@ -28,12 +28,12 @@ impl fmt::Display for AscDescError { | ||||
|                 write!(f, "Longitude must be contained between -180 and 180 degrees.",) | ||||
|             } | ||||
|             Self::InvalidSyntax { name } => { | ||||
|                 write!(f, "invalid asc/desc syntax for {}.", name) | ||||
|                 write!(f, "Invalid syntax for the asc/desc parameter: expected expression ending by `:asc` or `:desc`, found `{}`.", name) | ||||
|             } | ||||
|             Self::ReservedKeyword { name } => { | ||||
|                 write!( | ||||
|                     f, | ||||
|                     "{} is a reserved keyword and thus can't be used as a asc/desc rule.", | ||||
|                     "`{}` is a reserved keyword and thus can't be used as a asc/desc rule.", | ||||
|                     name | ||||
|                 ) | ||||
|             } | ||||
| @@ -192,18 +192,18 @@ impl fmt::Display for SortError { | ||||
|             Self::BadGeoPointUsage { name } => { | ||||
|                 write!( | ||||
|                     f, | ||||
|                     "invalid syntax for the `_geoPoint` parameter: `{}`. \ | ||||
|                     Usage: `_geoPoint(latitude, longitude):asc`.", | ||||
|                     "Invalid syntax for the geo parameter: expected expression formated like \ | ||||
|                     `_geoPoint(latitude, longitude)` and ending by `:asc` or `:desc`, found `{}`.", | ||||
|                     name | ||||
|                 ) | ||||
|             } | ||||
|             Self::InvalidName { name } => { | ||||
|                 write!(f, "invalid syntax for the sort parameter `{}`.", name) | ||||
|                 write!(f, "Invalid syntax for the sort parameter: expected expression ending by `:asc` or `:desc`, found `{}`.", name) | ||||
|             } | ||||
|             Self::ReservedName { name } => { | ||||
|                 write!( | ||||
|                     f, | ||||
|                     "{} is a reserved keyword and thus can't be used as a sort expression.", | ||||
|                     "`{}` is a reserved keyword and thus can't be used as a sort expression.", | ||||
|                     name | ||||
|                 ) | ||||
|             } | ||||
| @@ -211,7 +211,7 @@ impl fmt::Display for SortError { | ||||
|                 write!( | ||||
|                     f, | ||||
|                     "`{}` is a reserved keyword and thus can't be used as a sort expression. \ | ||||
|                     Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates.", | ||||
|                     Use the _geoPoint(latitude, longitude) built-in rule to sort on _geo field coordinates.", | ||||
|                     name, | ||||
|                 ) | ||||
|             } | ||||
| @@ -318,50 +318,4 @@ mod tests { | ||||
|             ); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn sort_error_message() { | ||||
|         let errors = [ | ||||
|             ( | ||||
|                 AscDescError::InvalidSyntax { name: S("truc:machin") }, | ||||
|                 S("invalid syntax for the sort parameter `truc:machin`."), | ||||
|             ), | ||||
|             ( | ||||
|                 AscDescError::InvalidSyntax { name: S("hello:world") }, | ||||
|                 S("invalid syntax for the sort parameter `hello:world`."), | ||||
|             ), | ||||
|             ( | ||||
|                 AscDescError::ReservedKeyword { name: S("_geo") }, | ||||
|                 S("`_geo` is a reserved keyword and thus can't be used as a sort expression. Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates."), | ||||
|             ), | ||||
|             ( | ||||
|                 AscDescError::ReservedKeyword { name: S("_geoDistance") }, | ||||
|                 S("_geoDistance is a reserved keyword and thus can't be used as a sort expression.") | ||||
|             ), | ||||
|             ( | ||||
|                 AscDescError::ReservedKeyword { name: S("_geoRadius(12, 13)") }, | ||||
|                 S("`_geoRadius` is a reserved keyword and thus can't be used as a sort expression. Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates."), | ||||
|             ), | ||||
|             ( | ||||
|                 AscDescError::InvalidLatitude, | ||||
|                 S("Latitude must be contained between -90 and 90 degrees."), | ||||
|             ), | ||||
|             ( | ||||
|                 AscDescError::InvalidLongitude, | ||||
|                 S("Longitude must be contained between -180 and 180 degrees."), | ||||
|             ), | ||||
|         ]; | ||||
|  | ||||
|         for (asc_desc_error, expected_message) in errors { | ||||
|             let sort_error = SortError::from(asc_desc_error); | ||||
|             assert_eq!( | ||||
|                 sort_error.to_string(), | ||||
|                 expected_message, | ||||
|                 "was expecting {} for the error {:?} but instead got {}", | ||||
|                 expected_message, | ||||
|                 sort_error, | ||||
|                 sort_error.to_string() | ||||
|             ); | ||||
|         } | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -17,23 +17,27 @@ pub enum CriterionError { | ||||
| impl fmt::Display for CriterionError { | ||||
|     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||
|         match self { | ||||
|             Self::InvalidName { name } => write!(f, "invalid ranking rule {}", name), | ||||
|             Self::InvalidName { name } => write!(f, "`{}` ranking rule is invalid. Valid ranking rules are Words, Typo, Sort, Proximity, Attribute, Exactness and custom ranking rules.", name), | ||||
|             Self::ReservedName { name } => { | ||||
|                 write!(f, "{} is a reserved keyword and thus can't be used as a ranking rule", name) | ||||
|                 write!( | ||||
|                     f, | ||||
|                     "`{}` is a reserved keyword and thus can't be used as a ranking rule", | ||||
|                     name | ||||
|                 ) | ||||
|             } | ||||
|             Self::ReservedNameForSort { name } => { | ||||
|                 write!( | ||||
|                     f, | ||||
|                     "{} is a reserved keyword and thus can't be used as a ranking rule. \ | ||||
| {} can only be used for sorting at search time", | ||||
|                     "`{}` is a reserved keyword and thus can't be used as a ranking rule. \ | ||||
| `{}` can only be used for sorting at search time", | ||||
|                     name, name | ||||
|                 ) | ||||
|             } | ||||
|             Self::ReservedNameForFilter { name } => { | ||||
|                 write!( | ||||
|                     f, | ||||
|                     "{} is a reserved keyword and thus can't be used as a ranking rule. \ | ||||
| {} can only be used for filtering at search time", | ||||
|                     "`{}` is a reserved keyword and thus can't be used as a ranking rule. \ | ||||
| `{}` can only be used for filtering at search time", | ||||
|                     name, name | ||||
|                 ) | ||||
|             } | ||||
|   | ||||
| @@ -1,16 +1,20 @@ | ||||
| use std::collections::BTreeMap; | ||||
| use std::io; | ||||
| use std::io::{Cursor, Write}; | ||||
|  | ||||
| use byteorder::{BigEndian, WriteBytesExt}; | ||||
| use serde::ser::Serialize; | ||||
| use serde::Deserializer; | ||||
| use serde_json::Value; | ||||
|  | ||||
| use super::serde::DocumentSerializer; | ||||
| use super::serde::DocumentVisitor; | ||||
| use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; | ||||
| use crate::FieldId; | ||||
|  | ||||
| /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary | ||||
| /// format used by milli. | ||||
| /// | ||||
| /// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to | ||||
| /// iterate other the documents. | ||||
| /// iterate over the documents. | ||||
| /// | ||||
| /// ## example: | ||||
| /// ``` | ||||
| @@ -18,43 +22,48 @@ use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; | ||||
| /// use serde_json::json; | ||||
| /// use std::io::Cursor; | ||||
| /// | ||||
| /// let json = r##"{"id": 1, "name": "foo"}"##; | ||||
| /// let mut writer = Cursor::new(Vec::new()); | ||||
| /// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap(); | ||||
| /// builder.add_documents(json!({"id": 1, "name": "foo"})).unwrap(); | ||||
| /// builder.extend_from_json(&mut json.as_bytes()).unwrap(); | ||||
| /// builder.finish().unwrap(); | ||||
| /// ``` | ||||
| pub struct DocumentBatchBuilder<W> { | ||||
|     serializer: DocumentSerializer<W>, | ||||
|     inner: ByteCounter<W>, | ||||
|     index: DocumentsBatchIndex, | ||||
|     obkv_buffer: Vec<u8>, | ||||
|     value_buffer: Vec<u8>, | ||||
|     values: BTreeMap<FieldId, Value>, | ||||
|     count: usize, | ||||
| } | ||||
|  | ||||
| impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> { | ||||
|     pub fn new(writer: W) -> Result<Self, Error> { | ||||
|         let index = DocumentsBatchIndex::new(); | ||||
|         let index = DocumentsBatchIndex::default(); | ||||
|         let mut writer = ByteCounter::new(writer); | ||||
|         // add space to write the offset of the metadata at the end of the writer | ||||
|         writer.write_u64::<BigEndian>(0)?; | ||||
|  | ||||
|         let serializer = | ||||
|             DocumentSerializer { writer, buffer: Vec::new(), index, count: 0, allow_seq: true }; | ||||
|  | ||||
|         Ok(Self { serializer }) | ||||
|         Ok(Self { | ||||
|             inner: writer, | ||||
|             index, | ||||
|             obkv_buffer: Vec::new(), | ||||
|             value_buffer: Vec::new(), | ||||
|             values: BTreeMap::new(), | ||||
|             count: 0, | ||||
|         }) | ||||
|     } | ||||
|  | ||||
|     /// Returns the number of documents that have been written to the builder. | ||||
|     pub fn len(&self) -> usize { | ||||
|         self.serializer.count | ||||
|         self.count | ||||
|     } | ||||
|  | ||||
|     /// This method must be called after the document addition is terminated. It will put the | ||||
|     /// metadata at the end of the file, and write the metadata offset at the beginning on the | ||||
|     /// file. | ||||
|     pub fn finish(self) -> Result<(), Error> { | ||||
|         let DocumentSerializer { | ||||
|             writer: ByteCounter { mut writer, count: offset }, | ||||
|             index, | ||||
|             count, | ||||
|             .. | ||||
|         } = self.serializer; | ||||
|     pub fn finish(self) -> Result<usize, Error> { | ||||
|         let Self { inner: ByteCounter { mut writer, count: offset }, index, count, .. } = self; | ||||
|  | ||||
|         let meta = DocumentsMetadata { count, index }; | ||||
|  | ||||
| @@ -65,16 +74,481 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> { | ||||
|  | ||||
|         writer.flush()?; | ||||
|  | ||||
|         Ok(()) | ||||
|         Ok(count) | ||||
|     } | ||||
|  | ||||
|     /// Adds documents to the builder. | ||||
|     /// Extends the builder with json documents from a reader. | ||||
|     pub fn extend_from_json<R: io::Read>(&mut self, reader: R) -> Result<(), Error> { | ||||
|         let mut de = serde_json::Deserializer::from_reader(reader); | ||||
|  | ||||
|         let mut visitor = DocumentVisitor { | ||||
|             inner: &mut self.inner, | ||||
|             index: &mut self.index, | ||||
|             obkv_buffer: &mut self.obkv_buffer, | ||||
|             value_buffer: &mut self.value_buffer, | ||||
|             values: &mut self.values, | ||||
|             count: &mut self.count, | ||||
|         }; | ||||
|  | ||||
|         de.deserialize_any(&mut visitor).map_err(Error::JsonError)? | ||||
|     } | ||||
|  | ||||
|     /// Creates a builder from a reader of CSV documents. | ||||
|     /// | ||||
|     /// The internal index is updated with the fields found | ||||
|     /// in the documents. Document must either be a map or a sequences of map, anything else will | ||||
|     /// fail. | ||||
|     pub fn add_documents<T: Serialize>(&mut self, document: T) -> Result<(), Error> { | ||||
|         document.serialize(&mut self.serializer)?; | ||||
|         Ok(()) | ||||
|     /// Since all fields in a csv documents are guaranteed to be ordered, we are able to perform | ||||
|     /// optimisations, and extending from another CSV is not allowed. | ||||
|     pub fn from_csv<R: io::Read>(reader: R, writer: W) -> Result<Self, Error> { | ||||
|         let mut this = Self::new(writer)?; | ||||
|         // Ensure that this is the first and only addition made with this builder | ||||
|         debug_assert!(this.index.is_empty()); | ||||
|  | ||||
|         let mut records = csv::Reader::from_reader(reader); | ||||
|  | ||||
|         let headers = records | ||||
|             .headers()? | ||||
|             .into_iter() | ||||
|             .map(parse_csv_header) | ||||
|             .map(|(k, t)| (this.index.insert(&k), t)) | ||||
|             .collect::<BTreeMap<_, _>>(); | ||||
|  | ||||
|         for (i, record) in records.into_records().enumerate() { | ||||
|             let record = record?; | ||||
|             this.obkv_buffer.clear(); | ||||
|             let mut writer = obkv::KvWriter::new(&mut this.obkv_buffer); | ||||
|             for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { | ||||
|                 let value = match ty { | ||||
|                     AllowedType::Number => { | ||||
|                         value.parse::<f64>().map(Value::from).map_err(|error| { | ||||
|                             Error::ParseFloat { | ||||
|                                 error, | ||||
|                                 // +1 for the header offset. | ||||
|                                 line: i + 1, | ||||
|                                 value: value.to_string(), | ||||
|                             } | ||||
|                         })? | ||||
|                     } | ||||
|                     AllowedType::String => Value::String(value.to_string()), | ||||
|                 }; | ||||
|  | ||||
|                 this.value_buffer.clear(); | ||||
|                 serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?; | ||||
|                 writer.insert(*fid, &this.value_buffer)?; | ||||
|             } | ||||
|  | ||||
|             this.inner.write_u32::<BigEndian>(this.obkv_buffer.len() as u32)?; | ||||
|             this.inner.write_all(&this.obkv_buffer)?; | ||||
|  | ||||
|             this.count += 1; | ||||
|         } | ||||
|  | ||||
|         Ok(this) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug)] | ||||
| enum AllowedType { | ||||
|     String, | ||||
|     Number, | ||||
| } | ||||
|  | ||||
| fn parse_csv_header(header: &str) -> (String, AllowedType) { | ||||
|     // if there are several separators we only split on the last one. | ||||
|     match header.rsplit_once(':') { | ||||
|         Some((field_name, field_type)) => match field_type { | ||||
|             "string" => (field_name.to_string(), AllowedType::String), | ||||
|             "number" => (field_name.to_string(), AllowedType::Number), | ||||
|             // if the pattern isn't reconized, we keep the whole field. | ||||
|             _otherwise => (header.to_string(), AllowedType::String), | ||||
|         }, | ||||
|         None => (header.to_string(), AllowedType::String), | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod test { | ||||
|     use std::io::Cursor; | ||||
|  | ||||
|     use serde_json::{json, Map}; | ||||
|  | ||||
|     use super::*; | ||||
|     use crate::documents::DocumentBatchReader; | ||||
|  | ||||
|     fn obkv_to_value(obkv: &obkv::KvReader<FieldId>, index: &DocumentsBatchIndex) -> Value { | ||||
|         let mut map = Map::new(); | ||||
|  | ||||
|         for (fid, value) in obkv.iter() { | ||||
|             let field_name = index.name(fid).unwrap().clone(); | ||||
|             let value: Value = serde_json::from_slice(value).unwrap(); | ||||
|  | ||||
|             map.insert(field_name, value); | ||||
|         } | ||||
|  | ||||
|         Value::Object(map) | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn add_single_documents_json() { | ||||
|         let mut cursor = Cursor::new(Vec::new()); | ||||
|         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); | ||||
|  | ||||
|         let json = serde_json::json!({ | ||||
|             "id": 1, | ||||
|             "field": "hello!", | ||||
|         }); | ||||
|  | ||||
|         builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); | ||||
|  | ||||
|         let json = serde_json::json!({ | ||||
|             "blabla": false, | ||||
|             "field": "hello!", | ||||
|             "id": 1, | ||||
|         }); | ||||
|  | ||||
|         builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); | ||||
|  | ||||
|         assert_eq!(builder.len(), 2); | ||||
|  | ||||
|         builder.finish().unwrap(); | ||||
|  | ||||
|         cursor.set_position(0); | ||||
|  | ||||
|         let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); | ||||
|  | ||||
|         let (index, document) = reader.next_document_with_index().unwrap().unwrap(); | ||||
|         assert_eq!(index.len(), 3); | ||||
|         assert_eq!(document.iter().count(), 2); | ||||
|  | ||||
|         let (index, document) = reader.next_document_with_index().unwrap().unwrap(); | ||||
|         assert_eq!(index.len(), 3); | ||||
|         assert_eq!(document.iter().count(), 3); | ||||
|  | ||||
|         assert!(reader.next_document_with_index().unwrap().is_none()); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn add_documents_seq_json() { | ||||
|         let mut cursor = Cursor::new(Vec::new()); | ||||
|         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); | ||||
|  | ||||
|         let json = serde_json::json!([{ | ||||
|             "id": 1, | ||||
|             "field": "hello!", | ||||
|         },{ | ||||
|             "blabla": false, | ||||
|             "field": "hello!", | ||||
|             "id": 1, | ||||
|         } | ||||
|         ]); | ||||
|  | ||||
|         builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); | ||||
|  | ||||
|         assert_eq!(builder.len(), 2); | ||||
|  | ||||
|         builder.finish().unwrap(); | ||||
|  | ||||
|         cursor.set_position(0); | ||||
|  | ||||
|         let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); | ||||
|  | ||||
|         let (index, document) = reader.next_document_with_index().unwrap().unwrap(); | ||||
|         assert_eq!(index.len(), 3); | ||||
|         assert_eq!(document.iter().count(), 2); | ||||
|  | ||||
|         let (index, document) = reader.next_document_with_index().unwrap().unwrap(); | ||||
|         assert_eq!(index.len(), 3); | ||||
|         assert_eq!(document.iter().count(), 3); | ||||
|  | ||||
|         assert!(reader.next_document_with_index().unwrap().is_none()); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn add_documents_csv() { | ||||
|         let mut cursor = Cursor::new(Vec::new()); | ||||
|  | ||||
|         let csv = "id:number,field:string\n1,hello!\n2,blabla"; | ||||
|  | ||||
|         let builder = | ||||
|             DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap(); | ||||
|         builder.finish().unwrap(); | ||||
|  | ||||
|         cursor.set_position(0); | ||||
|  | ||||
|         let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); | ||||
|  | ||||
|         let (index, document) = reader.next_document_with_index().unwrap().unwrap(); | ||||
|         assert_eq!(index.len(), 2); | ||||
|         assert_eq!(document.iter().count(), 2); | ||||
|  | ||||
|         let (_index, document) = reader.next_document_with_index().unwrap().unwrap(); | ||||
|         assert_eq!(document.iter().count(), 2); | ||||
|  | ||||
|         assert!(reader.next_document_with_index().unwrap().is_none()); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn simple_csv_document() { | ||||
|         let documents = r#"city,country,pop | ||||
| "Boston","United States","4628910""#; | ||||
|  | ||||
|         let mut buf = Vec::new(); | ||||
|         DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) | ||||
|             .unwrap() | ||||
|             .finish() | ||||
|             .unwrap(); | ||||
|         let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); | ||||
|         let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); | ||||
|         let val = obkv_to_value(&doc, index); | ||||
|  | ||||
|         assert_eq!( | ||||
|             val, | ||||
|             json!({ | ||||
|                 "city": "Boston", | ||||
|                 "country": "United States", | ||||
|                 "pop": "4628910", | ||||
|             }) | ||||
|         ); | ||||
|  | ||||
|         assert!(reader.next_document_with_index().unwrap().is_none()); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn coma_in_field() { | ||||
|         let documents = r#"city,country,pop | ||||
| "Boston","United, States","4628910""#; | ||||
|  | ||||
|         let mut buf = Vec::new(); | ||||
|         DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) | ||||
|             .unwrap() | ||||
|             .finish() | ||||
|             .unwrap(); | ||||
|         let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); | ||||
|         let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); | ||||
|         let val = obkv_to_value(&doc, index); | ||||
|  | ||||
|         assert_eq!( | ||||
|             val, | ||||
|             json!({ | ||||
|                 "city": "Boston", | ||||
|                 "country": "United, States", | ||||
|                 "pop": "4628910", | ||||
|             }) | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn quote_in_field() { | ||||
|         let documents = r#"city,country,pop | ||||
| "Boston","United"" States","4628910""#; | ||||
|  | ||||
|         let mut buf = Vec::new(); | ||||
|         DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) | ||||
|             .unwrap() | ||||
|             .finish() | ||||
|             .unwrap(); | ||||
|         let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); | ||||
|         let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); | ||||
|         let val = obkv_to_value(&doc, index); | ||||
|  | ||||
|         assert_eq!( | ||||
|             val, | ||||
|             json!({ | ||||
|                 "city": "Boston", | ||||
|                 "country": "United\" States", | ||||
|                 "pop": "4628910", | ||||
|             }) | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn integer_in_field() { | ||||
|         let documents = r#"city,country,pop:number | ||||
| "Boston","United States","4628910""#; | ||||
|  | ||||
|         let mut buf = Vec::new(); | ||||
|         DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) | ||||
|             .unwrap() | ||||
|             .finish() | ||||
|             .unwrap(); | ||||
|         let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); | ||||
|         let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); | ||||
|         let val = obkv_to_value(&doc, index); | ||||
|  | ||||
|         assert_eq!( | ||||
|             val, | ||||
|             json!({ | ||||
|                 "city": "Boston", | ||||
|                 "country": "United States", | ||||
|                 "pop": 4628910.0, | ||||
|             }) | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn float_in_field() { | ||||
|         let documents = r#"city,country,pop:number | ||||
| "Boston","United States","4628910.01""#; | ||||
|  | ||||
|         let mut buf = Vec::new(); | ||||
|         DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) | ||||
|             .unwrap() | ||||
|             .finish() | ||||
|             .unwrap(); | ||||
|         let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); | ||||
|         let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); | ||||
|         let val = obkv_to_value(&doc, index); | ||||
|  | ||||
|         assert_eq!( | ||||
|             val, | ||||
|             json!({ | ||||
|                 "city": "Boston", | ||||
|                 "country": "United States", | ||||
|                 "pop": 4628910.01, | ||||
|             }) | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn several_colon_in_header() { | ||||
|         let documents = r#"city:love:string,country:state,pop | ||||
| "Boston","United States","4628910""#; | ||||
|  | ||||
|         let mut buf = Vec::new(); | ||||
|         DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) | ||||
|             .unwrap() | ||||
|             .finish() | ||||
|             .unwrap(); | ||||
|         let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); | ||||
|         let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); | ||||
|         let val = obkv_to_value(&doc, index); | ||||
|  | ||||
|         assert_eq!( | ||||
|             val, | ||||
|             json!({ | ||||
|                 "city:love": "Boston", | ||||
|                 "country:state": "United States", | ||||
|                 "pop": "4628910", | ||||
|             }) | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn ending_by_colon_in_header() { | ||||
|         let documents = r#"city:,country,pop | ||||
| "Boston","United States","4628910""#; | ||||
|  | ||||
|         let mut buf = Vec::new(); | ||||
|         DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) | ||||
|             .unwrap() | ||||
|             .finish() | ||||
|             .unwrap(); | ||||
|         let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); | ||||
|         let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); | ||||
|         let val = obkv_to_value(&doc, index); | ||||
|  | ||||
|         assert_eq!( | ||||
|             val, | ||||
|             json!({ | ||||
|                 "city:": "Boston", | ||||
|                 "country": "United States", | ||||
|                 "pop": "4628910", | ||||
|             }) | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn starting_by_colon_in_header() { | ||||
|         let documents = r#":city,country,pop | ||||
| "Boston","United States","4628910""#; | ||||
|  | ||||
|         let mut buf = Vec::new(); | ||||
|         DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) | ||||
|             .unwrap() | ||||
|             .finish() | ||||
|             .unwrap(); | ||||
|         let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); | ||||
|         let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); | ||||
|         let val = obkv_to_value(&doc, index); | ||||
|  | ||||
|         assert_eq!( | ||||
|             val, | ||||
|             json!({ | ||||
|                 ":city": "Boston", | ||||
|                 "country": "United States", | ||||
|                 "pop": "4628910", | ||||
|             }) | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[ignore] | ||||
|     #[test] | ||||
|     fn starting_by_colon_in_header2() { | ||||
|         let documents = r#":string,country,pop | ||||
| "Boston","United States","4628910""#; | ||||
|  | ||||
|         let mut buf = Vec::new(); | ||||
|         DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) | ||||
|             .unwrap() | ||||
|             .finish() | ||||
|             .unwrap(); | ||||
|         let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); | ||||
|  | ||||
|         assert!(reader.next_document_with_index().is_err()); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn double_colon_in_header() { | ||||
|         let documents = r#"city::string,country,pop | ||||
| "Boston","United States","4628910""#; | ||||
|  | ||||
|         let mut buf = Vec::new(); | ||||
|         DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) | ||||
|             .unwrap() | ||||
|             .finish() | ||||
|             .unwrap(); | ||||
|         let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); | ||||
|         let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); | ||||
|         let val = obkv_to_value(&doc, index); | ||||
|  | ||||
|         assert_eq!( | ||||
|             val, | ||||
|             json!({ | ||||
|                 "city:": "Boston", | ||||
|                 "country": "United States", | ||||
|                 "pop": "4628910", | ||||
|             }) | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn bad_type_in_header() { | ||||
|         let documents = r#"city,country:number,pop | ||||
| "Boston","United States","4628910""#; | ||||
|  | ||||
|         let mut buf = Vec::new(); | ||||
|         assert!( | ||||
|             DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn bad_column_count1() { | ||||
|         let documents = r#"city,country,pop | ||||
| "Boston","United States","4628910", "too much""#; | ||||
|  | ||||
|         let mut buf = Vec::new(); | ||||
|         assert!( | ||||
|             DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn bad_column_count2() { | ||||
|         let documents = r#"city,country,pop | ||||
| "Boston","United States""#; | ||||
|  | ||||
|         let mut buf = Vec::new(); | ||||
|         assert!( | ||||
|             DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() | ||||
|         ); | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -7,7 +7,8 @@ mod builder; | ||||
| mod reader; | ||||
| mod serde; | ||||
|  | ||||
| use std::{fmt, io}; | ||||
| use std::fmt::{self, Debug}; | ||||
| use std::io; | ||||
|  | ||||
| use ::serde::{Deserialize, Serialize}; | ||||
| use bimap::BiHashMap; | ||||
| @@ -17,7 +18,38 @@ pub use reader::DocumentBatchReader; | ||||
| use crate::FieldId; | ||||
|  | ||||
| /// A bidirectional map that links field ids to their name in a document batch. | ||||
| pub type DocumentsBatchIndex = BiHashMap<FieldId, String>; | ||||
| #[derive(Default, Debug, Serialize, Deserialize)] | ||||
| pub struct DocumentsBatchIndex(pub BiHashMap<FieldId, String>); | ||||
|  | ||||
| impl DocumentsBatchIndex { | ||||
|     /// Insert the field in the map, or return it's field id if it doesn't already exists. | ||||
|     pub fn insert(&mut self, field: &str) -> FieldId { | ||||
|         match self.0.get_by_right(field) { | ||||
|             Some(field_id) => *field_id, | ||||
|             None => { | ||||
|                 let field_id = self.0.len() as FieldId; | ||||
|                 self.0.insert(field_id, field.to_string()); | ||||
|                 field_id | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn is_empty(&self) -> bool { | ||||
|         self.0.is_empty() | ||||
|     } | ||||
|  | ||||
|     pub fn len(&self) -> usize { | ||||
|         self.0.len() | ||||
|     } | ||||
|  | ||||
|     pub fn iter(&self) -> bimap::hash::Iter<FieldId, String> { | ||||
|         self.0.iter() | ||||
|     } | ||||
|  | ||||
|     pub fn name(&self, id: FieldId) -> Option<&String> { | ||||
|         self.0.get_by_left(&id) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Serialize, Deserialize)] | ||||
| struct DocumentsMetadata { | ||||
| @@ -50,14 +82,22 @@ impl<W: io::Write> io::Write for ByteCounter<W> { | ||||
|  | ||||
| #[derive(Debug)] | ||||
| pub enum Error { | ||||
|     ParseFloat { error: std::num::ParseFloatError, line: usize, value: String }, | ||||
|     InvalidDocumentFormat, | ||||
|     Custom(String), | ||||
|     JsonError(serde_json::Error), | ||||
|     CsvError(csv::Error), | ||||
|     Serialize(bincode::Error), | ||||
|     Io(io::Error), | ||||
|     DocumentTooLarge, | ||||
| } | ||||
|  | ||||
| impl From<csv::Error> for Error { | ||||
|     fn from(e: csv::Error) -> Self { | ||||
|         Self::CsvError(e) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl From<io::Error> for Error { | ||||
|     fn from(other: io::Error) -> Self { | ||||
|         Self::Io(other) | ||||
| @@ -70,15 +110,25 @@ impl From<bincode::Error> for Error { | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl From<serde_json::Error> for Error { | ||||
|     fn from(other: serde_json::Error) -> Self { | ||||
|         Self::JsonError(other) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl fmt::Display for Error { | ||||
|     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | ||||
|         match self { | ||||
|             Error::ParseFloat { error, line, value } => { | ||||
|                 write!(f, "Error parsing number {:?} at line {}: {}", value, line, error) | ||||
|             } | ||||
|             Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s), | ||||
|             Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."), | ||||
|             Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err), | ||||
|             Error::Io(e) => e.fmt(f), | ||||
|             Error::Io(e) => write!(f, "{}", e), | ||||
|             Error::DocumentTooLarge => f.write_str("Provided document is too large (>2Gib)"), | ||||
|             Error::Serialize(e) => e.fmt(f), | ||||
|             Error::Serialize(e) => write!(f, "{}", e), | ||||
|             Error::CsvError(e) => write!(f, "{}", e), | ||||
|         } | ||||
|     } | ||||
| } | ||||
| @@ -92,7 +142,8 @@ macro_rules! documents { | ||||
|         let documents = serde_json::json!($data); | ||||
|         let mut writer = std::io::Cursor::new(Vec::new()); | ||||
|         let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); | ||||
|         builder.add_documents(documents).unwrap(); | ||||
|         let documents = serde_json::to_vec(&documents).unwrap(); | ||||
|         builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); | ||||
|         builder.finish().unwrap(); | ||||
|  | ||||
|         writer.set_position(0); | ||||
| @@ -103,6 +154,8 @@ macro_rules! documents { | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod test { | ||||
|     use std::io::Cursor; | ||||
|  | ||||
|     use serde_json::{json, Value}; | ||||
|  | ||||
|     use super::*; | ||||
| @@ -119,12 +172,14 @@ mod test { | ||||
|             "bool": true | ||||
|         }); | ||||
|  | ||||
|         let json = serde_json::to_vec(&json).unwrap(); | ||||
|  | ||||
|         let mut v = Vec::new(); | ||||
|         let mut cursor = io::Cursor::new(&mut v); | ||||
|  | ||||
|         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); | ||||
|  | ||||
|         builder.add_documents(json).unwrap(); | ||||
|         builder.extend_from_json(Cursor::new(json)).unwrap(); | ||||
|  | ||||
|         builder.finish().unwrap(); | ||||
|  | ||||
| @@ -148,13 +203,16 @@ mod test { | ||||
|             "toto": false, | ||||
|         }); | ||||
|  | ||||
|         let doc1 = serde_json::to_vec(&doc1).unwrap(); | ||||
|         let doc2 = serde_json::to_vec(&doc2).unwrap(); | ||||
|  | ||||
|         let mut v = Vec::new(); | ||||
|         let mut cursor = io::Cursor::new(&mut v); | ||||
|  | ||||
|         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); | ||||
|  | ||||
|         builder.add_documents(doc1).unwrap(); | ||||
|         builder.add_documents(doc2).unwrap(); | ||||
|         builder.extend_from_json(Cursor::new(doc1)).unwrap(); | ||||
|         builder.extend_from_json(Cursor::new(doc2)).unwrap(); | ||||
|  | ||||
|         builder.finish().unwrap(); | ||||
|  | ||||
| @@ -177,12 +235,14 @@ mod test { | ||||
|             { "tata": "hello" }, | ||||
|         ]); | ||||
|  | ||||
|         let docs = serde_json::to_vec(&docs).unwrap(); | ||||
|  | ||||
|         let mut v = Vec::new(); | ||||
|         let mut cursor = io::Cursor::new(&mut v); | ||||
|  | ||||
|         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); | ||||
|  | ||||
|         builder.add_documents(docs).unwrap(); | ||||
|         builder.extend_from_json(Cursor::new(docs)).unwrap(); | ||||
|  | ||||
|         builder.finish().unwrap(); | ||||
|  | ||||
| @@ -210,11 +270,13 @@ mod test { | ||||
|             { "tata": "hello" }, | ||||
|         ]]); | ||||
|  | ||||
|         assert!(builder.add_documents(docs).is_err()); | ||||
|         let docs = serde_json::to_vec(&docs).unwrap(); | ||||
|         assert!(builder.extend_from_json(Cursor::new(docs)).is_err()); | ||||
|  | ||||
|         let docs = json!("hello"); | ||||
|         let docs = serde_json::to_vec(&docs).unwrap(); | ||||
|  | ||||
|         assert!(builder.add_documents(docs).is_err()); | ||||
|         assert!(builder.extend_from_json(Cursor::new(docs)).is_err()); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|   | ||||
| @@ -1,474 +1,134 @@ | ||||
| use std::collections::BTreeMap; | ||||
| use std::convert::TryInto; | ||||
| use std::io::Cursor; | ||||
| use std::{fmt, io}; | ||||
| use std::fmt; | ||||
| use std::io::{Cursor, Write}; | ||||
|  | ||||
| use byteorder::{BigEndian, WriteBytesExt}; | ||||
| use obkv::KvWriter; | ||||
| use serde::ser::{Impossible, Serialize, SerializeMap, SerializeSeq, Serializer}; | ||||
| use byteorder::WriteBytesExt; | ||||
| use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor}; | ||||
| use serde::Deserialize; | ||||
| use serde_json::Value; | ||||
|  | ||||
| use super::{ByteCounter, DocumentsBatchIndex, Error}; | ||||
| use crate::FieldId; | ||||
|  | ||||
| pub struct DocumentSerializer<W> { | ||||
|     pub writer: ByteCounter<W>, | ||||
|     pub buffer: Vec<u8>, | ||||
|     pub index: DocumentsBatchIndex, | ||||
|     pub count: usize, | ||||
|     pub allow_seq: bool, | ||||
| } | ||||
|  | ||||
| impl<'a, W: io::Write> Serializer for &'a mut DocumentSerializer<W> { | ||||
|     type Ok = (); | ||||
|  | ||||
|     type Error = Error; | ||||
|  | ||||
|     type SerializeSeq = SeqSerializer<'a, W>; | ||||
|     type SerializeTuple = Impossible<(), Self::Error>; | ||||
|     type SerializeTupleStruct = Impossible<(), Self::Error>; | ||||
|     type SerializeTupleVariant = Impossible<(), Self::Error>; | ||||
|     type SerializeMap = MapSerializer<'a, &'a mut ByteCounter<W>>; | ||||
|     type SerializeStruct = Impossible<(), Self::Error>; | ||||
|     type SerializeStructVariant = Impossible<(), Self::Error>; | ||||
|     fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> { | ||||
|         self.buffer.clear(); | ||||
|         let cursor = io::Cursor::new(&mut self.buffer); | ||||
|         self.count += 1; | ||||
|         let map_serializer = MapSerializer { | ||||
|             map: KvWriter::new(cursor), | ||||
|             index: &mut self.index, | ||||
|             writer: &mut self.writer, | ||||
|             mapped_documents: BTreeMap::new(), | ||||
|         }; | ||||
|  | ||||
|         Ok(map_serializer) | ||||
|     } | ||||
|  | ||||
|     fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> { | ||||
|         if self.allow_seq { | ||||
|             // Only allow sequence of documents of depth 1. | ||||
|             self.allow_seq = false; | ||||
|             Ok(SeqSerializer { serializer: self }) | ||||
|         } else { | ||||
|             Err(Error::InvalidDocumentFormat) | ||||
| macro_rules! tri { | ||||
|     ($e:expr) => { | ||||
|         match $e { | ||||
|             Ok(r) => r, | ||||
|             Err(e) => return Ok(Err(e.into())), | ||||
|         } | ||||
|     } | ||||
|     }; | ||||
| } | ||||
|  | ||||
|     fn serialize_bool(self, _v: bool) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
| struct FieldIdResolver<'a>(&'a mut DocumentsBatchIndex); | ||||
|  | ||||
|     fn serialize_i8(self, _v: i8) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
| impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> { | ||||
|     type Value = FieldId; | ||||
|  | ||||
|     fn serialize_i16(self, _v: i16) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_i32(self, _v: i32) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_i64(self, _v: i64) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_u8(self, _v: u8) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_u16(self, _v: u16) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_u32(self, _v: u32) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_u64(self, _v: u64) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_f32(self, _v: f32) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_f64(self, _v: f64) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_char(self, _v: char) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_none(self) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error> | ||||
|     fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error> | ||||
|     where | ||||
|         T: Serialize, | ||||
|         D: serde::Deserializer<'de>, | ||||
|     { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_unit(self) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_unit_variant( | ||||
|         self, | ||||
|         _name: &'static str, | ||||
|         _variant_index: u32, | ||||
|         _variant: &'static str, | ||||
|     ) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_newtype_struct<T: ?Sized>( | ||||
|         self, | ||||
|         _name: &'static str, | ||||
|         _value: &T, | ||||
|     ) -> Result<Self::Ok, Self::Error> | ||||
|     where | ||||
|         T: Serialize, | ||||
|     { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_newtype_variant<T: ?Sized>( | ||||
|         self, | ||||
|         _name: &'static str, | ||||
|         _variant_index: u32, | ||||
|         _variant: &'static str, | ||||
|         _value: &T, | ||||
|     ) -> Result<Self::Ok, Self::Error> | ||||
|     where | ||||
|         T: Serialize, | ||||
|     { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_tuple_struct( | ||||
|         self, | ||||
|         _name: &'static str, | ||||
|         _len: usize, | ||||
|     ) -> Result<Self::SerializeTupleStruct, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_tuple_variant( | ||||
|         self, | ||||
|         _name: &'static str, | ||||
|         _variant_index: u32, | ||||
|         _variant: &'static str, | ||||
|         _len: usize, | ||||
|     ) -> Result<Self::SerializeTupleVariant, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_struct( | ||||
|         self, | ||||
|         _name: &'static str, | ||||
|         _len: usize, | ||||
|     ) -> Result<Self::SerializeStruct, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_struct_variant( | ||||
|         self, | ||||
|         _name: &'static str, | ||||
|         _variant_index: u32, | ||||
|         _variant: &'static str, | ||||
|         _len: usize, | ||||
|     ) -> Result<Self::SerializeStructVariant, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|         deserializer.deserialize_str(self) | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct SeqSerializer<'a, W> { | ||||
|     serializer: &'a mut DocumentSerializer<W>, | ||||
| } | ||||
| impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> { | ||||
|     type Value = FieldId; | ||||
|  | ||||
| impl<'a, W: io::Write> SerializeSeq for SeqSerializer<'a, W> { | ||||
|     type Ok = (); | ||||
|     type Error = Error; | ||||
|  | ||||
|     fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error> | ||||
|     fn visit_str<E>(self, v: &str) -> Result<Self::Value, E> | ||||
|     where | ||||
|         T: Serialize, | ||||
|         E: serde::de::Error, | ||||
|     { | ||||
|         value.serialize(&mut *self.serializer)?; | ||||
|         Ok(()) | ||||
|         Ok(self.0.insert(v)) | ||||
|     } | ||||
|  | ||||
|     fn end(self) -> Result<Self::Ok, Self::Error> { | ||||
|         Ok(()) | ||||
|     fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||
|         write!(f, "a string") | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct MapSerializer<'a, W> { | ||||
|     map: KvWriter<io::Cursor<&'a mut Vec<u8>>, FieldId>, | ||||
|     index: &'a mut DocumentsBatchIndex, | ||||
|     writer: W, | ||||
|     mapped_documents: BTreeMap<FieldId, Value>, | ||||
| struct ValueDeserializer; | ||||
|  | ||||
| impl<'de> DeserializeSeed<'de> for ValueDeserializer { | ||||
|     type Value = serde_json::Value; | ||||
|  | ||||
|     fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error> | ||||
|     where | ||||
|         D: serde::Deserializer<'de>, | ||||
|     { | ||||
|         serde_json::Value::deserialize(deserializer) | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// This implementation of SerializeMap uses serilialize_entry instead of seriliaze_key and | ||||
| /// serialize_value, therefore these to methods remain unimplemented. | ||||
| impl<'a, W: io::Write> SerializeMap for MapSerializer<'a, W> { | ||||
|     type Ok = (); | ||||
|     type Error = Error; | ||||
| pub struct DocumentVisitor<'a, W> { | ||||
|     pub inner: &'a mut ByteCounter<W>, | ||||
|     pub index: &'a mut DocumentsBatchIndex, | ||||
|     pub obkv_buffer: &'a mut Vec<u8>, | ||||
|     pub value_buffer: &'a mut Vec<u8>, | ||||
|     pub values: &'a mut BTreeMap<FieldId, Value>, | ||||
|     pub count: &'a mut usize, | ||||
| } | ||||
|  | ||||
|     fn serialize_key<T: ?Sized + Serialize>(&mut self, _key: &T) -> Result<(), Self::Error> { | ||||
|         unreachable!() | ||||
|     } | ||||
| impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { | ||||
|     /// This Visitor value is nothing, since it write the value to a file. | ||||
|     type Value = Result<(), Error>; | ||||
|  | ||||
|     fn serialize_value<T: ?Sized>(&mut self, _value: &T) -> Result<(), Self::Error> { | ||||
|         unreachable!() | ||||
|     } | ||||
|  | ||||
|     fn end(mut self) -> Result<Self::Ok, Self::Error> { | ||||
|         let mut buf = Vec::new(); | ||||
|         for (key, value) in self.mapped_documents { | ||||
|             buf.clear(); | ||||
|             let mut cursor = Cursor::new(&mut buf); | ||||
|             serde_json::to_writer(&mut cursor, &value).map_err(Error::JsonError)?; | ||||
|             self.map.insert(key, cursor.into_inner()).map_err(Error::Io)?; | ||||
|     fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error> | ||||
|     where | ||||
|         A: SeqAccess<'de>, | ||||
|     { | ||||
|         while let Some(v) = seq.next_element_seed(&mut *self)? { | ||||
|             tri!(v) | ||||
|         } | ||||
|  | ||||
|         let data = self.map.into_inner().map_err(Error::Io)?.into_inner(); | ||||
|         let data_len: u32 = data.len().try_into().map_err(|_| Error::DocumentTooLarge)?; | ||||
|  | ||||
|         self.writer.write_u32::<BigEndian>(data_len).map_err(Error::Io)?; | ||||
|         self.writer.write_all(&data).map_err(Error::Io)?; | ||||
|  | ||||
|         Ok(()) | ||||
|         Ok(Ok(())) | ||||
|     } | ||||
|  | ||||
|     fn serialize_entry<K: ?Sized, V: ?Sized>( | ||||
|         &mut self, | ||||
|         key: &K, | ||||
|         value: &V, | ||||
|     ) -> Result<(), Self::Error> | ||||
|     fn visit_map<A>(self, mut map: A) -> Result<Self::Value, A::Error> | ||||
|     where | ||||
|         K: Serialize, | ||||
|         V: Serialize, | ||||
|         A: MapAccess<'de>, | ||||
|     { | ||||
|         let field_serializer = FieldSerializer { index: &mut self.index }; | ||||
|         let field_id: FieldId = key.serialize(field_serializer)?; | ||||
|         while let Some((key, value)) = | ||||
|             map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)? | ||||
|         { | ||||
|             self.values.insert(key, value); | ||||
|         } | ||||
|  | ||||
|         let value = serde_json::to_value(value).map_err(Error::JsonError)?; | ||||
|         self.obkv_buffer.clear(); | ||||
|         let mut obkv = obkv::KvWriter::new(Cursor::new(&mut *self.obkv_buffer)); | ||||
|         for (key, value) in self.values.iter() { | ||||
|             self.value_buffer.clear(); | ||||
|             // This is guaranteed to work | ||||
|             tri!(serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value)); | ||||
|             tri!(obkv.insert(*key, &self.value_buffer)); | ||||
|         } | ||||
|  | ||||
|         self.mapped_documents.insert(field_id, value); | ||||
|         let reader = tri!(obkv.into_inner()).into_inner(); | ||||
|  | ||||
|         Ok(()) | ||||
|         tri!(self.inner.write_u32::<byteorder::BigEndian>(reader.len() as u32)); | ||||
|         tri!(self.inner.write_all(reader)); | ||||
|  | ||||
|         *self.count += 1; | ||||
|         self.values.clear(); | ||||
|  | ||||
|         Ok(Ok(())) | ||||
|     } | ||||
|  | ||||
|     fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||
|         write!(f, "a documents, or a sequence of documents.") | ||||
|     } | ||||
| } | ||||
|  | ||||
| struct FieldSerializer<'a> { | ||||
|     index: &'a mut DocumentsBatchIndex, | ||||
| } | ||||
| impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W> | ||||
| where | ||||
|     W: Write, | ||||
| { | ||||
|     type Value = Result<(), Error>; | ||||
|  | ||||
| impl<'a> serde::Serializer for FieldSerializer<'a> { | ||||
|     type Ok = FieldId; | ||||
|  | ||||
|     type Error = Error; | ||||
|  | ||||
|     type SerializeSeq = Impossible<FieldId, Self::Error>; | ||||
|     type SerializeTuple = Impossible<FieldId, Self::Error>; | ||||
|     type SerializeTupleStruct = Impossible<FieldId, Self::Error>; | ||||
|     type SerializeTupleVariant = Impossible<FieldId, Self::Error>; | ||||
|     type SerializeMap = Impossible<FieldId, Self::Error>; | ||||
|     type SerializeStruct = Impossible<FieldId, Self::Error>; | ||||
|     type SerializeStructVariant = Impossible<FieldId, Self::Error>; | ||||
|  | ||||
|     fn serialize_str(self, ws: &str) -> Result<Self::Ok, Self::Error> { | ||||
|         let field_id = match self.index.get_by_right(ws) { | ||||
|             Some(field_id) => *field_id, | ||||
|             None => { | ||||
|                 let field_id = self.index.len() as FieldId; | ||||
|                 self.index.insert(field_id, ws.to_string()); | ||||
|                 field_id | ||||
|             } | ||||
|         }; | ||||
|  | ||||
|         Ok(field_id) | ||||
|     } | ||||
|  | ||||
|     fn serialize_bool(self, _v: bool) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_i8(self, _v: i8) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_i16(self, _v: i16) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_i32(self, _v: i32) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_i64(self, _v: i64) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_u8(self, _v: u8) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_u16(self, _v: u16) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_u32(self, _v: u32) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_u64(self, _v: u64) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_f32(self, _v: f32) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_f64(self, _v: f64) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_char(self, _v: char) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_none(self) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error> | ||||
|     fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error> | ||||
|     where | ||||
|         T: Serialize, | ||||
|         D: serde::Deserializer<'de>, | ||||
|     { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_unit(self) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_unit_variant( | ||||
|         self, | ||||
|         _name: &'static str, | ||||
|         _variant_index: u32, | ||||
|         _variant: &'static str, | ||||
|     ) -> Result<Self::Ok, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_newtype_struct<T: ?Sized>( | ||||
|         self, | ||||
|         _name: &'static str, | ||||
|         _value: &T, | ||||
|     ) -> Result<Self::Ok, Self::Error> | ||||
|     where | ||||
|         T: Serialize, | ||||
|     { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_newtype_variant<T: ?Sized>( | ||||
|         self, | ||||
|         _name: &'static str, | ||||
|         _variant_index: u32, | ||||
|         _variant: &'static str, | ||||
|         _value: &T, | ||||
|     ) -> Result<Self::Ok, Self::Error> | ||||
|     where | ||||
|         T: Serialize, | ||||
|     { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_tuple_struct( | ||||
|         self, | ||||
|         _name: &'static str, | ||||
|         _len: usize, | ||||
|     ) -> Result<Self::SerializeTupleStruct, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_tuple_variant( | ||||
|         self, | ||||
|         _name: &'static str, | ||||
|         _variant_index: u32, | ||||
|         _variant: &'static str, | ||||
|         _len: usize, | ||||
|     ) -> Result<Self::SerializeTupleVariant, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_struct( | ||||
|         self, | ||||
|         _name: &'static str, | ||||
|         _len: usize, | ||||
|     ) -> Result<Self::SerializeStruct, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
|  | ||||
|     fn serialize_struct_variant( | ||||
|         self, | ||||
|         _name: &'static str, | ||||
|         _variant_index: u32, | ||||
|         _variant: &'static str, | ||||
|         _len: usize, | ||||
|     ) -> Result<Self::SerializeStructVariant, Self::Error> { | ||||
|         Err(Error::InvalidDocumentFormat) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl serde::ser::Error for Error { | ||||
|     fn custom<T: fmt::Display>(msg: T) -> Self { | ||||
|         Error::Custom(msg.to_string()) | ||||
|         deserializer.deserialize_map(self) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
| use std::collections::HashSet; | ||||
| use std::collections::BTreeSet; | ||||
| use std::convert::Infallible; | ||||
| use std::error::Error as StdError; | ||||
| use std::{fmt, io, str}; | ||||
| @@ -57,19 +57,17 @@ pub enum UserError { | ||||
|     CriterionError(CriterionError), | ||||
|     DocumentLimitReached, | ||||
|     InvalidDocumentId { document_id: Value }, | ||||
|     InvalidFacetsDistribution { invalid_facets_name: HashSet<String> }, | ||||
|     InvalidFacetsDistribution { invalid_facets_name: BTreeSet<String> }, | ||||
|     InvalidGeoField { document_id: Value, object: Value }, | ||||
|     InvalidFilter(String), | ||||
|     InvalidSortName { name: String }, | ||||
|     InvalidSortableAttribute { field: String, valid_fields: HashSet<String> }, | ||||
|     InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> }, | ||||
|     SortRankingRuleMissing, | ||||
|     InvalidStoreFile, | ||||
|     MaxDatabaseSizeReached, | ||||
|     MissingDocumentId { document: Object }, | ||||
|     MissingDocumentId { primary_key: String, document: Object }, | ||||
|     MissingPrimaryKey, | ||||
|     NoSpaceLeftOnDevice, | ||||
|     PrimaryKeyCannotBeChanged, | ||||
|     PrimaryKeyCannotBeReset, | ||||
|     PrimaryKeyCannotBeChanged(String), | ||||
|     SerdeJson(serde_json::Error), | ||||
|     SortError(SortError), | ||||
|     UnknownInternalDocumentId { document_id: DocumentId }, | ||||
| @@ -168,7 +166,7 @@ impl From<SerializationError> for Error { | ||||
| impl fmt::Display for Error { | ||||
|     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||
|         match self { | ||||
|             Self::InternalError(error) => write!(f, "internal: {}", error), | ||||
|             Self::InternalError(error) => write!(f, "internal: {}.", error), | ||||
|             Self::IoError(error) => error.fmt(f), | ||||
|             Self::UserError(error) => error.fmt(f), | ||||
|         } | ||||
| @@ -181,15 +179,15 @@ impl fmt::Display for InternalError { | ||||
|     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||
|         match self { | ||||
|             Self::DatabaseMissingEntry { db_name, key } => { | ||||
|                 write!(f, "missing {} in the {} database", key.unwrap_or("key"), db_name) | ||||
|                 write!(f, "Missing {} in the {} database.", key.unwrap_or("key"), db_name) | ||||
|             } | ||||
|             Self::FieldIdMapMissingEntry(error) => error.fmt(f), | ||||
|             Self::Fst(error) => error.fmt(f), | ||||
|             Self::GrenadInvalidCompressionType => { | ||||
|                 f.write_str("invalid compression type have been specified to grenad") | ||||
|                 f.write_str("Invalid compression type have been specified to grenad.") | ||||
|             } | ||||
|             Self::IndexingMergingKeys { process } => { | ||||
|                 write!(f, "invalid merge while processing {}", process) | ||||
|                 write!(f, "Invalid merge while processing {}.", process) | ||||
|             } | ||||
|             Self::Serialization(error) => error.fmt(f), | ||||
|             Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f), | ||||
| @@ -207,69 +205,75 @@ impl StdError for InternalError {} | ||||
| impl fmt::Display for UserError { | ||||
|     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||
|         match self { | ||||
|             Self::InvalidFilter(input) => write!(f, "{}", input), | ||||
|             Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"), | ||||
|             Self::InvalidFilter(error) => f.write_str(error), | ||||
|             Self::AttributeLimitReached => f.write_str("A document cannot contain more than 65,535 fields."), | ||||
|             Self::CriterionError(error) => write!(f, "{}", error), | ||||
|             Self::DocumentLimitReached => f.write_str("maximum number of documents reached"), | ||||
|             Self::DocumentLimitReached => f.write_str("Maximum number of documents reached."), | ||||
|             Self::InvalidFacetsDistribution { invalid_facets_name } => { | ||||
|                 let name_list = | ||||
|                     invalid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<_>>().join(", "); | ||||
|                 write!( | ||||
|                     f, | ||||
|                     "invalid facet distribution, the fields {} are not set as filterable", | ||||
|                     "Invalid facet distribution, the fields `{}` are not set as filterable.", | ||||
|                     name_list | ||||
|                 ) | ||||
|             } | ||||
|             Self::InvalidGeoField { document_id, object } => write!( | ||||
|                 f, | ||||
|                 "the document with the id: {} contains an invalid _geo field: {}", | ||||
|                 document_id, object | ||||
|             ), | ||||
|             Self::InvalidDocumentId { document_id } => { | ||||
|                 let json = serde_json::to_string(document_id).unwrap(); | ||||
|             Self::InvalidGeoField { document_id, object } => { | ||||
|                 let document_id = match document_id { | ||||
|                     Value::String(id) => id.clone(), | ||||
|                     _ => document_id.to_string(), | ||||
|                 }; | ||||
|                 let object = match object { | ||||
|                     Value::String(id) => id.clone(), | ||||
|                     _ => object.to_string(), | ||||
|                 }; | ||||
|                 write!( | ||||
|                     f, | ||||
|                     "document identifier is invalid {}, \ | ||||
| a document id can be of type integer or string \ | ||||
| only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_)", | ||||
|                     json | ||||
|                     "The document with the id: `{}` contains an invalid _geo field: `{}`.", | ||||
|                     document_id, object | ||||
|                 ) | ||||
|             }, | ||||
|             Self::InvalidDocumentId { document_id } => { | ||||
|                 let document_id = match document_id { | ||||
|                     Value::String(id) => id.clone(), | ||||
|                     _ => document_id.to_string(), | ||||
|                 }; | ||||
|                 write!( | ||||
|                     f, | ||||
|                     "Document identifier `{}` is invalid. \ | ||||
| A document identifier can be of type integer or string, \ | ||||
| only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_).", | ||||
|                     document_id | ||||
|                 ) | ||||
|             } | ||||
|             Self::InvalidSortName { name } => { | ||||
|                 write!(f, "Invalid syntax for the sort parameter: {}", name) | ||||
|             } | ||||
|             Self::InvalidSortableAttribute { field, valid_fields } => { | ||||
|                 let valid_names = | ||||
|                     valid_fields.iter().map(AsRef::as_ref).collect::<Vec<_>>().join(", "); | ||||
|                 write!( | ||||
|                     f, | ||||
|                     "Attribute {} is not sortable, available sortable attributes are: {}", | ||||
|                     "Attribute `{}` is not sortable. Available sortable attributes are: `{}`.", | ||||
|                     field, valid_names | ||||
|                 ) | ||||
|             } | ||||
|             Self::SortRankingRuleMissing => f.write_str( | ||||
|                 "You must specify where \"sort\" is listed in the \ | ||||
| rankingRules setting to use the sort parameter at search time", | ||||
|                 "The sort ranking rule must be specified in the \ | ||||
| ranking rules settings to use the sort parameter at search time.", | ||||
|             ), | ||||
|             Self::MissingDocumentId { document } => { | ||||
|             Self::MissingDocumentId { primary_key, document } => { | ||||
|                 let json = serde_json::to_string(document).unwrap(); | ||||
|                 write!(f, "document doesn't have an identifier {}", json) | ||||
|                 write!(f, "Document doesn't have a `{}` attribute: `{}`.", primary_key, json) | ||||
|             } | ||||
|             Self::MissingPrimaryKey => f.write_str("missing primary key"), | ||||
|             Self::MaxDatabaseSizeReached => f.write_str("maximum database size reached"), | ||||
|             // TODO where can we find it instead of writing the text ourselves? | ||||
|             Self::NoSpaceLeftOnDevice => f.write_str("no space left on device"), | ||||
|             Self::InvalidStoreFile => f.write_str("store file is not a valid database file"), | ||||
|             Self::PrimaryKeyCannotBeChanged => { | ||||
|                 f.write_str("primary key cannot be changed if the database contains documents") | ||||
|             } | ||||
|             Self::PrimaryKeyCannotBeReset => { | ||||
|                 f.write_str("primary key cannot be reset if the database contains documents") | ||||
|             Self::MissingPrimaryKey => f.write_str("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index."), | ||||
|             Self::MaxDatabaseSizeReached => f.write_str("Maximum database size has been reached."), | ||||
|             Self::NoSpaceLeftOnDevice => f.write_str("There is no more space left on the device. Consider increasing the size of the disk/partition."), | ||||
|             Self::InvalidStoreFile => f.write_str("The database file is in an invalid state."), | ||||
|             Self::PrimaryKeyCannotBeChanged(primary_key) => { | ||||
|                 write!(f, "Index already has a primary key: `{}`.", primary_key) | ||||
|             } | ||||
|             Self::SerdeJson(error) => error.fmt(f), | ||||
|             Self::SortError(error) => write!(f, "{}", error), | ||||
|             Self::UnknownInternalDocumentId { document_id } => { | ||||
|                 write!(f, "an unknown internal document id have been used ({})", document_id) | ||||
|                 write!(f, "An unknown internal document id have been used: `{}`.", document_id) | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|   | ||||
| @@ -953,6 +953,7 @@ pub(crate) mod tests { | ||||
|             { "id": 1, "name": "kevin", "has_dog": true }, | ||||
|             { "id": 2, "name": "bob" } | ||||
|         ]); | ||||
|  | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let builder = IndexDocuments::new(&mut wtxn, &index, 0); | ||||
|         builder.execute(content, |_, _| ()).unwrap(); | ||||
|   | ||||
| @@ -68,7 +68,9 @@ mod test { | ||||
|                 "txts": sample_txts[..(rng.gen_range(0..3))], | ||||
|                 "cat-ints": sample_ints[..(rng.gen_range(0..3))], | ||||
|             }); | ||||
|             builder.add_documents(doc).unwrap(); | ||||
|  | ||||
|             let doc = Cursor::new(serde_json::to_vec(&doc).unwrap()); | ||||
|             builder.extend_from_json(doc).unwrap(); | ||||
|         } | ||||
|  | ||||
|         builder.finish().unwrap(); | ||||
|   | ||||
							
								
								
									
										33
									
								
								milli/src/search/facet/grammar.pest
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								milli/src/search/facet/grammar.pest
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,33 @@ | ||||
| key = _{reserved | quoted | word } | ||||
| value = _{quoted | word } | ||||
| quoted = _{ (PUSH("'") | PUSH("\"")) ~ string ~ POP  } | ||||
| string = {char*} | ||||
| word = ${(LETTER | NUMBER | "_" | "-" | ".")+} | ||||
|  | ||||
| char =  _{ !(PEEK | "\\") ~ ANY | ||||
|     | "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t") | ||||
|     | "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})} | ||||
|  | ||||
| reserved = { "_geoDistance" | ("_geoPoint" ~ parameters) | "_geo" } | ||||
| // we deliberately choose to allow empty parameters to generate more specific error message later | ||||
| parameters = {("(" ~ (value ~ ",")* ~ value? ~ ")") | ""} | ||||
| condition = _{between | eq | greater | less | geq | leq | neq} | ||||
| between = {key ~ value ~ "TO" ~ value} | ||||
| geq = {key ~ ">=" ~ value} | ||||
| leq = {key ~ "<=" ~ value} | ||||
| neq = {key ~ "!=" ~ value} | ||||
| eq = {key ~ "=" ~ value} | ||||
| greater = {key ~ ">" ~ value} | ||||
| less = {key ~ "<" ~ value} | ||||
| geo_radius = {"_geoRadius" ~ parameters } | ||||
|  | ||||
| prgm = {SOI ~ expr ~ EOI} | ||||
| expr = _{ ( term ~ (operation ~ term)* ) } | ||||
| term = { ("(" ~ expr ~ ")") | condition | not | geo_radius } | ||||
| operation = _{ and | or } | ||||
| and = {"AND"} | ||||
| or = {"OR"} | ||||
|  | ||||
| not = {"NOT" ~ term} | ||||
|  | ||||
| WHITESPACE = _{ " " } | ||||
| @@ -151,13 +151,13 @@ impl<'a> Search<'a> { | ||||
|                     Member::Field(ref field) if !sortable_fields.contains(field) => { | ||||
|                         return Err(UserError::InvalidSortableAttribute { | ||||
|                             field: field.to_string(), | ||||
|                             valid_fields: sortable_fields, | ||||
|                             valid_fields: sortable_fields.into_iter().collect(), | ||||
|                         })? | ||||
|                     } | ||||
|                     Member::Geo(_) if !sortable_fields.contains("_geo") => { | ||||
|                         return Err(UserError::InvalidSortableAttribute { | ||||
|                             field: "_geo".to_string(), | ||||
|                             valid_fields: sortable_fields, | ||||
|                             valid_fields: sortable_fields.into_iter().collect(), | ||||
|                         })? | ||||
|                     } | ||||
|                     _ => (), | ||||
|   | ||||
| @@ -877,7 +877,8 @@ mod tests { | ||||
|         let mut cursor = Cursor::new(Vec::new()); | ||||
|  | ||||
|         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); | ||||
|         builder.add_documents(big_object).unwrap(); | ||||
|         let big_object = Cursor::new(serde_json::to_vec(&big_object).unwrap()); | ||||
|         builder.extend_from_json(big_object).unwrap(); | ||||
|         builder.finish().unwrap(); | ||||
|         cursor.set_position(0); | ||||
|         let content = DocumentBatchReader::from_reader(cursor).unwrap(); | ||||
| @@ -905,8 +906,9 @@ mod tests { | ||||
|  | ||||
|         let mut cursor = Cursor::new(Vec::new()); | ||||
|  | ||||
|         let big_object = serde_json::to_string(&big_object).unwrap(); | ||||
|         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); | ||||
|         builder.add_documents(big_object).unwrap(); | ||||
|         builder.extend_from_json(&mut big_object.as_bytes()).unwrap(); | ||||
|         builder.finish().unwrap(); | ||||
|         cursor.set_position(0); | ||||
|         let content = DocumentBatchReader::from_reader(cursor).unwrap(); | ||||
|   | ||||
| @@ -75,7 +75,7 @@ fn create_fields_mapping( | ||||
|         .collect() | ||||
| } | ||||
|  | ||||
| fn find_primary_key(index: &bimap::BiHashMap<u16, String>) -> Option<&str> { | ||||
| fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> { | ||||
|     index | ||||
|         .iter() | ||||
|         .sorted_by_key(|(k, _)| *k) | ||||
| @@ -179,7 +179,7 @@ impl Transform<'_, '_> { | ||||
|                         if !self.autogenerate_docids { | ||||
|                             let mut json = Map::new(); | ||||
|                             for (key, value) in document.iter() { | ||||
|                                 let key = addition_index.get_by_left(&key).cloned(); | ||||
|                                 let key = addition_index.name(key).cloned(); | ||||
|                                 let value = serde_json::from_slice::<Value>(&value).ok(); | ||||
|  | ||||
|                                 if let Some((k, v)) = key.zip(value) { | ||||
| @@ -187,7 +187,11 @@ impl Transform<'_, '_> { | ||||
|                                 } | ||||
|                             } | ||||
|  | ||||
|                             return Err(UserError::MissingDocumentId { document: json }.into()); | ||||
|                             return Err(UserError::MissingDocumentId { | ||||
|                                 primary_key: primary_key_name, | ||||
|                                 document: json, | ||||
|                             } | ||||
|                             .into()); | ||||
|                         } | ||||
|  | ||||
|                         let uuid = | ||||
| @@ -544,6 +548,7 @@ mod test { | ||||
|     mod primary_key_inference { | ||||
|         use bimap::BiHashMap; | ||||
|  | ||||
|         use crate::documents::DocumentsBatchIndex; | ||||
|         use crate::update::index_documents::transform::find_primary_key; | ||||
|  | ||||
|         #[test] | ||||
| @@ -557,7 +562,7 @@ mod test { | ||||
|                 map.insert(4, "fakeId".to_string()); | ||||
|                 map.insert(0, "realId".to_string()); | ||||
|  | ||||
|                 assert_eq!(find_primary_key(&map), Some("realId")); | ||||
|                 assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId")); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|   | ||||
| @@ -465,7 +465,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|                     self.index.put_primary_key(self.wtxn, primary_key)?; | ||||
|                     Ok(()) | ||||
|                 } else { | ||||
|                     Err(UserError::PrimaryKeyCannotBeChanged.into()) | ||||
|                     let primary_key = self.index.primary_key(self.wtxn)?.unwrap(); | ||||
|                     Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into()) | ||||
|                 } | ||||
|             } | ||||
|             Setting::Reset => { | ||||
| @@ -473,7 +474,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|                     self.index.delete_primary_key(self.wtxn)?; | ||||
|                     Ok(()) | ||||
|                 } else { | ||||
|                     Err(UserError::PrimaryKeyCannotBeReset.into()) | ||||
|                     let primary_key = self.index.primary_key(self.wtxn)?.unwrap(); | ||||
|                     Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into()) | ||||
|                 } | ||||
|             } | ||||
|             Setting::NotSet => Ok(()), | ||||
| @@ -1106,7 +1108,7 @@ mod tests { | ||||
|         builder.reset_primary_key(); | ||||
|  | ||||
|         let err = builder.execute(|_, _| ()).unwrap_err(); | ||||
|         assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeReset))); | ||||
|         assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeChanged(_)))); | ||||
|         wtxn.abort().unwrap(); | ||||
|  | ||||
|         // But if we clear the database... | ||||
|   | ||||
| @@ -61,9 +61,12 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { | ||||
|     let mut cursor = Cursor::new(Vec::new()); | ||||
|     let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); | ||||
|     let reader = Cursor::new(CONTENT.as_bytes()); | ||||
|  | ||||
|     for doc in serde_json::Deserializer::from_reader(reader).into_iter::<serde_json::Value>() { | ||||
|         documents_builder.add_documents(doc.unwrap()).unwrap(); | ||||
|         let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap()); | ||||
|         documents_builder.extend_from_json(doc).unwrap(); | ||||
|     } | ||||
|  | ||||
|     documents_builder.finish().unwrap(); | ||||
|  | ||||
|     cursor.set_position(0); | ||||
|   | ||||
| @@ -409,7 +409,8 @@ fn criteria_ascdesc() { | ||||
|             "age": age, | ||||
|         }); | ||||
|  | ||||
|         batch_builder.add_documents(json).unwrap(); | ||||
|         let json = Cursor::new(serde_json::to_vec(&json).unwrap()); | ||||
|         batch_builder.extend_from_json(json).unwrap(); | ||||
|     }); | ||||
|  | ||||
|     batch_builder.finish().unwrap(); | ||||
|   | ||||
| @@ -1,18 +0,0 @@ | ||||
| [package] | ||||
| name = "search" | ||||
| version = "0.18.0" | ||||
| authors = ["Clément Renault <clement@meilisearch.com>"] | ||||
| edition = "2018" | ||||
|  | ||||
| [dependencies] | ||||
| anyhow = "1.0.38" | ||||
| byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } | ||||
| heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } | ||||
| log = "0.4.14" | ||||
| milli = { path = "../milli" } | ||||
| serde_json = "1.0.62" | ||||
| stderrlog = "0.5.1" | ||||
| structopt = { version = "0.3.21", default-features = false } | ||||
|  | ||||
| [target.'cfg(target_os = "linux")'.dependencies] | ||||
| jemallocator = "0.3.2" | ||||
		Reference in New Issue
	
	Block a user