mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-11-04 09:56:28 +00:00 
			
		
		
		
	implement review suggestions
This commit is contained in:
		@@ -1,16 +1,14 @@
 | 
			
		||||
use std::collections::BTreeMap;
 | 
			
		||||
use std::io;
 | 
			
		||||
use std::io::Cursor;
 | 
			
		||||
use std::io::Write;
 | 
			
		||||
use std::io::{Cursor, Write};
 | 
			
		||||
 | 
			
		||||
use byteorder::{BigEndian, WriteBytesExt};
 | 
			
		||||
use serde::Deserializer;
 | 
			
		||||
use serde_json::Value;
 | 
			
		||||
 | 
			
		||||
use crate::FieldId;
 | 
			
		||||
 | 
			
		||||
use super::serde::DocumentVisitor;
 | 
			
		||||
use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
 | 
			
		||||
use crate::FieldId;
 | 
			
		||||
 | 
			
		||||
/// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary
 | 
			
		||||
/// format used by milli.
 | 
			
		||||
@@ -27,7 +25,7 @@ use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
 | 
			
		||||
/// let json = r##"{"id": 1, "name": "foo"}"##;
 | 
			
		||||
/// let mut writer = Cursor::new(Vec::new());
 | 
			
		||||
/// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap();
 | 
			
		||||
/// builder.extend_from_json(Cursor::new(json.as_bytes())).unwrap();
 | 
			
		||||
/// builder.extend_from_json(&mut json.as_bytes()).unwrap();
 | 
			
		||||
/// builder.finish().unwrap();
 | 
			
		||||
/// ```
 | 
			
		||||
pub struct DocumentBatchBuilder<W> {
 | 
			
		||||
@@ -46,16 +44,14 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
 | 
			
		||||
        // add space to write the offset of the metadata at the end of the writer
 | 
			
		||||
        writer.write_u64::<BigEndian>(0)?;
 | 
			
		||||
 | 
			
		||||
        let this = Self {
 | 
			
		||||
        Ok(Self {
 | 
			
		||||
            inner: writer,
 | 
			
		||||
            index,
 | 
			
		||||
            obkv_buffer: Vec::new(),
 | 
			
		||||
            value_buffer: Vec::new(),
 | 
			
		||||
            values: BTreeMap::new(),
 | 
			
		||||
            count: 0,
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        Ok(this)
 | 
			
		||||
        })
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Returns the number of documents that have been written to the builder.
 | 
			
		||||
@@ -117,27 +113,31 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
 | 
			
		||||
 | 
			
		||||
        for (i, record) in records.into_records().enumerate() {
 | 
			
		||||
            let record = record?;
 | 
			
		||||
            let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer));
 | 
			
		||||
            this.obkv_buffer.clear();
 | 
			
		||||
            let mut writer = obkv::KvWriter::new(&mut this.obkv_buffer);
 | 
			
		||||
            for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) {
 | 
			
		||||
                let value = match ty {
 | 
			
		||||
                    AllowedType::Number => value.parse::<f64>().map(Value::from).map_err(|error| Error::ParseFloat {
 | 
			
		||||
                        error,
 | 
			
		||||
                        // +1 for the header offset.
 | 
			
		||||
                        line: i + 1,
 | 
			
		||||
                        value: value.to_string(),
 | 
			
		||||
                    })?,
 | 
			
		||||
                    AllowedType::Number => {
 | 
			
		||||
                        value.parse::<f64>().map(Value::from).map_err(|error| {
 | 
			
		||||
                            Error::ParseFloat {
 | 
			
		||||
                                error,
 | 
			
		||||
                                // +1 for the header offset.
 | 
			
		||||
                                line: i + 1,
 | 
			
		||||
                                value: value.to_string(),
 | 
			
		||||
                            }
 | 
			
		||||
                        })?
 | 
			
		||||
                    }
 | 
			
		||||
                    AllowedType::String => Value::String(value.to_string()),
 | 
			
		||||
                };
 | 
			
		||||
 | 
			
		||||
                this.value_buffer.clear();
 | 
			
		||||
                serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?;
 | 
			
		||||
                writer.insert(*fid, &this.value_buffer)?;
 | 
			
		||||
                this.value_buffer.clear();
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            this.inner.write_u32::<BigEndian>(this.obkv_buffer.len() as u32)?;
 | 
			
		||||
            this.inner.write_all(&this.obkv_buffer)?;
 | 
			
		||||
 | 
			
		||||
            this.obkv_buffer.clear();
 | 
			
		||||
            this.count += 1;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
@@ -156,7 +156,8 @@ fn parse_csv_header(header: &str) -> (String, AllowedType) {
 | 
			
		||||
    match header.rsplit_once(':') {
 | 
			
		||||
        Some((field_name, field_type)) => match field_type {
 | 
			
		||||
            "string" => (field_name.to_string(), AllowedType::String),
 | 
			
		||||
            "number" => (field_name.to_string(), AllowedType::Number), // if the pattern isn't reconized, we keep the whole field.
 | 
			
		||||
            "number" => (field_name.to_string(), AllowedType::Number),
 | 
			
		||||
            // if the pattern isn't reconized, we keep the whole field.
 | 
			
		||||
            _otherwise => (header.to_string(), AllowedType::String),
 | 
			
		||||
        },
 | 
			
		||||
        None => (header.to_string(), AllowedType::String),
 | 
			
		||||
@@ -169,9 +170,8 @@ mod test {
 | 
			
		||||
 | 
			
		||||
    use serde_json::{json, Map};
 | 
			
		||||
 | 
			
		||||
    use crate::documents::DocumentBatchReader;
 | 
			
		||||
 | 
			
		||||
    use super::*;
 | 
			
		||||
    use crate::documents::DocumentBatchReader;
 | 
			
		||||
 | 
			
		||||
    fn obkv_to_value(obkv: &obkv::KvReader<FieldId>, index: &DocumentsBatchIndex) -> Value {
 | 
			
		||||
        let mut map = Map::new();
 | 
			
		||||
@@ -525,7 +525,9 @@ mod test {
 | 
			
		||||
"Boston","United States","4628910""#;
 | 
			
		||||
 | 
			
		||||
        let mut buf = Vec::new();
 | 
			
		||||
        assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err());
 | 
			
		||||
        assert!(
 | 
			
		||||
            DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
 | 
			
		||||
        );
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    #[test]
 | 
			
		||||
@@ -534,7 +536,9 @@ mod test {
 | 
			
		||||
"Boston","United States","4628910", "too much""#;
 | 
			
		||||
 | 
			
		||||
        let mut buf = Vec::new();
 | 
			
		||||
        assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err());
 | 
			
		||||
        assert!(
 | 
			
		||||
            DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
 | 
			
		||||
        );
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    #[test]
 | 
			
		||||
@@ -543,6 +547,8 @@ mod test {
 | 
			
		||||
"Boston","United States""#;
 | 
			
		||||
 | 
			
		||||
        let mut buf = Vec::new();
 | 
			
		||||
        assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err());
 | 
			
		||||
        assert!(
 | 
			
		||||
            DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
 | 
			
		||||
        );
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -7,9 +7,8 @@ mod builder;
 | 
			
		||||
mod reader;
 | 
			
		||||
mod serde;
 | 
			
		||||
 | 
			
		||||
use std::num::ParseFloatError;
 | 
			
		||||
use std::io;
 | 
			
		||||
use std::fmt::{self, Debug};
 | 
			
		||||
use std::io;
 | 
			
		||||
 | 
			
		||||
use ::serde::{Deserialize, Serialize};
 | 
			
		||||
use bimap::BiHashMap;
 | 
			
		||||
@@ -24,7 +23,7 @@ pub struct DocumentsBatchIndex(pub BiHashMap<FieldId, String>);
 | 
			
		||||
 | 
			
		||||
impl DocumentsBatchIndex {
 | 
			
		||||
    /// Insert the field in the map, or return it's field id if it doesn't already exists.
 | 
			
		||||
    pub fn insert(&mut self, field:  &str) -> FieldId {
 | 
			
		||||
    pub fn insert(&mut self, field: &str) -> FieldId {
 | 
			
		||||
        match self.0.get_by_right(field) {
 | 
			
		||||
            Some(field_id) => *field_id,
 | 
			
		||||
            None => {
 | 
			
		||||
@@ -43,7 +42,7 @@ impl DocumentsBatchIndex {
 | 
			
		||||
        self.0.len()
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    pub fn iter(&self) -> impl Iterator<Item=(&FieldId, &String)> {
 | 
			
		||||
    pub fn iter(&self) -> bimap::hash::Iter<FieldId, String> {
 | 
			
		||||
        self.0.iter()
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -83,11 +82,7 @@ impl<W: io::Write> io::Write for ByteCounter<W> {
 | 
			
		||||
 | 
			
		||||
#[derive(Debug)]
 | 
			
		||||
pub enum Error {
 | 
			
		||||
    ParseFloat {
 | 
			
		||||
        error: std::num::ParseFloatError,
 | 
			
		||||
        line: usize,
 | 
			
		||||
        value: String,
 | 
			
		||||
    },
 | 
			
		||||
    ParseFloat { error: std::num::ParseFloatError, line: usize, value: String },
 | 
			
		||||
    InvalidDocumentFormat,
 | 
			
		||||
    Custom(String),
 | 
			
		||||
    JsonError(serde_json::Error),
 | 
			
		||||
@@ -124,7 +119,9 @@ impl From<serde_json::Error> for Error {
 | 
			
		||||
impl fmt::Display for Error {
 | 
			
		||||
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 | 
			
		||||
        match self {
 | 
			
		||||
            Error::ParseFloat { error, line, value} => write!(f, "Error parsing number {:?} at line {}: {}", value, line, error),
 | 
			
		||||
            Error::ParseFloat { error, line, value } => {
 | 
			
		||||
                write!(f, "Error parsing number {:?} at line {}: {}", value, line, error)
 | 
			
		||||
            }
 | 
			
		||||
            Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s),
 | 
			
		||||
            Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."),
 | 
			
		||||
            Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err),
 | 
			
		||||
 
 | 
			
		||||
@@ -1,18 +1,13 @@
 | 
			
		||||
use std::collections::BTreeMap;
 | 
			
		||||
use std::io::Cursor;
 | 
			
		||||
use std::io::Write;
 | 
			
		||||
use std::fmt;
 | 
			
		||||
use std::io::{Cursor, Write};
 | 
			
		||||
 | 
			
		||||
use byteorder::WriteBytesExt;
 | 
			
		||||
use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor};
 | 
			
		||||
use serde::Deserialize;
 | 
			
		||||
use serde::de::DeserializeSeed;
 | 
			
		||||
use serde::de::MapAccess;
 | 
			
		||||
use serde::de::SeqAccess;
 | 
			
		||||
use serde::de::Visitor;
 | 
			
		||||
use serde_json::Value;
 | 
			
		||||
 | 
			
		||||
use super::Error;
 | 
			
		||||
use super::{ByteCounter, DocumentsBatchIndex};
 | 
			
		||||
use super::{ByteCounter, DocumentsBatchIndex, Error};
 | 
			
		||||
use crate::FieldId;
 | 
			
		||||
 | 
			
		||||
macro_rules! tri {
 | 
			
		||||
@@ -31,8 +26,9 @@ impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> {
 | 
			
		||||
 | 
			
		||||
    fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
 | 
			
		||||
    where
 | 
			
		||||
        D: serde::Deserializer<'de> {
 | 
			
		||||
            deserializer.deserialize_str(self)
 | 
			
		||||
        D: serde::Deserializer<'de>,
 | 
			
		||||
    {
 | 
			
		||||
        deserializer.deserialize_str(self)
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -43,7 +39,7 @@ impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> {
 | 
			
		||||
    where
 | 
			
		||||
        E: serde::de::Error,
 | 
			
		||||
    {
 | 
			
		||||
            Ok(self.0.insert(v))
 | 
			
		||||
        Ok(self.0.insert(v))
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
 | 
			
		||||
@@ -58,8 +54,9 @@ impl<'de> DeserializeSeed<'de> for ValueDeserializer {
 | 
			
		||||
 | 
			
		||||
    fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
 | 
			
		||||
    where
 | 
			
		||||
        D: serde::Deserializer<'de> {
 | 
			
		||||
            serde_json::Value::deserialize(deserializer)
 | 
			
		||||
        D: serde::Deserializer<'de>,
 | 
			
		||||
    {
 | 
			
		||||
        serde_json::Value::deserialize(deserializer)
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -80,7 +77,9 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
 | 
			
		||||
    where
 | 
			
		||||
        A: SeqAccess<'de>,
 | 
			
		||||
    {
 | 
			
		||||
        while let Some(v) = seq.next_element_seed(&mut *self)? { tri!(v) }
 | 
			
		||||
        while let Some(v) = seq.next_element_seed(&mut *self)? {
 | 
			
		||||
            tri!(v)
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        Ok(Ok(()))
 | 
			
		||||
    }
 | 
			
		||||
@@ -89,7 +88,9 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
 | 
			
		||||
    where
 | 
			
		||||
        A: MapAccess<'de>,
 | 
			
		||||
    {
 | 
			
		||||
        while let Some((key, value)) = map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)? {
 | 
			
		||||
        while let Some((key, value)) =
 | 
			
		||||
            map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)?
 | 
			
		||||
        {
 | 
			
		||||
            self.values.insert(key, value);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
@@ -119,13 +120,15 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W>
 | 
			
		||||
where W: Write,
 | 
			
		||||
where
 | 
			
		||||
    W: Write,
 | 
			
		||||
{
 | 
			
		||||
    type Value = Result<(), Error>;
 | 
			
		||||
 | 
			
		||||
    fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
 | 
			
		||||
    where
 | 
			
		||||
        D: serde::Deserializer<'de> {
 | 
			
		||||
            deserializer.deserialize_map(self)
 | 
			
		||||
        D: serde::Deserializer<'de>,
 | 
			
		||||
    {
 | 
			
		||||
        deserializer.deserialize_map(self)
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -906,8 +906,9 @@ mod tests {
 | 
			
		||||
 | 
			
		||||
        let mut cursor = Cursor::new(Vec::new());
 | 
			
		||||
 | 
			
		||||
        let big_object = serde_json::to_string(&big_object).unwrap();
 | 
			
		||||
        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
 | 
			
		||||
        builder.add_documents(big_object).unwrap();
 | 
			
		||||
        builder.extend_from_json(&mut big_object.as_bytes()).unwrap();
 | 
			
		||||
        builder.finish().unwrap();
 | 
			
		||||
        cursor.set_position(0);
 | 
			
		||||
        let content = DocumentBatchReader::from_reader(cursor).unwrap();
 | 
			
		||||
 
 | 
			
		||||
@@ -544,7 +544,8 @@ mod test {
 | 
			
		||||
    mod primary_key_inference {
 | 
			
		||||
        use bimap::BiHashMap;
 | 
			
		||||
 | 
			
		||||
        use crate::{documents::DocumentsBatchIndex, update::index_documents::transform::find_primary_key};
 | 
			
		||||
        use crate::documents::DocumentsBatchIndex;
 | 
			
		||||
        use crate::update::index_documents::transform::find_primary_key;
 | 
			
		||||
 | 
			
		||||
        #[test]
 | 
			
		||||
        fn primary_key_infered_on_first_field() {
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user