mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-09-12 15:56:34 +00:00
implement csv serialization
This commit is contained in:
@ -1,5 +1,8 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use std::io::Cursor;
|
||||
use std::io::Write;
|
||||
|
||||
use byteorder::{BigEndian, WriteBytesExt};
|
||||
use serde::Deserializer;
|
||||
@ -38,7 +41,7 @@ pub struct DocumentBatchBuilder<W> {
|
||||
|
||||
impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
|
||||
pub fn new(writer: W) -> Result<Self, Error> {
|
||||
let index = DocumentsBatchIndex::new();
|
||||
let index = DocumentsBatchIndex::default();
|
||||
let mut writer = ByteCounter::new(writer);
|
||||
// add space to write the offset of the metadata at the end of the writer
|
||||
writer.write_u64::<BigEndian>(0)?;
|
||||
@ -101,6 +104,79 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Extends the builder with json documents from a reader.
|
||||
///
|
||||
/// This method can be only called once and is mutually exclusive with extend from json. This
|
||||
/// is because the fields in a csv are always guaranteed to come in order, and permits some
|
||||
/// optimizations.
|
||||
///
|
||||
/// From csv takes care to call finish in the end.
|
||||
pub fn from_csv<R: io::Read>(mut self, reader: R) -> Result<(), Error> {
|
||||
|
||||
// Ensure that this is the first and only addition made with this builder
|
||||
debug_assert!(self.index.is_empty());
|
||||
|
||||
let mut records = csv::Reader::from_reader(reader);
|
||||
|
||||
let headers = records
|
||||
.headers()
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(parse_csv_header)
|
||||
.map(|(k, t)| (self.index.insert(&k), t))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let records = records.into_records();
|
||||
|
||||
dbg!(&headers);
|
||||
for record in records {
|
||||
match record {
|
||||
Ok(record) => {
|
||||
let mut writer = obkv::KvWriter::new(Cursor::new(&mut self.obkv_buffer));
|
||||
for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) {
|
||||
let value = match ty {
|
||||
AllowedType::Number => value.parse::<f64>().map(Value::from).unwrap(),
|
||||
AllowedType::String => Value::String(value.to_string()),
|
||||
};
|
||||
|
||||
serde_json::to_writer(Cursor::new(&mut self.value_buffer), dbg!(&value)).unwrap();
|
||||
writer.insert(*fid, &self.value_buffer)?;
|
||||
self.value_buffer.clear();
|
||||
}
|
||||
|
||||
self.inner.write_u32::<BigEndian>(self.obkv_buffer.len() as u32)?;
|
||||
self.inner.write_all(&self.obkv_buffer)?;
|
||||
|
||||
self.obkv_buffer.clear();
|
||||
self.count += 1;
|
||||
},
|
||||
Err(_) => panic!(),
|
||||
}
|
||||
}
|
||||
|
||||
self.finish()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum AllowedType {
|
||||
String,
|
||||
Number,
|
||||
}
|
||||
|
||||
fn parse_csv_header(header: &str) -> (String, AllowedType) {
|
||||
// if there are several separators we only split on the last one.
|
||||
match header.rsplit_once(':') {
|
||||
Some((field_name, field_type)) => match field_type {
|
||||
"string" => (field_name.to_string(), AllowedType::String),
|
||||
"number" => (field_name.to_string(), AllowedType::Number), // if the pattern isn't reconized, we keep the whole field.
|
||||
_otherwise => (header.to_string(), AllowedType::String),
|
||||
},
|
||||
None => (header.to_string(), AllowedType::String),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@ -185,4 +261,29 @@ mod test {
|
||||
|
||||
assert!(reader.next_document_with_index().unwrap().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_documents_csv() {
|
||||
let mut cursor = Cursor::new(Vec::new());
|
||||
let builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
|
||||
|
||||
let csv = "id:number,field:string\n1,hello!\n2,blabla";
|
||||
|
||||
builder.from_csv(Cursor::new(csv.as_bytes())).unwrap();
|
||||
|
||||
cursor.set_position(0);
|
||||
|
||||
let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
|
||||
|
||||
dbg!(reader.len());
|
||||
|
||||
let (index, document) = reader.next_document_with_index().unwrap().unwrap();
|
||||
assert_eq!(index.len(), 2);
|
||||
assert_eq!(document.iter().count(), 2);
|
||||
|
||||
let (_index, document) = reader.next_document_with_index().unwrap().unwrap();
|
||||
assert_eq!(document.iter().count(), 2);
|
||||
|
||||
assert!(reader.next_document_with_index().unwrap().is_none());
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user