mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 05:26:27 +00:00 
			
		
		
		
	Make the engine to return csv string records as documents and headers
This commit is contained in:
		
							
								
								
									
										35
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										35
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -629,7 +629,7 @@ dependencies = [ | |||||||
|  "futures-util", |  "futures-util", | ||||||
|  "http", |  "http", | ||||||
|  "indexmap", |  "indexmap", | ||||||
|  "log 0.4.8", |  "log 0.4.11", | ||||||
|  "slab", |  "slab", | ||||||
|  "tokio", |  "tokio", | ||||||
|  "tokio-util", |  "tokio-util", | ||||||
| @@ -771,7 +771,7 @@ dependencies = [ | |||||||
|  "http-body", |  "http-body", | ||||||
|  "httparse", |  "httparse", | ||||||
|  "itoa", |  "itoa", | ||||||
|  "log 0.4.8", |  "log 0.4.11", | ||||||
|  "pin-project", |  "pin-project", | ||||||
|  "socket2", |  "socket2", | ||||||
|  "time", |  "time", | ||||||
| @@ -933,14 +933,14 @@ version = "0.3.9" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b" | checksum = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "log 0.4.8", |  "log 0.4.11", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "log" | name = "log" | ||||||
| version = "0.4.8" | version = "0.4.11" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7" | checksum = "4fabed175da42fed1fa0746b0ea71f412aa9d35e76e95e59b192c64b9dc2bf8b" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "cfg-if", |  "cfg-if", | ||||||
| ] | ] | ||||||
| @@ -1005,7 +1005,7 @@ dependencies = [ | |||||||
|  "itertools", |  "itertools", | ||||||
|  "jemallocator", |  "jemallocator", | ||||||
|  "levenshtein_automata", |  "levenshtein_automata", | ||||||
|  "log 0.4.8", |  "log 0.4.11", | ||||||
|  "memmap", |  "memmap", | ||||||
|  "once_cell", |  "once_cell", | ||||||
|  "oxidized-mtbl", |  "oxidized-mtbl", | ||||||
| @@ -1081,7 +1081,7 @@ dependencies = [ | |||||||
|  "iovec", |  "iovec", | ||||||
|  "kernel32-sys", |  "kernel32-sys", | ||||||
|  "libc", |  "libc", | ||||||
|  "log 0.4.8", |  "log 0.4.11", | ||||||
|  "miow 0.2.1", |  "miow 0.2.1", | ||||||
|  "net2", |  "net2", | ||||||
|  "slab", |  "slab", | ||||||
| @@ -1094,7 +1094,7 @@ version = "0.1.6" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "f5e374eff525ce1c5b7687c4cef63943e7686524a387933ad27ca7ec43779cb3" | checksum = "f5e374eff525ce1c5b7687c4cef63943e7686524a387933ad27ca7ec43779cb3" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "log 0.4.8", |  "log 0.4.11", | ||||||
|  "mio", |  "mio", | ||||||
|  "miow 0.3.4", |  "miow 0.3.4", | ||||||
|  "winapi 0.3.8", |  "winapi 0.3.8", | ||||||
| @@ -1141,7 +1141,7 @@ checksum = "136eed74cadb9edd2651ffba732b19a450316b680e4f48d6c79e905799e19d01" | |||||||
| dependencies = [ | dependencies = [ | ||||||
|  "buf_redux", |  "buf_redux", | ||||||
|  "httparse", |  "httparse", | ||||||
|  "log 0.4.8", |  "log 0.4.11", | ||||||
|  "mime 0.2.6", |  "mime 0.2.6", | ||||||
|  "mime_guess 1.8.8", |  "mime_guess 1.8.8", | ||||||
|  "quick-error", |  "quick-error", | ||||||
| @@ -1228,11 +1228,12 @@ checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c" | |||||||
| [[package]] | [[package]] | ||||||
| name = "oxidized-mtbl" | name = "oxidized-mtbl" | ||||||
| version = "0.1.0" | version = "0.1.0" | ||||||
| source = "git+https://github.com/Kerollmops/oxidized-mtbl.git?rev=4ca66e5#4ca66e50115da760f602e878943af59f06c53af1" | source = "git+https://github.com/Kerollmops/oxidized-mtbl.git?rev=5426182#5426182d9ad8b74a9ebb386f03d33ce073cef0e0" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "byteorder", |  "byteorder", | ||||||
|  "crc32c", |  "crc32c", | ||||||
|  "flate2", |  "flate2", | ||||||
|  |  "log 0.4.11", | ||||||
|  "memmap", |  "memmap", | ||||||
|  "snap", |  "snap", | ||||||
|  "tempfile", |  "tempfile", | ||||||
| @@ -1836,7 +1837,7 @@ checksum = "32e5ee9b90a5452c570a0b0ac1c99ae9498db7e56e33d74366de7f2a7add7f25" | |||||||
| dependencies = [ | dependencies = [ | ||||||
|  "atty", |  "atty", | ||||||
|  "chrono", |  "chrono", | ||||||
|  "log 0.4.8", |  "log 0.4.11", | ||||||
|  "termcolor", |  "termcolor", | ||||||
|  "thread_local", |  "thread_local", | ||||||
| ] | ] | ||||||
| @@ -2015,7 +2016,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" | |||||||
| checksum = "b8b8fe88007ebc363512449868d7da4389c9400072a3f666f212c7280082882a" | checksum = "b8b8fe88007ebc363512449868d7da4389c9400072a3f666f212c7280082882a" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "futures", |  "futures", | ||||||
|  "log 0.4.8", |  "log 0.4.11", | ||||||
|  "pin-project", |  "pin-project", | ||||||
|  "tokio", |  "tokio", | ||||||
|  "tungstenite", |  "tungstenite", | ||||||
| @@ -2030,7 +2031,7 @@ dependencies = [ | |||||||
|  "bytes", |  "bytes", | ||||||
|  "futures-core", |  "futures-core", | ||||||
|  "futures-sink", |  "futures-sink", | ||||||
|  "log 0.4.8", |  "log 0.4.11", | ||||||
|  "pin-project-lite", |  "pin-project-lite", | ||||||
|  "tokio", |  "tokio", | ||||||
| ] | ] | ||||||
| @@ -2068,7 +2069,7 @@ dependencies = [ | |||||||
|  "http", |  "http", | ||||||
|  "httparse", |  "httparse", | ||||||
|  "input_buffer", |  "input_buffer", | ||||||
|  "log 0.4.8", |  "log 0.4.11", | ||||||
|  "rand 0.7.3", |  "rand 0.7.3", | ||||||
|  "sha-1", |  "sha-1", | ||||||
|  "url", |  "url", | ||||||
| @@ -2211,7 +2212,7 @@ version = "0.3.0" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0" | checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "log 0.4.8", |  "log 0.4.11", | ||||||
|  "try-lock", |  "try-lock", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| @@ -2226,7 +2227,7 @@ dependencies = [ | |||||||
|  "headers", |  "headers", | ||||||
|  "http", |  "http", | ||||||
|  "hyper", |  "hyper", | ||||||
|  "log 0.4.8", |  "log 0.4.11", | ||||||
|  "mime 0.3.16", |  "mime 0.3.16", | ||||||
|  "mime_guess 2.0.3", |  "mime_guess 2.0.3", | ||||||
|  "multipart", |  "multipart", | ||||||
| @@ -2265,7 +2266,7 @@ checksum = "ded84f06e0ed21499f6184df0e0cb3494727b0c5da89534e0fcc55c51d812101" | |||||||
| dependencies = [ | dependencies = [ | ||||||
|  "bumpalo", |  "bumpalo", | ||||||
|  "lazy_static 1.4.0", |  "lazy_static 1.4.0", | ||||||
|  "log 0.4.8", |  "log 0.4.11", | ||||||
|  "proc-macro2", |  "proc-macro2", | ||||||
|  "quote", |  "quote", | ||||||
|  "syn", |  "syn", | ||||||
|   | |||||||
| @@ -22,7 +22,7 @@ jemallocator = "0.3.2" | |||||||
| levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } | levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } | ||||||
| memmap = "0.7.0" | memmap = "0.7.0" | ||||||
| once_cell = "1.4.0" | once_cell = "1.4.0" | ||||||
| oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "4ca66e5" } | oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "5426182" } | ||||||
| rayon = "1.3.1" | rayon = "1.3.1" | ||||||
| ringtail = "0.3.0" | ringtail = "0.3.0" | ||||||
| roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "mem-usage" } | roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "mem-usage" } | ||||||
| @@ -33,7 +33,7 @@ structopt = { version = "0.3.14", default-features = false } | |||||||
| tempfile = "3.1.0" | tempfile = "3.1.0" | ||||||
|  |  | ||||||
| # logging | # logging | ||||||
| log = "0.4.8" | log = "0.4.11" | ||||||
| stderrlog = "0.4.3" | stderrlog = "0.4.3" | ||||||
|  |  | ||||||
| # best proximity | # best proximity | ||||||
|   | |||||||
| @@ -10,8 +10,10 @@ use anyhow::Context; | |||||||
| use arc_cache::ArcCache; | use arc_cache::ArcCache; | ||||||
| use bstr::ByteSlice as _; | use bstr::ByteSlice as _; | ||||||
| use cow_utils::CowUtils; | use cow_utils::CowUtils; | ||||||
|  | use csv::StringRecord; | ||||||
| use flate2::read::GzDecoder; | use flate2::read::GzDecoder; | ||||||
| use fst::IntoStreamer; | use fst::IntoStreamer; | ||||||
|  | use heed::BytesEncode; | ||||||
| use heed::EnvOpenOptions; | use heed::EnvOpenOptions; | ||||||
| use heed::types::*; | use heed::types::*; | ||||||
| use log::{debug, info}; | use log::{debug, info}; | ||||||
| @@ -21,8 +23,9 @@ use rayon::prelude::*; | |||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use structopt::StructOpt; | use structopt::StructOpt; | ||||||
|  |  | ||||||
| use milli::{SmallVec32, Index, DocumentId, Position, Attribute, BEU32}; | use milli::heed_codec::CsvStringRecordCodec; | ||||||
| use milli::tokenizer::{simple_tokenizer, only_words}; | use milli::tokenizer::{simple_tokenizer, only_words}; | ||||||
|  | use milli::{SmallVec32, Index, DocumentId, Position, Attribute, BEU32}; | ||||||
|  |  | ||||||
| const LMDB_MAX_KEY_LENGTH: usize = 511; | const LMDB_MAX_KEY_LENGTH: usize = 511; | ||||||
| const ONE_MILLION: usize = 1_000_000; | const ONE_MILLION: usize = 1_000_000; | ||||||
| @@ -205,13 +208,17 @@ impl Store { | |||||||
|         Self::write_word_attribute_docids(&mut self.sorter, lrus) |         Self::write_word_attribute_docids(&mut self.sorter, lrus) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn write_headers(&mut self, headers: &[u8]) -> anyhow::Result<()> { |     pub fn write_headers(&mut self, headers: &StringRecord) -> anyhow::Result<()> { | ||||||
|  |         let headers = CsvStringRecordCodec::bytes_encode(headers) | ||||||
|  |             .with_context(|| format!("could not encode csv record"))?; | ||||||
|         Ok(self.sorter.insert(HEADERS_KEY, headers)?) |         Ok(self.sorter.insert(HEADERS_KEY, headers)?) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn write_document(&mut self, id: DocumentId, content: &[u8]) -> anyhow::Result<()> { |     pub fn write_document(&mut self, id: DocumentId, record: &StringRecord) -> anyhow::Result<()> { | ||||||
|  |         let record = CsvStringRecordCodec::bytes_encode(record) | ||||||
|  |             .with_context(|| format!("could not encode csv record"))?; | ||||||
|         self.documents_ids.insert(id); |         self.documents_ids.insert(id); | ||||||
|         Ok(self.documents_sorter.insert(id.to_be_bytes(), content)?) |         Ok(self.documents_sorter.insert(id.to_be_bytes(), record)?) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn write_word_positions<I>(sorter: &mut Sorter<MergeFn>, iter: I) -> anyhow::Result<()> |     fn write_word_positions<I>(sorter: &mut Sorter<MergeFn>, iter: I) -> anyhow::Result<()> | ||||||
| @@ -487,9 +494,6 @@ fn index_csv( | |||||||
|  |  | ||||||
|     // Write the headers into a Vec of bytes and then into the store. |     // Write the headers into a Vec of bytes and then into the store. | ||||||
|     let headers = rdr.headers()?; |     let headers = rdr.headers()?; | ||||||
|     let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new()); |  | ||||||
|     writer.write_byte_record(headers.as_byte_record())?; |  | ||||||
|     let headers = writer.into_inner()?; |  | ||||||
|     store.write_headers(&headers)?; |     store.write_headers(&headers)?; | ||||||
|  |  | ||||||
|     let mut before = Instant::now(); |     let mut before = Instant::now(); | ||||||
| @@ -500,7 +504,7 @@ fn index_csv( | |||||||
|         // We skip documents that must not be indexed by this thread. |         // We skip documents that must not be indexed by this thread. | ||||||
|         if document_id % num_threads == thread_index { |         if document_id % num_threads == thread_index { | ||||||
|             if document_id % ONE_MILLION == 0 { |             if document_id % ONE_MILLION == 0 { | ||||||
|                 debug!("We have seen {}m documents so far ({:.02?}).", |                 info!("We have seen {}m documents so far ({:.02?}).", | ||||||
|                     document_id / ONE_MILLION, before.elapsed()); |                     document_id / ONE_MILLION, before.elapsed()); | ||||||
|                 before = Instant::now(); |                 before = Instant::now(); | ||||||
|             } |             } | ||||||
| @@ -515,9 +519,6 @@ fn index_csv( | |||||||
|             } |             } | ||||||
|  |  | ||||||
|             // We write the document in the database. |             // We write the document in the database. | ||||||
|             let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new()); |  | ||||||
|             writer.write_byte_record(document.as_byte_record())?; |  | ||||||
|             let document = writer.into_inner()?; |  | ||||||
|             store.write_document(document_id, &document)?; |             store.write_document(document_id, &document)?; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,4 +1,4 @@ | |||||||
| use std::io::{self, Write, BufRead}; | use std::io::{self, BufRead}; | ||||||
| use std::iter::once; | use std::iter::once; | ||||||
| use std::path::PathBuf; | use std::path::PathBuf; | ||||||
| use std::time::Instant; | use std::time::Instant; | ||||||
| @@ -70,12 +70,12 @@ fn main() -> anyhow::Result<()> { | |||||||
|         }; |         }; | ||||||
|         let documents = index.documents(&rtxn, result.documents_ids.iter().cloned())?; |         let documents = index.documents(&rtxn, result.documents_ids.iter().cloned())?; | ||||||
|  |  | ||||||
|         let mut stdout = io::stdout(); |         let mut wtr = csv::Writer::from_writer(io::stdout()); | ||||||
|         stdout.write_all(&headers)?; |         wtr.write_record(&headers)?; | ||||||
|  |         for (_id, record) in documents { | ||||||
|         for (_id, content) in documents { |             wtr.write_record(&record)?; | ||||||
|             stdout.write_all(&content)?; |  | ||||||
|         } |         } | ||||||
|  |         wtr.flush()?; | ||||||
|  |  | ||||||
|         debug!("Took {:.02?} to find {} documents", before.elapsed(), result.documents_ids.len()); |         debug!("Took {:.02?} to find {} documents", before.elapsed(), result.documents_ids.len()); | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -1,4 +1,3 @@ | |||||||
| use std::borrow::Cow; |  | ||||||
| use std::collections::HashSet; | use std::collections::HashSet; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::net::SocketAddr; | use std::net::SocketAddr; | ||||||
| @@ -45,20 +44,25 @@ struct Opt { | |||||||
|     http_listen_addr: String, |     http_listen_addr: String, | ||||||
| } | } | ||||||
|  |  | ||||||
| fn highlight_string(string: &str, words: &HashSet<String>) -> String { | fn highlight_record(record: &csv::StringRecord, words: &HashSet<String>) -> csv::StringRecord { | ||||||
|     let mut output = String::new(); |     let mut output_record = csv::StringRecord::new(); | ||||||
|     for (token_type, token) in simple_tokenizer(string) { |     let mut buffer = String::new(); | ||||||
|         if token_type == TokenType::Word { |     for field in record { | ||||||
|             let lowercase_token = token.to_lowercase(); |         buffer.clear(); | ||||||
|             let to_highlight = words.contains(&lowercase_token); |         for (token_type, token) in simple_tokenizer(field) { | ||||||
|             if to_highlight { output.push_str("<mark>") } |             if token_type == TokenType::Word { | ||||||
|             output.push_str(token); |                 let lowercase_token = token.to_lowercase(); | ||||||
|             if to_highlight { output.push_str("</mark>") } |                 let to_highlight = words.contains(&lowercase_token); | ||||||
|         } else { |                 if to_highlight { buffer.push_str("<mark>") } | ||||||
|             output.push_str(token); |                 buffer.push_str(token); | ||||||
|  |                 if to_highlight { buffer.push_str("</mark>") } | ||||||
|  |             } else { | ||||||
|  |                 buffer.push_str(token); | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|  |         output_record.push_field(&buffer); | ||||||
|     } |     } | ||||||
|     output |     output_record | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Template)] | #[derive(Template)] | ||||||
| @@ -186,23 +190,27 @@ async fn main() -> anyhow::Result<()> { | |||||||
|                 .execute() |                 .execute() | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|  |  | ||||||
|             let mut body = Vec::new(); |             let body = match index.headers(&rtxn).unwrap() { | ||||||
|             if let Some(headers) = index.headers(&rtxn).unwrap() { |                 Some(headers) => { | ||||||
|                 // We write the headers |                     let mut wtr = csv::Writer::from_writer(Vec::new()); | ||||||
|                 body.extend_from_slice(headers); |  | ||||||
|                 let documents = index.documents(&rtxn, documents_ids).unwrap(); |  | ||||||
|  |  | ||||||
|                 for (_id, content) in documents { |                     // We write the headers | ||||||
|                     let content = std::str::from_utf8(content.as_ref()).unwrap(); |                     wtr.write_record(&headers).unwrap(); | ||||||
|                     let content = if disable_highlighting { |  | ||||||
|                         Cow::from(content) |  | ||||||
|                     } else { |  | ||||||
|                         Cow::from(highlight_string(content, &found_words)) |  | ||||||
|                     }; |  | ||||||
|  |  | ||||||
|                     body.extend_from_slice(content.as_bytes()); |                     let documents = index.documents(&rtxn, documents_ids).unwrap(); | ||||||
|                 } |                     for (_id, record) in documents { | ||||||
|             } |                         let record = if disable_highlighting { | ||||||
|  |                             record | ||||||
|  |                         } else { | ||||||
|  |                             highlight_record(&record, &found_words) | ||||||
|  |                         }; | ||||||
|  |                         wtr.write_record(&record).unwrap(); | ||||||
|  |                     } | ||||||
|  |  | ||||||
|  |                     wtr.into_inner().unwrap() | ||||||
|  |                 }, | ||||||
|  |                 None => Vec::new(), | ||||||
|  |             }; | ||||||
|  |  | ||||||
|             Response::builder() |             Response::builder() | ||||||
|                 .header("Content-Type", "text/csv") |                 .header("Content-Type", "text/csv") | ||||||
|   | |||||||
							
								
								
									
										26
									
								
								src/heed_codec/csv_string_record_codec.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								src/heed_codec/csv_string_record_codec.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | |||||||
|  | use std::borrow::Cow; | ||||||
|  | use csv::{StringRecord, Writer, ReaderBuilder}; | ||||||
|  |  | ||||||
|  | pub struct CsvStringRecordCodec; | ||||||
|  |  | ||||||
|  | impl heed::BytesDecode<'_> for CsvStringRecordCodec { | ||||||
|  |     type DItem = StringRecord; | ||||||
|  |  | ||||||
|  |     fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> { | ||||||
|  |         let mut reader = ReaderBuilder::new() | ||||||
|  |             .has_headers(false) | ||||||
|  |             .buffer_capacity(bytes.len()) // we will just read this record | ||||||
|  |             .from_reader(bytes); | ||||||
|  |         reader.records().next()?.ok() // it return an Option of Result | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl heed::BytesEncode<'_> for CsvStringRecordCodec { | ||||||
|  |     type EItem = StringRecord; | ||||||
|  |  | ||||||
|  |     fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> { | ||||||
|  |         let mut writer = Writer::from_writer(Vec::new()); | ||||||
|  |         writer.write_record(item).ok()?; | ||||||
|  |         writer.into_inner().ok().map(Cow::Owned) | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -1,5 +1,7 @@ | |||||||
|  | mod csv_string_record_codec; | ||||||
| mod roaring_bitmap_codec; | mod roaring_bitmap_codec; | ||||||
| mod str_beu32_codec; | mod str_beu32_codec; | ||||||
|  |  | ||||||
|  | pub use self::csv_string_record_codec::CsvStringRecordCodec; | ||||||
| pub use self::roaring_bitmap_codec::RoaringBitmapCodec; | pub use self::roaring_bitmap_codec::RoaringBitmapCodec; | ||||||
| pub use self::str_beu32_codec::StrBEU32Codec; | pub use self::str_beu32_codec::StrBEU32Codec; | ||||||
|   | |||||||
							
								
								
									
										41
									
								
								src/lib.rs
									
									
									
									
									
								
							
							
						
						
									
										41
									
								
								src/lib.rs
									
									
									
									
									
								
							| @@ -9,13 +9,14 @@ use std::collections::HashMap; | |||||||
| use std::hash::BuildHasherDefault; | use std::hash::BuildHasherDefault; | ||||||
|  |  | ||||||
| use anyhow::Context; | use anyhow::Context; | ||||||
|  | use csv::StringRecord; | ||||||
| use fxhash::{FxHasher32, FxHasher64}; | use fxhash::{FxHasher32, FxHasher64}; | ||||||
| use heed::types::*; | use heed::types::*; | ||||||
| use heed::{PolyDatabase, Database}; | use heed::{PolyDatabase, Database}; | ||||||
|  |  | ||||||
| pub use self::search::{Search, SearchResult}; | pub use self::search::{Search, SearchResult}; | ||||||
| pub use self::criterion::{Criterion, default_criteria}; | pub use self::criterion::{Criterion, default_criteria}; | ||||||
| use self::heed_codec::{RoaringBitmapCodec, StrBEU32Codec}; | use self::heed_codec::{RoaringBitmapCodec, StrBEU32Codec, CsvStringRecordCodec}; | ||||||
|  |  | ||||||
| pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>; | pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>; | ||||||
| pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>; | pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>; | ||||||
| @@ -59,21 +60,17 @@ impl Index { | |||||||
|         }) |         }) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn put_headers(&self, wtxn: &mut heed::RwTxn, headers: &[u8]) -> anyhow::Result<()> { |     pub fn put_headers(&self, wtxn: &mut heed::RwTxn, headers: &StringRecord) -> heed::Result<()> { | ||||||
|         Ok(self.main.put::<_, Str, ByteSlice>(wtxn, HEADERS_KEY, headers)?) |         self.main.put::<_, Str, CsvStringRecordCodec>(wtxn, HEADERS_KEY, headers) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn headers<'t>(&self, rtxn: &'t heed::RoTxn) -> heed::Result<Option<&'t [u8]>> { |     pub fn headers(&self, rtxn: &heed::RoTxn) -> heed::Result<Option<StringRecord>> { | ||||||
|         self.main.get::<_, Str, ByteSlice>(rtxn, HEADERS_KEY) |         self.main.get::<_, Str, CsvStringRecordCodec>(rtxn, HEADERS_KEY) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn number_of_attributes<'t>(&self, rtxn: &'t heed::RoTxn) -> anyhow::Result<Option<usize>> { |     pub fn number_of_attributes(&self, rtxn: &heed::RoTxn) -> anyhow::Result<Option<usize>> { | ||||||
|         match self.headers(rtxn)? { |         match self.headers(rtxn)? { | ||||||
|             Some(headers) => { |             Some(headers) => Ok(Some(headers.len())), | ||||||
|                 let mut rdr = csv::Reader::from_reader(headers); |  | ||||||
|                 let headers = rdr.headers()?; |  | ||||||
|                 Ok(Some(headers.len())) |  | ||||||
|             } |  | ||||||
|             None => Ok(None), |             None => Ok(None), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -94,13 +91,25 @@ impl Index { | |||||||
|         &self, |         &self, | ||||||
|         rtxn: &'t heed::RoTxn, |         rtxn: &'t heed::RoTxn, | ||||||
|         iter: impl IntoIterator<Item=DocumentId>, |         iter: impl IntoIterator<Item=DocumentId>, | ||||||
|     ) -> anyhow::Result<Vec<(DocumentId, Vec<u8>)>> |     ) -> anyhow::Result<Vec<(DocumentId, StringRecord)>> | ||||||
|     { |     { | ||||||
|         iter.into_iter().map(|id| { |         let ids: Vec<_> = iter.into_iter().collect(); | ||||||
|             let content = self.documents.get(rtxn, &BEU32::new(id))? |         let mut content = Vec::new(); | ||||||
|  |  | ||||||
|  |         for id in ids.iter().cloned() { | ||||||
|  |             let document_content = self.documents.get(rtxn, &BEU32::new(id))? | ||||||
|                 .with_context(|| format!("Could not find document {}", id))?; |                 .with_context(|| format!("Could not find document {}", id))?; | ||||||
|             Ok((id, content.to_vec())) |             content.extend_from_slice(document_content); | ||||||
|         }).collect() |         } | ||||||
|  |  | ||||||
|  |         let mut rdr = csv::ReaderBuilder::new().has_headers(false).from_reader(&content[..]); | ||||||
|  |  | ||||||
|  |         let mut documents = Vec::with_capacity(ids.len()); | ||||||
|  |         for (id, result) in ids.into_iter().zip(rdr.records()) { | ||||||
|  |             documents.push((id, result?)); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         Ok(documents) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Returns the number of documents indexed in the database. |     /// Returns the number of documents indexed in the database. | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user