Finally bump grenad to v0.4.1

This commit is contained in:
Clément Renault
2022-02-16 15:28:48 +01:00
parent fb79c32430
commit f367cc2e75
18 changed files with 130 additions and 94 deletions

View File

@ -18,8 +18,8 @@ use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_A
/// Returns the generated internal documents ids and a grenad reader
/// with the list of extracted words from the given chunk of documents.
#[logging_timer::time]
pub fn extract_docid_word_positions<R: io::Read>(
mut obkv_documents: grenad::Reader<R>,
pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters,
searchable_fields: &Option<HashSet<FieldId>>,
stop_words: Option<&fst::Set<&[u8]>>,
@ -46,7 +46,8 @@ pub fn extract_docid_word_positions<R: io::Read>(
}
let analyzer = Analyzer::<Vec<u8>>::new(AnalyzerConfig::default());
while let Some((key, value)) = obkv_documents.next()? {
let mut cursor = obkv_documents.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let document_id = key
.try_into()
.map(u32::from_be_bytes)

View File

@ -14,8 +14,8 @@ use crate::Result;
/// Returns a grenad reader with the list of extracted facet numbers and
/// documents ids from the given chunk of docid facet number positions.
#[logging_timer::time]
pub fn extract_facet_number_docids<R: io::Read>(
mut docid_fid_facet_number: grenad::Reader<R>,
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
docid_fid_facet_number: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> {
let max_memory = indexer.max_memory_by_thread();
@ -28,7 +28,8 @@ pub fn extract_facet_number_docids<R: io::Read>(
max_memory,
);
while let Some((key_bytes, _)) = docid_fid_facet_number.next()? {
let mut cursor = docid_fid_facet_number.into_cursor()?;
while let Some((key_bytes, _)) = cursor.move_on_next()? {
let (field_id, document_id, number) =
FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap();

View File

@ -16,8 +16,8 @@ use crate::{FieldId, Result};
/// Returns a grenad reader with the list of extracted facet strings and
/// documents ids from the given chunk of docid facet string positions.
#[logging_timer::time]
pub fn extract_facet_string_docids<R: io::Read>(
mut docid_fid_facet_string: grenad::Reader<R>,
pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
docid_fid_facet_string: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> {
let max_memory = indexer.max_memory_by_thread();
@ -32,7 +32,8 @@ pub fn extract_facet_string_docids<R: io::Read>(
let mut key_buffer = Vec::new();
let mut value_buffer = Vec::new();
while let Some((key, original_value_bytes)) = docid_fid_facet_string.next()? {
let mut cursor = docid_fid_facet_string.into_cursor()?;
while let Some((key, original_value_bytes)) = cursor.move_on_next()? {
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
let field_id = FieldId::from_be_bytes(field_id_bytes);
let (document_id_bytes, normalized_value_bytes) = try_split_array_at(bytes).unwrap();

View File

@ -16,8 +16,8 @@ use crate::{DocumentId, FieldId, Result};
/// Returns the generated grenad reader containing the docid the fid and the orginal value as key
/// and the normalized value as value extracted from the given chunk of documents.
#[logging_timer::time]
pub fn extract_fid_docid_facet_values<R: io::Read>(
mut obkv_documents: grenad::Reader<R>,
pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters,
faceted_fields: &HashSet<FieldId>,
) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
@ -40,7 +40,8 @@ pub fn extract_fid_docid_facet_values<R: io::Read>(
);
let mut key_buffer = Vec::new();
while let Some((docid_bytes, value)) = obkv_documents.next()? {
let mut cursor = obkv_documents.into_cursor()?;
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
let obkv = obkv::KvReader::new(value);
for (field_id, field_bytes) in obkv.iter() {

View File

@ -18,8 +18,8 @@ use crate::{relative_from_absolute_position, DocumentId, FieldId, Result};
/// Returns a grenad reader with the list of extracted field id word counts
/// and documents ids from the given chunk of docid word positions.
#[logging_timer::time]
pub fn extract_fid_word_count_docids<R: io::Read>(
mut docid_word_positions: grenad::Reader<R>,
pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> {
let max_memory = indexer.max_memory_by_thread();
@ -36,7 +36,8 @@ pub fn extract_fid_word_count_docids<R: io::Read>(
let mut document_fid_wordcount = HashMap::new();
let mut current_document_id = None;
while let Some((key, value)) = docid_word_positions.next()? {
let mut cursor = docid_word_positions.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let (document_id_bytes, _word_bytes) = try_split_array_at(key)
.ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let document_id = u32::from_be_bytes(document_id_bytes);

View File

@ -10,17 +10,20 @@ use crate::{FieldId, InternalError, Result, UserError};
/// Extracts the geographical coordinates contained in each document under the `_geo` field.
///
/// Returns the generated grenad reader containing the docid as key associated to the (latitude, longitude)
pub fn extract_geo_points<R: io::Read>(
mut obkv_documents: grenad::Reader<R>,
pub fn extract_geo_points<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters,
primary_key_id: FieldId,
geo_field_id: FieldId,
) -> Result<grenad::Reader<File>> {
let mut writer = tempfile::tempfile().and_then(|file| {
create_writer(indexer.chunk_compression_type, indexer.chunk_compression_level, file)
})?;
let mut writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
while let Some((docid_bytes, value)) = obkv_documents.next()? {
let mut cursor = obkv_documents.into_cursor()?;
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
let obkv = obkv::KvReader::new(value);
let point: Value = match obkv.get(geo_field_id) {
Some(point) => serde_json::from_slice(point).map_err(InternalError::SerdeJson)?,

View File

@ -17,8 +17,8 @@ use crate::Result;
/// Returns a grenad reader with the list of extracted words and
/// documents ids from the given chunk of docid word positions.
#[logging_timer::time]
pub fn extract_word_docids<R: io::Read>(
mut docid_word_positions: grenad::Reader<R>,
pub fn extract_word_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> {
let max_memory = indexer.max_memory_by_thread();
@ -32,7 +32,8 @@ pub fn extract_word_docids<R: io::Read>(
);
let mut value_buffer = Vec::new();
while let Some((key, _value)) = docid_word_positions.next()? {
let mut cursor = docid_word_positions.into_cursor()?;
while let Some((key, _value)) = cursor.move_on_next()? {
let (document_id_bytes, word_bytes) = try_split_array_at(key)
.ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let document_id = u32::from_be_bytes(document_id_bytes);

View File

@ -17,8 +17,8 @@ use crate::{DocumentId, Result};
/// Returns a grenad reader with the list of extracted word pairs proximities and
/// documents ids from the given chunk of docid word positions.
#[logging_timer::time]
pub fn extract_word_pair_proximity_docids<R: io::Read>(
mut docid_word_positions: grenad::Reader<R>,
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> {
let max_memory = indexer.max_memory_by_thread();
@ -35,7 +35,8 @@ pub fn extract_word_pair_proximity_docids<R: io::Read>(
let mut document_word_positions_heap = BinaryHeap::new();
let mut current_document_id = None;
while let Some((key, value)) = docid_word_positions.next()? {
let mut cursor = docid_word_positions.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let (document_id_bytes, word_bytes) = try_split_array_at(key)
.ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let document_id = u32::from_be_bytes(document_id_bytes);

View File

@ -14,8 +14,8 @@ use crate::{DocumentId, Result};
/// Returns a grenad reader with the list of extracted words at positions and
/// documents ids from the given chunk of docid word positions.
#[logging_timer::time]
pub fn extract_word_position_docids<R: io::Read>(
mut docid_word_positions: grenad::Reader<R>,
pub fn extract_word_position_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> {
let max_memory = indexer.max_memory_by_thread();
@ -29,7 +29,8 @@ pub fn extract_word_position_docids<R: io::Read>(
);
let mut key_buffer = Vec::new();
while let Some((key, value)) = docid_word_positions.next()? {
let mut cursor = docid_word_positions.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let (document_id_bytes, word_bytes) = try_split_array_at(key)
.ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let document_id = DocumentId::from_be_bytes(document_id_bytes);