Parse every attributes and filter before tokenization

This commit is contained in:
ManyTheFish
2024-11-20 15:08:27 +01:00
parent ff9c92c409
commit 4d616f8794
3 changed files with 33 additions and 50 deletions

View File

@@ -75,12 +75,12 @@ pub trait SearchableExtractor: Sized + Sync {
let dictionary = indexing_context.index.dictionary(&rtxn)?;
let dictionary: Option<Vec<_>> =
dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
let builder = tokenizer_builder(
let mut builder = tokenizer_builder(
stop_words.as_ref(),
allowed_separators.as_deref(),
dictionary.as_deref(),
);
let tokenizer = builder.into_tokenizer();
let tokenizer = builder.build();
let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?;
let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?;

View File

@@ -40,6 +40,12 @@ impl<'a> DocumentTokenizer<'a> {
return Err(UserError::AttributeLimitReached.into());
};
if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip)
!= Selection::Select
{
return Ok(());
}
let position = field_position
.entry(field_id)
.and_modify(|counter| *counter += MAX_DISTANCE)
@@ -87,30 +93,25 @@ impl<'a> DocumentTokenizer<'a> {
Ok(())
};
// if the current field is searchable or contains a searchable attribute
if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip)
!= Selection::Skip
{
// parse json.
match serde_json::to_value(value).map_err(InternalError::SerdeJson)? {
Value::Object(object) => seek_leaf_values_in_object(
&object,
self.attribute_to_extract,
self.attribute_to_skip,
field_name,
Depth::OnBaseKey,
&mut tokenize_field,
)?,
Value::Array(array) => seek_leaf_values_in_array(
&array,
self.attribute_to_extract,
self.attribute_to_skip,
field_name,
Depth::OnBaseKey,
&mut tokenize_field,
)?,
value => tokenize_field(field_name, Depth::OnBaseKey, &value)?,
}
// parse json.
match serde_json::to_value(value).map_err(InternalError::SerdeJson)? {
Value::Object(object) => seek_leaf_values_in_object(
&object,
None,
&[],
field_name,
Depth::OnBaseKey,
&mut tokenize_field,
)?,
Value::Array(array) => seek_leaf_values_in_array(
&array,
None,
&[],
field_name,
Depth::OnBaseKey,
&mut tokenize_field,
)?,
value => tokenize_field(field_name, Depth::OnBaseKey, &value)?,
}
}