Make the search and the indexing work

This commit is contained in:
ManyTheFish
2023-07-24 18:35:20 +02:00
parent d8d12d5979
commit 9c485f8563
7 changed files with 197 additions and 4 deletions

View File

@ -1094,10 +1094,7 @@ impl Index {
/* separators easing method */
pub(crate) fn allowed_separators<'t>(
&self,
rtxn: &'t RoTxn,
) -> Result<Option<BTreeSet<String>>> {
pub fn allowed_separators<'t>(&self, rtxn: &'t RoTxn) -> Result<Option<BTreeSet<String>>> {
let default_separators =
charabia::separators::DEFAULT_SEPARATORS.iter().map(|s| s.to_string());
let mut separators: Option<BTreeSet<_>> = None;

View File

@ -479,6 +479,20 @@ pub fn execute_search(
tokbuilder.stop_words(stop_words);
}
let separators = ctx.index.allowed_separators(ctx.txn)?;
let separators: Option<Vec<_>> =
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
if let Some(ref separators) = separators {
tokbuilder.separators(separators);
}
let dictionary = ctx.index.dictionary(ctx.txn)?;
let dictionary: Option<Vec<_>> =
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
if let Some(ref dictionary) = dictionary {
tokbuilder.words_dict(dictionary);
}
let script_lang_map = ctx.index.script_language(ctx.txn)?;
if !script_lang_map.is_empty() {
tokbuilder.allow_list(&script_lang_map);

View File

@ -28,6 +28,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
indexer: GrenadParameters,
searchable_fields: &Option<HashSet<FieldId>>,
stop_words: Option<&fst::Set<&[u8]>>,
allowed_separators: Option<&Vec<&str>>,
dictionary: Option<&Vec<&str>>,
max_positions_per_attributes: Option<u32>,
) -> Result<(RoaringBitmap, grenad::Reader<File>, ScriptLanguageDocidsMap)> {
puffin::profile_function!();
@ -52,6 +54,14 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
if let Some(stop_words) = stop_words {
tokenizer_builder.stop_words(stop_words);
}
if let Some(dictionary) = dictionary {
// let dictionary: Vec<_> = dictionary.iter().map(String::as_str).collect();
tokenizer_builder.words_dict(dictionary.as_slice());
}
if let Some(separators) = allowed_separators {
// let separators: Vec<_> = separators.iter().map(String::as_str).collect();
tokenizer_builder.separators(separators.as_slice());
}
let tokenizer = tokenizer_builder.build();
let mut cursor = obkv_documents.into_cursor()?;

View File

@ -49,6 +49,8 @@ pub(crate) fn data_from_obkv_documents(
geo_fields_ids: Option<(FieldId, FieldId)>,
vectors_field_id: Option<FieldId>,
stop_words: Option<fst::Set<&[u8]>>,
allowed_separators: Option<Vec<&str>>,
dictionary: Option<Vec<&str>>,
max_positions_per_attributes: Option<u32>,
exact_attributes: HashSet<FieldId>,
) -> Result<()> {
@ -76,6 +78,8 @@ pub(crate) fn data_from_obkv_documents(
geo_fields_ids,
vectors_field_id,
&stop_words,
&allowed_separators,
&dictionary,
max_positions_per_attributes,
)
})
@ -289,6 +293,8 @@ fn send_and_extract_flattened_documents_data(
geo_fields_ids: Option<(FieldId, FieldId)>,
vectors_field_id: Option<FieldId>,
stop_words: &Option<fst::Set<&[u8]>>,
allowed_separators: &Option<Vec<&str>>,
dictionary: &Option<Vec<&str>>,
max_positions_per_attributes: Option<u32>,
) -> Result<(
grenad::Reader<CursorClonableMmap>,
@ -344,6 +350,8 @@ fn send_and_extract_flattened_documents_data(
indexer,
searchable_fields,
stop_words.as_ref(),
allowed_separators.as_ref(),
dictionary.as_ref(),
max_positions_per_attributes,
)?;

View File

@ -316,6 +316,12 @@ where
let vectors_field_id = self.index.fields_ids_map(self.wtxn)?.id("_vectors");
let stop_words = self.index.stop_words(self.wtxn)?;
let separators = self.index.allowed_separators(self.wtxn)?;
let separators: Option<Vec<_>> =
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
let dictionary = self.index.dictionary(self.wtxn)?;
let dictionary: Option<Vec<_>> =
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
let pool_params = GrenadParameters {
@ -353,6 +359,8 @@ where
geo_fields_ids,
vectors_field_id,
stop_words,
separators,
dictionary,
max_positions_per_attributes,
exact_attributes,
)