mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-31 10:50:03 +00:00
Rewrite cheapest path algorithm and empty path cache
It is now much simpler and has much better performance.
This commit is contained in:
@ -1,3 +1,5 @@
|
||||
use std::time::Instant;
|
||||
|
||||
use heed::RoTxn;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
@ -9,7 +11,7 @@ use crate::new::graph_based_ranking_rule::GraphBasedRankingRule;
|
||||
use crate::new::ranking_rule_graph::proximity::ProximityGraph;
|
||||
use crate::new::ranking_rule_graph::typo::TypoGraph;
|
||||
use crate::new::words::Words;
|
||||
use crate::search::new::sort::Sort;
|
||||
// use crate::search::new::sort::Sort;
|
||||
use crate::{Filter, Index, Result, TermsMatchingStrategy};
|
||||
|
||||
pub trait RankingRuleOutputIter<'transaction, Query> {
|
||||
@ -123,13 +125,14 @@ pub fn execute_search<'transaction>(
|
||||
length: usize,
|
||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) -> Result<Vec<u32>> {
|
||||
logger.initial_query(query_graph, Instant::now());
|
||||
let words = &mut Words::new(TermsMatchingStrategy::Last);
|
||||
let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?;
|
||||
// let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?;
|
||||
let proximity = &mut GraphBasedRankingRule::<ProximityGraph>::new("proximity".to_owned());
|
||||
let typo = &mut GraphBasedRankingRule::<TypoGraph>::new("typo".to_owned());
|
||||
// TODO: ranking rules given as argument
|
||||
let mut ranking_rules: Vec<&mut dyn RankingRule<'transaction, QueryGraph>> =
|
||||
vec![words, typo, proximity, sort];
|
||||
vec![words, typo, proximity /*sort*/];
|
||||
|
||||
logger.ranking_rules(&ranking_rules);
|
||||
|
||||
@ -144,7 +147,13 @@ pub fn execute_search<'transaction>(
|
||||
}
|
||||
|
||||
let ranking_rules_len = ranking_rules.len();
|
||||
logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, &universe);
|
||||
logger.start_iteration_ranking_rule(
|
||||
0,
|
||||
ranking_rules[0],
|
||||
query_graph,
|
||||
&universe,
|
||||
Instant::now(),
|
||||
);
|
||||
ranking_rules[0].start_iteration(index, txn, db_cache, logger, &universe, query_graph)?;
|
||||
|
||||
let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len];
|
||||
@ -154,11 +163,12 @@ pub fn execute_search<'transaction>(
|
||||
|
||||
macro_rules! back {
|
||||
() => {
|
||||
// assert!(candidates[cur_ranking_rule_index].is_empty());
|
||||
assert!(candidates[cur_ranking_rule_index].is_empty());
|
||||
logger.end_iteration_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index],
|
||||
&candidates[cur_ranking_rule_index],
|
||||
Instant::now(),
|
||||
);
|
||||
candidates[cur_ranking_rule_index].clear();
|
||||
ranking_rules[cur_ranking_rule_index].end_iteration(index, txn, db_cache, logger);
|
||||
@ -187,6 +197,7 @@ pub fn execute_search<'transaction>(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index],
|
||||
&candidates,
|
||||
Instant::now(),
|
||||
);
|
||||
} else {
|
||||
let all_candidates = candidates.iter().collect::<Vec<_>>();
|
||||
@ -196,6 +207,7 @@ pub fn execute_search<'transaction>(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index],
|
||||
&skipped_candidates.into_iter().collect(),
|
||||
Instant::now(),
|
||||
);
|
||||
let candidates = candidates
|
||||
.iter()
|
||||
@ -219,24 +231,26 @@ pub fn execute_search<'transaction>(
|
||||
// The universe for this bucket is zero or one element, so we don't need to sort
|
||||
// anything, just extend the results and go back to the parent ranking rule.
|
||||
if candidates[cur_ranking_rule_index].len() <= 1 {
|
||||
candidates[cur_ranking_rule_index].clear();
|
||||
maybe_add_to_results!(&candidates[cur_ranking_rule_index]);
|
||||
candidates[cur_ranking_rule_index].clear();
|
||||
back!();
|
||||
continue;
|
||||
}
|
||||
|
||||
logger.next_bucket_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index],
|
||||
&candidates[cur_ranking_rule_index],
|
||||
);
|
||||
|
||||
let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(index, txn, db_cache, logger, &candidates[cur_ranking_rule_index])? else {
|
||||
// TODO: add remaining candidates automatically here?
|
||||
back!();
|
||||
continue;
|
||||
};
|
||||
|
||||
logger.next_bucket_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index],
|
||||
&candidates[cur_ranking_rule_index],
|
||||
&next_bucket.candidates,
|
||||
Instant::now(),
|
||||
);
|
||||
|
||||
assert!(candidates[cur_ranking_rule_index].is_superset(&next_bucket.candidates));
|
||||
candidates[cur_ranking_rule_index] -= &next_bucket.candidates;
|
||||
|
||||
@ -255,6 +269,7 @@ pub fn execute_search<'transaction>(
|
||||
ranking_rules[cur_ranking_rule_index],
|
||||
&next_bucket.query,
|
||||
&candidates[cur_ranking_rule_index],
|
||||
Instant::now(),
|
||||
);
|
||||
ranking_rules[cur_ranking_rule_index].start_iteration(
|
||||
index,
|
||||
@ -271,17 +286,18 @@ pub fn execute_search<'transaction>(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader, Cursor, Seek};
|
||||
use std::time::Instant;
|
||||
|
||||
use heed::EnvOpenOptions;
|
||||
|
||||
use super::execute_search;
|
||||
// use crate::allocator::ALLOC;
|
||||
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::new::db_cache::DatabaseCache;
|
||||
use crate::new::logger::detailed::DetailedSearchLogger;
|
||||
use big_s::S;
|
||||
use heed::EnvOpenOptions;
|
||||
use maplit::hashset;
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader, Cursor, Seek};
|
||||
use std::time::Instant;
|
||||
// use crate::new::logger::detailed::DetailedSearchLogger;
|
||||
use crate::new::logger::{DefaultSearchLogger, SearchLogger};
|
||||
use crate::new::make_query_graph;
|
||||
use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
||||
@ -323,16 +339,119 @@ mod tests {
|
||||
let mut db_cache = DatabaseCache::default();
|
||||
|
||||
let query_graph =
|
||||
make_query_graph(&index, &txn, &mut db_cache, "b b b b b b b b b b").unwrap();
|
||||
println!("{}", query_graph.graphviz());
|
||||
logger.initial_query(&query_graph);
|
||||
make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government")
|
||||
.unwrap();
|
||||
logger.initial_query(&query_graph, Instant::now());
|
||||
|
||||
let results =
|
||||
execute_search(&index, &txn, &mut db_cache, &query_graph, None, 0, 20, &mut logger)
|
||||
execute_search(&index, &txn, &mut db_cache, &query_graph, None, 0, 50, &mut logger)
|
||||
.unwrap();
|
||||
println!("{results:?}")
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn search_wiki_new() {
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
let index = Index::new(options, "data_wiki").unwrap();
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len());
|
||||
|
||||
let primary_key = index.primary_key(&txn).unwrap().unwrap();
|
||||
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
|
||||
// loop {
|
||||
let start = Instant::now();
|
||||
|
||||
let mut db_cache = DatabaseCache::default();
|
||||
|
||||
let query_graph = make_query_graph(
|
||||
&index,
|
||||
&txn,
|
||||
&mut db_cache,
|
||||
"which a the releases from poison by the government",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
|
||||
|
||||
let results = execute_search(
|
||||
&index,
|
||||
&txn,
|
||||
&mut db_cache,
|
||||
&query_graph,
|
||||
None,
|
||||
0,
|
||||
20,
|
||||
&mut DefaultSearchLogger,
|
||||
// &mut logger,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// logger.write_d2_description();
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
|
||||
let ids = index
|
||||
.documents(&txn, results.iter().copied())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|x| {
|
||||
let obkv = &x.1;
|
||||
let id = obkv.get(primary_key).unwrap();
|
||||
let id: serde_json::Value = serde_json::from_slice(id).unwrap();
|
||||
id.as_str().unwrap().to_owned()
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
println!("{}us: {results:?}", elapsed.as_micros());
|
||||
println!("external ids: {ids:?}");
|
||||
// println!("max_resident: {}", ALLOC.max_resident.load(std::sync::atomic::Ordering::SeqCst));
|
||||
// println!("allocated: {}", ALLOC.allocated.load(std::sync::atomic::Ordering::SeqCst));
|
||||
// }
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn search_wiki_old() {
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
let index = Index::new(options, "data_wiki").unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let rr = index.criteria(&txn).unwrap();
|
||||
println!("{rr:?}");
|
||||
|
||||
let primary_key = index.primary_key(&txn).unwrap().unwrap();
|
||||
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.query("releases from poison by the government");
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
|
||||
let docs = s.execute().unwrap();
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
|
||||
let ids = index
|
||||
.documents(&txn, docs.documents_ids.iter().copied())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|x| {
|
||||
let obkv = &x.1;
|
||||
let id = obkv.get(primary_key).unwrap();
|
||||
let id: serde_json::Value = serde_json::from_slice(id).unwrap();
|
||||
id.as_str().unwrap().to_owned()
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
|
||||
println!("external ids: {ids:?}");
|
||||
}
|
||||
#[test]
|
||||
fn search_movies_new() {
|
||||
let mut options = EnvOpenOptions::new();
|
||||
@ -343,7 +462,7 @@ mod tests {
|
||||
|
||||
let primary_key = index.primary_key(&txn).unwrap().unwrap();
|
||||
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
|
||||
|
||||
// loop {
|
||||
let start = Instant::now();
|
||||
|
||||
let mut db_cache = DatabaseCache::default();
|
||||
@ -352,7 +471,7 @@ mod tests {
|
||||
make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government")
|
||||
.unwrap();
|
||||
|
||||
let mut logger = DetailedSearchLogger::new("log");
|
||||
let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
|
||||
|
||||
let results = execute_search(
|
||||
&index,
|
||||
@ -360,9 +479,10 @@ mod tests {
|
||||
&mut db_cache,
|
||||
&query_graph,
|
||||
None,
|
||||
5,
|
||||
0,
|
||||
20,
|
||||
&mut logger, //&mut DefaultSearchLogger,
|
||||
// &mut DefaultSearchLogger,
|
||||
&mut logger,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
@ -384,6 +504,7 @@ mod tests {
|
||||
|
||||
println!("{}us: {results:?}", elapsed.as_micros());
|
||||
println!("external ids: {ids:?}");
|
||||
// }
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -392,19 +513,39 @@ mod tests {
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
let index = Index::new(options, "data_movies").unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let rr = index.criteria(&txn).unwrap();
|
||||
println!("{rr:?}");
|
||||
|
||||
let primary_key = index.primary_key(&txn).unwrap().unwrap();
|
||||
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.query("b b b b b b b b b b");
|
||||
s.query("releases from poison by the government");
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
|
||||
let docs = s.execute().unwrap();
|
||||
|
||||
let elapsed = start.elapsed();
|
||||
|
||||
let ids = index
|
||||
.documents(&txn, docs.documents_ids.iter().copied())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|x| {
|
||||
let obkv = &x.1;
|
||||
let id = obkv.get(primary_key).unwrap();
|
||||
let id: serde_json::Value = serde_json::from_slice(id).unwrap();
|
||||
id.as_str().unwrap().to_owned()
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
|
||||
println!("external ids: {ids:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -420,10 +561,16 @@ mod tests {
|
||||
|
||||
builder.set_min_word_len_one_typo(5);
|
||||
builder.set_min_word_len_two_typos(100);
|
||||
|
||||
builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
|
||||
builder.set_sortable_fields(hashset! { S("release_date") });
|
||||
builder.set_criteria(vec![
|
||||
Criterion::Words,
|
||||
Criterion::Typo,
|
||||
Criterion::Proximity,
|
||||
Criterion::Asc("release_date".to_owned()),
|
||||
]);
|
||||
|
||||
builder.execute(|_| (), || false).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -445,6 +592,7 @@ mod tests {
|
||||
builder.set_searchable_fields(searchable_fields);
|
||||
let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
|
||||
builder.set_filterable_fields(filterable_fields);
|
||||
|
||||
builder.set_min_word_len_one_typo(5);
|
||||
builder.set_min_word_len_two_typos(100);
|
||||
builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
|
||||
@ -467,6 +615,48 @@ mod tests {
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
}
|
||||
#[test]
|
||||
fn _index_wiki() {
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
let index = Index::new(options, "data_wiki").unwrap();
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
// let primary_key = "id";
|
||||
let searchable_fields = vec!["body", "title", "url"];
|
||||
// let filterable_fields = vec![];
|
||||
let config = IndexerConfig::default();
|
||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
||||
// builder.set_primary_key(primary_key.to_owned());
|
||||
let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
|
||||
builder.set_searchable_fields(searchable_fields);
|
||||
// let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
|
||||
// builder.set_filterable_fields(filterable_fields);
|
||||
|
||||
// builder.set_min_word_len_one_typo(5);
|
||||
// builder.set_min_word_len_two_typos(100);
|
||||
builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]);
|
||||
builder.execute(|_| (), || false).unwrap();
|
||||
|
||||
let config = IndexerConfig::default();
|
||||
let indexing_config =
|
||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
||||
let builder =
|
||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false)
|
||||
.unwrap();
|
||||
|
||||
let documents = documents_from(
|
||||
"/Users/meilisearch/Documents/milli2/benchmarks/datasets/smol-wiki-articles.csv",
|
||||
"csv",
|
||||
);
|
||||
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||
user_error.unwrap();
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
}
|
||||
|
||||
fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
|
||||
let reader = File::open(filename)
|
||||
|
Reference in New Issue
Block a user