Rewrite cheapest path algorithm and empty path cache

It is now much simpler and has much better performance.
This commit is contained in:
Loïc Lecrenier
2023-03-02 21:27:42 +01:00
parent caa1e1b923
commit c27ea2677f
14 changed files with 782 additions and 530 deletions

View File

@ -1,3 +1,5 @@
use std::time::Instant;
use heed::RoTxn;
use roaring::RoaringBitmap;
@ -9,7 +11,7 @@ use crate::new::graph_based_ranking_rule::GraphBasedRankingRule;
use crate::new::ranking_rule_graph::proximity::ProximityGraph;
use crate::new::ranking_rule_graph::typo::TypoGraph;
use crate::new::words::Words;
use crate::search::new::sort::Sort;
// use crate::search::new::sort::Sort;
use crate::{Filter, Index, Result, TermsMatchingStrategy};
pub trait RankingRuleOutputIter<'transaction, Query> {
@ -123,13 +125,14 @@ pub fn execute_search<'transaction>(
length: usize,
logger: &mut dyn SearchLogger<QueryGraph>,
) -> Result<Vec<u32>> {
logger.initial_query(query_graph, Instant::now());
let words = &mut Words::new(TermsMatchingStrategy::Last);
let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?;
// let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?;
let proximity = &mut GraphBasedRankingRule::<ProximityGraph>::new("proximity".to_owned());
let typo = &mut GraphBasedRankingRule::<TypoGraph>::new("typo".to_owned());
// TODO: ranking rules given as argument
let mut ranking_rules: Vec<&mut dyn RankingRule<'transaction, QueryGraph>> =
vec![words, typo, proximity, sort];
vec![words, typo, proximity /*sort*/];
logger.ranking_rules(&ranking_rules);
@ -144,7 +147,13 @@ pub fn execute_search<'transaction>(
}
let ranking_rules_len = ranking_rules.len();
logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, &universe);
logger.start_iteration_ranking_rule(
0,
ranking_rules[0],
query_graph,
&universe,
Instant::now(),
);
ranking_rules[0].start_iteration(index, txn, db_cache, logger, &universe, query_graph)?;
let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len];
@ -154,11 +163,12 @@ pub fn execute_search<'transaction>(
macro_rules! back {
() => {
// assert!(candidates[cur_ranking_rule_index].is_empty());
assert!(candidates[cur_ranking_rule_index].is_empty());
logger.end_iteration_ranking_rule(
cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index],
&candidates[cur_ranking_rule_index],
Instant::now(),
);
candidates[cur_ranking_rule_index].clear();
ranking_rules[cur_ranking_rule_index].end_iteration(index, txn, db_cache, logger);
@ -187,6 +197,7 @@ pub fn execute_search<'transaction>(
cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index],
&candidates,
Instant::now(),
);
} else {
let all_candidates = candidates.iter().collect::<Vec<_>>();
@ -196,6 +207,7 @@ pub fn execute_search<'transaction>(
cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index],
&skipped_candidates.into_iter().collect(),
Instant::now(),
);
let candidates = candidates
.iter()
@ -219,24 +231,26 @@ pub fn execute_search<'transaction>(
// The universe for this bucket is zero or one element, so we don't need to sort
// anything, just extend the results and go back to the parent ranking rule.
if candidates[cur_ranking_rule_index].len() <= 1 {
candidates[cur_ranking_rule_index].clear();
maybe_add_to_results!(&candidates[cur_ranking_rule_index]);
candidates[cur_ranking_rule_index].clear();
back!();
continue;
}
logger.next_bucket_ranking_rule(
cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index],
&candidates[cur_ranking_rule_index],
);
let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(index, txn, db_cache, logger, &candidates[cur_ranking_rule_index])? else {
// TODO: add remaining candidates automatically here?
back!();
continue;
};
logger.next_bucket_ranking_rule(
cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index],
&candidates[cur_ranking_rule_index],
&next_bucket.candidates,
Instant::now(),
);
assert!(candidates[cur_ranking_rule_index].is_superset(&next_bucket.candidates));
candidates[cur_ranking_rule_index] -= &next_bucket.candidates;
@ -255,6 +269,7 @@ pub fn execute_search<'transaction>(
ranking_rules[cur_ranking_rule_index],
&next_bucket.query,
&candidates[cur_ranking_rule_index],
Instant::now(),
);
ranking_rules[cur_ranking_rule_index].start_iteration(
index,
@ -271,17 +286,18 @@ pub fn execute_search<'transaction>(
#[cfg(test)]
mod tests {
use std::fs::File;
use std::io::{BufRead, BufReader, Cursor, Seek};
use std::time::Instant;
use heed::EnvOpenOptions;
use super::execute_search;
// use crate::allocator::ALLOC;
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use crate::index::tests::TempIndex;
use crate::new::db_cache::DatabaseCache;
use crate::new::logger::detailed::DetailedSearchLogger;
use big_s::S;
use heed::EnvOpenOptions;
use maplit::hashset;
use std::fs::File;
use std::io::{BufRead, BufReader, Cursor, Seek};
use std::time::Instant;
// use crate::new::logger::detailed::DetailedSearchLogger;
use crate::new::logger::{DefaultSearchLogger, SearchLogger};
use crate::new::make_query_graph;
use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
@ -323,16 +339,119 @@ mod tests {
let mut db_cache = DatabaseCache::default();
let query_graph =
make_query_graph(&index, &txn, &mut db_cache, "b b b b b b b b b b").unwrap();
println!("{}", query_graph.graphviz());
logger.initial_query(&query_graph);
make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government")
.unwrap();
logger.initial_query(&query_graph, Instant::now());
let results =
execute_search(&index, &txn, &mut db_cache, &query_graph, None, 0, 20, &mut logger)
execute_search(&index, &txn, &mut db_cache, &query_graph, None, 0, 50, &mut logger)
.unwrap();
println!("{results:?}")
}
#[test]
fn search_wiki_new() {
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
let index = Index::new(options, "data_wiki").unwrap();
let txn = index.read_txn().unwrap();
println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len());
let primary_key = index.primary_key(&txn).unwrap().unwrap();
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
// loop {
let start = Instant::now();
let mut db_cache = DatabaseCache::default();
let query_graph = make_query_graph(
&index,
&txn,
&mut db_cache,
"which a the releases from poison by the government",
)
.unwrap();
// let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
let results = execute_search(
&index,
&txn,
&mut db_cache,
&query_graph,
None,
0,
20,
&mut DefaultSearchLogger,
// &mut logger,
)
.unwrap();
// logger.write_d2_description();
let elapsed = start.elapsed();
let ids = index
.documents(&txn, results.iter().copied())
.unwrap()
.into_iter()
.map(|x| {
let obkv = &x.1;
let id = obkv.get(primary_key).unwrap();
let id: serde_json::Value = serde_json::from_slice(id).unwrap();
id.as_str().unwrap().to_owned()
})
.collect::<Vec<_>>();
println!("{}us: {results:?}", elapsed.as_micros());
println!("external ids: {ids:?}");
// println!("max_resident: {}", ALLOC.max_resident.load(std::sync::atomic::Ordering::SeqCst));
// println!("allocated: {}", ALLOC.allocated.load(std::sync::atomic::Ordering::SeqCst));
// }
}
#[test]
fn search_wiki_old() {
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
let index = Index::new(options, "data_wiki").unwrap();
let txn = index.read_txn().unwrap();
let rr = index.criteria(&txn).unwrap();
println!("{rr:?}");
let primary_key = index.primary_key(&txn).unwrap().unwrap();
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
let start = Instant::now();
let mut s = Search::new(&txn, &index);
s.query("releases from poison by the government");
s.terms_matching_strategy(TermsMatchingStrategy::Last);
s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
let docs = s.execute().unwrap();
let elapsed = start.elapsed();
let ids = index
.documents(&txn, docs.documents_ids.iter().copied())
.unwrap()
.into_iter()
.map(|x| {
let obkv = &x.1;
let id = obkv.get(primary_key).unwrap();
let id: serde_json::Value = serde_json::from_slice(id).unwrap();
id.as_str().unwrap().to_owned()
})
.collect::<Vec<_>>();
println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
println!("external ids: {ids:?}");
}
#[test]
fn search_movies_new() {
let mut options = EnvOpenOptions::new();
@ -343,7 +462,7 @@ mod tests {
let primary_key = index.primary_key(&txn).unwrap().unwrap();
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
// loop {
let start = Instant::now();
let mut db_cache = DatabaseCache::default();
@ -352,7 +471,7 @@ mod tests {
make_query_graph(&index, &txn, &mut db_cache, "releases from poison by the government")
.unwrap();
let mut logger = DetailedSearchLogger::new("log");
let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
let results = execute_search(
&index,
@ -360,9 +479,10 @@ mod tests {
&mut db_cache,
&query_graph,
None,
5,
0,
20,
&mut logger, //&mut DefaultSearchLogger,
// &mut DefaultSearchLogger,
&mut logger,
)
.unwrap();
@ -384,6 +504,7 @@ mod tests {
println!("{}us: {results:?}", elapsed.as_micros());
println!("external ids: {ids:?}");
// }
}
#[test]
@ -392,19 +513,39 @@ mod tests {
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
let index = Index::new(options, "data_movies").unwrap();
let txn = index.read_txn().unwrap();
let rr = index.criteria(&txn).unwrap();
println!("{rr:?}");
let primary_key = index.primary_key(&txn).unwrap().unwrap();
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
let start = Instant::now();
let mut s = Search::new(&txn, &index);
s.query("b b b b b b b b b b");
s.query("releases from poison by the government");
s.terms_matching_strategy(TermsMatchingStrategy::Last);
s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
let docs = s.execute().unwrap();
let elapsed = start.elapsed();
let ids = index
.documents(&txn, docs.documents_ids.iter().copied())
.unwrap()
.into_iter()
.map(|x| {
let obkv = &x.1;
let id = obkv.get(primary_key).unwrap();
let id: serde_json::Value = serde_json::from_slice(id).unwrap();
id.as_str().unwrap().to_owned()
})
.collect::<Vec<_>>();
println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
println!("external ids: {ids:?}");
}
#[test]
@ -420,10 +561,16 @@ mod tests {
builder.set_min_word_len_one_typo(5);
builder.set_min_word_len_two_typos(100);
builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
builder.set_sortable_fields(hashset! { S("release_date") });
builder.set_criteria(vec![
Criterion::Words,
Criterion::Typo,
Criterion::Proximity,
Criterion::Asc("release_date".to_owned()),
]);
builder.execute(|_| (), || false).unwrap();
wtxn.commit().unwrap();
}
#[test]
@ -445,6 +592,7 @@ mod tests {
builder.set_searchable_fields(searchable_fields);
let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
builder.set_filterable_fields(filterable_fields);
builder.set_min_word_len_one_typo(5);
builder.set_min_word_len_two_typos(100);
builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
@ -467,6 +615,48 @@ mod tests {
index.prepare_for_closing().wait();
}
#[test]
fn _index_wiki() {
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
let index = Index::new(options, "data_wiki").unwrap();
let mut wtxn = index.write_txn().unwrap();
// let primary_key = "id";
let searchable_fields = vec!["body", "title", "url"];
// let filterable_fields = vec![];
let config = IndexerConfig::default();
let mut builder = Settings::new(&mut wtxn, &index, &config);
// builder.set_primary_key(primary_key.to_owned());
let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields);
// let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
// builder.set_filterable_fields(filterable_fields);
// builder.set_min_word_len_one_typo(5);
// builder.set_min_word_len_two_typos(100);
builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]);
builder.execute(|_| (), || false).unwrap();
let config = IndexerConfig::default();
let indexing_config =
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false)
.unwrap();
let documents = documents_from(
"/Users/meilisearch/Documents/milli2/benchmarks/datasets/smol-wiki-articles.csv",
"csv",
);
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
index.prepare_for_closing().wait();
}
fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
let reader = File::open(filename)