move the benchmarks to another crate so we can download the datasets automatically without adding overhead to the build of milli

This commit is contained in:
tamo
2021-05-25 17:09:14 +02:00
committed by Tamo
parent 3c84075d2d
commit 06c414a753
10 changed files with 154 additions and 55 deletions

View File

@ -53,17 +53,8 @@ tinytemplate = "=1.1.0"
[dev-dependencies]
big_s = "1.0.2"
criterion = "0.3.4"
maplit = "1.0.2"
rand = "0.8.3"
[features]
default = []
[[bench]]
name = "songs"
harness = false
[[bench]]
name = "wiki"
harness = false

View File

@ -1,27 +0,0 @@
Benchmarks
==========
For our benchmark we are using a small subset of the dataset songs.csv. It was generated with this command:
```
xsv sample --seed 42 song.csv -o smol-songs.csv
```
You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)
And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz).
You need to put this file in the current directory: `milli/milli/benches/smol-songs.csv.gz`
You can run the following command from the root of this git repository
```
wget https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz -O milli/benches/smol-songs.csv.gz
```
- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h
- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h
- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h
By default the benchmarks expect the datasets to be uncompressed and present in `milli/milli/benches`, but you can also specify your own path with the environment variable `MILLI_BENCH_DATASETS_PATH` like that:
```
MILLI_BENCH_DATASETS_PATH=~/Downloads/datasets cargo bench --bench songs
```
Our benchmarking suite uses criterion which allow you to do a lot of configuration, see the documentation [here](https://bheisler.github.io/criterion.rs/book/user_guide/user_guide.html)

View File

@ -1,209 +0,0 @@
mod utils;
use criterion::{criterion_group, criterion_main};
use milli::update::Settings;
use utils::Conf;
fn base_conf(builder: &mut Settings) {
let displayed_fields = [
"id", "title", "album", "artist", "genre", "country", "released", "duration",
]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_displayed_fields(displayed_fields);
let searchable_fields = ["title", "album", "artist"]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_searchable_fields(searchable_fields);
let faceted_fields = [
("released-timestamp", "number"),
("duration-float", "number"),
("genre", "string"),
("country", "string"),
("artist", "string"),
]
.iter()
.map(|(a, b)| (a.to_string(), b.to_string()))
.collect();
builder.set_faceted_fields(faceted_fields);
}
const BASE_CONF: Conf = Conf {
dataset: "smol-songs.csv",
queries: &[
"john ", // 9097
"david ", // 4794
"charles ", // 1957
"david bowie ", // 1200
"michael jackson ", // 600
"thelonious monk ", // 303
"charles mingus ", // 142
"marcus miller ", // 60
"tamo ", // 13
"Notstandskomitee ", // 4
],
configure: base_conf,
..Conf::BASE
};
fn bench_songs(c: &mut criterion::Criterion) {
let default_criterion: Vec<String> = milli::default_criteria()
.iter()
.map(|criteria| criteria.to_string())
.collect();
let default_criterion = default_criterion.iter().map(|s| s.as_str());
let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)")
.chain(default_criterion.clone())
.collect();
let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)")
.chain(default_criterion.clone())
.collect();
let basic_with_quote: Vec<String> = BASE_CONF
.queries
.iter()
.map(|s| {
s.trim()
.split(' ')
.map(|s| format!(r#""{}""#, s))
.collect::<Vec<String>>()
.join(" ")
})
.collect();
let basic_with_quote: &[&str] = &basic_with_quote
.iter()
.map(|s| s.as_str())
.collect::<Vec<&str>>();
let confs = &[
/* first we bench each criterion alone */
utils::Conf {
group_name: "proximity",
queries: &[
"black saint sinner lady ",
"les dangeureuses 1960 ",
"The Disneyland Sing-Along Chorus ",
"Under Great Northern Lights ",
"7000 Danses Un Jour Dans Notre Vie ",
],
criterion: Some(&["proximity"]),
optional_words: false,
..BASE_CONF
},
utils::Conf {
group_name: "typo",
queries: &[
"mongus ",
"thelonius monk ",
"Disnaylande ",
"the white striper ",
"indochie ",
"indochien ",
"klub des loopers ",
"fear of the duck ",
"michel depech ",
"stromal ",
"dire straights ",
"Arethla Franklin ",
],
criterion: Some(&["typo"]),
optional_words: false,
..BASE_CONF
},
utils::Conf {
group_name: "words",
queries: &[
"the black saint and the sinner lady and the good doggo ", // four words to pop
"les liaisons dangeureuses 1793 ", // one word to pop
"The Disneyland Children's Sing-Alone song ", // two words to pop
"seven nation mummy ", // one word to pop
"7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop
"Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop
"whathavenotnsuchforth and a good amount of words to pop to match the first one ", // 13
],
criterion: Some(&["words"]),
..BASE_CONF
},
utils::Conf {
group_name: "asc",
criterion: Some(&["asc(released-timestamp)"]),
..BASE_CONF
},
utils::Conf {
group_name: "desc",
criterion: Some(&["desc(released-timestamp)"]),
..BASE_CONF
},
/* then we bench the asc and desc criterion on top of the default criterion */
utils::Conf {
group_name: "asc + default",
criterion: Some(&asc_default[..]),
..BASE_CONF
},
utils::Conf {
group_name: "desc + default",
criterion: Some(&desc_default[..]),
..BASE_CONF
},
/* we bench the filters with the default request */
utils::Conf {
group_name: "basic filter: <=",
facet_condition: Some("released-timestamp <= 946728000"), // year 2000
..BASE_CONF
},
utils::Conf {
group_name: "basic filter: TO",
facet_condition: Some("released-timestamp 946728000 TO 1262347200"), // year 2000 to 2010
..BASE_CONF
},
utils::Conf {
group_name: "big filter",
facet_condition: Some("released-timestamp != 1262347200 AND (NOT (released-timestamp = 946728000)) AND (duration-float = 1 OR (duration-float 1.1 TO 1.5 AND released-timestamp > 315576000))"),
..BASE_CONF
},
/* the we bench some global / normal search with all the default criterion in the default
* order */
utils::Conf {
group_name: "basic placeholder",
queries: &[""],
..BASE_CONF
},
utils::Conf {
group_name: "basic without quote",
queries: &BASE_CONF
.queries
.iter()
.map(|s| s.trim()) // we remove the space at the end of each request
.collect::<Vec<&str>>(),
..BASE_CONF
},
utils::Conf {
group_name: "basic with quote",
queries: basic_with_quote,
..BASE_CONF
},
utils::Conf {
group_name: "prefix search",
queries: &[
"s", // 500k+ results
"a", //
"b", //
"i", //
"x", // only 7k results
],
..BASE_CONF
},
];
utils::run_benches(c, confs);
}
criterion_group!(benches, bench_songs);
criterion_main!(benches);

View File

@ -1,125 +0,0 @@
use std::fs::{create_dir_all, remove_dir_all, File};
use criterion::BenchmarkId;
use heed::EnvOpenOptions;
use milli::{
update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat},
FacetCondition, Index,
};
/// The name of the environment variable used to select the path
/// of the directory containing the datasets
const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
/// The default path for the dataset if nothing is specified
/// By default we chose `milli/benches` because any cargo command ran in `milli/milli/**` will be
/// executed with a pwd of `milli/milli`
const DEFAULT_DATASETS_PATH: &str = "milli/benches";
pub struct Conf<'a> {
/// where we are going to create our database.mmdb directory
/// each benchmark will first try to delete it and then recreate it
pub database_name: &'a str,
/// the dataset to be used, it must be an uncompressed csv
pub dataset: &'a str,
pub group_name: &'a str,
pub queries: &'a [&'a str],
/// here you can change which criterion are used and in which order.
/// - if you specify something all the base configuration will be thrown out
/// - if you don't specify anything (None) the default configuration will be kept
pub criterion: Option<&'a [&'a str]>,
/// the last chance to configure your database as you want
pub configure: fn(&mut Settings),
pub facet_condition: Option<&'a str>,
/// enable or disable the optional words on the query
pub optional_words: bool,
}
impl Conf<'_> {
fn nop(_builder: &mut Settings) {}
pub const BASE: Self = Conf {
database_name: "benches.mmdb",
dataset: "",
group_name: "",
queries: &[],
criterion: None,
configure: Self::nop,
facet_condition: None,
optional_words: true,
};
}
pub fn base_setup(conf: &Conf) -> Index {
match remove_dir_all(&conf.database_name) {
Ok(_) => (),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => (),
Err(e) => panic!("{}", e),
}
create_dir_all(&conf.database_name).unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
options.max_readers(10);
let index = Index::new(options, conf.database_name).unwrap();
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.settings(&mut wtxn, &index);
if let Some(criterion) = conf.criterion {
builder.reset_faceted_fields();
builder.reset_criteria();
builder.reset_stop_words();
let criterion = criterion.iter().map(|s| s.to_string()).collect();
builder.set_criteria(criterion);
}
(conf.configure)(&mut builder);
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, &index);
builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
// we called from cargo the current directory is supposed to be milli/milli
let base_dataset_path = std::env::vars()
.find(|var| var.0 == BASE_DATASETS_PATH_KEY)
.map_or(DEFAULT_DATASETS_PATH.to_owned(), |(_key, value)| value);
let dataset_path = format!("{}/{}", base_dataset_path, conf.dataset);
let reader = File::open(&dataset_path)
.expect(&format!("could not find the dataset in: {}", &dataset_path));
builder.execute(reader, |_, _| ()).unwrap();
wtxn.commit().unwrap();
index
}
pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
for conf in confs {
let index = base_setup(conf);
let mut group = c.benchmark_group(&format!("{}: {}", conf.dataset, conf.group_name));
for &query in conf.queries {
group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| {
b.iter(|| {
let rtxn = index.read_txn().unwrap();
let mut search = index.search(&rtxn);
search.query(query).optional_words(conf.optional_words);
if let Some(facet_condition) = conf.facet_condition {
let facet_condition =
FacetCondition::from_str(&rtxn, &index, facet_condition).unwrap();
search.facet_condition(facet_condition);
}
let _ids = search.execute().unwrap();
});
});
}
group.finish();
}
}

View File

@ -1,132 +0,0 @@
mod utils;
use criterion::{criterion_group, criterion_main};
use milli::update::Settings;
use utils::Conf;
fn base_conf(builder: &mut Settings) {
let displayed_fields = ["title", "body", "url"]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_displayed_fields(displayed_fields);
let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields);
}
const BASE_CONF: Conf = Conf {
dataset: "smol-wiki-articles.csv",
queries: &[
"mingus ", // 46 candidates
"miles davis ", // 159
"rock and roll ", // 1007
"machine ", // 3448
"spain ", // 7002
"japan ", // 10.593
"france ", // 17.616
"film ", // 24.959
],
configure: base_conf,
..Conf::BASE
};
fn bench_songs(c: &mut criterion::Criterion) {
let basic_with_quote: Vec<String> = BASE_CONF
.queries
.iter()
.map(|s| {
s.trim()
.split(' ')
.map(|s| format!(r#""{}""#, s))
.collect::<Vec<String>>()
.join(" ")
})
.collect();
let basic_with_quote: &[&str] = &basic_with_quote
.iter()
.map(|s| s.as_str())
.collect::<Vec<&str>>();
let confs = &[
/* first we bench each criterion alone */
utils::Conf {
group_name: "proximity",
queries: &[
"herald sings ",
"april paris ",
"tea two ",
"diesel engine ",
],
criterion: Some(&["proximity"]),
optional_words: false,
..BASE_CONF
},
utils::Conf {
group_name: "typo",
queries: &[
"migrosoft ",
"linax ",
"Disnaylande ",
"phytogropher ",
"nympalidea ",
"aritmetric ",
"the fronce ",
"sisan ",
],
criterion: Some(&["typo"]),
optional_words: false,
..BASE_CONF
},
utils::Conf {
group_name: "words",
queries: &[
"the black saint and the sinner lady and the good doggo ", // four words to pop, 27 results
"Kameya Tokujirō mingus monk ", // two words to pop, 55
"Ulrich Hensel meilisearch milli ", // two words to pop, 306
"Idaho Bellevue pizza ", // one word to pop, 800
"Abraham machin ", // one word to pop, 1141
],
criterion: Some(&["words"]),
..BASE_CONF
},
/* the we bench some global / normal search with all the default criterion in the default
* order */
utils::Conf {
group_name: "basic placeholder",
queries: &[""],
..BASE_CONF
},
utils::Conf {
group_name: "basic without quote",
queries: &BASE_CONF
.queries
.iter()
.map(|s| s.trim()) // we remove the space at the end of each request
.collect::<Vec<&str>>(),
..BASE_CONF
},
utils::Conf {
group_name: "basic with quote",
queries: basic_with_quote,
..BASE_CONF
},
utils::Conf {
group_name: "prefix search",
queries: &[
"t", // 453k results
"c", // 405k
"g", // 318k
"j", // 227k
"q", // 71k
"x", // 17k
],
..BASE_CONF
},
];
utils::run_benches(c, confs);
}
criterion_group!(benches, bench_songs);
criterion_main!(benches);