move the benchmarks to another crate so we can download the datasets automatically without adding overhead to the build of milli

2025-10-31 16:06:31 +00:00 · 2021-05-25 17:09:14 +02:00
parent 3c84075d2d
commit 06c414a753
10 changed files with 154 additions and 55 deletions
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -53,17 +53,8 @@ tinytemplate = "=1.1.0"

 [dev-dependencies]
 big_s = "1.0.2"
-criterion = "0.3.4"
 maplit = "1.0.2"
 rand = "0.8.3"

 [features]
 default = []
-
-[[bench]]
-name = "songs"
-harness = false
-
-[[bench]]
-name = "wiki"
-harness = false
--- a/milli/benches/README.md
+++ b/milli/benches/README.md
@@ -1,27 +0,0 @@
-Benchmarks
-==========
-
-For our benchmark we are using a small subset of the dataset songs.csv. It was generated with this command:
-```
-xsv sample --seed 42 song.csv -o smol-songs.csv
-```
-You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)
-And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz).
-
-You need to put this file in the current directory: `milli/milli/benches/smol-songs.csv.gz`
-You can run the following command from the root of this git repository
-```
-wget https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz -O milli/benches/smol-songs.csv.gz
-```
-
- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h
- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h
- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h
-
-By default the benchmarks expect the datasets to be uncompressed and present in `milli/milli/benches`, but you can also specify your own path with the environment variable `MILLI_BENCH_DATASETS_PATH` like that:
-```
-MILLI_BENCH_DATASETS_PATH=~/Downloads/datasets cargo bench --bench songs
-```
-
-Our benchmarking suite uses criterion which allow you to do a lot of configuration, see the documentation [here](https://bheisler.github.io/criterion.rs/book/user_guide/user_guide.html)
-
--- a/milli/benches/songs.rs
+++ b/milli/benches/songs.rs
@@ -1,209 +0,0 @@
-mod utils;
-
-use criterion::{criterion_group, criterion_main};
-use milli::update::Settings;
-use utils::Conf;
-
-fn base_conf(builder: &mut Settings) {
-    let displayed_fields = [
-        "id", "title", "album", "artist", "genre", "country", "released", "duration",
-    ]
-    .iter()
-    .map(|s| s.to_string())
-    .collect();
-    builder.set_displayed_fields(displayed_fields);
-
-    let searchable_fields = ["title", "album", "artist"]
-        .iter()
-        .map(|s| s.to_string())
-        .collect();
-    builder.set_searchable_fields(searchable_fields);
-
-    let faceted_fields = [
-        ("released-timestamp", "number"),
-        ("duration-float", "number"),
-        ("genre", "string"),
-        ("country", "string"),
-        ("artist", "string"),
-    ]
-    .iter()
-    .map(|(a, b)| (a.to_string(), b.to_string()))
-    .collect();
-    builder.set_faceted_fields(faceted_fields);
-}
-
-const BASE_CONF: Conf = Conf {
-    dataset: "smol-songs.csv",
-    queries: &[
-        "john ",             // 9097
-        "david ",            // 4794
-        "charles ",          // 1957
-        "david bowie ",      // 1200
-        "michael jackson ",  // 600
-        "thelonious monk ",  // 303
-        "charles mingus ",   // 142
-        "marcus miller ",    // 60
-        "tamo ",             // 13
-        "Notstandskomitee ", // 4
-    ],
-    configure: base_conf,
-    ..Conf::BASE
-};
-
-fn bench_songs(c: &mut criterion::Criterion) {
-    let default_criterion: Vec<String> = milli::default_criteria()
-        .iter()
-        .map(|criteria| criteria.to_string())
-        .collect();
-    let default_criterion = default_criterion.iter().map(|s| s.as_str());
-    let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)")
-        .chain(default_criterion.clone())
-        .collect();
-    let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)")
-        .chain(default_criterion.clone())
-        .collect();
-
-    let basic_with_quote: Vec<String> = BASE_CONF
-        .queries
-        .iter()
-        .map(|s| {
-            s.trim()
-                .split(' ')
-                .map(|s| format!(r#""{}""#, s))
-                .collect::<Vec<String>>()
-                .join(" ")
-        })
-        .collect();
-    let basic_with_quote: &[&str] = &basic_with_quote
-        .iter()
-        .map(|s| s.as_str())
-        .collect::<Vec<&str>>();
-
-    let confs = &[
-        /* first we bench each criterion alone */
-        utils::Conf {
-            group_name: "proximity",
-            queries: &[
-                "black saint sinner lady ",
-                "les dangeureuses 1960 ",
-                "The Disneyland Sing-Along Chorus ",
-                "Under Great Northern Lights ",
-                "7000 Danses Un Jour Dans Notre Vie ",
-            ],
-            criterion: Some(&["proximity"]),
-            optional_words: false,
-            ..BASE_CONF
-        },
-        utils::Conf {
-            group_name: "typo",
-            queries: &[
-                "mongus ",
-                "thelonius monk ",
-                "Disnaylande ",
-                "the white striper ",
-                "indochie ",
-                "indochien ",
-                "klub des loopers ",
-                "fear of the duck ",
-                "michel depech ",
-                "stromal ",
-                "dire straights ",
-                "Arethla Franklin ",
-            ],
-            criterion: Some(&["typo"]),
-            optional_words: false,
-            ..BASE_CONF
-        },
-        utils::Conf {
-            group_name: "words",
-            queries: &[
-                "the black saint and the sinner lady and the good doggo ", // four words to pop
-                "les liaisons dangeureuses 1793 ",                         // one word to pop
-                "The Disneyland Children's Sing-Alone song ",              // two words to pop
-                "seven nation mummy ",                                     // one word to pop
-                "7000 Danses / Le Baiser / je me trompe de mots ",         // four words to pop
-                "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop
-                "whathavenotnsuchforth and a good amount of words to pop to match the first one ", // 13
-            ],
-            criterion: Some(&["words"]),
-            ..BASE_CONF
-        },
-        utils::Conf {
-            group_name: "asc",
-            criterion: Some(&["asc(released-timestamp)"]),
-            ..BASE_CONF
-        },
-        utils::Conf {
-            group_name: "desc",
-            criterion: Some(&["desc(released-timestamp)"]),
-            ..BASE_CONF
-        },
-
-        /* then we bench the asc and desc criterion on top of the default criterion */
-        utils::Conf {
-            group_name: "asc + default",
-            criterion: Some(&asc_default[..]),
-            ..BASE_CONF
-        },
-        utils::Conf {
-            group_name: "desc + default",
-            criterion: Some(&desc_default[..]),
-            ..BASE_CONF
-        },
-
-        /* we bench the filters with the default request */
-        utils::Conf {
-            group_name: "basic filter: <=",
-            facet_condition: Some("released-timestamp <= 946728000"), // year 2000
-            ..BASE_CONF
-        },
-        utils::Conf {
-            group_name: "basic filter: TO",
-            facet_condition: Some("released-timestamp 946728000 TO 1262347200"), // year 2000 to 2010
-            ..BASE_CONF
-        },
-        utils::Conf {
-            group_name: "big filter",
-            facet_condition: Some("released-timestamp != 1262347200 AND (NOT (released-timestamp = 946728000)) AND (duration-float = 1 OR (duration-float 1.1 TO 1.5 AND released-timestamp > 315576000))"),
-            ..BASE_CONF
-        },
-
-        /* the we bench some global / normal search with all the default criterion in the default
-         * order */
-        utils::Conf {
-            group_name: "basic placeholder",
-            queries: &[""],
-            ..BASE_CONF
-        },
-        utils::Conf {
-            group_name: "basic without quote",
-            queries: &BASE_CONF
-                .queries
-                .iter()
-                .map(|s| s.trim()) // we remove the space at the end of each request
-                .collect::<Vec<&str>>(),
-            ..BASE_CONF
-        },
-        utils::Conf {
-            group_name: "basic with quote",
-            queries: basic_with_quote,
-            ..BASE_CONF
-        },
-        utils::Conf {
-            group_name: "prefix search",
-            queries: &[
-                "s", // 500k+ results
-                "a", //
-                "b", //
-                "i", //
-                "x", // only 7k results
-            ],
-            ..BASE_CONF
-        },
-    ];
-
-    utils::run_benches(c, confs);
-}
-
-criterion_group!(benches, bench_songs);
-criterion_main!(benches);
--- a/milli/benches/utils.rs
+++ b/milli/benches/utils.rs
@@ -1,125 +0,0 @@
-use std::fs::{create_dir_all, remove_dir_all, File};
-
-use criterion::BenchmarkId;
-use heed::EnvOpenOptions;
-use milli::{
-    update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat},
-    FacetCondition, Index,
-};
-
-/// The name of the environment variable used to select the path
-/// of the directory containing the datasets
-const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
-
-/// The default path for the dataset if nothing is specified
-/// By default we chose `milli/benches` because any cargo command ran in `milli/milli/**` will be
-/// executed with a pwd of `milli/milli`
-const DEFAULT_DATASETS_PATH: &str = "milli/benches";
-
-pub struct Conf<'a> {
-    /// where we are going to create our database.mmdb directory
-    /// each benchmark will first try to delete it and then recreate it
-    pub database_name: &'a str,
-    /// the dataset to be used, it must be an uncompressed csv
-    pub dataset: &'a str,
-    pub group_name: &'a str,
-    pub queries: &'a [&'a str],
-    /// here you can change which criterion are used and in which order.
-    /// - if you specify something all the base configuration will be thrown out
-    /// - if you don't specify anything (None) the default configuration will be kept
-    pub criterion: Option<&'a [&'a str]>,
-    /// the last chance to configure your database as you want
-    pub configure: fn(&mut Settings),
-    pub facet_condition: Option<&'a str>,
-    /// enable or disable the optional words on the query
-    pub optional_words: bool,
-}
-
-impl Conf<'_> {
-    fn nop(_builder: &mut Settings) {}
-
-    pub const BASE: Self = Conf {
-        database_name: "benches.mmdb",
-        dataset: "",
-        group_name: "",
-        queries: &[],
-        criterion: None,
-        configure: Self::nop,
-        facet_condition: None,
-        optional_words: true,
-    };
-}
-
-pub fn base_setup(conf: &Conf) -> Index {
-    match remove_dir_all(&conf.database_name) {
-        Ok(_) => (),
-        Err(e) if e.kind() == std::io::ErrorKind::NotFound => (),
-        Err(e) => panic!("{}", e),
-    }
-    create_dir_all(&conf.database_name).unwrap();
-
-    let mut options = EnvOpenOptions::new();
-    options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
-    options.max_readers(10);
-    let index = Index::new(options, conf.database_name).unwrap();
-
-    let update_builder = UpdateBuilder::new(0);
-    let mut wtxn = index.write_txn().unwrap();
-    let mut builder = update_builder.settings(&mut wtxn, &index);
-
-    if let Some(criterion) = conf.criterion {
-        builder.reset_faceted_fields();
-        builder.reset_criteria();
-        builder.reset_stop_words();
-
-        let criterion = criterion.iter().map(|s| s.to_string()).collect();
-        builder.set_criteria(criterion);
-    }
-
-    (conf.configure)(&mut builder);
-
-    builder.execute(|_, _| ()).unwrap();
-    wtxn.commit().unwrap();
-
-    let update_builder = UpdateBuilder::new(0);
-    let mut wtxn = index.write_txn().unwrap();
-    let mut builder = update_builder.index_documents(&mut wtxn, &index);
-    builder.update_format(UpdateFormat::Csv);
-    builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
-    // we called from cargo the current directory is supposed to be milli/milli
-    let base_dataset_path = std::env::vars()
-        .find(|var| var.0 == BASE_DATASETS_PATH_KEY)
-        .map_or(DEFAULT_DATASETS_PATH.to_owned(), |(_key, value)| value);
-    let dataset_path = format!("{}/{}", base_dataset_path, conf.dataset);
-    let reader = File::open(&dataset_path)
-        .expect(&format!("could not find the dataset in: {}", &dataset_path));
-    builder.execute(reader, |_, _| ()).unwrap();
-    wtxn.commit().unwrap();
-
-    index
-}
-
-pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
-    for conf in confs {
-        let index = base_setup(conf);
-
-        let mut group = c.benchmark_group(&format!("{}: {}", conf.dataset, conf.group_name));
-
-        for &query in conf.queries {
-            group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| {
-                b.iter(|| {
-                    let rtxn = index.read_txn().unwrap();
-                    let mut search = index.search(&rtxn);
-                    search.query(query).optional_words(conf.optional_words);
-                    if let Some(facet_condition) = conf.facet_condition {
-                        let facet_condition =
-                            FacetCondition::from_str(&rtxn, &index, facet_condition).unwrap();
-                        search.facet_condition(facet_condition);
-                    }
-                    let _ids = search.execute().unwrap();
-                });
-            });
-        }
-        group.finish();
-    }
-}
--- a/milli/benches/wiki.rs
+++ b/milli/benches/wiki.rs
@@ -1,132 +0,0 @@
-mod utils;
-
-use criterion::{criterion_group, criterion_main};
-use milli::update::Settings;
-use utils::Conf;
-
-fn base_conf(builder: &mut Settings) {
-    let displayed_fields = ["title", "body", "url"]
-        .iter()
-        .map(|s| s.to_string())
-        .collect();
-    builder.set_displayed_fields(displayed_fields);
-
-    let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect();
-    builder.set_searchable_fields(searchable_fields);
-}
-
-const BASE_CONF: Conf = Conf {
-    dataset: "smol-wiki-articles.csv",
-    queries: &[
-        "mingus ",        // 46 candidates
-        "miles davis ",   // 159
-        "rock and roll ", // 1007
-        "machine ",       // 3448
-        "spain ",         // 7002
-        "japan ",         // 10.593
-        "france ",        // 17.616
-        "film ",          // 24.959
-    ],
-    configure: base_conf,
-    ..Conf::BASE
-};
-
-fn bench_songs(c: &mut criterion::Criterion) {
-    let basic_with_quote: Vec<String> = BASE_CONF
-        .queries
-        .iter()
-        .map(|s| {
-            s.trim()
-                .split(' ')
-                .map(|s| format!(r#""{}""#, s))
-                .collect::<Vec<String>>()
-                .join(" ")
-        })
-        .collect();
-    let basic_with_quote: &[&str] = &basic_with_quote
-        .iter()
-        .map(|s| s.as_str())
-        .collect::<Vec<&str>>();
-
-    let confs = &[
-        /* first we bench each criterion alone */
-        utils::Conf {
-            group_name: "proximity",
-            queries: &[
-                "herald sings ",
-                "april paris ",
-                "tea two ",
-                "diesel engine ",
-            ],
-            criterion: Some(&["proximity"]),
-            optional_words: false,
-            ..BASE_CONF
-        },
-        utils::Conf {
-            group_name: "typo",
-            queries: &[
-                "migrosoft ",
-                "linax ",
-                "Disnaylande ",
-                "phytogropher ",
-                "nympalidea ",
-                "aritmetric ",
-                "the fronce ",
-                "sisan ",
-            ],
-            criterion: Some(&["typo"]),
-            optional_words: false,
-            ..BASE_CONF
-        },
-        utils::Conf {
-            group_name: "words",
-            queries: &[
-                "the black saint and the sinner lady and the good doggo ", // four words to pop, 27 results
-                "Kameya Tokujirō mingus monk ",                           // two words to pop, 55
-                "Ulrich Hensel meilisearch milli ",                        // two words to pop, 306
-                "Idaho Bellevue pizza ",                                   // one word to pop, 800
-                "Abraham machin ",                                         // one word to pop, 1141
-            ],
-            criterion: Some(&["words"]),
-            ..BASE_CONF
-        },
-        /* the we bench some global / normal search with all the default criterion in the default
-         * order */
-        utils::Conf {
-            group_name: "basic placeholder",
-            queries: &[""],
-            ..BASE_CONF
-        },
-        utils::Conf {
-            group_name: "basic without quote",
-            queries: &BASE_CONF
-                .queries
-                .iter()
-                .map(|s| s.trim()) // we remove the space at the end of each request
-                .collect::<Vec<&str>>(),
-            ..BASE_CONF
-        },
-        utils::Conf {
-            group_name: "basic with quote",
-            queries: basic_with_quote,
-            ..BASE_CONF
-        },
-        utils::Conf {
-            group_name: "prefix search",
-            queries: &[
-                "t", // 453k results
-                "c", // 405k
-                "g", // 318k
-                "j", // 227k
-                "q", // 71k
-                "x", // 17k
-            ],
-            ..BASE_CONF
-        },
-    ];
-
-    utils::run_benches(c, confs);
-}
-
-criterion_group!(benches, bench_songs);
-criterion_main!(benches);