move the benchmarks to another crate so we can download the datasets automatically without adding overhead to the build of milli

2025-10-31 16:06:31 +00:00 · 2021-05-25 17:09:14 +02:00
parent 3c84075d2d
commit 06c414a753
10 changed files with 154 additions and 55 deletions
--- a/benchmarks/benches/songs.rs
+++ b/benchmarks/benches/songs.rs
@@ -0,0 +1,210 @@
+mod datasets_paths;
+mod utils;
+
+use criterion::{criterion_group, criterion_main};
+use milli::update::Settings;
+use utils::Conf;
+
+fn base_conf(builder: &mut Settings) {
+    let displayed_fields = [
+        "id", "title", "album", "artist", "genre", "country", "released", "duration",
+    ]
+    .iter()
+    .map(|s| s.to_string())
+    .collect();
+    builder.set_displayed_fields(displayed_fields);
+
+    let searchable_fields = ["title", "album", "artist"]
+        .iter()
+        .map(|s| s.to_string())
+        .collect();
+    builder.set_searchable_fields(searchable_fields);
+
+    let faceted_fields = [
+        ("released-timestamp", "number"),
+        ("duration-float", "number"),
+        ("genre", "string"),
+        ("country", "string"),
+        ("artist", "string"),
+    ]
+    .iter()
+    .map(|(a, b)| (a.to_string(), b.to_string()))
+    .collect();
+    builder.set_faceted_fields(faceted_fields);
+}
+
+const BASE_CONF: Conf = Conf {
+    dataset: datasets_paths::SMOL_SONGS,
+    queries: &[
+        "john ",             // 9097
+        "david ",            // 4794
+        "charles ",          // 1957
+        "david bowie ",      // 1200
+        "michael jackson ",  // 600
+        "thelonious monk ",  // 303
+        "charles mingus ",   // 142
+        "marcus miller ",    // 60
+        "tamo ",             // 13
+        "Notstandskomitee ", // 4
+    ],
+    configure: base_conf,
+    ..Conf::BASE
+};
+
+fn bench_songs(c: &mut criterion::Criterion) {
+    let default_criterion: Vec<String> = milli::default_criteria()
+        .iter()
+        .map(|criteria| criteria.to_string())
+        .collect();
+    let default_criterion = default_criterion.iter().map(|s| s.as_str());
+    let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)")
+        .chain(default_criterion.clone())
+        .collect();
+    let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)")
+        .chain(default_criterion.clone())
+        .collect();
+
+    let basic_with_quote: Vec<String> = BASE_CONF
+        .queries
+        .iter()
+        .map(|s| {
+            s.trim()
+                .split(' ')
+                .map(|s| format!(r#""{}""#, s))
+                .collect::<Vec<String>>()
+                .join(" ")
+        })
+        .collect();
+    let basic_with_quote: &[&str] = &basic_with_quote
+        .iter()
+        .map(|s| s.as_str())
+        .collect::<Vec<&str>>();
+
+    let confs = &[
+        /* first we bench each criterion alone */
+        utils::Conf {
+            group_name: "proximity",
+            queries: &[
+                "black saint sinner lady ",
+                "les dangeureuses 1960 ",
+                "The Disneyland Sing-Along Chorus ",
+                "Under Great Northern Lights ",
+                "7000 Danses Un Jour Dans Notre Vie ",
+            ],
+            criterion: Some(&["proximity"]),
+            optional_words: false,
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "typo",
+            queries: &[
+                "mongus ",
+                "thelonius monk ",
+                "Disnaylande ",
+                "the white striper ",
+                "indochie ",
+                "indochien ",
+                "klub des loopers ",
+                "fear of the duck ",
+                "michel depech ",
+                "stromal ",
+                "dire straights ",
+                "Arethla Franklin ",
+            ],
+            criterion: Some(&["typo"]),
+            optional_words: false,
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "words",
+            queries: &[
+                "the black saint and the sinner lady and the good doggo ", // four words to pop
+                "les liaisons dangeureuses 1793 ",                         // one word to pop
+                "The Disneyland Children's Sing-Alone song ",              // two words to pop
+                "seven nation mummy ",                                     // one word to pop
+                "7000 Danses / Le Baiser / je me trompe de mots ",         // four words to pop
+                "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop
+                "whathavenotnsuchforth and a good amount of words to pop to match the first one ", // 13
+            ],
+            criterion: Some(&["words"]),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "asc",
+            criterion: Some(&["asc(released-timestamp)"]),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "desc",
+            criterion: Some(&["desc(released-timestamp)"]),
+            ..BASE_CONF
+        },
+
+        /* then we bench the asc and desc criterion on top of the default criterion */
+        utils::Conf {
+            group_name: "asc + default",
+            criterion: Some(&asc_default[..]),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "desc + default",
+            criterion: Some(&desc_default[..]),
+            ..BASE_CONF
+        },
+
+        /* we bench the filters with the default request */
+        utils::Conf {
+            group_name: "basic filter: <=",
+            facet_condition: Some("released-timestamp <= 946728000"), // year 2000
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "basic filter: TO",
+            facet_condition: Some("released-timestamp 946728000 TO 1262347200"), // year 2000 to 2010
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "big filter",
+            facet_condition: Some("released-timestamp != 1262347200 AND (NOT (released-timestamp = 946728000)) AND (duration-float = 1 OR (duration-float 1.1 TO 1.5 AND released-timestamp > 315576000))"),
+            ..BASE_CONF
+        },
+
+        /* the we bench some global / normal search with all the default criterion in the default
+         * order */
+        utils::Conf {
+            group_name: "basic placeholder",
+            queries: &[""],
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "basic without quote",
+            queries: &BASE_CONF
+                .queries
+                .iter()
+                .map(|s| s.trim()) // we remove the space at the end of each request
+                .collect::<Vec<&str>>(),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "basic with quote",
+            queries: basic_with_quote,
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "prefix search",
+            queries: &[
+                "s", // 500k+ results
+                "a", //
+                "b", //
+                "i", //
+                "x", // only 7k results
+            ],
+            ..BASE_CONF
+        },
+    ];
+
+    utils::run_benches(c, confs);
+}
+
+criterion_group!(benches, bench_songs);
+criterion_main!(benches);
--- a/benchmarks/benches/utils.rs
+++ b/benchmarks/benches/utils.rs
@@ -0,0 +1,114 @@
+use std::fs::{create_dir_all, remove_dir_all, File};
+
+use criterion::BenchmarkId;
+use heed::EnvOpenOptions;
+use milli::{
+    update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat},
+    FacetCondition, Index,
+};
+
+pub struct Conf<'a> {
+    /// where we are going to create our database.mmdb directory
+    /// each benchmark will first try to delete it and then recreate it
+    pub database_name: &'a str,
+    /// the dataset to be used, it must be an uncompressed csv
+    pub dataset: &'a str,
+    pub group_name: &'a str,
+    pub queries: &'a [&'a str],
+    /// here you can change which criterion are used and in which order.
+    /// - if you specify something all the base configuration will be thrown out
+    /// - if you don't specify anything (None) the default configuration will be kept
+    pub criterion: Option<&'a [&'a str]>,
+    /// the last chance to configure your database as you want
+    pub configure: fn(&mut Settings),
+    pub facet_condition: Option<&'a str>,
+    /// enable or disable the optional words on the query
+    pub optional_words: bool,
+    /// primary key, if there is None we'll auto-generate docids for every documents
+    pub primary_key: Option<&'a str>,
+}
+
+impl Conf<'_> {
+    fn nop(_builder: &mut Settings) {}
+
+    pub const BASE: Self = Conf {
+        database_name: "benches.mmdb",
+        dataset: "",
+        group_name: "",
+        queries: &[],
+        criterion: None,
+        configure: Self::nop,
+        facet_condition: None,
+        optional_words: true,
+        primary_key: None,
+    };
+}
+
+pub fn base_setup(conf: &Conf) -> Index {
+    match remove_dir_all(&conf.database_name) {
+        Ok(_) => (),
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => (),
+        Err(e) => panic!("{}", e),
+    }
+    create_dir_all(&conf.database_name).unwrap();
+
+    let mut options = EnvOpenOptions::new();
+    options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
+    options.max_readers(10);
+    let index = Index::new(options, conf.database_name).unwrap();
+
+    let update_builder = UpdateBuilder::new(0);
+    let mut wtxn = index.write_txn().unwrap();
+    let mut builder = update_builder.settings(&mut wtxn, &index);
+
+    if let Some(criterion) = conf.criterion {
+        builder.reset_faceted_fields();
+        builder.reset_criteria();
+        builder.reset_stop_words();
+
+        let criterion = criterion.iter().map(|s| s.to_string()).collect();
+        builder.set_criteria(criterion);
+    }
+
+    (conf.configure)(&mut builder);
+
+    builder.execute(|_, _| ()).unwrap();
+    wtxn.commit().unwrap();
+
+    let update_builder = UpdateBuilder::new(0);
+    let mut wtxn = index.write_txn().unwrap();
+    let mut builder = update_builder.index_documents(&mut wtxn, &index);
+    builder.update_format(UpdateFormat::Csv);
+    builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
+    let reader = File::open(conf.dataset)
+        .expect(&format!("could not find the dataset in: {}", conf.dataset));
+    builder.execute(reader, |_, _| ()).unwrap();
+    wtxn.commit().unwrap();
+
+    index
+}
+
+pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
+    for conf in confs {
+        let index = base_setup(conf);
+
+        let mut group = c.benchmark_group(&format!("{}: {}", conf.dataset, conf.group_name));
+
+        for &query in conf.queries {
+            group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| {
+                b.iter(|| {
+                    let rtxn = index.read_txn().unwrap();
+                    let mut search = index.search(&rtxn);
+                    search.query(query).optional_words(conf.optional_words);
+                    if let Some(facet_condition) = conf.facet_condition {
+                        let facet_condition =
+                            FacetCondition::from_str(&rtxn, &index, facet_condition).unwrap();
+                        search.facet_condition(facet_condition);
+                    }
+                    let _ids = search.execute().unwrap();
+                });
+            });
+        }
+        group.finish();
+    }
+}
--- a/benchmarks/benches/wiki.rs
+++ b/benchmarks/benches/wiki.rs
@@ -0,0 +1,133 @@
+mod datasets_paths;
+mod utils;
+
+use criterion::{criterion_group, criterion_main};
+use milli::update::Settings;
+use utils::Conf;
+
+fn base_conf(builder: &mut Settings) {
+    let displayed_fields = ["title", "body", "url"]
+        .iter()
+        .map(|s| s.to_string())
+        .collect();
+    builder.set_displayed_fields(displayed_fields);
+
+    let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect();
+    builder.set_searchable_fields(searchable_fields);
+}
+
+const BASE_CONF: Conf = Conf {
+    dataset: datasets_paths::SMOL_WIKI_ARTICLES,
+    queries: &[
+        "mingus ",        // 46 candidates
+        "miles davis ",   // 159
+        "rock and roll ", // 1007
+        "machine ",       // 3448
+        "spain ",         // 7002
+        "japan ",         // 10.593
+        "france ",        // 17.616
+        "film ",          // 24.959
+    ],
+    configure: base_conf,
+    ..Conf::BASE
+};
+
+fn bench_songs(c: &mut criterion::Criterion) {
+    let basic_with_quote: Vec<String> = BASE_CONF
+        .queries
+        .iter()
+        .map(|s| {
+            s.trim()
+                .split(' ')
+                .map(|s| format!(r#""{}""#, s))
+                .collect::<Vec<String>>()
+                .join(" ")
+        })
+        .collect();
+    let basic_with_quote: &[&str] = &basic_with_quote
+        .iter()
+        .map(|s| s.as_str())
+        .collect::<Vec<&str>>();
+
+    let confs = &[
+        /* first we bench each criterion alone */
+        utils::Conf {
+            group_name: "proximity",
+            queries: &[
+                "herald sings ",
+                "april paris ",
+                "tea two ",
+                "diesel engine ",
+            ],
+            criterion: Some(&["proximity"]),
+            optional_words: false,
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "typo",
+            queries: &[
+                "migrosoft ",
+                "linax ",
+                "Disnaylande ",
+                "phytogropher ",
+                "nympalidea ",
+                "aritmetric ",
+                "the fronce ",
+                "sisan ",
+            ],
+            criterion: Some(&["typo"]),
+            optional_words: false,
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "words",
+            queries: &[
+                "the black saint and the sinner lady and the good doggo ", // four words to pop, 27 results
+                "Kameya Tokujirō mingus monk ",                           // two words to pop, 55
+                "Ulrich Hensel meilisearch milli ",                        // two words to pop, 306
+                "Idaho Bellevue pizza ",                                   // one word to pop, 800
+                "Abraham machin ",                                         // one word to pop, 1141
+            ],
+            criterion: Some(&["words"]),
+            ..BASE_CONF
+        },
+        /* the we bench some global / normal search with all the default criterion in the default
+         * order */
+        utils::Conf {
+            group_name: "basic placeholder",
+            queries: &[""],
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "basic without quote",
+            queries: &BASE_CONF
+                .queries
+                .iter()
+                .map(|s| s.trim()) // we remove the space at the end of each request
+                .collect::<Vec<&str>>(),
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "basic with quote",
+            queries: basic_with_quote,
+            ..BASE_CONF
+        },
+        utils::Conf {
+            group_name: "prefix search",
+            queries: &[
+                "t", // 453k results
+                "c", // 405k
+                "g", // 318k
+                "j", // 227k
+                "q", // 71k
+                "x", // 17k
+            ],
+            ..BASE_CONF
+        },
+    ];
+
+    utils::run_benches(c, confs);
+}
+
+criterion_group!(benches, bench_songs);
+criterion_main!(benches);