mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 16:06:31 +00:00 
			
		
		
		
	move the benchmarks to another crate so we can download the datasets automatically without adding overhead to the build of milli
This commit is contained in:
		| @@ -53,17 +53,8 @@ tinytemplate = "=1.1.0" | ||||
|  | ||||
| [dev-dependencies] | ||||
| big_s = "1.0.2" | ||||
| criterion = "0.3.4" | ||||
| maplit = "1.0.2" | ||||
| rand = "0.8.3" | ||||
|  | ||||
| [features] | ||||
| default = [] | ||||
|  | ||||
| [[bench]] | ||||
| name = "songs" | ||||
| harness = false | ||||
|  | ||||
| [[bench]] | ||||
| name = "wiki" | ||||
| harness = false | ||||
|   | ||||
| @@ -1,27 +0,0 @@ | ||||
| Benchmarks | ||||
| ========== | ||||
|  | ||||
| For our benchmark we are using a small subset of the dataset songs.csv. It was generated with this command: | ||||
| ``` | ||||
| xsv sample --seed 42 song.csv -o smol-songs.csv | ||||
| ``` | ||||
| You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz) | ||||
| And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz). | ||||
|  | ||||
| You need to put this file in the current directory: `milli/milli/benches/smol-songs.csv.gz` | ||||
| You can run the following command from the root of this git repository | ||||
| ``` | ||||
| wget https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz -O milli/benches/smol-songs.csv.gz | ||||
| ``` | ||||
|  | ||||
| - To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h | ||||
| - You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h | ||||
| - And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h | ||||
|  | ||||
| By default the benchmarks expect the datasets to be uncompressed and present in `milli/milli/benches`, but you can also specify your own path with the environment variable `MILLI_BENCH_DATASETS_PATH` like that: | ||||
| ``` | ||||
| MILLI_BENCH_DATASETS_PATH=~/Downloads/datasets cargo bench --bench songs | ||||
| ``` | ||||
|  | ||||
| Our benchmarking suite uses criterion which allow you to do a lot of configuration, see the documentation [here](https://bheisler.github.io/criterion.rs/book/user_guide/user_guide.html) | ||||
|  | ||||
| @@ -1,209 +0,0 @@ | ||||
| mod utils; | ||||
|  | ||||
| use criterion::{criterion_group, criterion_main}; | ||||
| use milli::update::Settings; | ||||
| use utils::Conf; | ||||
|  | ||||
| fn base_conf(builder: &mut Settings) { | ||||
|     let displayed_fields = [ | ||||
|         "id", "title", "album", "artist", "genre", "country", "released", "duration", | ||||
|     ] | ||||
|     .iter() | ||||
|     .map(|s| s.to_string()) | ||||
|     .collect(); | ||||
|     builder.set_displayed_fields(displayed_fields); | ||||
|  | ||||
|     let searchable_fields = ["title", "album", "artist"] | ||||
|         .iter() | ||||
|         .map(|s| s.to_string()) | ||||
|         .collect(); | ||||
|     builder.set_searchable_fields(searchable_fields); | ||||
|  | ||||
|     let faceted_fields = [ | ||||
|         ("released-timestamp", "number"), | ||||
|         ("duration-float", "number"), | ||||
|         ("genre", "string"), | ||||
|         ("country", "string"), | ||||
|         ("artist", "string"), | ||||
|     ] | ||||
|     .iter() | ||||
|     .map(|(a, b)| (a.to_string(), b.to_string())) | ||||
|     .collect(); | ||||
|     builder.set_faceted_fields(faceted_fields); | ||||
| } | ||||
|  | ||||
| const BASE_CONF: Conf = Conf { | ||||
|     dataset: "smol-songs.csv", | ||||
|     queries: &[ | ||||
|         "john ",             // 9097 | ||||
|         "david ",            // 4794 | ||||
|         "charles ",          // 1957 | ||||
|         "david bowie ",      // 1200 | ||||
|         "michael jackson ",  // 600 | ||||
|         "thelonious monk ",  // 303 | ||||
|         "charles mingus ",   // 142 | ||||
|         "marcus miller ",    // 60 | ||||
|         "tamo ",             // 13 | ||||
|         "Notstandskomitee ", // 4 | ||||
|     ], | ||||
|     configure: base_conf, | ||||
|     ..Conf::BASE | ||||
| }; | ||||
|  | ||||
| fn bench_songs(c: &mut criterion::Criterion) { | ||||
|     let default_criterion: Vec<String> = milli::default_criteria() | ||||
|         .iter() | ||||
|         .map(|criteria| criteria.to_string()) | ||||
|         .collect(); | ||||
|     let default_criterion = default_criterion.iter().map(|s| s.as_str()); | ||||
|     let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)") | ||||
|         .chain(default_criterion.clone()) | ||||
|         .collect(); | ||||
|     let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)") | ||||
|         .chain(default_criterion.clone()) | ||||
|         .collect(); | ||||
|  | ||||
|     let basic_with_quote: Vec<String> = BASE_CONF | ||||
|         .queries | ||||
|         .iter() | ||||
|         .map(|s| { | ||||
|             s.trim() | ||||
|                 .split(' ') | ||||
|                 .map(|s| format!(r#""{}""#, s)) | ||||
|                 .collect::<Vec<String>>() | ||||
|                 .join(" ") | ||||
|         }) | ||||
|         .collect(); | ||||
|     let basic_with_quote: &[&str] = &basic_with_quote | ||||
|         .iter() | ||||
|         .map(|s| s.as_str()) | ||||
|         .collect::<Vec<&str>>(); | ||||
|  | ||||
|     let confs = &[ | ||||
|         /* first we bench each criterion alone */ | ||||
|         utils::Conf { | ||||
|             group_name: "proximity", | ||||
|             queries: &[ | ||||
|                 "black saint sinner lady ", | ||||
|                 "les dangeureuses 1960 ", | ||||
|                 "The Disneyland Sing-Along Chorus ", | ||||
|                 "Under Great Northern Lights ", | ||||
|                 "7000 Danses Un Jour Dans Notre Vie ", | ||||
|             ], | ||||
|             criterion: Some(&["proximity"]), | ||||
|             optional_words: false, | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|         utils::Conf { | ||||
|             group_name: "typo", | ||||
|             queries: &[ | ||||
|                 "mongus ", | ||||
|                 "thelonius monk ", | ||||
|                 "Disnaylande ", | ||||
|                 "the white striper ", | ||||
|                 "indochie ", | ||||
|                 "indochien ", | ||||
|                 "klub des loopers ", | ||||
|                 "fear of the duck ", | ||||
|                 "michel depech ", | ||||
|                 "stromal ", | ||||
|                 "dire straights ", | ||||
|                 "Arethla Franklin ", | ||||
|             ], | ||||
|             criterion: Some(&["typo"]), | ||||
|             optional_words: false, | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|         utils::Conf { | ||||
|             group_name: "words", | ||||
|             queries: &[ | ||||
|                 "the black saint and the sinner lady and the good doggo ", // four words to pop | ||||
|                 "les liaisons dangeureuses 1793 ",                         // one word to pop | ||||
|                 "The Disneyland Children's Sing-Alone song ",              // two words to pop | ||||
|                 "seven nation mummy ",                                     // one word to pop | ||||
|                 "7000 Danses / Le Baiser / je me trompe de mots ",         // four words to pop | ||||
|                 "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop | ||||
|                 "whathavenotnsuchforth and a good amount of words to pop to match the first one ", // 13 | ||||
|             ], | ||||
|             criterion: Some(&["words"]), | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|         utils::Conf { | ||||
|             group_name: "asc", | ||||
|             criterion: Some(&["asc(released-timestamp)"]), | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|         utils::Conf { | ||||
|             group_name: "desc", | ||||
|             criterion: Some(&["desc(released-timestamp)"]), | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|  | ||||
|         /* then we bench the asc and desc criterion on top of the default criterion */ | ||||
|         utils::Conf { | ||||
|             group_name: "asc + default", | ||||
|             criterion: Some(&asc_default[..]), | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|         utils::Conf { | ||||
|             group_name: "desc + default", | ||||
|             criterion: Some(&desc_default[..]), | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|  | ||||
|         /* we bench the filters with the default request */ | ||||
|         utils::Conf { | ||||
|             group_name: "basic filter: <=", | ||||
|             facet_condition: Some("released-timestamp <= 946728000"), // year 2000 | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|         utils::Conf { | ||||
|             group_name: "basic filter: TO", | ||||
|             facet_condition: Some("released-timestamp 946728000 TO 1262347200"), // year 2000 to 2010 | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|         utils::Conf { | ||||
|             group_name: "big filter", | ||||
|             facet_condition: Some("released-timestamp != 1262347200 AND (NOT (released-timestamp = 946728000)) AND (duration-float = 1 OR (duration-float 1.1 TO 1.5 AND released-timestamp > 315576000))"), | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|  | ||||
|         /* the we bench some global / normal search with all the default criterion in the default | ||||
|          * order */ | ||||
|         utils::Conf { | ||||
|             group_name: "basic placeholder", | ||||
|             queries: &[""], | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|         utils::Conf { | ||||
|             group_name: "basic without quote", | ||||
|             queries: &BASE_CONF | ||||
|                 .queries | ||||
|                 .iter() | ||||
|                 .map(|s| s.trim()) // we remove the space at the end of each request | ||||
|                 .collect::<Vec<&str>>(), | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|         utils::Conf { | ||||
|             group_name: "basic with quote", | ||||
|             queries: basic_with_quote, | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|         utils::Conf { | ||||
|             group_name: "prefix search", | ||||
|             queries: &[ | ||||
|                 "s", // 500k+ results | ||||
|                 "a", // | ||||
|                 "b", // | ||||
|                 "i", // | ||||
|                 "x", // only 7k results | ||||
|             ], | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|     ]; | ||||
|  | ||||
|     utils::run_benches(c, confs); | ||||
| } | ||||
|  | ||||
| criterion_group!(benches, bench_songs); | ||||
| criterion_main!(benches); | ||||
| @@ -1,125 +0,0 @@ | ||||
| use std::fs::{create_dir_all, remove_dir_all, File}; | ||||
|  | ||||
| use criterion::BenchmarkId; | ||||
| use heed::EnvOpenOptions; | ||||
| use milli::{ | ||||
|     update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat}, | ||||
|     FacetCondition, Index, | ||||
| }; | ||||
|  | ||||
| /// The name of the environment variable used to select the path | ||||
| /// of the directory containing the datasets | ||||
| const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH"; | ||||
|  | ||||
| /// The default path for the dataset if nothing is specified | ||||
| /// By default we chose `milli/benches` because any cargo command ran in `milli/milli/**` will be | ||||
| /// executed with a pwd of `milli/milli` | ||||
| const DEFAULT_DATASETS_PATH: &str = "milli/benches"; | ||||
|  | ||||
| pub struct Conf<'a> { | ||||
|     /// where we are going to create our database.mmdb directory | ||||
|     /// each benchmark will first try to delete it and then recreate it | ||||
|     pub database_name: &'a str, | ||||
|     /// the dataset to be used, it must be an uncompressed csv | ||||
|     pub dataset: &'a str, | ||||
|     pub group_name: &'a str, | ||||
|     pub queries: &'a [&'a str], | ||||
|     /// here you can change which criterion are used and in which order. | ||||
|     /// - if you specify something all the base configuration will be thrown out | ||||
|     /// - if you don't specify anything (None) the default configuration will be kept | ||||
|     pub criterion: Option<&'a [&'a str]>, | ||||
|     /// the last chance to configure your database as you want | ||||
|     pub configure: fn(&mut Settings), | ||||
|     pub facet_condition: Option<&'a str>, | ||||
|     /// enable or disable the optional words on the query | ||||
|     pub optional_words: bool, | ||||
| } | ||||
|  | ||||
| impl Conf<'_> { | ||||
|     fn nop(_builder: &mut Settings) {} | ||||
|  | ||||
|     pub const BASE: Self = Conf { | ||||
|         database_name: "benches.mmdb", | ||||
|         dataset: "", | ||||
|         group_name: "", | ||||
|         queries: &[], | ||||
|         criterion: None, | ||||
|         configure: Self::nop, | ||||
|         facet_condition: None, | ||||
|         optional_words: true, | ||||
|     }; | ||||
| } | ||||
|  | ||||
| pub fn base_setup(conf: &Conf) -> Index { | ||||
|     match remove_dir_all(&conf.database_name) { | ||||
|         Ok(_) => (), | ||||
|         Err(e) if e.kind() == std::io::ErrorKind::NotFound => (), | ||||
|         Err(e) => panic!("{}", e), | ||||
|     } | ||||
|     create_dir_all(&conf.database_name).unwrap(); | ||||
|  | ||||
|     let mut options = EnvOpenOptions::new(); | ||||
|     options.map_size(100 * 1024 * 1024 * 1024); // 100 GB | ||||
|     options.max_readers(10); | ||||
|     let index = Index::new(options, conf.database_name).unwrap(); | ||||
|  | ||||
|     let update_builder = UpdateBuilder::new(0); | ||||
|     let mut wtxn = index.write_txn().unwrap(); | ||||
|     let mut builder = update_builder.settings(&mut wtxn, &index); | ||||
|  | ||||
|     if let Some(criterion) = conf.criterion { | ||||
|         builder.reset_faceted_fields(); | ||||
|         builder.reset_criteria(); | ||||
|         builder.reset_stop_words(); | ||||
|  | ||||
|         let criterion = criterion.iter().map(|s| s.to_string()).collect(); | ||||
|         builder.set_criteria(criterion); | ||||
|     } | ||||
|  | ||||
|     (conf.configure)(&mut builder); | ||||
|  | ||||
|     builder.execute(|_, _| ()).unwrap(); | ||||
|     wtxn.commit().unwrap(); | ||||
|  | ||||
|     let update_builder = UpdateBuilder::new(0); | ||||
|     let mut wtxn = index.write_txn().unwrap(); | ||||
|     let mut builder = update_builder.index_documents(&mut wtxn, &index); | ||||
|     builder.update_format(UpdateFormat::Csv); | ||||
|     builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); | ||||
|     // we called from cargo the current directory is supposed to be milli/milli | ||||
|     let base_dataset_path = std::env::vars() | ||||
|         .find(|var| var.0 == BASE_DATASETS_PATH_KEY) | ||||
|         .map_or(DEFAULT_DATASETS_PATH.to_owned(), |(_key, value)| value); | ||||
|     let dataset_path = format!("{}/{}", base_dataset_path, conf.dataset); | ||||
|     let reader = File::open(&dataset_path) | ||||
|         .expect(&format!("could not find the dataset in: {}", &dataset_path)); | ||||
|     builder.execute(reader, |_, _| ()).unwrap(); | ||||
|     wtxn.commit().unwrap(); | ||||
|  | ||||
|     index | ||||
| } | ||||
|  | ||||
| pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { | ||||
|     for conf in confs { | ||||
|         let index = base_setup(conf); | ||||
|  | ||||
|         let mut group = c.benchmark_group(&format!("{}: {}", conf.dataset, conf.group_name)); | ||||
|  | ||||
|         for &query in conf.queries { | ||||
|             group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { | ||||
|                 b.iter(|| { | ||||
|                     let rtxn = index.read_txn().unwrap(); | ||||
|                     let mut search = index.search(&rtxn); | ||||
|                     search.query(query).optional_words(conf.optional_words); | ||||
|                     if let Some(facet_condition) = conf.facet_condition { | ||||
|                         let facet_condition = | ||||
|                             FacetCondition::from_str(&rtxn, &index, facet_condition).unwrap(); | ||||
|                         search.facet_condition(facet_condition); | ||||
|                     } | ||||
|                     let _ids = search.execute().unwrap(); | ||||
|                 }); | ||||
|             }); | ||||
|         } | ||||
|         group.finish(); | ||||
|     } | ||||
| } | ||||
| @@ -1,132 +0,0 @@ | ||||
| mod utils; | ||||
|  | ||||
| use criterion::{criterion_group, criterion_main}; | ||||
| use milli::update::Settings; | ||||
| use utils::Conf; | ||||
|  | ||||
| fn base_conf(builder: &mut Settings) { | ||||
|     let displayed_fields = ["title", "body", "url"] | ||||
|         .iter() | ||||
|         .map(|s| s.to_string()) | ||||
|         .collect(); | ||||
|     builder.set_displayed_fields(displayed_fields); | ||||
|  | ||||
|     let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect(); | ||||
|     builder.set_searchable_fields(searchable_fields); | ||||
| } | ||||
|  | ||||
| const BASE_CONF: Conf = Conf { | ||||
|     dataset: "smol-wiki-articles.csv", | ||||
|     queries: &[ | ||||
|         "mingus ",        // 46 candidates | ||||
|         "miles davis ",   // 159 | ||||
|         "rock and roll ", // 1007 | ||||
|         "machine ",       // 3448 | ||||
|         "spain ",         // 7002 | ||||
|         "japan ",         // 10.593 | ||||
|         "france ",        // 17.616 | ||||
|         "film ",          // 24.959 | ||||
|     ], | ||||
|     configure: base_conf, | ||||
|     ..Conf::BASE | ||||
| }; | ||||
|  | ||||
| fn bench_songs(c: &mut criterion::Criterion) { | ||||
|     let basic_with_quote: Vec<String> = BASE_CONF | ||||
|         .queries | ||||
|         .iter() | ||||
|         .map(|s| { | ||||
|             s.trim() | ||||
|                 .split(' ') | ||||
|                 .map(|s| format!(r#""{}""#, s)) | ||||
|                 .collect::<Vec<String>>() | ||||
|                 .join(" ") | ||||
|         }) | ||||
|         .collect(); | ||||
|     let basic_with_quote: &[&str] = &basic_with_quote | ||||
|         .iter() | ||||
|         .map(|s| s.as_str()) | ||||
|         .collect::<Vec<&str>>(); | ||||
|  | ||||
|     let confs = &[ | ||||
|         /* first we bench each criterion alone */ | ||||
|         utils::Conf { | ||||
|             group_name: "proximity", | ||||
|             queries: &[ | ||||
|                 "herald sings ", | ||||
|                 "april paris ", | ||||
|                 "tea two ", | ||||
|                 "diesel engine ", | ||||
|             ], | ||||
|             criterion: Some(&["proximity"]), | ||||
|             optional_words: false, | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|         utils::Conf { | ||||
|             group_name: "typo", | ||||
|             queries: &[ | ||||
|                 "migrosoft ", | ||||
|                 "linax ", | ||||
|                 "Disnaylande ", | ||||
|                 "phytogropher ", | ||||
|                 "nympalidea ", | ||||
|                 "aritmetric ", | ||||
|                 "the fronce ", | ||||
|                 "sisan ", | ||||
|             ], | ||||
|             criterion: Some(&["typo"]), | ||||
|             optional_words: false, | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|         utils::Conf { | ||||
|             group_name: "words", | ||||
|             queries: &[ | ||||
|                 "the black saint and the sinner lady and the good doggo ", // four words to pop, 27 results | ||||
|                 "Kameya Tokujirō mingus monk ",                           // two words to pop, 55 | ||||
|                 "Ulrich Hensel meilisearch milli ",                        // two words to pop, 306 | ||||
|                 "Idaho Bellevue pizza ",                                   // one word to pop, 800 | ||||
|                 "Abraham machin ",                                         // one word to pop, 1141 | ||||
|             ], | ||||
|             criterion: Some(&["words"]), | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|         /* the we bench some global / normal search with all the default criterion in the default | ||||
|          * order */ | ||||
|         utils::Conf { | ||||
|             group_name: "basic placeholder", | ||||
|             queries: &[""], | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|         utils::Conf { | ||||
|             group_name: "basic without quote", | ||||
|             queries: &BASE_CONF | ||||
|                 .queries | ||||
|                 .iter() | ||||
|                 .map(|s| s.trim()) // we remove the space at the end of each request | ||||
|                 .collect::<Vec<&str>>(), | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|         utils::Conf { | ||||
|             group_name: "basic with quote", | ||||
|             queries: basic_with_quote, | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|         utils::Conf { | ||||
|             group_name: "prefix search", | ||||
|             queries: &[ | ||||
|                 "t", // 453k results | ||||
|                 "c", // 405k | ||||
|                 "g", // 318k | ||||
|                 "j", // 227k | ||||
|                 "q", // 71k | ||||
|                 "x", // 17k | ||||
|             ], | ||||
|             ..BASE_CONF | ||||
|         }, | ||||
|     ]; | ||||
|  | ||||
|     utils::run_benches(c, confs); | ||||
| } | ||||
|  | ||||
| criterion_group!(benches, bench_songs); | ||||
| criterion_main!(benches); | ||||
		Reference in New Issue
	
	Block a user