207: Benchmarks r=Kerollmops a=irevoire



Co-authored-by: tamo <tamo@meilisearch.com>
Co-authored-by: Clémentine Urquizar <clementine@meilisearch.com>
Co-authored-by: Tamo <irevoire@hotmail.fr>
Co-authored-by: Irevoire <tamo@meilisearch.com>
This commit is contained in:
bors[bot]
2021-06-02 15:29:09 +00:00
committed by GitHub
15 changed files with 1161 additions and 58 deletions

71
.github/workflows/benchmarks.yml vendored Normal file
View File

@@ -0,0 +1,71 @@
name: Benchmarks
on:
workflow_dispatch:
inputs:
dataset_name:
description: 'The name of the dataset used to benchmark (songs or wiki)'
required: false
default: 'songs'
jobs:
benchmarks:
name: Run and upload benchmarks
runs-on: self-hosted
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
# Set variables
- name: Set current branch name
shell: bash
run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})"
id: current_branch
- name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
shell: bash
run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')"
id: normalized_current_branch
- name: Set shorter commit SHA
shell: bash
run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)"
id: commit_sha
- name: Set file basename with format "dataset_branch_commitSHA"
shell: bash
run: echo "##[set-output name=basename;]$(echo ${{ github.event.inputs.dataset_name }}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})"
id: file
# Run benchmarks
- name: Run benchmarks - Dataset ${{ github.event.inputs.dataset_name }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
run: |
cd benchmarks
cargo bench --bench ${{ github.event.inputs.dataset_name }} -- --save-baseline ${{ steps.file.outputs.basename }}
# Generate critcmp files
- name: Install critcmp
run: cargo install critcmp
- name: Export cripcmp file
run: |
critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json
# Upload benchmarks
- name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3
uses: BetaHuhn/do-spaces-action@v2
with:
access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }}
secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }}
space_name: ${{ secrets.DO_SPACES_SPACE_NAME }}
space_region: ${{ secrets.DO_SPACES_SPACE_REGION }}
source: ${{ steps.file.outputs.basename }}.json
out_dir: critcmp_results
# Helper
- name: 'README: compare with another benchmark'
run: |
echo "${{ steps.file.outputs.basename }}.json has just been pushed."
echo 'How to compare this benchmark with another one?'
echo ' - Check the available files with: ./benchmarks/scripts/list.sh'
echo " - Run the following command: ./benchmaks/scipts/compare.sh ${{ steps.file.outputs.basename }}.json <file-to-compare-with>"

346
Cargo.lock generated
View File

@@ -122,6 +122,20 @@ version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd"
[[package]]
name = "benchmarks"
version = "0.1.0"
dependencies = [
"anyhow",
"bytes 1.0.1",
"convert_case",
"criterion",
"flate2",
"heed",
"milli",
"reqwest",
]
[[package]]
name = "big_s"
version = "1.0.2"
@@ -327,6 +341,12 @@ dependencies = [
"unicode-width",
]
[[package]]
name = "convert_case"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
[[package]]
name = "cow-utils"
version = "0.1.2"
@@ -506,6 +526,15 @@ version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
[[package]]
name = "encoding_rs"
version = "0.8.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "80df024fbc5ac80f87dfef0d9f5209a252f2a497f7f42944cff24d8253cac065"
dependencies = [
"cfg-if 1.0.0",
]
[[package]]
name = "fake-simd"
version = "0.1.2"
@@ -750,12 +779,31 @@ dependencies = [
"http",
"indexmap",
"slab",
"tokio",
"tokio-util",
"tokio 0.2.25",
"tokio-util 0.3.1",
"tracing",
"tracing-futures",
]
[[package]]
name = "h2"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "825343c4eef0b63f541f8903f395dc5beb362a979b5799a84062527ef1e37726"
dependencies = [
"bytes 1.0.1",
"fnv",
"futures-core",
"futures-sink",
"futures-util",
"http",
"indexmap",
"slab",
"tokio 1.6.0",
"tokio-util 0.6.7",
"tracing",
]
[[package]]
name = "half"
version = "1.7.1"
@@ -893,6 +941,17 @@ dependencies = [
"http",
]
[[package]]
name = "http-body"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60daa14be0e0786db0f03a9e57cb404c9d756eed2b6c62b9ea98ec5743ec75a9"
dependencies = [
"bytes 1.0.1",
"http",
"pin-project-lite 0.2.6",
]
[[package]]
name = "http-ui"
version = "0.2.1"
@@ -922,7 +981,7 @@ dependencies = [
"stderrlog",
"structopt",
"tempfile",
"tokio",
"tokio 0.2.25",
"warp",
]
@@ -960,20 +1019,59 @@ dependencies = [
"futures-channel",
"futures-core",
"futures-util",
"h2",
"h2 0.2.7",
"http",
"http-body",
"http-body 0.3.1",
"httparse",
"httpdate",
"itoa",
"pin-project 1.0.5",
"socket2",
"tokio",
"socket2 0.3.19",
"tokio 0.2.25",
"tower-service",
"tracing",
"want",
]
[[package]]
name = "hyper"
version = "0.14.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8bf09f61b52cfcf4c00de50df88ae423d6c02354e385a86341133b5338630ad1"
dependencies = [
"bytes 1.0.1",
"futures-channel",
"futures-core",
"futures-util",
"h2 0.3.3",
"http",
"http-body 0.4.2",
"httparse",
"httpdate",
"itoa",
"pin-project 1.0.5",
"socket2 0.4.0",
"tokio 1.6.0",
"tower-service",
"tracing",
"want",
]
[[package]]
name = "hyper-rustls"
version = "0.22.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f9f7a97316d44c0af9b0301e65010573a853a9fc97046d7331d7f6bc0fd5a64"
dependencies = [
"futures-util",
"hyper 0.14.5",
"log",
"rustls",
"tokio 1.6.0",
"tokio-rustls",
"webpki",
]
[[package]]
name = "idna"
version = "0.2.2"
@@ -1029,6 +1127,12 @@ dependencies = [
"libc",
]
[[package]]
name = "ipnet"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "47be2f14c678be2fdcab04ab1171db51b2762ce6f0a8ee87c8dd4a04ed216135"
[[package]]
name = "itertools"
version = "0.9.0"
@@ -1261,7 +1365,6 @@ dependencies = [
"bstr",
"byteorder",
"chrono",
"criterion",
"crossbeam-channel",
"csv",
"either",
@@ -1343,6 +1446,19 @@ dependencies = [
"winapi 0.2.8",
]
[[package]]
name = "mio"
version = "0.7.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf80d3e903b34e0bd7282b218398aec54e082c840d9baf8339e0080a0c542956"
dependencies = [
"libc",
"log",
"miow 0.3.7",
"ntapi",
"winapi 0.3.9",
]
[[package]]
name = "mio-named-pipes"
version = "0.1.7"
@@ -1350,7 +1466,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0840c1c50fd55e521b247f949c241c9997709f23bd7f023b9762cd561e935656"
dependencies = [
"log",
"mio",
"mio 0.6.23",
"miow 0.3.7",
"winapi 0.3.9",
]
@@ -1363,7 +1479,7 @@ checksum = "afcb699eb26d4332647cc848492bbc15eafb26f08d0304550d5aa1f612e066f0"
dependencies = [
"iovec",
"libc",
"mio",
"mio 0.6.23",
]
[[package]]
@@ -1441,6 +1557,15 @@ dependencies = [
"version_check",
]
[[package]]
name = "ntapi"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44"
dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "num-integer"
version = "0.1.44"
@@ -1956,12 +2081,62 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "reqwest"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2296f2fac53979e8ccbc4a1136b25dcefd37be9ed7e4a1f6b05a6029c84ff124"
dependencies = [
"base64 0.13.0",
"bytes 1.0.1",
"encoding_rs",
"futures-core",
"futures-util",
"http",
"http-body 0.4.2",
"hyper 0.14.5",
"hyper-rustls",
"ipnet",
"js-sys",
"lazy_static",
"log",
"mime",
"percent-encoding",
"pin-project-lite 0.2.6",
"rustls",
"serde",
"serde_urlencoded 0.7.0",
"tokio 1.6.0",
"tokio-rustls",
"url",
"wasm-bindgen",
"wasm-bindgen-futures",
"web-sys",
"webpki-roots",
"winreg",
]
[[package]]
name = "retain_mut"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "53552c6c49e1e13f1a203ef0080ab3bbef0beb570a528993e83df057a9d9bba1"
[[package]]
name = "ring"
version = "0.16.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc"
dependencies = [
"cc",
"libc",
"once_cell",
"spin",
"untrusted",
"web-sys",
"winapi 0.3.9",
]
[[package]]
name = "roaring"
version = "0.6.6"
@@ -1982,6 +2157,19 @@ dependencies = [
"semver",
]
[[package]]
name = "rustls"
version = "0.19.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35edb675feee39aec9c99fa5ff985081995a06d594114ae14cbe797ad7b7a6d7"
dependencies = [
"base64 0.13.0",
"log",
"ring",
"sct",
"webpki",
]
[[package]]
name = "ryu"
version = "1.0.5"
@@ -2015,6 +2203,16 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "sct"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b362b83898e0e69f38515b82ee15aa80636befe47c3b6d3d89a911e78fc228ce"
dependencies = [
"ring",
"untrusted",
]
[[package]]
name = "search"
version = "0.2.1"
@@ -2108,6 +2306,18 @@ dependencies = [
"url",
]
[[package]]
name = "serde_urlencoded"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edfa57a7f8d9c1d260a549e7224100f6c43d43f9103e06dd8b4095a9b2b43ce9"
dependencies = [
"form_urlencoded",
"itoa",
"ryu",
"serde",
]
[[package]]
name = "sha-1"
version = "0.8.2"
@@ -2193,6 +2403,22 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "socket2"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e3dfc207c526015c632472a77be09cf1b6e46866581aecae5cc38fb4235dea2"
dependencies = [
"libc",
"winapi 0.3.9",
]
[[package]]
name = "spin"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
[[package]]
name = "static_assertions"
version = "1.1.0"
@@ -2386,7 +2612,7 @@ dependencies = [
"lazy_static",
"libc",
"memchr",
"mio",
"mio 0.6.23",
"mio-named-pipes",
"mio-uds",
"num_cpus",
@@ -2397,6 +2623,21 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "tokio"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37"
dependencies = [
"autocfg",
"bytes 1.0.1",
"libc",
"memchr",
"mio 0.7.11",
"num_cpus",
"pin-project-lite 0.2.6",
]
[[package]]
name = "tokio-macros"
version = "0.2.6"
@@ -2408,6 +2649,17 @@ dependencies = [
"syn 1.0.64",
]
[[package]]
name = "tokio-rustls"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc6844de72e57df1980054b38be3a9f4702aba4858be64dd700181a8a6d0e1b6"
dependencies = [
"rustls",
"tokio 1.6.0",
"webpki",
]
[[package]]
name = "tokio-tungstenite"
version = "0.11.0"
@@ -2417,7 +2669,7 @@ dependencies = [
"futures-util",
"log",
"pin-project 0.4.27",
"tokio",
"tokio 0.2.25",
"tungstenite",
]
@@ -2432,7 +2684,21 @@ dependencies = [
"futures-sink",
"log",
"pin-project-lite 0.1.12",
"tokio",
"tokio 0.2.25",
]
[[package]]
name = "tokio-util"
version = "0.6.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1caa0b0c8d94a049db56b5acf8cba99dc0623aab1b26d5b5f5e2d945846b3592"
dependencies = [
"bytes 1.0.1",
"futures-core",
"futures-sink",
"log",
"pin-project-lite 0.2.6",
"tokio 1.6.0",
]
[[package]]
@@ -2578,6 +2844,12 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564"
[[package]]
name = "untrusted"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
[[package]]
name = "url"
version = "2.2.1"
@@ -2654,7 +2926,7 @@ dependencies = [
"futures",
"headers",
"http",
"hyper",
"hyper 0.13.10",
"log",
"mime",
"mime_guess",
@@ -2663,8 +2935,8 @@ dependencies = [
"scoped-tls",
"serde",
"serde_json",
"serde_urlencoded",
"tokio",
"serde_urlencoded 0.6.1",
"tokio 0.2.25",
"tokio-tungstenite",
"tower-service",
"tracing",
@@ -2691,6 +2963,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8fe8f61dba8e5d645a4d8132dc7a0a66861ed5e1045d2c0ed940fab33bac0fbe"
dependencies = [
"cfg-if 1.0.0",
"serde",
"serde_json",
"wasm-bindgen-macro",
]
@@ -2709,6 +2983,18 @@ dependencies = [
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-futures"
version = "0.4.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73157efb9af26fb564bb59a009afd1c7c334a44db171d280690d0c3faaec3468"
dependencies = [
"cfg-if 1.0.0",
"js-sys",
"wasm-bindgen",
"web-sys",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.72"
@@ -2748,6 +3034,25 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "webpki"
version = "0.21.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8e38c0608262c46d4a56202ebabdeb094cef7e560ca7a226c6bf055188aa4ea"
dependencies = [
"ring",
"untrusted",
]
[[package]]
name = "webpki-roots"
version = "0.21.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aabe153544e473b775453675851ecc86863d2a81d786d741f6b76778f2a48940"
dependencies = [
"webpki",
]
[[package]]
name = "whatlang"
version = "0.9.0"
@@ -2800,6 +3105,15 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "winreg"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0120db82e8a1e0b9fb3345a539c478767c0048d842860994d96113d5b667bd69"
dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "ws2_32-sys"
version = "0.2.1"

View File

@@ -1,5 +1,5 @@
[workspace]
members = ["milli", "http-ui", "infos", "helpers", "search"]
members = ["milli", "http-ui", "benchmarks", "infos", "helpers", "search"]
default-members = ["milli"]
[profile.release]

1
benchmarks/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
benches/datasets_paths.rs

28
benchmarks/Cargo.toml Normal file
View File

@@ -0,0 +1,28 @@
[package]
name = "benchmarks"
version = "0.1.0"
edition = "2018"
publish = false
[dependencies]
milli = { path = "../milli" }
[dev-dependencies]
heed = "*" # we want to use the version milli uses
criterion = "0.3.4"
[build-dependencies]
anyhow = "1.0"
bytes = "1.0"
flate2 = "1.0.20"
convert_case = "0.4"
reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false }
[[bench]]
name = "songs"
harness = false
[[bench]]
name = "wiki"
harness = false

110
benchmarks/README.md Normal file
View File

@@ -0,0 +1,110 @@
Benchmarks
==========
## TOC
- [Datasets](#datasets)
- [Run the benchmarks](#run-the-benchmarks)
- [Comparison between benchmarks](#comparison-between-benchmarks)
## Datasets
The benchmarks are available for the following datasets:
- `songs`
- `wiki`
### Songs
`songs` is a subset of the [`songs.csv` dataset](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz).
It was generated with this command:
```bash
xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv
```
_[Download the generated `songs` dataset](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)._
### Wiki
`wiki` is a subset of the [`wikipedia-articles.csv` dataset](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz).
It was generated with the following command:
```bash
xsv sample --seed 42 500000 wikipedia-articles.csv -o smol-wikipedia-articles.csv
```
_[Download the generated `wiki` dataset](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz)._
## Run the benchmarks
### On our private server
The Meili team has self-hosted his own GitHub runner to run benchmarks on our dedicated bare metal server.
To trigger the benchmark workflow:
- Go to the `Actions` tab of this repository.
- Select the `Benchmarks` workflow on the left.
- Click on `Run workflow` in the blue banner.
- Select the branch on which you want to run the benchmarks and select the dataset you want (default: `songs`).
- Finally, click on `Run workflow`.
This GitHub workflow will run the benchmarks and push the `critcmp` report to a DigitalOcean Space (= S3).
The name of the uploaded file is displayed in the workflow.
_[More about critcmp](https://github.com/BurntSushi/critcmp)._
💡 To compare the just-uploaded benchmark with another one, check out the [next section](#comparison-between-benchmarks).
### On your machine
To run all the benchmarks (~4h):
```bash
cargo bench
```
To run only the `songs` (~1h) or `wiki` (~3h) benchmark:
```bash
cargo bench --bench <dataset name>
```
By default, the benchmarks will be downloaded and uncompressed automatically in the target directory.<br>
If you don't want to download the datasets every time you update something on the code, you can specify a custom directory with the environment variable `MILLI_BENCH_DATASETS_PATH`:
```bash
mkdir ~/datasets
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded
touch build.rs
MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded
```
## Comparison between benchmarks
The benchmark reports we push are generated with `critcmp`. Thus, we use `critcmp` to generate comparison results between 2 benchmarks.
We provide a script to download and display the comparison report.
Requirements:
- `grep`
- `curl`
- [`critcmp`](https://github.com/BurntSushi/critcmp)
List the available file in the DO Space:
```bash
./benchmarks/script/list.sh
```
```bash
songs_main_09a4321.json
songs_geosearch_24ec456.json
```
Run the comparison script:
```bash
./benchmarks/scripts/compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json
```

211
benchmarks/benches/songs.rs Normal file
View File

@@ -0,0 +1,211 @@
mod datasets_paths;
mod utils;
use criterion::{criterion_group, criterion_main};
use milli::update::Settings;
use utils::Conf;
fn base_conf(builder: &mut Settings) {
let displayed_fields = [
"id", "title", "album", "artist", "genre", "country", "released", "duration",
]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_displayed_fields(displayed_fields);
let searchable_fields = ["title", "album", "artist"]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_searchable_fields(searchable_fields);
let faceted_fields = [
"released-timestamp",
"duration-float",
"genre",
"country",
"artist",
]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_faceted_fields(faceted_fields);
}
const BASE_CONF: Conf = Conf {
dataset: datasets_paths::SMOL_SONGS,
queries: &[
"john ", // 9097
"david ", // 4794
"charles ", // 1957
"david bowie ", // 1200
"michael jackson ", // 600
"thelonious monk ", // 303
"charles mingus ", // 142
"marcus miller ", // 60
"tamo ", // 13
"Notstandskomitee ", // 4
],
configure: base_conf,
primary_key: Some("id"),
..Conf::BASE
};
fn bench_songs(c: &mut criterion::Criterion) {
let default_criterion: Vec<String> = milli::default_criteria()
.iter()
.map(|criteria| criteria.to_string())
.collect();
let default_criterion = default_criterion.iter().map(|s| s.as_str());
let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)")
.chain(default_criterion.clone())
.collect();
let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)")
.chain(default_criterion.clone())
.collect();
let basic_with_quote: Vec<String> = BASE_CONF
.queries
.iter()
.map(|s| {
s.trim()
.split(' ')
.map(|s| format!(r#""{}""#, s))
.collect::<Vec<String>>()
.join(" ")
})
.collect();
let basic_with_quote: &[&str] = &basic_with_quote
.iter()
.map(|s| s.as_str())
.collect::<Vec<&str>>();
let confs = &[
/* first we bench each criterion alone */
utils::Conf {
group_name: "proximity",
queries: &[
"black saint sinner lady ",
"les dangeureuses 1960 ",
"The Disneyland Sing-Along Chorus ",
"Under Great Northern Lights ",
"7000 Danses Un Jour Dans Notre Vie ",
],
criterion: Some(&["proximity"]),
optional_words: false,
..BASE_CONF
},
utils::Conf {
group_name: "typo",
queries: &[
"mongus ",
"thelonius monk ",
"Disnaylande ",
"the white striper ",
"indochie ",
"indochien ",
"klub des loopers ",
"fear of the duck ",
"michel depech ",
"stromal ",
"dire straights ",
"Arethla Franklin ",
],
criterion: Some(&["typo"]),
optional_words: false,
..BASE_CONF
},
utils::Conf {
group_name: "words",
queries: &[
"the black saint and the sinner lady and the good doggo ", // four words to pop
"les liaisons dangeureuses 1793 ", // one word to pop
"The Disneyland Children's Sing-Alone song ", // two words to pop
"seven nation mummy ", // one word to pop
"7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop
"Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop
"whathavenotnsuchforth and a good amount of words to pop to match the first one ", // 13
],
criterion: Some(&["words"]),
..BASE_CONF
},
utils::Conf {
group_name: "asc",
criterion: Some(&["asc(released-timestamp)"]),
..BASE_CONF
},
utils::Conf {
group_name: "desc",
criterion: Some(&["desc(released-timestamp)"]),
..BASE_CONF
},
/* then we bench the asc and desc criterion on top of the default criterion */
utils::Conf {
group_name: "asc + default",
criterion: Some(&asc_default[..]),
..BASE_CONF
},
utils::Conf {
group_name: "desc + default",
criterion: Some(&desc_default[..]),
..BASE_CONF
},
/* we bench the filters with the default request */
utils::Conf {
group_name: "basic filter: <=",
facet_condition: Some("released-timestamp <= 946728000"), // year 2000
..BASE_CONF
},
utils::Conf {
group_name: "basic filter: TO",
facet_condition: Some("released-timestamp 946728000 TO 1262347200"), // year 2000 to 2010
..BASE_CONF
},
utils::Conf {
group_name: "big filter",
facet_condition: Some("released-timestamp != 1262347200 AND (NOT (released-timestamp = 946728000)) AND (duration-float = 1 OR (duration-float 1.1 TO 1.5 AND released-timestamp > 315576000))"),
..BASE_CONF
},
/* the we bench some global / normal search with all the default criterion in the default
* order */
utils::Conf {
group_name: "basic placeholder",
queries: &[""],
..BASE_CONF
},
utils::Conf {
group_name: "basic without quote",
queries: &BASE_CONF
.queries
.iter()
.map(|s| s.trim()) // we remove the space at the end of each request
.collect::<Vec<&str>>(),
..BASE_CONF
},
utils::Conf {
group_name: "basic with quote",
queries: basic_with_quote,
..BASE_CONF
},
utils::Conf {
group_name: "prefix search",
queries: &[
"s", // 500k+ results
"a", //
"b", //
"i", //
"x", // only 7k results
],
..BASE_CONF
},
];
utils::run_benches(c, confs);
}
criterion_group!(benches, bench_songs);
criterion_main!(benches);

119
benchmarks/benches/utils.rs Normal file
View File

@@ -0,0 +1,119 @@
use std::fs::{create_dir_all, remove_dir_all, File};
use criterion::BenchmarkId;
use heed::EnvOpenOptions;
use milli::{
update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat},
FacetCondition, Index,
};
pub struct Conf<'a> {
/// where we are going to create our database.mmdb directory
/// each benchmark will first try to delete it and then recreate it
pub database_name: &'a str,
/// the dataset to be used, it must be an uncompressed csv
pub dataset: &'a str,
pub group_name: &'a str,
pub queries: &'a [&'a str],
/// here you can change which criterion are used and in which order.
/// - if you specify something all the base configuration will be thrown out
/// - if you don't specify anything (None) the default configuration will be kept
pub criterion: Option<&'a [&'a str]>,
/// the last chance to configure your database as you want
pub configure: fn(&mut Settings),
pub facet_condition: Option<&'a str>,
/// enable or disable the optional words on the query
pub optional_words: bool,
/// primary key, if there is None we'll auto-generate docids for every documents
pub primary_key: Option<&'a str>,
}
impl Conf<'_> {
pub const BASE: Self = Conf {
database_name: "benches.mmdb",
dataset: "",
group_name: "",
queries: &[],
criterion: None,
configure: |_| (),
facet_condition: None,
optional_words: true,
primary_key: None,
};
}
pub fn base_setup(conf: &Conf) -> Index {
match remove_dir_all(&conf.database_name) {
Ok(_) => (),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => (),
Err(e) => panic!("{}", e),
}
create_dir_all(&conf.database_name).unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
options.max_readers(10);
let index = Index::new(options, conf.database_name).unwrap();
if let Some(primary_key) = conf.primary_key {
let mut wtxn = index.write_txn().unwrap();
index.put_primary_key(&mut wtxn, primary_key).unwrap();
}
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.settings(&mut wtxn, &index);
if let Some(criterion) = conf.criterion {
builder.reset_faceted_fields();
builder.reset_criteria();
builder.reset_stop_words();
let criterion = criterion.iter().map(|s| s.to_string()).collect();
builder.set_criteria(criterion);
}
(conf.configure)(&mut builder);
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();
let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.index_documents(&mut wtxn, &index);
if let None = conf.primary_key {
builder.enable_autogenerate_docids();
}
builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
let reader = File::open(conf.dataset)
.expect(&format!("could not find the dataset in: {}", conf.dataset));
builder.execute(reader, |_, _| ()).unwrap();
wtxn.commit().unwrap();
index
}
pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
for conf in confs {
let index = base_setup(conf);
let mut group = c.benchmark_group(&format!("{}: {}", conf.dataset, conf.group_name));
for &query in conf.queries {
group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| {
b.iter(|| {
let rtxn = index.read_txn().unwrap();
let mut search = index.search(&rtxn);
search.query(query).optional_words(conf.optional_words);
if let Some(facet_condition) = conf.facet_condition {
let facet_condition =
FacetCondition::from_str(&rtxn, &index, facet_condition).unwrap();
search.facet_condition(facet_condition);
}
let _ids = search.execute().unwrap();
});
});
}
group.finish();
}
}

133
benchmarks/benches/wiki.rs Normal file
View File

@@ -0,0 +1,133 @@
mod datasets_paths;
mod utils;
use criterion::{criterion_group, criterion_main};
use milli::update::Settings;
use utils::Conf;
fn base_conf(builder: &mut Settings) {
let displayed_fields = ["title", "body", "url"]
.iter()
.map(|s| s.to_string())
.collect();
builder.set_displayed_fields(displayed_fields);
let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields);
}
const BASE_CONF: Conf = Conf {
dataset: datasets_paths::SMOL_WIKI_ARTICLES,
queries: &[
"mingus ", // 46 candidates
"miles davis ", // 159
"rock and roll ", // 1007
"machine ", // 3448
"spain ", // 7002
"japan ", // 10.593
"france ", // 17.616
"film ", // 24.959
],
configure: base_conf,
..Conf::BASE
};
fn bench_songs(c: &mut criterion::Criterion) {
let basic_with_quote: Vec<String> = BASE_CONF
.queries
.iter()
.map(|s| {
s.trim()
.split(' ')
.map(|s| format!(r#""{}""#, s))
.collect::<Vec<String>>()
.join(" ")
})
.collect();
let basic_with_quote: &[&str] = &basic_with_quote
.iter()
.map(|s| s.as_str())
.collect::<Vec<&str>>();
let confs = &[
/* first we bench each criterion alone */
utils::Conf {
group_name: "proximity",
queries: &[
"herald sings ",
"april paris ",
"tea two ",
"diesel engine ",
],
criterion: Some(&["proximity"]),
optional_words: false,
..BASE_CONF
},
utils::Conf {
group_name: "typo",
queries: &[
"migrosoft ",
"linax ",
"Disnaylande ",
"phytogropher ",
"nympalidea ",
"aritmetric ",
"the fronce ",
"sisan ",
],
criterion: Some(&["typo"]),
optional_words: false,
..BASE_CONF
},
utils::Conf {
group_name: "words",
queries: &[
"the black saint and the sinner lady and the good doggo ", // four words to pop, 27 results
"Kameya Tokujirō mingus monk ", // two words to pop, 55
"Ulrich Hensel meilisearch milli ", // two words to pop, 306
"Idaho Bellevue pizza ", // one word to pop, 800
"Abraham machin ", // one word to pop, 1141
],
criterion: Some(&["words"]),
..BASE_CONF
},
/* the we bench some global / normal search with all the default criterion in the default
* order */
utils::Conf {
group_name: "basic placeholder",
queries: &[""],
..BASE_CONF
},
utils::Conf {
group_name: "basic without quote",
queries: &BASE_CONF
.queries
.iter()
.map(|s| s.trim()) // we remove the space at the end of each request
.collect::<Vec<&str>>(),
..BASE_CONF
},
utils::Conf {
group_name: "basic with quote",
queries: basic_with_quote,
..BASE_CONF
},
utils::Conf {
group_name: "prefix search",
queries: &[
"t", // 453k results
"c", // 405k
"g", // 318k
"j", // 227k
"q", // 71k
"x", // 17k
],
..BASE_CONF
},
];
utils::run_benches(c, confs);
}
criterion_group!(benches, bench_songs);
criterion_main!(benches);

80
benchmarks/build.rs Normal file
View File

@@ -0,0 +1,80 @@
use std::path::{Path, PathBuf};
use std::{env, fs};
use std::{
fs::File,
io::{Cursor, Read, Seek, Write},
};
use bytes::Bytes;
use convert_case::{Case, Casing};
use flate2::read::GzDecoder;
use reqwest::IntoUrl;
const BASE_URL: &str = "https://meili-datasets.s3.fr-par.scw.cloud/benchmarks";
const DATASET_SONGS: &str = "smol-songs";
const DATASET_WIKI: &str = "smol-wiki-articles";
/// The name of the environment variable used to select the path
/// of the directory containing the datasets
const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH";
fn main() -> anyhow::Result<()> {
let out_dir = PathBuf::from(env::var(BASE_DATASETS_PATH_KEY).unwrap_or(env::var("OUT_DIR")?));
let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches");
let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?;
writeln!(
manifest_paths_file,
r#"//! This file is generated by the build script.
//! Do not modify by hand, use the build.rs file.
#![allow(dead_code)]
"#
)?;
writeln!(manifest_paths_file)?;
for dataset in &[DATASET_SONGS, DATASET_WIKI] {
let out_path = out_dir.join(dataset);
let out_file = out_path.with_extension("csv");
writeln!(
&mut manifest_paths_file,
r#"pub const {}: &str = {:?};"#,
dataset.to_case(Case::ScreamingSnake),
out_file.display(),
)?;
if out_file.exists() {
eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset);
continue;
}
let url = format!("{}/{}.csv.gz", BASE_URL, dataset);
eprintln!("downloading: {}", url);
let bytes = download_dataset(url.clone())?;
eprintln!("{} downloaded successfully", url);
eprintln!("uncompressing in {}", out_path.display());
uncompress_in_file(bytes, &out_file)?;
}
Ok(())
}
fn download_dataset<U: IntoUrl>(url: U) -> anyhow::Result<Cursor<Bytes>> {
let bytes = reqwest::blocking::Client::builder()
.timeout(None)
.build()?
.get(url)
.send()?
.bytes()?;
Ok(Cursor::new(bytes))
}
fn uncompress_in_file<R: Read + Seek, P: AsRef<Path>>(bytes: R, path: P) -> anyhow::Result<()> {
let path = path.as_ref();
let mut gz = GzDecoder::new(bytes);
let mut dataset = Vec::new();
gz.read_to_end(&mut dataset)?;
fs::write(path, dataset)?;
Ok(())
}

58
benchmarks/scripts/compare.sh Executable file
View File

@@ -0,0 +1,58 @@
#!/usr/bin/env bash
# Requirements:
# - critcmp. See: https://github.com/BurntSushi/critcmp
# - curl
# Usage
# $ bash compare.sh json_file1 json_file1
# ex: bash compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json
# Checking that critcmp is installed
command -v critcmp > /dev/null 2>&1
if [[ "$?" -ne 0 ]]; then
echo 'You must install critcmp to make this script working.'
echo '$ cargo install critcmp'
echo 'See: https://github.com/BurntSushi/critcmp'
exit 1
fi
if [[ $# -ne 2 ]]
then
echo 'Need 2 arguments.'
echo 'Usage: '
echo ' $ ./compare.sh file_to_download1 file_to_download2'
echo 'Ex:'
echo ' $ ./compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json'
exit 1
fi
file1="$1"
file2="$2"
s3_url='https://milli-benchmarks.fra1.digitaloceanspaces.com/critcmp_results'
file1_s3_url="$s3_url/$file1"
file2_s3_url="$s3_url/$file2"
file1_local_path="/tmp/$file1"
file2_local_path="/tmp/$file2"
if [[ ! -f "$file1_local_path" ]]; then
curl "$file1_s3_url" -O "$file1_local_path"
if [[ "$?" -ne 0 ]]; then
echo 'curl command failed.'
exit 1
fi
else
echo "$file1 already present in /tmp, no need to download."
fi
if [[ ! -f "$file2_local_path" ]]; then
curl "$file2_s3_url" -O "$file2_local_path"
if [[ "$?" -ne 0 ]]; then
echo 'curl command failed.'
exit 1
fi
else
echo "$file2 already present in /tmp, no need to download."
fi
critcmp --color always "$file1_local_path" "$file2_local_path"

14
benchmarks/scripts/list.sh Executable file
View File

@@ -0,0 +1,14 @@
#!/usr/bin/env bash
# Requirements:
# - curl
# - grep
res=$(curl -s https://milli-benchmarks.fra1.digitaloceanspaces.com | grep -o '<Key>[^<]\+' | cut -c 5- | grep critcmp_results/ | cut -c 18-)
for pattern in "$@"
do
res=$(echo "$res" | grep $pattern)
done
echo "$res"

5
benchmarks/src/lib.rs Normal file
View File

@@ -0,0 +1,5 @@
//! This library is only used to isolate the benchmarks
//! from the original milli library.
//!
//! It does not include interesting functions for milli library
//! users only for milli contributors.

View File

@@ -53,13 +53,8 @@ tinytemplate = "=1.1.0"
[dev-dependencies]
big_s = "1.0.2"
criterion = "0.3.4"
maplit = "1.0.2"
rand = "0.8.3"
[features]
default = []
[[bench]]
name = "search"
harness = false

View File

@@ -1,36 +0,0 @@
use std::time::Duration;
use heed::EnvOpenOptions;
use milli::Index;
use criterion::{criterion_group, criterion_main, BenchmarkId};
fn bench_search(c: &mut criterion::Criterion) {
let database = "books-4cpu.mmdb";
let queries = [
"minogue kylie",
"minogue kylie live",
];
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
options.max_readers(10);
let index = Index::new(options, database).unwrap();
let mut group = c.benchmark_group("search");
group.sample_size(10);
group.measurement_time(Duration::from_secs(12));
for query in &queries {
group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| {
b.iter(|| {
let rtxn = index.read_txn().unwrap();
let _documents_ids = index.search(&rtxn).query(*query).execute().unwrap();
});
});
}
group.finish();
}
criterion_group!(benches, bench_search);
criterion_main!(benches);