Compare commits

..

25 Commits

Author SHA1 Message Date
Kerollmops
681920ba3e Rename the hits and query output into facetHits and facetQuery respectively 2023-06-07 11:20:46 +02:00
Kerollmops
975a88a093 Fix the error code returned when the facetName field is missing 2023-06-07 11:14:14 +02:00
Kerollmops
ef0fe1195f Introduce a new invalid_facet_search_facet_name error code 2023-06-07 11:04:53 +02:00
Kerollmops
377ebb8a52 Use the right field id to write the string facet values FST 2023-06-07 10:52:35 +02:00
Kerollmops
3b174c6f15 Return an empty list of results if attribute is set as filterable 2023-06-07 10:33:11 +02:00
Clément Renault
25d49f5811 Use the minWordSizeForTypos index settings 2023-06-06 10:48:43 +02:00
Clément Renault
e9af506591 Format the code 2023-06-06 10:48:43 +02:00
Clément Renault
6ee4f4b544 Fix compilation issues 2023-06-06 10:48:42 +02:00
Clément Renault
e92576e0d4 Simplify the placeholder search of the facet-search route 2023-06-06 10:48:08 +02:00
Clément Renault
7e1a49e7fa Use the disableOnAttributes parameter on the facet-search route 2023-06-06 10:48:08 +02:00
Clément Renault
17e86e9c42 Use the disableOnWords parameter on the facet-search route 2023-06-06 10:48:08 +02:00
Clément Renault
f4f5ae70d6 Support the typoTolerant.enabled parameter 2023-06-06 10:48:08 +02:00
Clément Renault
edf3031dae Log an error when a facet value is missing from the database 2023-06-06 10:48:08 +02:00
Clément Renault
09d440a427 Rename the SearchForFacetValues struct 2023-06-06 10:48:08 +02:00
Clément Renault
8b66318a6b Return an internal error when a field id is missing 2023-06-06 10:48:08 +02:00
Clément Renault
196a2b3d58 Make clippy happy 2023-06-06 10:48:07 +02:00
Clément Renault
c153cbc593 Improve the returned errors from the facet-search route 2023-06-06 10:48:07 +02:00
Clément Renault
e731f1c8ba Fix the max number of facets to be returned to 100 2023-06-06 10:48:07 +02:00
Clément Renault
c39d830ff8 Return the correct response JSON object from the facet-search route 2023-06-06 10:48:07 +02:00
Clément Renault
2dca4d82d8 Send analytics about the facet-search route 2023-06-06 10:48:07 +02:00
Clément Renault
ce87ee8ea0 Make the search for facet work 2023-06-06 10:37:27 +02:00
Kerollmops
f06bb445a6 Introduce the facet search route 2023-06-06 10:37:26 +02:00
Kerollmops
81792eb5f7 Restrict the number of facet search results to 1000 2023-06-06 10:37:26 +02:00
Kerollmops
7a49bbc8df Introduce the SearchForFacetValue struct 2023-06-06 10:37:26 +02:00
Clément Renault
ca16aaaa30 Store the facet string values in multiple FSTs 2023-06-06 10:37:26 +02:00
57 changed files with 1969 additions and 2096 deletions

View File

@@ -2,3 +2,4 @@ target
Dockerfile
.dockerignore
.gitignore
**/.git

View File

@@ -35,7 +35,7 @@ jobs:
- name: Build deb package
run: cargo deb -p meilisearch -o target/debian/meilisearch.deb
- name: Upload debian pkg to release
uses: svenstaro/upload-release-action@2.6.1
uses: svenstaro/upload-release-action@2.5.0
with:
repo_token: ${{ secrets.MEILI_BOT_GH_PAT }}
file: target/debian/meilisearch.deb

View File

@@ -54,7 +54,7 @@ jobs:
# No need to upload binaries for dry run (cron)
- name: Upload binaries to release
if: github.event_name == 'release'
uses: svenstaro/upload-release-action@2.6.1
uses: svenstaro/upload-release-action@2.5.0
with:
repo_token: ${{ secrets.MEILI_BOT_GH_PAT }}
file: target/release/meilisearch
@@ -87,7 +87,7 @@ jobs:
# No need to upload binaries for dry run (cron)
- name: Upload binaries to release
if: github.event_name == 'release'
uses: svenstaro/upload-release-action@2.6.1
uses: svenstaro/upload-release-action@2.5.0
with:
repo_token: ${{ secrets.MEILI_BOT_GH_PAT }}
file: target/release/${{ matrix.artifact_name }}
@@ -121,7 +121,7 @@ jobs:
- name: Upload the binary to release
# No need to upload binaries for dry run (cron)
if: github.event_name == 'release'
uses: svenstaro/upload-release-action@2.6.1
uses: svenstaro/upload-release-action@2.5.0
with:
repo_token: ${{ secrets.MEILI_BOT_GH_PAT }}
file: target/${{ matrix.target }}/release/meilisearch
@@ -183,7 +183,7 @@ jobs:
- name: Upload the binary to release
# No need to upload binaries for dry run (cron)
if: github.event_name == 'release'
uses: svenstaro/upload-release-action@2.6.1
uses: svenstaro/upload-release-action@2.5.0
with:
repo_token: ${{ secrets.MEILI_BOT_GH_PAT }}
file: target/${{ matrix.target }}/release/meilisearch

View File

@@ -58,9 +58,13 @@ jobs:
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
with:
platforms: linux/amd64,linux/arm64
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
with:
platforms: linux/amd64,linux/arm64
- name: Login to Docker Hub
uses: docker/login-action@v2
@@ -88,10 +92,13 @@ jobs:
push: true
platforms: linux/amd64,linux/arm64
tags: ${{ steps.meta.outputs.tags }}
builder: ${{ steps.buildx.outputs.name }}
build-args: |
COMMIT_SHA=${{ github.sha }}
COMMIT_DATE=${{ steps.build-metadata.outputs.date }}
GIT_TAG=${{ github.ref_name }}
cache-from: type=gha
cache-to: type=gha,mode=max
# /!\ Don't touch this without checking with Cloud team
- name: Send CI information to Cloud team

View File

@@ -3,11 +3,6 @@ name: SDKs tests
on:
workflow_dispatch:
inputs:
docker_image:
description: 'The Meilisearch Docker image used'
required: false
default: nightly
schedule:
- cron: "0 6 * * MON" # Every Monday at 6:00AM
@@ -16,28 +11,13 @@ env:
MEILI_NO_ANALYTICS: 'true'
jobs:
define-docker-image:
runs-on: ubuntu-latest
outputs:
docker-image: ${{ steps.define-image.outputs.docker-image }}
steps:
- uses: actions/checkout@v3
- name: Define the Docker image we need to use
id: define-image
run: |
event=${{ github.event.action }}
echo "docker-image=nightly" >> $GITHUB_OUTPUT
if [[ $event == 'workflow_dispatch' ]]; then
echo "docker-image=${{ github.event.inputs.docker_image }}" >> $GITHUB_OUTPUT
fi
meilisearch-js-tests:
needs: define-docker-image
name: JS SDK tests
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
image: getmeili/meilisearch:nightly
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@@ -67,12 +47,11 @@ jobs:
run: yarn test:env:browser
instant-meilisearch-tests:
needs: define-docker-image
name: instant-meilisearch tests
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
image: getmeili/meilisearch:nightly
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@@ -94,12 +73,11 @@ jobs:
run: yarn build
meilisearch-php-tests:
needs: define-docker-image
name: PHP SDK tests
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
image: getmeili/meilisearch:nightly
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@@ -125,12 +103,11 @@ jobs:
composer remove --dev guzzlehttp/guzzle http-interop/http-factory-guzzle
meilisearch-python-tests:
needs: define-docker-image
name: Python SDK tests
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
image: getmeili/meilisearch:nightly
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@@ -150,12 +127,11 @@ jobs:
run: pipenv run pytest
meilisearch-go-tests:
needs: define-docker-image
name: Go SDK tests
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
image: getmeili/meilisearch:nightly
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@@ -163,7 +139,7 @@ jobs:
- '7700:7700'
steps:
- name: Set up Go
uses: actions/setup-go@v4
uses: actions/setup-go@v3
with:
go-version: stable
- uses: actions/checkout@v3
@@ -180,12 +156,11 @@ jobs:
run: go test -v ./...
meilisearch-ruby-tests:
needs: define-docker-image
name: Ruby SDK tests
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
image: getmeili/meilisearch:nightly
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@@ -205,12 +180,11 @@ jobs:
run: bundle exec rspec
meilisearch-rust-tests:
needs: define-docker-image
name: Rust SDK tests
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
image: getmeili/meilisearch:nightly
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}

View File

@@ -43,7 +43,7 @@ jobs:
toolchain: nightly
override: true
- name: Cache dependencies
uses: Swatinem/rust-cache@v2.4.0
uses: Swatinem/rust-cache@v2.2.1
- name: Run cargo check without any default features
uses: actions-rs/cargo@v1
with:
@@ -65,7 +65,7 @@ jobs:
steps:
- uses: actions/checkout@v3
- name: Cache dependencies
uses: Swatinem/rust-cache@v2.4.0
uses: Swatinem/rust-cache@v2.2.1
- name: Run cargo check without any default features
uses: actions-rs/cargo@v1
with:
@@ -105,29 +105,6 @@ jobs:
command: test
args: --workspace --locked --release --all-features
test-disabled-tokenization:
name: Test disabled tokenization
runs-on: ubuntu-latest
container:
image: ubuntu:18.04
if: github.event_name == 'schedule'
steps:
- uses: actions/checkout@v3
- name: Install needed dependencies
run: |
apt-get update
apt-get install --assume-yes build-essential curl
- uses: actions-rs/toolchain@v1
with:
toolchain: stable
override: true
- name: Run cargo tree without default features and check lindera is not present
run: |
cargo tree -f '{p} {f}' -e normal --no-default-features | grep lindera -vqz
- name: Run cargo tree with default features and check lindera is pressent
run: |
cargo tree -f '{p} {f}' -e normal | grep lindera -qz
# We run tests in debug also, to make sure that the debug_assertions are hit
test-debug:
name: Run tests in debug
@@ -146,7 +123,7 @@ jobs:
toolchain: stable
override: true
- name: Cache dependencies
uses: Swatinem/rust-cache@v2.4.0
uses: Swatinem/rust-cache@v2.2.1
- name: Run tests in debug
uses: actions-rs/cargo@v1
with:
@@ -165,7 +142,7 @@ jobs:
override: true
components: clippy
- name: Cache dependencies
uses: Swatinem/rust-cache@v2.4.0
uses: Swatinem/rust-cache@v2.2.1
- name: Run cargo clippy
uses: actions-rs/cargo@v1
with:
@@ -184,7 +161,7 @@ jobs:
override: true
components: rustfmt
- name: Cache dependencies
uses: Swatinem/rust-cache@v2.4.0
uses: Swatinem/rust-cache@v2.2.1
- name: Run cargo fmt
# Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file.
# Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate

64
Cargo.lock generated
View File

@@ -1207,12 +1207,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "doc-comment"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
[[package]]
name = "dump"
version = "1.2.0"
@@ -1769,15 +1763,6 @@ dependencies = [
"byteorder",
]
[[package]]
name = "hashbrown"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
dependencies = [
"ahash 0.7.6",
]
[[package]]
name = "hashbrown"
version = "0.12.3"
@@ -1879,22 +1864,6 @@ dependencies = [
"digest",
]
[[package]]
name = "hnsw"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b9740ebf8769ec4ad6762cc951ba18f39bba6dfbc2fbbe46285f7539af79752"
dependencies = [
"ahash 0.7.6",
"hashbrown 0.11.2",
"libm",
"num-traits",
"rand_core",
"serde",
"smallvec",
"space",
]
[[package]]
name = "http"
version = "0.2.9"
@@ -2025,7 +1994,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
dependencies = [
"autocfg",
"hashbrown 0.12.3",
"hashbrown",
"serde",
]
@@ -2117,7 +2086,7 @@ checksum = "37228e06c75842d1097432d94d02f37fe3ebfca9791c2e8fef6e9db17ed128c1"
dependencies = [
"cedarwood",
"fxhash",
"hashbrown 0.12.3",
"hashbrown",
"lazy_static",
"phf",
"phf_codegen",
@@ -2746,7 +2715,6 @@ dependencies = [
"bimap",
"bincode",
"bstr",
"bytemuck",
"byteorder",
"charabia",
"concat-arrays",
@@ -2762,7 +2730,6 @@ dependencies = [
"geoutils",
"grenad",
"heed",
"hnsw",
"insta",
"itertools",
"json-depth-checker",
@@ -2777,7 +2744,6 @@ dependencies = [
"once_cell",
"ordered-float",
"rand",
"rand_pcg",
"rayon",
"roaring",
"rstar",
@@ -2787,7 +2753,6 @@ dependencies = [
"smallstr",
"smallvec",
"smartstring",
"space",
"tempfile",
"thiserror",
"time",
@@ -3362,16 +3327,6 @@ dependencies = [
"getrandom",
]
[[package]]
name = "rand_pcg"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59cad018caf63deb318e5a4586d99a24424a364f40f1e5778c29aca23f4fc73e"
dependencies = [
"rand_core",
"serde",
]
[[package]]
name = "rayon"
version = "1.7.0"
@@ -3809,9 +3764,6 @@ name = "smallvec"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
dependencies = [
"serde",
]
[[package]]
name = "smartstring"
@@ -3834,16 +3786,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "space"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c5ab9701ae895386d13db622abf411989deff7109b13b46b6173bb4ce5c1d123"
dependencies = [
"doc-comment",
"num-traits",
]
[[package]]
name = "spin"
version = "0.5.2"
@@ -4491,7 +4433,7 @@ version = "0.16.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c531a2dc4c462b833788be2c07eef4e621d0e9edbd55bf280cc164c1c1aa043"
dependencies = [
"hashbrown 0.12.3",
"hashbrown",
"once_cell",
]

View File

@@ -1,3 +1,4 @@
# syntax=docker/dockerfile:1.4
# Compile
FROM rust:alpine3.16 AS compiler
@@ -11,7 +12,7 @@ ARG GIT_TAG
ENV VERGEN_GIT_SHA=${COMMIT_SHA} VERGEN_GIT_COMMIT_TIMESTAMP=${COMMIT_DATE} VERGEN_GIT_SEMVER_LIGHTWEIGHT=${GIT_TAG}
ENV RUSTFLAGS="-C target-feature=-crt-static"
COPY . .
COPY --link . .
RUN set -eux; \
apkArch="$(apk --print-arch)"; \
if [ "$apkArch" = "aarch64" ]; then \
@@ -30,7 +31,7 @@ RUN apk update --quiet \
# add meilisearch to the `/bin` so you can run it from anywhere and it's easy
# to find.
COPY --from=compiler /meilisearch/target/release/meilisearch /bin/meilisearch
COPY --from=compiler --link /meilisearch/target/release/meilisearch /bin/meilisearch
# To stay compatible with the older version of the container (pre v0.27.0) we're
# going to symlink the meilisearch binary in the path to `/meilisearch`
RUN ln -s /bin/meilisearch /meilisearch

File diff suppressed because it is too large Load Diff

View File

@@ -1,19 +0,0 @@
global:
scrape_interval: 15s # By default, scrape targets every 15 seconds.
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: 'codelab-monitor'
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'meilisearch'
# Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 5s
static_configs:
- targets: ['localhost:7700']

View File

@@ -1,131 +1,131 @@
# This file shows the default configuration of Meilisearch.
# All variables are defined here: https://www.meilisearch.com/docs/learn/configuration/instance_options#environment-variables
db_path = "./data.ms"
# Designates the location where database files will be created and retrieved.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#database-path
db_path = "./data.ms"
env = "development"
# Configures the instance's environment. Value must be either `production` or `development`.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#environment
env = "development"
# The address on which the HTTP server will listen.
http_addr = "localhost:7700"
# The address on which the HTTP server will listen.
# master_key = "YOUR_MASTER_KEY_VALUE"
# Sets the instance's master key, automatically protecting all routes except GET /health.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#master-key
# master_key = "YOUR_MASTER_KEY_VALUE"
# no_analytics = true
# Deactivates Meilisearch's built-in telemetry when provided.
# Meilisearch automatically collects data from all instances that do not opt out using this flag.
# All gathered data is used solely for the purpose of improving Meilisearch, and can be deleted at any time.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#disable-analytics
# no_analytics = true
http_payload_size_limit = "100 MB"
# Sets the maximum size of accepted payloads.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#payload-limit-size
http_payload_size_limit = "100 MB"
log_level = "INFO"
# Defines how much detail should be present in Meilisearch's logs.
# Meilisearch currently supports six log levels, listed in order of increasing verbosity: `OFF`, `ERROR`, `WARN`, `INFO`, `DEBUG`, `TRACE`
# https://www.meilisearch.com/docs/learn/configuration/instance_options#log-level
log_level = "INFO"
# max_indexing_memory = "2 GiB"
# Sets the maximum amount of RAM Meilisearch can use when indexing.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#max-indexing-memory
# max_indexing_memory = "2 GiB"
# max_indexing_threads = 4
# Sets the maximum number of threads Meilisearch can use during indexing.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#max-indexing-threads
# max_indexing_threads = 4
#############
### DUMPS ###
#############
dump_dir = "dumps/"
# Sets the directory where Meilisearch will create dump files.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#dump-directory
dump_dir = "dumps/"
# import_dump = "./path/to/my/file.dump"
# Imports the dump file located at the specified path. Path must point to a .dump file.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#import-dump
# import_dump = "./path/to/my/file.dump"
ignore_missing_dump = false
# Prevents Meilisearch from throwing an error when `import_dump` does not point to a valid dump file.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ignore-missing-dump
ignore_missing_dump = false
ignore_dump_if_db_exists = false
# Prevents a Meilisearch instance with an existing database from throwing an error when using `import_dump`.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ignore-dump-if-db-exists
ignore_dump_if_db_exists = false
#################
### SNAPSHOTS ###
#################
schedule_snapshot = false
# Enables scheduled snapshots when true, disable when false (the default).
# If the value is given as an integer, then enables the scheduled snapshot with the passed value as the interval
# between each snapshot, in seconds.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#schedule-snapshot-creation
schedule_snapshot = false
snapshot_dir = "snapshots/"
# Sets the directory where Meilisearch will store snapshots.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#snapshot-destination
snapshot_dir = "snapshots/"
# import_snapshot = "./path/to/my/snapshot"
# Launches Meilisearch after importing a previously-generated snapshot at the given filepath.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#import-snapshot
# import_snapshot = "./path/to/my/snapshot"
ignore_missing_snapshot = false
# Prevents a Meilisearch instance from throwing an error when `import_snapshot` does not point to a valid snapshot file.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ignore-missing-snapshot
ignore_missing_snapshot = false
ignore_snapshot_if_db_exists = false
# Prevents a Meilisearch instance with an existing database from throwing an error when using `import_snapshot`.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ignore-snapshot-if-db-exists
ignore_snapshot_if_db_exists = false
###########
### SSL ###
###########
# ssl_auth_path = "./path/to/root"
# Enables client authentication in the specified path.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-authentication-path
# ssl_auth_path = "./path/to/root"
# ssl_cert_path = "./path/to/certfile"
# Sets the server's SSL certificates.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-certificates-path
# ssl_cert_path = "./path/to/certfile"
# ssl_key_path = "./path/to/private-key"
# Sets the server's SSL key files.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-key-path
# ssl_key_path = "./path/to/private-key"
# ssl_ocsp_path = "./path/to/ocsp-file"
# Sets the server's OCSP file.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-ocsp-path
# ssl_ocsp_path = "./path/to/ocsp-file"
ssl_require_auth = false
# Makes SSL authentication mandatory.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-require-auth
ssl_require_auth = false
ssl_resumption = false
# Activates SSL session resumption.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-resumption
ssl_resumption = false
ssl_tickets = false
# Activates SSL tickets.
# https://www.meilisearch.com/docs/learn/configuration/instance_options#ssl-tickets
ssl_tickets = false
#############################
### Experimental features ###
#############################
experimental_enable_metrics = false
# Experimental metrics feature. For more information, see: <https://github.com/meilisearch/meilisearch/discussions/3518>
# Enables the Prometheus metrics on the `GET /metrics` endpoint.
experimental_enable_metrics = false
# Experimental RAM reduction during indexing, do not use in production, see: <https://github.com/meilisearch/product/discussions/652>
experimental_reduce_indexing_memory_usage = false
# Experimental RAM reduction during indexing, do not use in production, see: <https://github.com/meilisearch/product/discussions/652>

File diff suppressed because it is too large Load Diff

View File

@@ -90,17 +90,8 @@ pub enum IndexStatus {
pub struct IndexStats {
/// Number of documents in the index.
pub number_of_documents: u64,
/// Size taken up by the index' DB, in bytes.
///
/// This includes the size taken by both the used and free pages of the DB, and as the free pages
/// are not returned to the disk after a deletion, this number is typically larger than
/// `used_database_size` that only includes the size of the used pages.
/// Size of the index' DB, in bytes.
pub database_size: u64,
/// Size taken by the used pages of the index' DB, in bytes.
///
/// As the DB backend does not return to the disk the pages that are not currently used by the DB,
/// this value is typically smaller than `database_size`.
pub used_database_size: u64,
/// Association of every field name with the number of times it occurs in the documents.
pub field_distribution: FieldDistribution,
/// Creation date of the index.
@@ -116,10 +107,10 @@ impl IndexStats {
///
/// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`.
pub fn new(index: &Index, rtxn: &RoTxn) -> Result<Self> {
let database_size = index.on_disk_size()?;
Ok(IndexStats {
number_of_documents: index.number_of_documents(rtxn)?,
database_size: index.on_disk_size()?,
used_database_size: index.used_size()?,
database_size,
field_distribution: index.field_distribution(rtxn)?,
created_at: index.created_at(rtxn)?,
updated_at: index.updated_at(rtxn)?,

View File

@@ -31,7 +31,7 @@ mod uuid_codec;
pub type Result<T> = std::result::Result<T, Error>;
pub type TaskId = u32;
use std::collections::{BTreeMap, HashMap};
use std::collections::HashMap;
use std::ops::{Bound, RangeBounds};
use std::path::{Path, PathBuf};
use std::sync::atomic::AtomicBool;
@@ -573,16 +573,10 @@ impl IndexScheduler {
&self.index_mapper.indexer_config
}
/// Return the real database size (i.e.: The size **with** the free pages)
pub fn size(&self) -> Result<u64> {
Ok(self.env.real_disk_size()?)
}
/// Return the used database size (i.e.: The size **without** the free pages)
pub fn used_size(&self) -> Result<u64> {
Ok(self.env.non_free_pages_size()?)
}
/// Return the index corresponding to the name.
///
/// * If the index wasn't opened before, the index will be opened.
@@ -762,38 +756,6 @@ impl IndexScheduler {
Ok(tasks)
}
/// The returned structure contains:
/// 1. The name of the property being observed can be `statuses`, `types`, or `indexes`.
/// 2. The name of the specific data related to the property can be `enqueued` for the `statuses`, `settingsUpdate` for the `types`, or the name of the index for the `indexes`, for example.
/// 3. The number of times the properties appeared.
pub fn get_stats(&self) -> Result<BTreeMap<String, BTreeMap<String, u64>>> {
let rtxn = self.read_txn()?;
let mut res = BTreeMap::new();
res.insert(
"statuses".to_string(),
enum_iterator::all::<Status>()
.map(|s| Ok((s.to_string(), self.get_status(&rtxn, s)?.len())))
.collect::<Result<BTreeMap<String, u64>>>()?,
);
res.insert(
"types".to_string(),
enum_iterator::all::<Kind>()
.map(|s| Ok((s.to_string(), self.get_kind(&rtxn, s)?.len())))
.collect::<Result<BTreeMap<String, u64>>>()?,
);
res.insert(
"indexes".to_string(),
self.index_tasks
.iter(&rtxn)?
.map(|res| Ok(res.map(|(name, bitmap)| (name.to_string(), bitmap.len()))?))
.collect::<Result<BTreeMap<String, u64>>>()?,
);
Ok(res)
}
/// Return true iff there is at least one task associated with this index
/// that is processing.
pub fn is_index_processing(&self, index: &str) -> Result<bool> {

View File

@@ -45,11 +45,6 @@ impl AuthController {
self.store.size()
}
/// Return the used size of the `AuthController` database in bytes.
pub fn used_size(&self) -> Result<u64> {
self.store.used_size()
}
pub fn create_key(&self, create_key: CreateApiKey) -> Result<Key> {
match self.store.get_api_key(create_key.uid)? {
Some(_) => Err(AuthControllerError::ApiKeyAlreadyExists(create_key.uid.to_string())),

View File

@@ -75,11 +75,6 @@ impl HeedAuthStore {
Ok(self.env.real_disk_size()?)
}
/// Return the number of bytes actually used in the database
pub fn used_size(&self) -> Result<u64> {
Ok(self.env.non_free_pages_size()?)
}
pub fn set_drop_on_close(&mut self, v: bool) {
self.should_close_on_drop = v;
}

View File

@@ -151,6 +151,10 @@ make_missing_field_convenience_builder!(MissingApiKeyExpiresAt, missing_api_key_
make_missing_field_convenience_builder!(MissingApiKeyIndexes, missing_api_key_indexes);
make_missing_field_convenience_builder!(MissingSwapIndexes, missing_swap_indexes);
make_missing_field_convenience_builder!(MissingDocumentFilter, missing_document_filter);
make_missing_field_convenience_builder!(
MissingFacetSearchFacetName,
missing_facet_search_facet_name
);
// Integrate a sub-error into a [`DeserrError`] by taking its error message but using
// the default error code (C) from `Self`

View File

@@ -217,7 +217,6 @@ InvalidDocumentFields , InvalidRequest , BAD_REQUEST ;
MissingDocumentFilter , InvalidRequest , BAD_REQUEST ;
InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ;
InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ;
InvalidVectorDimensions , InvalidRequest , BAD_REQUEST ;
InvalidDocumentId , InvalidRequest , BAD_REQUEST ;
InvalidDocumentLimit , InvalidRequest , BAD_REQUEST ;
InvalidDocumentOffset , InvalidRequest , BAD_REQUEST ;
@@ -231,6 +230,7 @@ InvalidSearchAttributesToRetrieve , InvalidRequest , BAD_REQUEST ;
InvalidSearchCropLength , InvalidRequest , BAD_REQUEST ;
InvalidSearchCropMarker , InvalidRequest , BAD_REQUEST ;
InvalidSearchFacets , InvalidRequest , BAD_REQUEST ;
InvalidFacetSearchFacetName , InvalidRequest , BAD_REQUEST ;
InvalidSearchFilter , InvalidRequest , BAD_REQUEST ;
InvalidSearchHighlightPostTag , InvalidRequest , BAD_REQUEST ;
InvalidSearchHighlightPreTag , InvalidRequest , BAD_REQUEST ;
@@ -240,8 +240,11 @@ InvalidSearchMatchingStrategy , InvalidRequest , BAD_REQUEST ;
InvalidSearchOffset , InvalidRequest , BAD_REQUEST ;
InvalidSearchPage , InvalidRequest , BAD_REQUEST ;
InvalidSearchQ , InvalidRequest , BAD_REQUEST ;
InvalidFacetSearchQuery , InvalidRequest , BAD_REQUEST ;
InvalidFacetSearchName , InvalidRequest , BAD_REQUEST ;
InvalidSearchShowMatchesPosition , InvalidRequest , BAD_REQUEST ;
InvalidSearchSort , InvalidRequest , BAD_REQUEST ;
InvalidSearchFacet , InvalidRequest , BAD_REQUEST ;
InvalidSettingsDisplayedAttributes , InvalidRequest , BAD_REQUEST ;
InvalidSettingsDistinctAttribute , InvalidRequest , BAD_REQUEST ;
InvalidSettingsFaceting , InvalidRequest , BAD_REQUEST ;
@@ -278,6 +281,7 @@ MissingApiKeyIndexes , InvalidRequest , BAD_REQUEST ;
MissingAuthorizationHeader , Auth , UNAUTHORIZED ;
MissingContentType , InvalidRequest , UNSUPPORTED_MEDIA_TYPE ;
MissingDocumentId , InvalidRequest , BAD_REQUEST ;
MissingFacetSearchFacetName , InvalidRequest , BAD_REQUEST ;
MissingIndexUid , InvalidRequest , BAD_REQUEST ;
MissingMasterKey , Auth , UNAUTHORIZED ;
MissingPayload , InvalidRequest , BAD_REQUEST ;
@@ -331,9 +335,11 @@ impl ErrorCode for milli::Error {
UserError::SortRankingRuleMissing => Code::InvalidSearchSort,
UserError::InvalidFacetsDistribution { .. } => Code::InvalidSearchFacets,
UserError::InvalidSortableAttribute { .. } => Code::InvalidSearchSort,
UserError::InvalidFacetSearchFacetName { .. } => {
Code::InvalidFacetSearchFacetName
}
UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
UserError::SortError(_) => Code::InvalidSearchSort,
UserError::InvalidMinTypoWordLenSetting(_, _) => {
Code::InvalidSettingsTypoTolerance

View File

@@ -38,6 +38,18 @@ impl MultiSearchAggregator {
pub fn succeed(&mut self) {}
}
#[derive(Default)]
pub struct FacetSearchAggregator;
#[allow(dead_code)]
impl FacetSearchAggregator {
pub fn from_query(_: &dyn Any, _: &dyn Any) -> Self {
Self::default()
}
pub fn succeed(&mut self, _: &dyn Any) {}
}
impl MockAnalytics {
#[allow(clippy::new_ret_no_self)]
pub fn new(opt: &Opt) -> Arc<dyn Analytics> {
@@ -56,6 +68,7 @@ impl Analytics for MockAnalytics {
fn get_search(&self, _aggregate: super::SearchAggregator) {}
fn post_search(&self, _aggregate: super::SearchAggregator) {}
fn post_multi_search(&self, _aggregate: super::MultiSearchAggregator) {}
fn post_facet_search(&self, _aggregate: super::FacetSearchAggregator) {}
fn add_documents(
&self,
_documents_query: &UpdateDocumentsQuery,

View File

@@ -25,6 +25,8 @@ pub type SegmentAnalytics = mock_analytics::MockAnalytics;
pub type SearchAggregator = mock_analytics::SearchAggregator;
#[cfg(any(debug_assertions, not(feature = "analytics")))]
pub type MultiSearchAggregator = mock_analytics::MultiSearchAggregator;
#[cfg(any(debug_assertions, not(feature = "analytics")))]
pub type FacetSearchAggregator = mock_analytics::FacetSearchAggregator;
// if we are in release mode and the feature analytics was enabled
// we use the real analytics
@@ -34,6 +36,8 @@ pub type SegmentAnalytics = segment_analytics::SegmentAnalytics;
pub type SearchAggregator = segment_analytics::SearchAggregator;
#[cfg(all(not(debug_assertions), feature = "analytics"))]
pub type MultiSearchAggregator = segment_analytics::MultiSearchAggregator;
#[cfg(all(not(debug_assertions), feature = "analytics"))]
pub type FacetSearchAggregator = segment_analytics::FacetSearchAggregator;
/// The Meilisearch config dir:
/// `~/.config/Meilisearch` on *NIX or *BSD.
@@ -88,6 +92,9 @@ pub trait Analytics: Sync + Send {
/// This method should be called to aggregate a post array of searches
fn post_multi_search(&self, aggregate: MultiSearchAggregator);
/// This method should be called to aggregate post facet values searches
fn post_facet_search(&self, aggregate: FacetSearchAggregator);
// this method should be called to aggregate a add documents request
fn add_documents(
&self,

View File

@@ -1,5 +1,6 @@
use std::collections::{BinaryHeap, HashMap, HashSet};
use std::fs;
use std::mem::take;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::{Duration, Instant};
@@ -29,11 +30,13 @@ use super::{
use crate::analytics::Analytics;
use crate::option::{default_http_addr, IndexerOpts, MaxMemory, MaxThreads, ScheduleSnapshot};
use crate::routes::indexes::documents::UpdateDocumentsQuery;
use crate::routes::indexes::facet_search::FacetSearchQuery;
use crate::routes::tasks::TasksFilterQuery;
use crate::routes::{create_all_stats, Stats};
use crate::search::{
SearchQuery, SearchQueryWithIndex, SearchResult, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER,
DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT,
FacetSearchResult, MatchingStrategy, SearchQuery, SearchQueryWithIndex, SearchResult,
DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG,
DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
};
use crate::Opt;
@@ -71,6 +74,7 @@ pub enum AnalyticsMsg {
AggregateGetSearch(SearchAggregator),
AggregatePostSearch(SearchAggregator),
AggregatePostMultiSearch(MultiSearchAggregator),
AggregatePostFacetSearch(FacetSearchAggregator),
AggregateAddDocuments(DocumentsAggregator),
AggregateDeleteDocuments(DocumentsDeletionAggregator),
AggregateUpdateDocuments(DocumentsAggregator),
@@ -139,6 +143,7 @@ impl SegmentAnalytics {
batcher,
post_search_aggregator: SearchAggregator::default(),
post_multi_search_aggregator: MultiSearchAggregator::default(),
post_facet_search_aggregator: FacetSearchAggregator::default(),
get_search_aggregator: SearchAggregator::default(),
add_documents_aggregator: DocumentsAggregator::default(),
delete_documents_aggregator: DocumentsDeletionAggregator::default(),
@@ -182,6 +187,10 @@ impl super::Analytics for SegmentAnalytics {
let _ = self.sender.try_send(AnalyticsMsg::AggregatePostSearch(aggregate));
}
fn post_facet_search(&self, aggregate: FacetSearchAggregator) {
let _ = self.sender.try_send(AnalyticsMsg::AggregatePostFacetSearch(aggregate));
}
fn post_multi_search(&self, aggregate: MultiSearchAggregator) {
let _ = self.sender.try_send(AnalyticsMsg::AggregatePostMultiSearch(aggregate));
}
@@ -354,6 +363,7 @@ pub struct Segment {
get_search_aggregator: SearchAggregator,
post_search_aggregator: SearchAggregator,
post_multi_search_aggregator: MultiSearchAggregator,
post_facet_search_aggregator: FacetSearchAggregator,
add_documents_aggregator: DocumentsAggregator,
delete_documents_aggregator: DocumentsDeletionAggregator,
update_documents_aggregator: DocumentsAggregator,
@@ -418,6 +428,7 @@ impl Segment {
Some(AnalyticsMsg::AggregateGetSearch(agreg)) => self.get_search_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregatePostSearch(agreg)) => self.post_search_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregatePostMultiSearch(agreg)) => self.post_multi_search_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregatePostFacetSearch(agreg)) => self.post_facet_search_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregateAddDocuments(agreg)) => self.add_documents_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregateDeleteDocuments(agreg)) => self.delete_documents_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregateUpdateDocuments(agreg)) => self.update_documents_aggregator.aggregate(agreg),
@@ -461,55 +472,72 @@ impl Segment {
})
.await;
}
let get_search = std::mem::take(&mut self.get_search_aggregator)
.into_event(&self.user, "Documents Searched GET");
let post_search = std::mem::take(&mut self.post_search_aggregator)
.into_event(&self.user, "Documents Searched POST");
let post_multi_search = std::mem::take(&mut self.post_multi_search_aggregator)
.into_event(&self.user, "Documents Searched by Multi-Search POST");
let add_documents = std::mem::take(&mut self.add_documents_aggregator)
.into_event(&self.user, "Documents Added");
let delete_documents = std::mem::take(&mut self.delete_documents_aggregator)
.into_event(&self.user, "Documents Deleted");
let update_documents = std::mem::take(&mut self.update_documents_aggregator)
.into_event(&self.user, "Documents Updated");
let get_fetch_documents = std::mem::take(&mut self.get_fetch_documents_aggregator)
.into_event(&self.user, "Documents Fetched GET");
let post_fetch_documents = std::mem::take(&mut self.post_fetch_documents_aggregator)
.into_event(&self.user, "Documents Fetched POST");
let get_tasks =
std::mem::take(&mut self.get_tasks_aggregator).into_event(&self.user, "Tasks Seen");
let health =
std::mem::take(&mut self.health_aggregator).into_event(&self.user, "Health Seen");
if let Some(get_search) = get_search {
let Segment {
inbox: _,
opt: _,
batcher: _,
user,
get_search_aggregator,
post_search_aggregator,
post_multi_search_aggregator,
post_facet_search_aggregator,
add_documents_aggregator,
delete_documents_aggregator,
update_documents_aggregator,
get_fetch_documents_aggregator,
post_fetch_documents_aggregator,
get_tasks_aggregator,
health_aggregator,
} = self;
if let Some(get_search) =
take(get_search_aggregator).into_event(&user, "Documents Searched GET")
{
let _ = self.batcher.push(get_search).await;
}
if let Some(post_search) = post_search {
if let Some(post_search) =
take(post_search_aggregator).into_event(&user, "Documents Searched POST")
{
let _ = self.batcher.push(post_search).await;
}
if let Some(post_multi_search) = post_multi_search {
if let Some(post_multi_search) = take(post_multi_search_aggregator)
.into_event(&user, "Documents Searched by Multi-Search POST")
{
let _ = self.batcher.push(post_multi_search).await;
}
if let Some(add_documents) = add_documents {
if let Some(post_facet_search) = take(post_facet_search_aggregator)
.into_event(&user, "Documents Searched by Facet-Search POST")
{
let _ = self.batcher.push(post_facet_search).await;
}
if let Some(add_documents) =
take(add_documents_aggregator).into_event(&user, "Documents Added")
{
let _ = self.batcher.push(add_documents).await;
}
if let Some(delete_documents) = delete_documents {
if let Some(delete_documents) =
take(delete_documents_aggregator).into_event(&user, "Documents Deleted")
{
let _ = self.batcher.push(delete_documents).await;
}
if let Some(update_documents) = update_documents {
if let Some(update_documents) =
take(update_documents_aggregator).into_event(&user, "Documents Updated")
{
let _ = self.batcher.push(update_documents).await;
}
if let Some(get_fetch_documents) = get_fetch_documents {
if let Some(get_fetch_documents) =
take(get_fetch_documents_aggregator).into_event(&user, "Documents Fetched GET") {
let _ = self.batcher.push(get_fetch_documents).await;
}
if let Some(post_fetch_documents) = post_fetch_documents {
if let Some(post_fetch_documents) =
take(post_fetch_documents_aggregator).into_event(&user, "Documents Fetched POST") {
let _ = self.batcher.push(post_fetch_documents).await;
}
if let Some(get_tasks) = get_tasks {
if let Some(get_tasks) = take(get_tasks_aggregator).into_event(&user, "Tasks Seen") {
let _ = self.batcher.push(get_tasks).await;
}
if let Some(health) = health {
if let Some(health) = take(health_aggregator).into_event(&user, "Health Seen") {
let _ = self.batcher.push(health).await;
}
let _ = self.batcher.flush().await;
@@ -886,6 +914,144 @@ impl MultiSearchAggregator {
}
}
#[derive(Default)]
pub struct FacetSearchAggregator {
timestamp: Option<OffsetDateTime>,
// context
user_agents: HashSet<String>,
// requests
total_received: usize,
total_succeeded: usize,
time_spent: BinaryHeap<usize>,
// The set of all facetNames that were used
facet_names: HashSet<String>,
// As there been any other parameter than the facetName or facetQuery ones?
additional_search_parameters_provided: bool,
}
impl FacetSearchAggregator {
pub fn from_query(query: &FacetSearchQuery, request: &HttpRequest) -> Self {
let FacetSearchQuery {
facet_query: _,
facet_name,
q,
offset,
limit,
page,
hits_per_page,
attributes_to_retrieve,
attributes_to_crop,
crop_length,
attributes_to_highlight,
show_matches_position,
filter,
sort,
facets,
highlight_pre_tag,
highlight_post_tag,
crop_marker,
matching_strategy,
} = query;
let mut ret = Self::default();
ret.timestamp = Some(OffsetDateTime::now_utc());
ret.total_received = 1;
ret.user_agents = extract_user_agents(request).into_iter().collect();
ret.facet_names = Some(facet_name.clone()).into_iter().collect();
ret.additional_search_parameters_provided = q.is_some()
|| *offset != DEFAULT_SEARCH_OFFSET()
|| *limit != DEFAULT_SEARCH_LIMIT()
|| page.is_some()
|| hits_per_page.is_some()
|| attributes_to_retrieve.is_some()
|| attributes_to_crop.is_some()
|| *crop_length != DEFAULT_CROP_LENGTH()
|| attributes_to_highlight.is_some()
|| *show_matches_position
|| filter.is_some()
|| sort.is_some()
|| facets.is_some()
|| *highlight_pre_tag != DEFAULT_HIGHLIGHT_PRE_TAG()
|| *highlight_post_tag != DEFAULT_HIGHLIGHT_POST_TAG()
|| *crop_marker != DEFAULT_CROP_MARKER()
|| *matching_strategy != MatchingStrategy::default();
ret
}
pub fn succeed(&mut self, result: &FacetSearchResult) {
self.total_succeeded = self.total_succeeded.saturating_add(1);
self.time_spent.push(result.processing_time_ms as usize);
}
/// Aggregate one [SearchAggregator] into another.
pub fn aggregate(&mut self, mut other: Self) {
if self.timestamp.is_none() {
self.timestamp = other.timestamp;
}
// context
for user_agent in other.user_agents.into_iter() {
self.user_agents.insert(user_agent);
}
// request
self.total_received = self.total_received.saturating_add(other.total_received);
self.total_succeeded = self.total_succeeded.saturating_add(other.total_succeeded);
self.time_spent.append(&mut other.time_spent);
// facet_names
for facet_name in other.facet_names.into_iter() {
self.facet_names.insert(facet_name);
}
// additional_search_parameters_provided
self.additional_search_parameters_provided = self.additional_search_parameters_provided
| other.additional_search_parameters_provided;
}
pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> {
if self.total_received == 0 {
None
} else {
// the index of the 99th percentage of value
let percentile_99th = 0.99 * (self.total_succeeded as f64 - 1.) + 1.;
// we get all the values in a sorted manner
let time_spent = self.time_spent.into_sorted_vec();
// We are only interested by the slowest value of the 99th fastest results
let time_spent = time_spent.get(percentile_99th as usize);
let properties = json!({
"user-agent": self.user_agents,
"requests": {
"99th_response_time": time_spent.map(|t| format!("{:.2}", t)),
"total_succeeded": self.total_succeeded,
"total_failed": self.total_received.saturating_sub(self.total_succeeded), // just to be sure we never panics
"total_received": self.total_received,
},
"facets": {
"total_distinct_facet_count": self.facet_names.len(),
},
"additional_search_parameters_provided": self.additional_search_parameters_provided,
});
Some(Track {
timestamp: self.timestamp,
user: user.clone(),
event: event_name.to_string(),
properties,
..Default::default()
})
}
}
}
#[derive(Default)]
pub struct DocumentsAggregator {
timestamp: Option<OffsetDateTime>,

View File

@@ -4,32 +4,20 @@ use prometheus::{
register_int_gauge_vec, HistogramVec, IntCounterVec, IntGauge, IntGaugeVec,
};
/// Create evenly distributed buckets
fn create_buckets() -> [f64; 29] {
(0..10)
.chain((10..100).step_by(10))
.chain((100..=1000).step_by(100))
.map(|i| i as f64 / 1000.)
.collect::<Vec<_>>()
.try_into()
.unwrap()
}
const HTTP_RESPONSE_TIME_CUSTOM_BUCKETS: &[f64; 14] = &[
0.0005, 0.0008, 0.00085, 0.0009, 0.00095, 0.001, 0.00105, 0.0011, 0.00115, 0.0012, 0.0015,
0.002, 0.003, 1.0,
];
lazy_static! {
pub static ref HTTP_RESPONSE_TIME_CUSTOM_BUCKETS: [f64; 29] = create_buckets();
pub static ref MEILISEARCH_HTTP_REQUESTS_TOTAL: IntCounterVec = register_int_counter_vec!(
opts!("meilisearch_http_requests_total", "Meilisearch HTTP requests total"),
pub static ref HTTP_REQUESTS_TOTAL: IntCounterVec = register_int_counter_vec!(
opts!("http_requests_total", "HTTP requests total"),
&["method", "path"]
)
.expect("Can't create a metric");
pub static ref MEILISEARCH_DB_SIZE_BYTES: IntGauge =
register_int_gauge!(opts!("meilisearch_db_size_bytes", "Meilisearch DB Size In Bytes"))
register_int_gauge!(opts!("meilisearch_db_size_bytes", "Meilisearch Db Size In Bytes"))
.expect("Can't create a metric");
pub static ref MEILISEARCH_USED_DB_SIZE_BYTES: IntGauge = register_int_gauge!(opts!(
"meilisearch_used_db_size_bytes",
"Meilisearch Used DB Size In Bytes"
))
.expect("Can't create a metric");
pub static ref MEILISEARCH_INDEX_COUNT: IntGauge =
register_int_gauge!(opts!("meilisearch_index_count", "Meilisearch Index Count"))
.expect("Can't create a metric");
@@ -38,16 +26,11 @@ lazy_static! {
&["index"]
)
.expect("Can't create a metric");
pub static ref MEILISEARCH_HTTP_RESPONSE_TIME_SECONDS: HistogramVec = register_histogram_vec!(
pub static ref HTTP_RESPONSE_TIME_SECONDS: HistogramVec = register_histogram_vec!(
"http_response_time_seconds",
"HTTP response times",
&["method", "path"],
HTTP_RESPONSE_TIME_CUSTOM_BUCKETS.to_vec()
)
.expect("Can't create a metric");
pub static ref MEILISEARCH_NB_TASKS: IntGaugeVec = register_int_gauge_vec!(
opts!("meilisearch_nb_tasks", "Meilisearch Number of tasks"),
&["kind", "value"]
)
.expect("Can't create a metric");
}

View File

@@ -52,11 +52,11 @@ where
if is_registered_resource {
let request_method = req.method().to_string();
histogram_timer = Some(
crate::metrics::MEILISEARCH_HTTP_RESPONSE_TIME_SECONDS
crate::metrics::HTTP_RESPONSE_TIME_SECONDS
.with_label_values(&[&request_method, request_path])
.start_timer(),
);
crate::metrics::MEILISEARCH_HTTP_REQUESTS_TOTAL
crate::metrics::HTTP_REQUESTS_TOTAL
.with_label_values(&[&request_method, request_path])
.inc();
}

View File

@@ -0,0 +1,133 @@
use std::collections::{BTreeSet, HashSet};
use actix_web::web::Data;
use actix_web::{web, HttpRequest, HttpResponse};
use deserr::actix_web::AwebJson;
use index_scheduler::IndexScheduler;
use log::debug;
use meilisearch_types::deserr::DeserrJsonError;
use meilisearch_types::error::deserr_codes::*;
use meilisearch_types::error::ResponseError;
use meilisearch_types::index_uid::IndexUid;
use serde_json::Value;
use crate::analytics::{Analytics, FacetSearchAggregator};
use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::GuardedData;
use crate::search::{
add_search_rules, perform_facet_search, MatchingStrategy, SearchQuery, DEFAULT_CROP_LENGTH,
DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG,
DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
};
pub fn configure(cfg: &mut web::ServiceConfig) {
cfg.service(web::resource("").route(web::post().to(search)));
}
// TODO improve the error messages
#[derive(Debug, Clone, Default, PartialEq, Eq, deserr::Deserr)]
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
pub struct FacetSearchQuery {
#[deserr(default, error = DeserrJsonError<InvalidFacetSearchQuery>)]
pub facet_query: Option<String>,
#[deserr(error = DeserrJsonError<MissingFacetSearchFacetName>, missing_field_error = DeserrJsonError::missing_facet_search_facet_name)]
pub facet_name: String,
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
pub q: Option<String>,
#[deserr(default = DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError<InvalidSearchOffset>)]
pub offset: usize,
#[deserr(default = DEFAULT_SEARCH_LIMIT(), error = DeserrJsonError<InvalidSearchLimit>)]
pub limit: usize,
#[deserr(default, error = DeserrJsonError<InvalidSearchPage>)]
pub page: Option<usize>,
#[deserr(default, error = DeserrJsonError<InvalidSearchHitsPerPage>)]
pub hits_per_page: Option<usize>,
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToRetrieve>)]
pub attributes_to_retrieve: Option<BTreeSet<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToCrop>)]
pub attributes_to_crop: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchCropLength>, default = DEFAULT_CROP_LENGTH())]
pub crop_length: usize,
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToHighlight>)]
pub attributes_to_highlight: Option<HashSet<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchShowMatchesPosition>, default)]
pub show_matches_position: bool,
#[deserr(default, error = DeserrJsonError<InvalidSearchFilter>)]
pub filter: Option<Value>,
#[deserr(default, error = DeserrJsonError<InvalidSearchSort>)]
pub sort: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchFacets>)]
pub facets: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchHighlightPreTag>, default = DEFAULT_HIGHLIGHT_PRE_TAG())]
pub highlight_pre_tag: String,
#[deserr(default, error = DeserrJsonError<InvalidSearchHighlightPostTag>, default = DEFAULT_HIGHLIGHT_POST_TAG())]
pub highlight_post_tag: String,
#[deserr(default, error = DeserrJsonError<InvalidSearchCropMarker>, default = DEFAULT_CROP_MARKER())]
pub crop_marker: String,
#[deserr(default, error = DeserrJsonError<InvalidSearchMatchingStrategy>, default)]
pub matching_strategy: MatchingStrategy,
}
pub async fn search(
index_scheduler: GuardedData<ActionPolicy<{ actions::SEARCH }>, Data<IndexScheduler>>,
index_uid: web::Path<String>,
params: AwebJson<FacetSearchQuery, DeserrJsonError>,
req: HttpRequest,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
let query = params.into_inner();
debug!("facet search called with params: {:?}", query);
let mut aggregate = FacetSearchAggregator::from_query(&query, &req);
let facet_query = query.facet_query.clone();
let facet_name = query.facet_name.clone();
let mut search_query = SearchQuery::from(query);
// Tenant token search_rules.
if let Some(search_rules) = index_scheduler.filters().get_index_search_rules(&index_uid) {
add_search_rules(&mut search_query, search_rules);
}
let index = index_scheduler.index(&index_uid)?;
let search_result = tokio::task::spawn_blocking(move || {
perform_facet_search(&index, search_query, facet_query, facet_name)
})
.await?;
if let Ok(ref search_result) = search_result {
aggregate.succeed(search_result);
}
analytics.post_facet_search(aggregate);
let search_result = search_result?;
debug!("returns: {:?}", search_result);
Ok(HttpResponse::Ok().json(search_result))
}
impl From<FacetSearchQuery> for SearchQuery {
fn from(value: FacetSearchQuery) -> Self {
SearchQuery {
q: value.q,
offset: value.offset,
limit: value.limit,
page: value.page,
hits_per_page: value.hits_per_page,
attributes_to_retrieve: value.attributes_to_retrieve,
attributes_to_crop: value.attributes_to_crop,
crop_length: value.crop_length,
attributes_to_highlight: value.attributes_to_highlight,
show_matches_position: value.show_matches_position,
filter: value.filter,
sort: value.sort,
facets: value.facets,
highlight_pre_tag: value.highlight_pre_tag,
highlight_post_tag: value.highlight_post_tag,
crop_marker: value.crop_marker,
matching_strategy: value.matching_strategy,
}
}
}

View File

@@ -24,6 +24,7 @@ use crate::extractors::authentication::{AuthenticationError, GuardedData};
use crate::extractors::sequential_extractor::SeqHandler;
pub mod documents;
pub mod facet_search;
pub mod search;
pub mod settings;
@@ -44,6 +45,7 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
.service(web::resource("/stats").route(web::get().to(SeqHandler(get_index_stats))))
.service(web::scope("/documents").configure(documents::configure))
.service(web::scope("/search").configure(search::configure))
.service(web::scope("/facet-search").configure(facet_search::configure))
.service(web::scope("/settings").configure(settings::configure)),
);
}

View File

@@ -34,8 +34,6 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
pub struct SearchQueryGet {
#[deserr(default, error = DeserrQueryParamError<InvalidSearchQ>)]
q: Option<String>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchQ>)]
vector: Option<Vec<f32>>,
#[deserr(default = Param(DEFAULT_SEARCH_OFFSET()), error = DeserrQueryParamError<InvalidSearchOffset>)]
offset: Param<usize>,
#[deserr(default = Param(DEFAULT_SEARCH_LIMIT()), error = DeserrQueryParamError<InvalidSearchLimit>)]
@@ -82,7 +80,6 @@ impl From<SearchQueryGet> for SearchQuery {
Self {
q: other.q,
vector: other.vector,
offset: other.offset.0,
limit: other.limit.0,
page: other.page.as_deref().copied(),

View File

@@ -17,7 +17,7 @@ pub fn configure(config: &mut web::ServiceConfig) {
pub async fn get_metrics(
index_scheduler: GuardedData<ActionPolicy<{ actions::METRICS_GET }>, Data<IndexScheduler>>,
auth_controller: Data<AuthController>,
auth_controller: GuardedData<ActionPolicy<{ actions::METRICS_GET }>, Data<AuthController>>,
) -> Result<HttpResponse, ResponseError> {
let auth_filters = index_scheduler.filters();
if !auth_filters.all_indexes_authorized() {
@@ -28,10 +28,10 @@ pub async fn get_metrics(
return Err(error);
}
let response = create_all_stats((*index_scheduler).clone(), auth_controller, auth_filters)?;
let response =
create_all_stats((*index_scheduler).clone(), (*auth_controller).clone(), auth_filters)?;
crate::metrics::MEILISEARCH_DB_SIZE_BYTES.set(response.database_size as i64);
crate::metrics::MEILISEARCH_USED_DB_SIZE_BYTES.set(response.used_database_size as i64);
crate::metrics::MEILISEARCH_INDEX_COUNT.set(response.indexes.len() as i64);
for (index, value) in response.indexes.iter() {
@@ -40,14 +40,6 @@ pub async fn get_metrics(
.set(value.number_of_documents as i64);
}
for (kind, value) in index_scheduler.get_stats()? {
for (value, count) in value {
crate::metrics::MEILISEARCH_NB_TASKS
.with_label_values(&[&kind, &value])
.set(count as i64);
}
}
let encoder = TextEncoder::new();
let mut buffer = vec![];
encoder.encode(&prometheus::gather(), &mut buffer).expect("Failed to encode metrics");

View File

@@ -231,8 +231,6 @@ pub async fn running() -> HttpResponse {
#[serde(rename_all = "camelCase")]
pub struct Stats {
pub database_size: u64,
#[serde(skip)]
pub used_database_size: u64,
#[serde(serialize_with = "time::serde::rfc3339::option::serialize")]
pub last_update: Option<OffsetDateTime>,
pub indexes: BTreeMap<String, indexes::IndexStats>,
@@ -261,7 +259,6 @@ pub fn create_all_stats(
let mut last_task: Option<OffsetDateTime> = None;
let mut indexes = BTreeMap::new();
let mut database_size = 0;
let mut used_database_size = 0;
for index_uid in index_scheduler.index_names()? {
// Accumulate the size of all indexes, even unauthorized ones, so
@@ -269,7 +266,6 @@ pub fn create_all_stats(
// See <https://github.com/meilisearch/meilisearch/pull/3541#discussion_r1126747643> for context.
let stats = index_scheduler.index_stats(&index_uid)?;
database_size += stats.inner_stats.database_size;
used_database_size += stats.inner_stats.used_database_size;
if !filters.is_index_authorized(&index_uid) {
continue;
@@ -282,14 +278,10 @@ pub fn create_all_stats(
}
database_size += index_scheduler.size()?;
used_database_size += index_scheduler.used_size()?;
database_size += auth_controller.size()?;
used_database_size += auth_controller.used_size()?;
let update_file_size = index_scheduler.compute_update_file_size()?;
database_size += update_file_size;
used_database_size += update_file_size;
database_size += index_scheduler.compute_update_file_size()?;
let stats = Stats { database_size, used_database_size, last_update: last_task, indexes };
let stats = Stats { database_size, last_update: last_task, indexes };
Ok(stats)
}

View File

@@ -8,7 +8,9 @@ use either::Either;
use meilisearch_auth::IndexSearchRules;
use meilisearch_types::deserr::DeserrJsonError;
use meilisearch_types::error::deserr_codes::*;
use meilisearch_types::heed::RoTxn;
use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::milli::{FacetValueHit, SearchForFacetValues};
use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS;
use meilisearch_types::{milli, Document};
use milli::tokenizer::TokenizerBuilder;
@@ -31,13 +33,11 @@ pub const DEFAULT_CROP_MARKER: fn() -> String = || "…".to_string();
pub const DEFAULT_HIGHLIGHT_PRE_TAG: fn() -> String = || "<em>".to_string();
pub const DEFAULT_HIGHLIGHT_POST_TAG: fn() -> String = || "</em>".to_string();
#[derive(Debug, Clone, Default, PartialEq, Deserr)]
#[derive(Debug, Clone, Default, PartialEq, Eq, Deserr)]
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
pub struct SearchQuery {
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
pub q: Option<String>,
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
pub vector: Option<Vec<f32>>,
#[deserr(default = DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError<InvalidSearchOffset>)]
pub offset: usize,
#[deserr(default = DEFAULT_SEARCH_LIMIT(), error = DeserrJsonError<InvalidSearchLimit>)]
@@ -82,15 +82,13 @@ impl SearchQuery {
// This struct contains the fields of `SearchQuery` inline.
// This is because neither deserr nor serde support `flatten` when using `deny_unknown_fields.
// The `From<SearchQueryWithIndex>` implementation ensures both structs remain up to date.
#[derive(Debug, Clone, PartialEq, Deserr)]
#[derive(Debug, Clone, PartialEq, Eq, Deserr)]
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
pub struct SearchQueryWithIndex {
#[deserr(error = DeserrJsonError<InvalidIndexUid>, missing_field_error = DeserrJsonError::missing_index_uid)]
pub index_uid: IndexUid,
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
pub q: Option<String>,
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
pub vector: Option<Vec<f32>>,
#[deserr(default = DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError<InvalidSearchOffset>)]
pub offset: usize,
#[deserr(default = DEFAULT_SEARCH_LIMIT(), error = DeserrJsonError<InvalidSearchLimit>)]
@@ -130,7 +128,6 @@ impl SearchQueryWithIndex {
let SearchQueryWithIndex {
index_uid,
q,
vector,
offset,
limit,
page,
@@ -152,7 +149,6 @@ impl SearchQueryWithIndex {
index_uid,
SearchQuery {
q,
vector,
offset,
limit,
page,
@@ -176,7 +172,7 @@ impl SearchQueryWithIndex {
}
}
#[derive(Debug, Clone, PartialEq, Eq, Deserr)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, Deserr)]
#[deserr(rename_all = camelCase)]
pub enum MatchingStrategy {
/// Remove query words from last to first
@@ -247,6 +243,14 @@ pub struct FacetStats {
pub max: f64,
}
#[derive(Serialize, Debug, Clone, PartialEq)]
#[serde(rename_all = "camelCase")]
pub struct FacetSearchResult {
pub facet_hits: Vec<FacetValueHit>,
pub facet_query: Option<String>,
pub processing_time_ms: u128,
}
/// Incorporate search rules in search query
pub fn add_search_rules(query: &mut SearchQuery, rules: IndexSearchRules) {
query.filter = match (query.filter.take(), rules.filter) {
@@ -267,18 +271,12 @@ pub fn add_search_rules(query: &mut SearchQuery, rules: IndexSearchRules) {
}
}
pub fn perform_search(
index: &Index,
query: SearchQuery,
) -> Result<SearchResult, MeilisearchHttpError> {
let before_search = Instant::now();
let rtxn = index.read_txn()?;
let mut search = index.search(&rtxn);
if let Some(ref vector) = query.vector {
search.vector(vector.clone());
}
fn prepare_search<'t>(
index: &'t Index,
rtxn: &'t RoTxn,
query: &'t SearchQuery,
) -> Result<(milli::Search<'t>, bool, usize, usize), MeilisearchHttpError> {
let mut search = index.search(rtxn);
if let Some(ref query) = query.q {
search.query(query);
@@ -288,7 +286,7 @@ pub fn perform_search(
search.terms_matching_strategy(query.matching_strategy.into());
let max_total_hits = index
.pagination_max_total_hits(&rtxn)
.pagination_max_total_hits(rtxn)
.map_err(milli::Error::from)?
.unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS);
@@ -330,6 +328,19 @@ pub fn perform_search(
search.sort_criteria(sort);
}
Ok((search, is_finite_pagination, max_total_hits, offset))
}
pub fn perform_search(
index: &Index,
query: SearchQuery,
) -> Result<SearchResult, MeilisearchHttpError> {
let before_search = Instant::now();
let rtxn = index.read_txn()?;
let (search, is_finite_pagination, max_total_hits, offset) =
prepare_search(index, &rtxn, &query)?;
let milli::SearchResult { documents_ids, matching_words, candidates, .. } = search.execute()?;
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
@@ -483,6 +494,28 @@ pub fn perform_search(
Ok(result)
}
pub fn perform_facet_search(
index: &Index,
search_query: SearchQuery,
facet_query: Option<String>,
facet_name: String,
) -> Result<FacetSearchResult, MeilisearchHttpError> {
let before_search = Instant::now();
let rtxn = index.read_txn()?;
let (search, _, _, _) = prepare_search(index, &rtxn, &search_query)?;
let mut facet_search = SearchForFacetValues::new(facet_name, search);
if let Some(facet_query) = &facet_query {
facet_search.query(facet_query);
}
Ok(FacetSearchResult {
facet_hits: facet_search.execute()?,
facet_query,
processing_time_ms: before_search.elapsed().as_millis(),
})
}
fn insert_geo_distance(sorts: &[String], document: &mut Document) {
lazy_static::lazy_static! {
static ref GEO_REGEX: Regex =

View File

@@ -15,7 +15,6 @@ license.workspace = true
bimap = { version = "0.6.3", features = ["serde"] }
bincode = "1.3.3"
bstr = "1.4.0"
bytemuck = { version = "1.13.1", features = ["extern_crate_alloc"] }
byteorder = "1.4.3"
charabia = { version = "0.7.2", default-features = false }
concat-arrays = "0.1.2"
@@ -33,21 +32,18 @@ heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.6", default-f
"lmdb",
"sync-read-txn",
] }
hnsw = { version = "0.11.0", features = ["serde1"] }
json-depth-checker = { path = "../json-depth-checker" }
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
memmap2 = "0.5.10"
obkv = "0.2.0"
once_cell = "1.17.1"
ordered-float = "3.6.0"
rand_pcg = { version = "0.3.1", features = ["serde1"] }
rayon = "1.7.0"
roaring = "0.10.1"
rstar = { version = "0.10.0", features = ["serde"] }
serde = { version = "1.0.160", features = ["derive"] }
serde_json = { version = "1.0.95", features = ["preserve_order"] }
slice-group-by = "0.3.0"
space = "0.17.0"
smallstr = { version = "0.3.0", features = ["serde"] }
smallvec = "1.10.0"
smartstring = "1.0.1"

View File

@@ -52,7 +52,6 @@ fn main() -> Result<(), Box<dyn Error>> {
let docs = execute_search(
&mut ctx,
&(!query.trim().is_empty()).then(|| query.trim().to_owned()),
&None,
TermsMatchingStrategy::Last,
false,
&None,

View File

@@ -1,34 +0,0 @@
use serde::{Deserialize, Serialize};
use space::Metric;
#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)]
pub struct DotProduct;
impl Metric<Vec<f32>> for DotProduct {
type Unit = u32;
// TODO explain me this function, I don't understand why f32.to_bits is ordered.
// I tried to do this and it wasn't OK <https://stackoverflow.com/a/43305015/1941280>
//
// Following <https://docs.rs/space/0.17.0/space/trait.Metric.html>.
fn distance(&self, a: &Vec<f32>, b: &Vec<f32>) -> Self::Unit {
let dist: f32 = a.iter().zip(b).map(|(a, b)| a * b).sum();
let dist = 1.0 - dist;
debug_assert!(!dist.is_nan());
dist.to_bits()
}
}
#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)]
pub struct Euclidean;
impl Metric<Vec<f32>> for Euclidean {
type Unit = u32;
fn distance(&self, a: &Vec<f32>, b: &Vec<f32>) -> Self::Unit {
let squared: f32 = a.iter().zip(b).map(|(a, b)| (a - b).powi(2)).sum();
let dist = squared.sqrt();
debug_assert!(!dist.is_nan());
dist.to_bits()
}
}

View File

@@ -110,11 +110,9 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
},
#[error(transparent)]
InvalidGeoField(#[from] GeoError),
#[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)]
InvalidVectorDimensions { expected: usize, found: usize },
#[error("{0}")]
InvalidFilter(String),
#[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))]
#[error("Invalid type for filter subexpression: `expected {}, found: {1}`.", .0.join(", "))]
InvalidFilterExpression(&'static [&'static str], Value),
#[error("Attribute `{}` is not sortable. {}",
.field,
@@ -126,6 +124,16 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
}
)]
InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> },
#[error("Attribute `{}` is not filterable. {}",
.field,
match .valid_fields.is_empty() {
true => "This index does not have configured filterable attributes.".to_string(),
false => format!("Available filterable attributes are: `{}`.",
valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
),
}
)]
InvalidFacetSearchFacetName { field: String, valid_fields: BTreeSet<String> },
#[error("{}", HeedError::BadOpenOptions)]
InvalidLmdbOpenOptions,
#[error("You must specify where `sort` is listed in the rankingRules setting to use the sort parameter at search time.")]

View File

@@ -0,0 +1,23 @@
use std::borrow::Cow;
use fst::Set;
use heed::{BytesDecode, BytesEncode};
/// A codec for values of type `Set<&[u8]>`.
pub struct FstSetCodec;
impl<'a> BytesEncode<'a> for FstSetCodec {
type EItem = Set<Vec<u8>>;
fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
Some(Cow::Borrowed(item.as_fst().as_bytes()))
}
}
impl<'a> BytesDecode<'a> for FstSetCodec {
type DItem = Set<&'a [u8]>;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
Set::new(bytes).ok()
}
}

View File

@@ -2,6 +2,7 @@ mod beu32_str_codec;
mod byte_slice_ref;
pub mod facet;
mod field_id_word_count_codec;
mod fst_set_codec;
mod obkv_codec;
mod roaring_bitmap;
mod roaring_bitmap_length;
@@ -15,6 +16,7 @@ pub use str_ref::StrRefCodec;
pub use self::beu32_str_codec::BEU32StrCodec;
pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
pub use self::fst_set_codec::FstSetCodec;
pub use self::obkv_codec::ObkvCodec;
pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
pub use self::roaring_bitmap_length::{

View File

@@ -49,7 +49,7 @@ impl CboRoaringBitmapCodec {
} else {
// Otherwise, it means we used the classic RoaringBitmapCodec and
// that the header takes threshold integers.
RoaringBitmap::deserialize_unchecked_from(bytes)
RoaringBitmap::deserialize_from(bytes)
}
}
@@ -69,7 +69,7 @@ impl CboRoaringBitmapCodec {
vec.push(integer);
}
} else {
roaring |= RoaringBitmap::deserialize_unchecked_from(bytes.as_ref())?;
roaring |= RoaringBitmap::deserialize_from(bytes.as_ref())?;
}
}

View File

@@ -8,7 +8,7 @@ impl heed::BytesDecode<'_> for RoaringBitmapCodec {
type DItem = RoaringBitmap;
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
RoaringBitmap::deserialize_unchecked_from(bytes).ok()
RoaringBitmap::deserialize_from(bytes).ok()
}
}

View File

@@ -8,12 +8,10 @@ use charabia::{Language, Script};
use heed::flags::Flags;
use heed::types::*;
use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn};
use rand_pcg::Pcg32;
use roaring::RoaringBitmap;
use rstar::RTree;
use time::OffsetDateTime;
use crate::distance::DotProduct;
use crate::error::{InternalError, UserError};
use crate::facet::FacetType;
use crate::fields_ids_map::FieldsIdsMap;
@@ -21,16 +19,14 @@ use crate::heed_codec::facet::{
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
FieldIdCodec, OrderedF64Codec,
};
use crate::heed_codec::{ScriptLanguageCodec, StrBEU16Codec, StrRefCodec};
use crate::heed_codec::{FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec};
use crate::{
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec,
Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32,
default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId,
FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec,
Search, U8StrStrCodec, BEU16, BEU32,
};
/// The HNSW data-structure that we serialize, fill and search in.
pub type Hnsw = hnsw::Hnsw<DotProduct, Vec<f32>, Pcg32, 12, 24>;
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
pub const DEFAULT_MIN_WORD_LEN_TWO_TYPOS: u8 = 9;
@@ -47,7 +43,6 @@ pub mod main_key {
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids";
pub const GEO_RTREE_KEY: &str = "geo-rtree";
pub const VECTOR_HNSW_KEY: &str = "vector-hnsw";
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids";
pub const PRIMARY_KEY_KEY: &str = "primary-key";
@@ -90,9 +85,9 @@ pub mod db_name {
pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids";
pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids";
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
pub const FACET_ID_STRING_FST: &str = "facet-id-string-fst";
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
pub const VECTOR_ID_DOCID: &str = "vector-id-docids";
pub const DOCUMENTS: &str = "documents";
pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids";
}
@@ -117,6 +112,9 @@ pub struct Index {
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>,
/// Maps a word and a document id (u32) to all the positions where the given word appears.
pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
/// Maps the proximity between a pair of words with all the docids where this relation appears.
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
/// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
@@ -150,15 +148,14 @@ pub struct Index {
pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
/// Maps the facet field id and ranges of strings with the docids that corresponds to them.
pub facet_id_string_docids: Database<FacetGroupKeyCodec<StrRefCodec>, FacetGroupValueCodec>,
/// Maps the facet field id of the string facets with an FST containing all the facets values.
pub facet_id_string_fst: Database<OwnedType<BEU16>, FstSetCodec>,
/// Maps the document id, the facet field id and the numbers.
pub field_id_docid_facet_f64s: Database<FieldDocIdFacetF64Codec, Unit>,
/// Maps the document id, the facet field id and the strings.
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
/// Maps a vector id to the document id that have it.
pub vector_id_docid: Database<OwnedType<BEU32>, OwnedType<BEU32>>,
/// Maps the document id to the document as an obkv store.
pub(crate) documents: Database<OwnedType<BEU32>, ObkvCodec>,
}
@@ -183,6 +180,7 @@ impl Index {
let word_prefix_docids = env.create_database(&mut wtxn, Some(WORD_PREFIX_DOCIDS))?;
let exact_word_prefix_docids =
env.create_database(&mut wtxn, Some(EXACT_WORD_PREFIX_DOCIDS))?;
let docid_word_positions = env.create_database(&mut wtxn, Some(DOCID_WORD_POSITIONS))?;
let word_pair_proximity_docids =
env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
let script_language_docids =
@@ -202,6 +200,7 @@ impl Index {
let facet_id_f64_docids = env.create_database(&mut wtxn, Some(FACET_ID_F64_DOCIDS))?;
let facet_id_string_docids =
env.create_database(&mut wtxn, Some(FACET_ID_STRING_DOCIDS))?;
let facet_id_string_fst = env.create_database(&mut wtxn, Some(FACET_ID_STRING_FST))?;
let facet_id_exists_docids =
env.create_database(&mut wtxn, Some(FACET_ID_EXISTS_DOCIDS))?;
let facet_id_is_null_docids =
@@ -212,7 +211,6 @@ impl Index {
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_F64S))?;
let field_id_docid_facet_strings =
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?;
let vector_id_docid = env.create_database(&mut wtxn, Some(VECTOR_ID_DOCID))?;
let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?;
wtxn.commit()?;
@@ -225,6 +223,7 @@ impl Index {
exact_word_docids,
word_prefix_docids,
exact_word_prefix_docids,
docid_word_positions,
word_pair_proximity_docids,
script_language_docids,
word_prefix_pair_proximity_docids,
@@ -236,12 +235,12 @@ impl Index {
field_id_word_count_docids,
facet_id_f64_docids,
facet_id_string_docids,
facet_id_string_fst,
facet_id_exists_docids,
facet_id_is_null_docids,
facet_id_is_empty_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
vector_id_docid,
documents,
})
}
@@ -513,26 +512,6 @@ impl Index {
}
}
/* vector HNSW */
/// Writes the provided `hnsw`.
pub(crate) fn put_vector_hnsw(&self, wtxn: &mut RwTxn, hnsw: &Hnsw) -> heed::Result<()> {
self.main.put::<_, Str, SerdeBincode<Hnsw>>(wtxn, main_key::VECTOR_HNSW_KEY, hnsw)
}
/// Delete the `hnsw`.
pub(crate) fn delete_vector_hnsw(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
self.main.delete::<_, Str>(wtxn, main_key::VECTOR_HNSW_KEY)
}
/// Returns the `hnsw`.
pub fn vector_hnsw(&self, rtxn: &RoTxn) -> Result<Option<Hnsw>> {
match self.main.get::<_, Str, SerdeBincode<Hnsw>>(rtxn, main_key::VECTOR_HNSW_KEY)? {
Some(hnsw) => Ok(Some(hnsw)),
None => Ok(None),
}
}
/* field distribution */
/// Writes the field distribution which associates every field name with
@@ -1497,9 +1476,9 @@ pub(crate) mod tests {
db_snap!(index, field_distribution,
@r###"
age 1 |
id 2 |
name 2 |
age 1
id 2
name 2
"###
);
@@ -1517,9 +1496,9 @@ pub(crate) mod tests {
db_snap!(index, field_distribution,
@r###"
age 1 |
id 2 |
name 2 |
age 1
id 2
name 2
"###
);
@@ -1533,9 +1512,9 @@ pub(crate) mod tests {
db_snap!(index, field_distribution,
@r###"
has_dog 1 |
id 2 |
name 2 |
has_dog 1
id 2
name 2
"###
);
}

View File

@@ -5,12 +5,57 @@
#[global_allocator]
pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
// #[cfg(test)]
// pub mod allocator {
// use std::alloc::{GlobalAlloc, System};
// use std::sync::atomic::{self, AtomicI64};
// #[global_allocator]
// pub static ALLOC: CountingAlloc = CountingAlloc {
// max_resident: AtomicI64::new(0),
// resident: AtomicI64::new(0),
// allocated: AtomicI64::new(0),
// };
// pub struct CountingAlloc {
// pub max_resident: AtomicI64,
// pub resident: AtomicI64,
// pub allocated: AtomicI64,
// }
// unsafe impl GlobalAlloc for CountingAlloc {
// unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 {
// self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst);
// let old_resident =
// self.resident.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst);
// let resident = old_resident + layout.size() as i64;
// self.max_resident.fetch_max(resident, atomic::Ordering::SeqCst);
// // if layout.size() > 1_000_000 {
// // eprintln!(
// // "allocating {} with new resident size: {resident}",
// // layout.size() / 1_000_000
// // );
// // // let trace = std::backtrace::Backtrace::capture();
// // // let t = trace.to_string();
// // // eprintln!("{t}");
// // }
// System.alloc(layout)
// }
// unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) {
// self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed);
// System.dealloc(ptr, layout)
// }
// }
// }
#[macro_use]
pub mod documents;
mod asc_desc;
mod criterion;
pub mod distance;
mod error;
mod external_documents_ids;
pub mod facet;
@@ -54,8 +99,9 @@ pub use self::heed_codec::{
};
pub use self::index::Index;
pub use self::search::{
FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, Search,
SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
FacetDistribution, FacetValueHit, Filter, FormatOptions, MatchBounds, MatcherBuilder,
MatchingWords, Search, SearchForFacetValues, SearchResult, TermsMatchingStrategy,
DEFAULT_VALUES_PER_FACET,
};
pub type Result<T> = std::result::Result<T, error::Error>;

View File

@@ -1,14 +1,19 @@
use std::fmt;
use fst::automaton::{Automaton, Str};
use fst::{IntoStreamer, Streamer};
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
use log::error;
use once_cell::sync::Lazy;
use roaring::bitmap::RoaringBitmap;
pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET};
pub use self::new::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords};
use self::new::PartialSearchResult;
use crate::error::UserError;
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};
use crate::{
execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext,
execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext, BEU16,
};
// Building these factories is not free.
@@ -16,13 +21,15 @@ static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
static LEVDIST1: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(1, true));
static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
/// The maximum number of facets returned by the facet search route.
const MAX_NUMBER_OF_FACETS: usize = 100;
pub mod facet;
mod fst_utils;
pub mod new;
pub struct Search<'a> {
query: Option<String>,
vector: Option<Vec<f32>>,
// this should be linked to the String in the query
filter: Option<Filter<'a>>,
offset: usize,
@@ -40,7 +47,6 @@ impl<'a> Search<'a> {
pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> {
Search {
query: None,
vector: None,
filter: None,
offset: 0,
limit: 20,
@@ -59,11 +65,6 @@ impl<'a> Search<'a> {
self
}
pub fn vector(&mut self, vector: impl Into<Vec<f32>>) -> &mut Search<'a> {
self.vector = Some(vector.into());
self
}
pub fn offset(&mut self, offset: usize) -> &mut Search<'a> {
self.offset = offset;
self
@@ -113,7 +114,6 @@ impl<'a> Search<'a> {
execute_search(
&mut ctx,
&self.query,
&self.vector,
self.terms_matching_strategy,
self.exhaustive_number_hits,
&self.filter,
@@ -140,7 +140,6 @@ impl fmt::Debug for Search<'_> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let Search {
query,
vector: _,
filter,
offset,
limit,
@@ -154,7 +153,6 @@ impl fmt::Debug for Search<'_> {
} = self;
f.debug_struct("Search")
.field("query", query)
.field("vector", &"[...]")
.field("filter", filter)
.field("offset", offset)
.field("limit", limit)
@@ -209,6 +207,170 @@ pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA {
}
}
pub struct SearchForFacetValues<'a> {
query: Option<String>,
facet: String,
search_query: Search<'a>,
}
impl<'a> SearchForFacetValues<'a> {
pub fn new(facet: String, search_query: Search<'a>) -> SearchForFacetValues<'a> {
SearchForFacetValues { query: None, facet, search_query }
}
pub fn query(&mut self, query: impl Into<String>) -> &mut Self {
self.query = Some(query.into());
self
}
pub fn execute(&self) -> Result<Vec<FacetValueHit>> {
let index = self.search_query.index;
let rtxn = self.search_query.rtxn;
let filterable_fields = index.filterable_fields(rtxn)?;
if !filterable_fields.contains(&self.facet) {
return Err(UserError::InvalidFacetSearchFacetName {
field: self.facet.clone(),
valid_fields: filterable_fields.into_iter().collect(),
}
.into());
}
let fields_ids_map = index.fields_ids_map(rtxn)?;
let fid = match fields_ids_map.id(&self.facet) {
Some(fid) => fid,
// we return an empty list of results when the attribute has been
// set as filterable but no document contains this field (yet).
None => return Ok(Vec::new()),
};
let fst = match self.search_query.index.facet_id_string_fst.get(rtxn, &BEU16::new(fid))? {
Some(fst) => fst,
None => return Ok(vec![]),
};
let search_candidates = self.search_query.execute()?.candidates;
match self.query.as_ref() {
Some(query) => {
let authorize_typos = self.search_query.index.authorize_typos(rtxn)?;
let field_authorizes_typos =
!self.search_query.index.exact_attributes_ids(rtxn)?.contains(&fid);
if authorize_typos && field_authorizes_typos {
let mut results = vec![];
let exact_words_fst = self.search_query.index.exact_words(rtxn)?;
if exact_words_fst.map_or(false, |fst| fst.contains(query)) {
let key =
FacetGroupKey { field_id: fid, level: 0, left_bound: query.as_ref() };
if let Some(FacetGroupValue { bitmap, .. }) =
index.facet_id_string_docids.get(rtxn, &key)?
{
let count = search_candidates.intersection_len(&bitmap);
if count != 0 {
results.push(FacetValueHit { value: query.to_string(), count });
}
}
} else {
let one_typo = self.search_query.index.min_word_len_one_typo(rtxn)?;
let two_typos = self.search_query.index.min_word_len_two_typos(rtxn)?;
let is_prefix = true;
let automaton = if query.len() < one_typo as usize {
build_dfa(query, 0, is_prefix)
} else if query.len() < two_typos as usize {
build_dfa(query, 1, is_prefix)
} else {
build_dfa(query, 2, is_prefix)
};
let mut stream = fst.search(automaton).into_stream();
let mut length = 0;
while let Some(facet_value) = stream.next() {
let value = std::str::from_utf8(facet_value)?;
let key = FacetGroupKey { field_id: fid, level: 0, left_bound: value };
let docids = match index.facet_id_string_docids.get(rtxn, &key)? {
Some(FacetGroupValue { bitmap, .. }) => bitmap,
None => {
error!(
"the facet value is missing from the facet database: {key:?}"
);
continue;
}
};
let count = search_candidates.intersection_len(&docids);
if count != 0 {
results.push(FacetValueHit { value: value.to_string(), count });
length += 1;
}
if length >= MAX_NUMBER_OF_FACETS {
break;
}
}
}
Ok(results)
} else {
let automaton = Str::new(query).starts_with();
let mut stream = fst.search(automaton).into_stream();
let mut results = vec![];
let mut length = 0;
while let Some(facet_value) = stream.next() {
let value = std::str::from_utf8(facet_value)?;
let key = FacetGroupKey { field_id: fid, level: 0, left_bound: value };
let docids = match index.facet_id_string_docids.get(rtxn, &key)? {
Some(FacetGroupValue { bitmap, .. }) => bitmap,
None => {
error!(
"the facet value is missing from the facet database: {key:?}"
);
continue;
}
};
let count = search_candidates.intersection_len(&docids);
if count != 0 {
results.push(FacetValueHit { value: value.to_string(), count });
length += 1;
}
if length >= MAX_NUMBER_OF_FACETS {
break;
}
}
Ok(results)
}
}
None => {
let mut results = vec![];
let mut length = 0;
let prefix = FacetGroupKey { field_id: fid, level: 0, left_bound: "" };
for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? {
let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) =
result?;
let count = search_candidates.intersection_len(&bitmap);
if count != 0 {
results.push(FacetValueHit { value: left_bound.to_string(), count });
length += 1;
}
if length >= MAX_NUMBER_OF_FACETS {
break;
}
}
Ok(results)
}
}
}
}
#[derive(Debug, Clone, serde::Serialize, PartialEq)]
pub struct FacetValueHit {
/// The original facet value
pub value: String,
/// The number of documents associated to this facet
pub count: u64,
}
#[cfg(test)]
mod test {
#[allow(unused_imports)]

View File

@@ -509,7 +509,6 @@ mod tests {
let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search(
&mut ctx,
&Some(query.to_string()),
&None,
crate::TermsMatchingStrategy::default(),
false,
&None,

View File

@@ -28,7 +28,6 @@ use db_cache::DatabaseCache;
use exact_attribute::ExactAttribute;
use graph_based_ranking_rule::{Exactness, Fid, Position, Proximity, Typo};
use heed::RoTxn;
use hnsw::Searcher;
use interner::{DedupInterner, Interner};
pub use logger::visual::VisualSearchLogger;
pub use logger::{DefaultSearchLogger, SearchLogger};
@@ -40,16 +39,13 @@ use ranking_rules::{
use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache};
use roaring::RoaringBitmap;
use sort::Sort;
use space::Neighbor;
use self::geo_sort::GeoSort;
pub use self::geo_sort::Strategy as GeoSortStrategy;
use self::graph_based_ranking_rule::Words;
use self::interner::Interned;
use crate::search::new::distinct::apply_distinct_rule;
use crate::{
AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError, BEU32,
};
use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError};
/// A structure used throughout the execution of a search query.
pub struct SearchContext<'ctx> {
@@ -353,7 +349,6 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>(
pub fn execute_search(
ctx: &mut SearchContext,
query: &Option<String>,
vector: &Option<Vec<f32>>,
terms_matching_strategy: TermsMatchingStrategy,
exhaustive_number_hits: bool,
filters: &Option<Filter>,
@@ -433,33 +428,6 @@ pub fn execute_search(
let BucketSortOutput { docids, mut all_candidates } = bucket_sort_output;
let docids = match vector {
Some(vector) => {
// return the nearest documents that are also part of the candidates.
let mut searcher = Searcher::new();
let hnsw = ctx.index.vector_hnsw(ctx.txn)?.unwrap_or_default();
let ef = hnsw.len().min(100);
let mut dest = vec![Neighbor { index: 0, distance: 0 }; ef];
let neighbors = hnsw.nearest(vector, ef, &mut searcher, &mut dest[..]);
let mut docids = Vec::new();
for Neighbor { index, distance: _ } in neighbors.iter() {
let index = BEU32::new(*index as u32);
let docid = ctx.index.vector_id_docid.get(ctx.txn, &index)?.unwrap().get();
if universe.contains(docid) {
docids.push(docid);
if docids.len() == (from + length) {
break;
}
}
}
docids.into_iter().skip(from).take(length).collect()
}
// return the search docids if the vector field is not specified
None => docids,
};
// The candidates is the universe unless the exhaustive number of hits
// is requested and a distinct attribute is set.
if exhaustive_number_hits {

View File

@@ -79,7 +79,7 @@ pub fn located_query_terms_from_tokens(
TokenKind::Separator(separator_kind) => {
// add penalty for hard separators
if let SeparatorKind::Hard = separator_kind {
position = position.wrapping_add(7);
position = position.wrapping_add(1);
}
phrase = 'phrase: {

View File

@@ -89,6 +89,7 @@ Create a snapshot test of the given database.
- `exact_word_docids`
- `word_prefix_docids`
- `exact_word_prefix_docids`
- `docid_word_positions`
- `word_pair_proximity_docids`
- `word_prefix_pair_proximity_docids`
- `word_position_docids`
@@ -216,6 +217,11 @@ pub fn snap_exact_word_prefix_docids(index: &Index) -> String {
&format!("{s:<16} {}", display_bitmap(&b))
})
}
pub fn snap_docid_word_positions(index: &Index) -> String {
make_db_snap_from_iter!(index, docid_word_positions, |((idx, s), b)| {
&format!("{idx:<6} {s:<16} {}", display_bitmap(&b))
})
}
pub fn snap_word_pair_proximity_docids(index: &Index) -> String {
make_db_snap_from_iter!(index, word_pair_proximity_docids, |((proximity, word1, word2), b)| {
&format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b))
@@ -318,7 +324,7 @@ pub fn snap_field_distributions(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let mut snap = String::new();
for (field, count) in index.field_distribution(&rtxn).unwrap() {
writeln!(&mut snap, "{field:<16} {count:<6} |").unwrap();
writeln!(&mut snap, "{field:<16} {count:<6}").unwrap();
}
snap
}
@@ -328,7 +334,7 @@ pub fn snap_fields_ids_map(index: &Index) -> String {
let mut snap = String::new();
for field_id in fields_ids_map.ids() {
let name = fields_ids_map.name(field_id).unwrap();
writeln!(&mut snap, "{field_id:<3} {name:<16} |").unwrap();
writeln!(&mut snap, "{field_id:<3} {name:<16}").unwrap();
}
snap
}
@@ -471,6 +477,9 @@ macro_rules! full_snap_of_db {
($index:ident, exact_word_prefix_docids) => {{
$crate::snapshot_tests::snap_exact_word_prefix_docids(&$index)
}};
($index:ident, docid_word_positions) => {{
$crate::snapshot_tests::snap_docid_word_positions(&$index)
}};
($index:ident, word_pair_proximity_docids) => {{
$crate::snapshot_tests::snap_word_pair_proximity_docids(&$index)
}};

View File

@@ -1,7 +1,7 @@
---
source: milli/src/index.rs
---
age 1 |
id 2 |
name 2 |
age 1
id 2
name 2

View File

@@ -1,7 +1,7 @@
---
source: milli/src/index.rs
---
age 1 |
id 2 |
name 2 |
age 1
id 2
name 2

View File

@@ -23,6 +23,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
exact_word_docids,
word_prefix_docids,
exact_word_prefix_docids,
docid_word_positions,
word_pair_proximity_docids,
word_prefix_pair_proximity_docids,
prefix_word_pair_proximity_docids,
@@ -34,12 +35,12 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
script_language_docids,
facet_id_f64_docids,
facet_id_string_docids,
facet_id_string_fst: _,
facet_id_exists_docids,
facet_id_is_null_docids,
facet_id_is_empty_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
vector_id_docid,
documents,
} = self.index;
@@ -58,7 +59,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
self.index.delete_geo_rtree(self.wtxn)?;
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
self.index.delete_vector_hnsw(self.wtxn)?;
// We clean all the faceted documents ids.
for field_id in faceted_fields {
@@ -81,6 +81,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
exact_word_docids.clear(self.wtxn)?;
word_prefix_docids.clear(self.wtxn)?;
exact_word_prefix_docids.clear(self.wtxn)?;
docid_word_positions.clear(self.wtxn)?;
word_pair_proximity_docids.clear(self.wtxn)?;
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
prefix_word_pair_proximity_docids.clear(self.wtxn)?;
@@ -97,7 +98,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
facet_id_string_docids.clear(self.wtxn)?;
field_id_docid_facet_f64s.clear(self.wtxn)?;
field_id_docid_facet_strings.clear(self.wtxn)?;
vector_id_docid.clear(self.wtxn)?;
documents.clear(self.wtxn)?;
Ok(number_of_documents)
@@ -142,6 +142,7 @@ mod tests {
assert!(index.word_docids.is_empty(&rtxn).unwrap());
assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
assert!(index.docid_word_positions.is_empty(&rtxn).unwrap());
assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap());
assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap());

View File

@@ -1,13 +1,11 @@
use std::collections::btree_map::Entry;
use std::collections::{BTreeSet, HashMap, HashSet};
use std::collections::{HashMap, HashSet};
use fst::IntoStreamer;
use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice};
use heed::{BytesDecode, BytesEncode, Database, RwIter};
use hnsw::Searcher;
use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize};
use space::KnnPoints;
use time::OffsetDateTime;
use super::facet::delete::FacetsDelete;
@@ -16,9 +14,9 @@ use crate::error::InternalError;
use crate::facet::FacetType;
use crate::heed_codec::facet::FieldDocIdFacetCodec;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::index::Hnsw;
use crate::{
ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, BEU32,
ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec,
SmallString32, BEU32,
};
pub struct DeleteDocuments<'t, 'u, 'i> {
@@ -234,6 +232,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
exact_word_docids,
word_prefix_docids,
exact_word_prefix_docids,
docid_word_positions,
word_pair_proximity_docids,
field_id_word_count_docids,
word_prefix_pair_proximity_docids,
@@ -244,18 +243,32 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
word_prefix_fid_docids,
facet_id_f64_docids: _,
facet_id_string_docids: _,
facet_id_string_fst: _,
field_id_docid_facet_f64s: _,
field_id_docid_facet_strings: _,
script_language_docids,
facet_id_exists_docids,
facet_id_is_null_docids,
facet_id_is_empty_docids,
vector_id_docid,
documents,
} = self.index;
// Remove from the documents database
// Retrieve the words contained in the documents.
let mut words = Vec::new();
for docid in &self.to_delete_docids {
documents.delete(self.wtxn, &BEU32::new(docid))?;
// We iterate through the words positions of the document id, retrieve the word and delete the positions.
// We create an iterator to be able to get the content and delete the key-value itself.
// It's faster to acquire a cursor to get and delete, as we avoid traversing the LMDB B-Tree two times but only once.
let mut iter = docid_word_positions.prefix_iter_mut(self.wtxn, &(docid, ""))?;
while let Some(result) = iter.next() {
let ((_docid, word), _positions) = result?;
// This boolean will indicate if we must remove this word from the words FST.
words.push((SmallString32::from(word), false));
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? };
}
}
// We acquire the current external documents ids map...
// Note that its soft-deleted document ids field will be equal to the `to_delete_docids`
@@ -266,27 +279,42 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
let new_external_documents_ids = new_external_documents_ids.into_static();
self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?;
let mut words_to_keep = BTreeSet::default();
let mut words_to_delete = BTreeSet::default();
// Maybe we can improve the get performance of the words
// if we sort the words first, keeping the LMDB pages in cache.
words.sort_unstable();
// We iterate over the words and delete the documents ids
// from the word docids database.
remove_from_word_docids(
self.wtxn,
word_docids,
&self.to_delete_docids,
&mut words_to_keep,
&mut words_to_delete,
)?;
remove_from_word_docids(
self.wtxn,
exact_word_docids,
&self.to_delete_docids,
&mut words_to_keep,
&mut words_to_delete,
)?;
for (word, must_remove) in &mut words {
remove_from_word_docids(
self.wtxn,
word_docids,
word.as_str(),
must_remove,
&self.to_delete_docids,
)?;
remove_from_word_docids(
self.wtxn,
exact_word_docids,
word.as_str(),
must_remove,
&self.to_delete_docids,
)?;
}
// We construct an FST set that contains the words to delete from the words FST.
let words_to_delete = fst::Set::from_iter(words_to_delete.difference(&words_to_keep))?;
let words_to_delete =
words.iter().filter_map(
|(word, must_remove)| {
if *must_remove {
Some(word.as_str())
} else {
None
}
},
);
let words_to_delete = fst::Set::from_iter(words_to_delete)?;
let new_words_fst = {
// We retrieve the current words FST from the database.
@@ -440,30 +468,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
&self.to_delete_docids,
)?;
// An ugly and slow way to remove the vectors from the HNSW
// It basically reconstructs the HNSW from scratch without editing the current one.
let current_hnsw = self.index.vector_hnsw(self.wtxn)?.unwrap_or_default();
if !current_hnsw.is_empty() {
let mut new_hnsw = Hnsw::default();
let mut searcher = Searcher::new();
let mut new_vector_id_docids = Vec::new();
for result in vector_id_docid.iter(self.wtxn)? {
let (vector_id, docid) = result?;
if !self.to_delete_docids.contains(docid.get()) {
let vector = current_hnsw.get_point(vector_id.get() as usize).clone();
let vector_id = new_hnsw.insert(vector, &mut searcher);
new_vector_id_docids.push((vector_id as u32, docid));
}
}
vector_id_docid.clear(self.wtxn)?;
for (vector_id, docid) in new_vector_id_docids {
vector_id_docid.put(self.wtxn, &BEU32::new(vector_id), &docid)?;
}
self.index.put_vector_hnsw(self.wtxn, &new_hnsw)?;
}
self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;
Ok(DetailedDocumentDeletionResult {
@@ -529,24 +533,23 @@ fn remove_from_word_prefix_docids(
fn remove_from_word_docids(
txn: &mut heed::RwTxn,
db: &heed::Database<Str, RoaringBitmapCodec>,
word: &str,
must_remove: &mut bool,
to_remove: &RoaringBitmap,
words_to_keep: &mut BTreeSet<String>,
words_to_remove: &mut BTreeSet<String>,
) -> Result<()> {
// We create an iterator to be able to get the content and delete the word docids.
// It's faster to acquire a cursor to get and delete or put, as we avoid traversing
// the LMDB B-Tree two times but only once.
let mut iter = db.iter_mut(txn)?;
while let Some((key, mut docids)) = iter.next().transpose()? {
let previous_len = docids.len();
docids -= to_remove;
if docids.is_empty() {
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? };
words_to_remove.insert(key.to_owned());
} else {
words_to_keep.insert(key.to_owned());
if docids.len() != previous_len {
let mut iter = db.prefix_iter_mut(txn, word)?;
if let Some((key, mut docids)) = iter.next().transpose()? {
if key == word {
let previous_len = docids.len();
docids -= to_remove;
if docids.is_empty() {
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? };
*must_remove = true;
} else if docids.len() != previous_len {
let key = key.to_owned();
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.put_current(&key, &docids)? };
@@ -625,7 +628,7 @@ mod tests {
use super::*;
use crate::index::tests::TempIndex;
use crate::{db_snap, Filter, Search};
use crate::{db_snap, Filter};
fn delete_documents<'t>(
wtxn: &mut RwTxn<'t, '_>,
@@ -1197,52 +1200,4 @@ mod tests {
DeletionStrategy::AlwaysSoft,
);
}
#[test]
fn delete_words_exact_attributes() {
let index = TempIndex::new();
index
.update_settings(|settings| {
settings.set_primary_key(S("id"));
settings.set_searchable_fields(vec![S("text"), S("exact")]);
settings.set_exact_attributes(vec![S("exact")].into_iter().collect());
})
.unwrap();
index
.add_documents(documents!([
{ "id": 0, "text": "hello" },
{ "id": 1, "exact": "hello"}
]))
.unwrap();
db_snap!(index, word_docids, 1, @r###"
hello [0, ]
"###);
db_snap!(index, exact_word_docids, 1, @r###"
hello [1, ]
"###);
db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
let mut wtxn = index.write_txn().unwrap();
let deleted_internal_ids =
delete_documents(&mut wtxn, &index, &["1"], DeletionStrategy::AlwaysHard);
wtxn.commit().unwrap();
db_snap!(index, word_docids, 2, @r###"
hello [0, ]
"###);
db_snap!(index, exact_word_docids, 2, @"");
db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]");
let txn = index.read_txn().unwrap();
let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap();
insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###);
let mut s = Search::new(&txn, &index);
s.query("hello");
let crate::SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
}
}

View File

@@ -78,15 +78,16 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
use std::fs::File;
use heed::types::DecodeIgnore;
use log::debug;
use time::OffsetDateTime;
use self::incremental::FacetsUpdateIncremental;
use super::FacetsUpdateBulk;
use crate::facet::FacetType;
use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::ByteSliceRefCodec;
use crate::{Index, Result};
use crate::{Index, Result, BEU16};
pub mod bulk;
pub mod delete;
@@ -157,6 +158,43 @@ impl<'i> FacetsUpdate<'i> {
);
incremental_update.execute(wtxn)?;
}
// We compute one FST by string facet
let mut text_fsts = vec![];
let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
for result in database.iter(wtxn)? {
let (facet_group_key, _) = result?;
if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
current_fst = match current_fst.take() {
Some((fid, fst_builder)) if fid != field_id => {
let fst = fst_builder.into_set();
text_fsts.push((fid, fst));
Some((field_id, fst::SetBuilder::memory()))
}
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
None => Some((field_id, fst::SetBuilder::memory())),
};
if let Some((_, fst_builder)) = current_fst.as_mut() {
fst_builder.insert(left_bound)?;
}
}
}
if let Some((field_id, fst_builder)) = current_fst {
let fst = fst_builder.into_set();
text_fsts.push((field_id, fst));
}
// We remove all of the previous FSTs that were in this database
self.index.facet_id_string_fst.clear(wtxn)?;
// We write those FSTs in LMDB now
for (field_id, fst) in text_fsts {
self.index.facet_id_string_fst.put(wtxn, &BEU16::new(field_id), &fst)?;
}
Ok(())
}
}

View File

@@ -1 +0,0 @@

View File

@@ -1,6 +1,6 @@
use std::collections::HashMap;
use std::fs::File;
use std::io;
use std::{cmp, io};
use grenad::Sorter;
@@ -54,10 +54,11 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
}
for position in read_u32_ne_bytes(value) {
let (field_id, _) = relative_from_absolute_position(position);
let (field_id, position) = relative_from_absolute_position(position);
let word_count = position as u32 + 1;
let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0);
*value += 1;
*value = cmp::max(*value, word_count);
}
}
@@ -82,7 +83,7 @@ fn drain_document_fid_wordcount_into_sorter(
let mut key_buffer = Vec::new();
for (fid, count) in document_fid_wordcount.drain() {
if count <= 30 {
if count <= 10 {
key_buffer.clear();
key_buffer.extend_from_slice(&fid.to_be_bytes());
key_buffer.push(count as u8);

View File

@@ -1,40 +0,0 @@
use std::fs::File;
use std::io;
use bytemuck::cast_slice;
use serde_json::from_slice;
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
use crate::{FieldId, InternalError, Result};
/// Extracts the embedding vector contained in each document under the `_vector` field.
///
/// Returns the generated grenad reader containing the docid as key associated to the Vec<f32>
#[logging_timer::time]
pub fn extract_vector_points<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters,
vector_fid: FieldId,
) -> Result<grenad::Reader<File>> {
let mut writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
let mut cursor = obkv_documents.into_cursor()?;
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
let obkv = obkv::KvReader::new(value);
// first we get the _vector field
if let Some(vector) = obkv.get(vector_fid) {
// try to extract the vector
let vector: Vec<f32> = from_slice(vector).map_err(InternalError::SerdeJson).unwrap();
let bytes = cast_slice(&vector);
writer.insert(docid_bytes, bytes)?;
}
// else => the _vector object was `null`, there is nothing to do
}
writer_into_reader(writer)
}

View File

@@ -4,7 +4,6 @@ mod extract_facet_string_docids;
mod extract_fid_docid_facet_values;
mod extract_fid_word_count_docids;
mod extract_geo_points;
mod extract_vector_points;
mod extract_word_docids;
mod extract_word_fid_docids;
mod extract_word_pair_proximity_docids;
@@ -23,7 +22,6 @@ use self::extract_facet_string_docids::extract_facet_string_docids;
use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, ExtractedFacetValues};
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
use self::extract_geo_points::extract_geo_points;
use self::extract_vector_points::extract_vector_points;
use self::extract_word_docids::extract_word_docids;
use self::extract_word_fid_docids::extract_word_fid_docids;
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
@@ -47,7 +45,6 @@ pub(crate) fn data_from_obkv_documents(
faceted_fields: HashSet<FieldId>,
primary_key_id: FieldId,
geo_fields_ids: Option<(FieldId, FieldId)>,
vector_field_id: Option<FieldId>,
stop_words: Option<fst::Set<&[u8]>>,
max_positions_per_attributes: Option<u32>,
exact_attributes: HashSet<FieldId>,
@@ -72,7 +69,6 @@ pub(crate) fn data_from_obkv_documents(
&faceted_fields,
primary_key_id,
geo_fields_ids,
vector_field_id,
&stop_words,
max_positions_per_attributes,
)
@@ -283,7 +279,6 @@ fn send_and_extract_flattened_documents_data(
faceted_fields: &HashSet<FieldId>,
primary_key_id: FieldId,
geo_fields_ids: Option<(FieldId, FieldId)>,
vector_field_id: Option<FieldId>,
stop_words: &Option<fst::Set<&[u8]>>,
max_positions_per_attributes: Option<u32>,
) -> Result<(
@@ -312,20 +307,6 @@ fn send_and_extract_flattened_documents_data(
});
}
if let Some(vector_field_id) = vector_field_id {
let documents_chunk_cloned = flattened_documents_chunk.clone();
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
rayon::spawn(move || {
let result = extract_vector_points(documents_chunk_cloned, indexer, vector_field_id);
let _ = match result {
Ok(vector_points) => {
lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
}
Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
};
});
}
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
rayon::join(
|| {
@@ -344,6 +325,8 @@ fn send_and_extract_flattened_documents_data(
// send docid_word_positions_chunk to DB writer
let docid_word_positions_chunk =
unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? };
let _ = lmdb_writer_sx
.send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone())));
let _ =
lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair)));

View File

@@ -4,6 +4,7 @@ use std::result::Result as StdResult;
use roaring::RoaringBitmap;
use super::read_u32_ne_bytes;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::update::index_documents::transform::Operation;
use crate::Result;
@@ -21,6 +22,10 @@ pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Co
}
}
pub fn roaring_bitmap_from_u32s_array(slice: &[u8]) -> RoaringBitmap {
read_u32_ne_bytes(slice).collect()
}
pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> {
buffer.clear();
buffer.reserve(bitmap.serialized_size());

View File

@@ -14,8 +14,8 @@ pub use grenad_helpers::{
};
pub use merge_functions::{
concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps,
merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs, serialize_roaring_bitmap,
MergeFn,
merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs,
roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn,
};
use crate::MAX_WORD_LENGTH;

View File

@@ -304,8 +304,6 @@ where
}
None => None,
};
// get the fid of the `_vector` field.
let vector_field_id = self.index.fields_ids_map(self.wtxn)?.id("_vector");
let stop_words = self.index.stop_words(self.wtxn)?;
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
@@ -342,7 +340,6 @@ where
faceted_fields,
primary_key_id,
geo_fields_ids,
vector_field_id,
stop_words,
max_positions_per_attributes,
exact_attributes,
@@ -2474,11 +2471,11 @@ mod tests {
{
"id": 3,
"text": "a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a a a a a a
a a a a a a a a a a a a a a a a a a a a a "
}
]))
@@ -2516,5 +2513,6 @@ mod tests {
db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
db_snap!(index, docid_word_positions, 3, @"5287245332627675740b28bd46e1cde1");
}
}

View File

@@ -4,26 +4,27 @@ use std::convert::TryInto;
use std::fs::File;
use std::io;
use bytemuck::allocation::pod_collect_to_vec;
use charabia::{Language, Script};
use grenad::MergerBuilder;
use heed::types::ByteSlice;
use heed::RwTxn;
use hnsw::Searcher;
use heed::{BytesDecode, RwTxn};
use roaring::RoaringBitmap;
use space::KnnPoints;
use super::helpers::{
self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap,
self, merge_ignore_values, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap,
valid_lmdb_key, CursorClonableMmap,
};
use super::{ClonableMmap, MergeFn};
use crate::error::UserError;
use crate::facet::FacetType;
use crate::update::facet::FacetsUpdate;
use crate::update::index_documents::helpers::as_cloneable_grenad;
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32};
use crate::{
lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index,
Result,
};
pub(crate) enum TypedChunk {
DocidWordPositions(grenad::Reader<CursorClonableMmap>),
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
Documents(grenad::Reader<CursorClonableMmap>),
@@ -42,7 +43,6 @@ pub(crate) enum TypedChunk {
FieldIdFacetIsNullDocids(grenad::Reader<File>),
FieldIdFacetIsEmptyDocids(grenad::Reader<File>),
GeoPoints(grenad::Reader<File>),
VectorPoints(grenad::Reader<File>),
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
}
@@ -56,6 +56,29 @@ pub(crate) fn write_typed_chunk_into_index(
) -> Result<(RoaringBitmap, bool)> {
let mut is_merged_database = false;
match typed_chunk {
TypedChunk::DocidWordPositions(docid_word_positions_iter) => {
write_entries_into_database(
docid_word_positions_iter,
&index.docid_word_positions,
wtxn,
index_is_empty,
|value, buffer| {
// ensure that values are unique and ordered
let positions = roaring_bitmap_from_u32s_array(value);
BoRoaringBitmapCodec::serialize_into(&positions, buffer);
Ok(buffer)
},
|new_values, db_values, buffer| {
let new_values = roaring_bitmap_from_u32s_array(new_values);
let positions = match BoRoaringBitmapCodec::bytes_decode(db_values) {
Some(db_values) => new_values | db_values,
None => new_values, // should not happen
};
BoRoaringBitmapCodec::serialize_into(&positions, buffer);
Ok(())
},
)?;
}
TypedChunk::Documents(obkv_documents_iter) => {
let mut cursor = obkv_documents_iter.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
@@ -226,38 +249,6 @@ pub(crate) fn write_typed_chunk_into_index(
index.put_geo_rtree(wtxn, &rtree)?;
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
}
TypedChunk::VectorPoints(vector_points) => {
let mut hnsw = index.vector_hnsw(wtxn)?.unwrap_or_default();
let mut searcher = Searcher::new();
let mut expected_dimensions = match index.vector_id_docid.iter(wtxn)?.next() {
Some(result) => {
let (vector_id, _) = result?;
Some(hnsw.get_point(vector_id.get() as usize).len())
}
None => None,
};
let mut cursor = vector_points.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
// convert the key back to a u32 (4 bytes)
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
// convert the vector back to a Vec<f32>
let vector: Vec<f32> = pod_collect_to_vec(value);
// TODO Move this error in the vector extractor
let found = vector.len();
let expected = *expected_dimensions.get_or_insert(found);
if expected != found {
return Err(UserError::InvalidVectorDimensions { expected, found })?;
}
let vector_id = hnsw.insert(vector, &mut searcher) as u32;
index.vector_id_docid.put(wtxn, &BEU32::new(vector_id), &BEU32::new(docid))?;
}
log::debug!("There are {} entries in the HNSW so far", hnsw.len());
index.put_vector_hnsw(wtxn, &hnsw)?;
}
TypedChunk::ScriptLanguageDocids(hash_pair) => {
let mut buffer = Vec::new();
for (key, value) in hash_pair {