mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-12-07 21:25:42 +00:00
Compare commits
313 Commits
prototype-
...
search-ref
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f8f190cd40 | ||
|
|
3a408e8287 | ||
|
|
d3e5b10e23 | ||
|
|
1aaf24ccbf | ||
|
|
90bc230820 | ||
|
|
342c4ff85d | ||
|
|
c85392ce40 | ||
|
|
8875d24a48 | ||
|
|
c470b67fa2 | ||
|
|
c0e081cd98 | ||
|
|
b60840ebff | ||
|
|
fdc1763838 | ||
|
|
75819bc940 | ||
|
|
7b8cc25625 | ||
|
|
2be641f373 | ||
|
|
d89d2efb7e | ||
|
|
f284a9c0dd | ||
|
|
134e7fc433 | ||
|
|
0cba919228 | ||
|
|
aa63091752 | ||
|
|
58735d6d8f | ||
|
|
1b514517f5 | ||
|
|
11f814821d | ||
|
|
30fb1153cc | ||
|
|
3b2c8b9f25 | ||
|
|
2a7f9adf78 | ||
|
|
608ceea440 | ||
|
|
79001b9c97 | ||
|
|
59b12fca87 | ||
|
|
48f5bb1693 | ||
|
|
93188b3c88 | ||
|
|
bc4efca611 | ||
|
|
feaf25a95d | ||
|
|
414b3fae89 | ||
|
|
899baa0ea5 | ||
|
|
374095d42c | ||
|
|
dd007dceca | ||
|
|
3ae587205c | ||
|
|
1bf2694604 | ||
|
|
ed9cc1af55 | ||
|
|
b41a6cbd7a | ||
|
|
c8af572697 | ||
|
|
249053e514 | ||
|
|
ff2cf2a5ae | ||
|
|
b448aca49c | ||
|
|
55bad07c16 | ||
|
|
380469665f | ||
|
|
3421125a55 | ||
|
|
0b2200e6e7 | ||
|
|
0fd5ab9fcc | ||
|
|
14293f6c8f | ||
|
|
d3a94e8b25 | ||
|
|
1944077a7f | ||
|
|
8195d366fa | ||
|
|
cfd1b2cc97 | ||
|
|
19b044b4e6 | ||
|
|
e0730b55b3 | ||
|
|
729fa3770d | ||
|
|
9cbc85b2f9 | ||
|
|
a3cf104736 | ||
|
|
a109802d45 | ||
|
|
2d8060df80 | ||
|
|
47b66e49b8 | ||
|
|
8f2e971879 | ||
|
|
654a3a9e19 | ||
|
|
d1fdbb63da | ||
|
|
fb9d9239b2 | ||
|
|
a7a0891210 | ||
|
|
84d9c731f8 | ||
|
|
11f4724957 | ||
|
|
85182497ab | ||
|
|
3e4a356638 | ||
|
|
dfd9c384aa | ||
|
|
f0b4046c43 | ||
|
|
4b953d62fb | ||
|
|
c2f4b6ced0 | ||
|
|
1e6cbcaf12 | ||
|
|
066c6bd875 | ||
|
|
fd583501d7 | ||
|
|
bff4bde0ce | ||
|
|
cd45d21d6e | ||
|
|
f9960be115 | ||
|
|
bd9aba4d77 | ||
|
|
8edad8291b | ||
|
|
b3f60ee805 | ||
|
|
5acf953298 | ||
|
|
d9cebff61c | ||
|
|
30f7bd03f6 | ||
|
|
df0d9bb878 | ||
|
|
5230ddb3ea | ||
|
|
d6a7c28e4d | ||
|
|
e55efc419e | ||
|
|
644e136aee | ||
|
|
b4fabce36d | ||
|
|
9350a7b017 | ||
|
|
be69ab320d | ||
|
|
d59d75c9cd | ||
|
|
38b7b31beb | ||
|
|
7a01f20df7 | ||
|
|
c20c38a7fa | ||
|
|
5ab46324c4 | ||
|
|
325f17488a | ||
|
|
e7ff987c46 | ||
|
|
244003e36f | ||
|
|
1f813a6f3b | ||
|
|
96183e804a | ||
|
|
7ab48ed8c7 | ||
|
|
a94e78ffb0 | ||
|
|
e7bb8c940f | ||
|
|
8cb85294ef | ||
|
|
d0e9d65025 | ||
|
|
540a396e49 | ||
|
|
a81165f0d8 | ||
|
|
d6585eb10b | ||
|
|
f7d90ad19f | ||
|
|
31630c85d0 | ||
|
|
ab09dc0167 | ||
|
|
618c54915d | ||
|
|
130d2061bd | ||
|
|
66ddee4390 | ||
|
|
90a6c01495 | ||
|
|
e58426109a | ||
|
|
f513cf930a | ||
|
|
8a13ed7e3f | ||
|
|
1b8e4d0301 | ||
|
|
996619b22a | ||
|
|
2c9822a337 | ||
|
|
7276deee0a | ||
|
|
f7e7f438f8 | ||
|
|
ba8dcc2d78 | ||
|
|
7ca91ebb71 | ||
|
|
1ba8a40d61 | ||
|
|
47f6a3ad3d | ||
|
|
ae17c62e24 | ||
|
|
a1148c09c2 | ||
|
|
9c5f64769a | ||
|
|
ebe23b04c9 | ||
|
|
13b7c826c1 | ||
|
|
5440f43fd3 | ||
|
|
d9460a76f4 | ||
|
|
d1ddaa223d | ||
|
|
f7ecea142e | ||
|
|
337e75b0e4 | ||
|
|
b5691802a3 | ||
|
|
6e50f23896 | ||
|
|
4c8a0179ba | ||
|
|
c69cbec64a | ||
|
|
ce328c329d | ||
|
|
959e4607bb | ||
|
|
4b4ffb8ec9 | ||
|
|
3951fe22ab | ||
|
|
4d5bc9df4c | ||
|
|
ec2f8e8040 | ||
|
|
406b8bd248 | ||
|
|
62b9c6fbee | ||
|
|
b439d36807 | ||
|
|
faceb661e3 | ||
|
|
4129d657e2 | ||
|
|
3f13608002 | ||
|
|
4708d9b016 | ||
|
|
0d2e7bcc13 | ||
|
|
55fbfb6124 | ||
|
|
58fe260c72 | ||
|
|
24e5f6f7a9 | ||
|
|
9b87c36200 | ||
|
|
12b26cd54e | ||
|
|
061b1e6d7c | ||
|
|
0d6e8b5c31 | ||
|
|
d48cdc67a0 | ||
|
|
35c16ad047 | ||
|
|
2997d1f186 | ||
|
|
2a5997fb20 | ||
|
|
ee8a9e0bad | ||
|
|
3b0737a092 | ||
|
|
fdd02105ac | ||
|
|
aa9592455c | ||
|
|
01e24dd630 | ||
|
|
ae6bb1ce17 | ||
|
|
5fd28620cd | ||
|
|
728710d63a | ||
|
|
fa81381865 | ||
|
|
b96a682f16 | ||
|
|
d0f048c068 | ||
|
|
223e82a10d | ||
|
|
9507ff5e31 | ||
|
|
c2b025946a | ||
|
|
3a818c5e87 | ||
|
|
d74134ce3a | ||
|
|
5ac129bfa1 | ||
|
|
abb4522f76 | ||
|
|
ef084ef042 | ||
|
|
3524bd1257 | ||
|
|
d4f6216966 | ||
|
|
77acafe534 | ||
|
|
53afda3237 | ||
|
|
abb19d368d | ||
|
|
b4a52a622e | ||
|
|
8d7d8cdc2f | ||
|
|
626a93b348 | ||
|
|
af65fe201a | ||
|
|
9b83b1deb0 | ||
|
|
e9eb271499 | ||
|
|
3281a88d08 | ||
|
|
5a644054ab | ||
|
|
16fefd364e | ||
|
|
00bad8c716 | ||
|
|
862714a18b | ||
|
|
d18ebe4f3a | ||
|
|
7169d85115 | ||
|
|
f5f5f03ec0 | ||
|
|
9b2653427d | ||
|
|
56b7209f26 | ||
|
|
9b1f439a91 | ||
|
|
01c7d2de8f | ||
|
|
a86aeba411 | ||
|
|
384fdc2df4 | ||
|
|
83e5b4ed0d | ||
|
|
272cd7ebbd | ||
|
|
c63c7377e6 | ||
|
|
9259cdb12e | ||
|
|
5b50e49522 | ||
|
|
65474c8de5 | ||
|
|
fbb1ba3de0 | ||
|
|
a59ca28e2c | ||
|
|
825f742000 | ||
|
|
dd491320e5 | ||
|
|
c6ff97a220 | ||
|
|
49240c367a | ||
|
|
1e6e624078 | ||
|
|
8b4e07e1a3 | ||
|
|
2853009987 | ||
|
|
aa59c3bc2c | ||
|
|
7b1d8f4c6d | ||
|
|
a49ddec9df | ||
|
|
05fe856e6e | ||
|
|
c0cdaf9f53 | ||
|
|
e9cf58d584 | ||
|
|
31628c5cd4 | ||
|
|
3004e281d7 | ||
|
|
14e8d0aaa2 | ||
|
|
1c58cf8426 | ||
|
|
5155fd2bf1 | ||
|
|
9ec9c204d3 | ||
|
|
78b9304d52 | ||
|
|
0465ba4a05 | ||
|
|
2099991dd1 | ||
|
|
c232cdabf5 | ||
|
|
4e266211bf | ||
|
|
57fa689131 | ||
|
|
10626dddfc | ||
|
|
9051065c22 | ||
|
|
e8c76cf7bf | ||
|
|
3f1729a17f | ||
|
|
cab2b6bcda | ||
|
|
c4979a2fda | ||
|
|
23931f8a4f | ||
|
|
aa414565bb | ||
|
|
1db152046e | ||
|
|
c27ea2677f | ||
|
|
caa1e1b923 | ||
|
|
71f18e4379 | ||
|
|
600e3dd1c5 | ||
|
|
362eb0de86 | ||
|
|
998d46ac10 | ||
|
|
6c85c0d95e | ||
|
|
0e1fbbf7c6 | ||
|
|
6806640ef0 | ||
|
|
173e37584c | ||
|
|
6ba4d5e987 | ||
|
|
dd12d44134 | ||
|
|
a61495d660 | ||
|
|
c8e251bf24 | ||
|
|
a938fbde4a | ||
|
|
dcf3f1d18a | ||
|
|
66d0c63694 | ||
|
|
132191360b | ||
|
|
345c99d5bd | ||
|
|
89d696c1e3 | ||
|
|
c645853529 | ||
|
|
a70ab8b072 | ||
|
|
48aae76b15 | ||
|
|
23bf572dea | ||
|
|
864f6410ed | ||
|
|
c9bf6bb2fa | ||
|
|
46249ea901 | ||
|
|
ce0d1e0e13 | ||
|
|
5065d8b0c1 | ||
|
|
a83007c013 | ||
|
|
79e0a6dd4e | ||
|
|
2d88089129 | ||
|
|
1d937f831b | ||
|
|
6c659dc12f | ||
|
|
a8531053a0 | ||
|
|
cf34d1c95f | ||
|
|
1a9c58a7ab | ||
|
|
64571c8288 | ||
|
|
72123c458b | ||
|
|
d5881519cb | ||
|
|
ea016d97af | ||
|
|
fa2ea4a379 | ||
|
|
030263caa3 | ||
|
|
c25779afba | ||
|
|
df48ac8803 | ||
|
|
ff86073288 | ||
|
|
0ad53784e7 | ||
|
|
e064c52544 | ||
|
|
e106b16148 | ||
|
|
b1d61f5a02 | ||
|
|
7dc04747fd | ||
|
|
7c0cd7172d | ||
|
|
43ff236df8 | ||
|
|
19ab4d1a15 | ||
|
|
9287858997 |
@@ -2,3 +2,4 @@ target
|
||||
Dockerfile
|
||||
.dockerignore
|
||||
.gitignore
|
||||
**/.git
|
||||
|
||||
19
.github/uffizzi/Dockerfile
vendored
19
.github/uffizzi/Dockerfile
vendored
@@ -1,19 +0,0 @@
|
||||
# Run
|
||||
FROM uffizzi/ttyd:alpine
|
||||
|
||||
ENV MEILI_HTTP_ADDR 0.0.0.0:7700
|
||||
ENV MEILI_SERVER_PROVIDER docker
|
||||
ENV MEILI_NO_ANALYTICS true
|
||||
|
||||
RUN apk update --quiet \
|
||||
&& apk add -q --no-cache libgcc tini curl
|
||||
|
||||
COPY target/x86_64-unknown-linux-musl/release/meilisearch /bin/meilisearch
|
||||
RUN ln -s /bin/meilisearch /meilisearch
|
||||
|
||||
WORKDIR /meili_data
|
||||
|
||||
EXPOSE 7700/tcp
|
||||
|
||||
ENTRYPOINT ["tini", "--"]
|
||||
CMD ["ttyd", "/bin/zsh"]
|
||||
26
.github/uffizzi/docker-compose.uffizzi.yml
vendored
26
.github/uffizzi/docker-compose.uffizzi.yml
vendored
@@ -1,26 +0,0 @@
|
||||
version: "3"
|
||||
|
||||
x-uffizzi:
|
||||
ingress:
|
||||
service: nginx
|
||||
port: 8081
|
||||
|
||||
services:
|
||||
meilisearch:
|
||||
image: "${MEILISEARCH_IMAGE}"
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "7681:7681"
|
||||
- "7700:7700"
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 500M
|
||||
|
||||
nginx:
|
||||
image: nginx:alpine
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "8081:8081"
|
||||
volumes:
|
||||
- ./.github/uffizzi/nginx:/etc/nginx
|
||||
28
.github/uffizzi/nginx/nginx.conf
vendored
28
.github/uffizzi/nginx/nginx.conf
vendored
@@ -1,28 +0,0 @@
|
||||
|
||||
events {
|
||||
worker_connections 4096; ## Default: 1024
|
||||
}
|
||||
|
||||
http {
|
||||
map $http_upgrade $connection_upgrade {
|
||||
default upgrade;
|
||||
'' close;
|
||||
}
|
||||
|
||||
server {
|
||||
listen 8081;
|
||||
|
||||
location / {
|
||||
proxy_pass http://localhost:7681;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection $connection_upgrade;
|
||||
}
|
||||
|
||||
location /meilisearch/ {
|
||||
# rewrite /meilisearch/(.*) /$1 break;
|
||||
proxy_pass http://localhost:7700/;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
7
.github/workflows/publish-docker-images.yml
vendored
7
.github/workflows/publish-docker-images.yml
vendored
@@ -58,9 +58,13 @@ jobs:
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
with:
|
||||
platforms: linux/amd64,linux/arm64
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
with:
|
||||
platforms: linux/amd64,linux/arm64
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v2
|
||||
@@ -88,10 +92,13 @@ jobs:
|
||||
push: true
|
||||
platforms: linux/amd64,linux/arm64
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
builder: ${{ steps.buildx.outputs.name }}
|
||||
build-args: |
|
||||
COMMIT_SHA=${{ github.sha }}
|
||||
COMMIT_DATE=${{ steps.build-metadata.outputs.date }}
|
||||
GIT_TAG=${{ github.ref_name }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
# /!\ Don't touch this without checking with Cloud team
|
||||
- name: Send CI information to Cloud team
|
||||
|
||||
200
.github/workflows/sdks-tests.yml
vendored
Normal file
200
.github/workflows/sdks-tests.yml
vendored
Normal file
@@ -0,0 +1,200 @@
|
||||
# If any test fails, the engine team should ensure the "breaking" changes are expected and contact the integration team
|
||||
name: SDKs tests
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: "0 6 * * MON" # Every Monday at 6:00AM
|
||||
|
||||
env:
|
||||
MEILI_MASTER_KEY: 'masterKey'
|
||||
MEILI_NO_ANALYTICS: 'true'
|
||||
|
||||
jobs:
|
||||
|
||||
meilisearch-js-tests:
|
||||
name: JS SDK tests
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
meilisearch:
|
||||
image: getmeili/meilisearch:nightly
|
||||
env:
|
||||
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
|
||||
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
repository: meilisearch/meilisearch-js
|
||||
- name: Setup node
|
||||
uses: actions/setup-node@v3
|
||||
with:
|
||||
cache: 'yarn'
|
||||
- name: Install dependencies
|
||||
run: yarn --dev
|
||||
- name: Run tests
|
||||
run: yarn test
|
||||
- name: Build project
|
||||
run: yarn build
|
||||
- name: Run ESM env
|
||||
run: yarn test:env:esm
|
||||
- name: Run Node.js env
|
||||
run: yarn test:env:nodejs
|
||||
- name: Run node typescript env
|
||||
run: yarn test:env:node-ts
|
||||
- name: Run Browser env
|
||||
run: yarn test:env:browser
|
||||
|
||||
instant-meilisearch-tests:
|
||||
name: instant-meilisearch tests
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
meilisearch:
|
||||
image: getmeili/meilisearch:nightly
|
||||
env:
|
||||
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
|
||||
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
repository: meilisearch/instant-meilisearch
|
||||
- name: Setup node
|
||||
uses: actions/setup-node@v3
|
||||
with:
|
||||
cache: yarn
|
||||
- name: Install dependencies
|
||||
run: yarn install
|
||||
- name: Run tests
|
||||
run: yarn test
|
||||
- name: Build all the playgrounds and the packages
|
||||
run: yarn build
|
||||
|
||||
meilisearch-php-tests:
|
||||
name: PHP SDK tests
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
meilisearch:
|
||||
image: getmeili/meilisearch:nightly
|
||||
env:
|
||||
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
|
||||
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
repository: meilisearch/meilisearch-php
|
||||
- name: Install PHP
|
||||
uses: shivammathur/setup-php@v2
|
||||
with:
|
||||
coverage: none
|
||||
- name: Validate composer.json and composer.lock
|
||||
run: composer validate
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
composer remove --dev friendsofphp/php-cs-fixer --no-update --no-interaction
|
||||
composer update --prefer-dist --no-progress
|
||||
- name: Run test suite - default HTTP client (Guzzle 7)
|
||||
run: |
|
||||
sh scripts/tests.sh
|
||||
composer remove --dev guzzlehttp/guzzle http-interop/http-factory-guzzle
|
||||
|
||||
meilisearch-python-tests:
|
||||
name: Python SDK tests
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
meilisearch:
|
||||
image: getmeili/meilisearch:nightly
|
||||
env:
|
||||
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
|
||||
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
repository: meilisearch/meilisearch-python
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
- name: Install pipenv
|
||||
uses: dschep/install-pipenv-action@v1
|
||||
- name: Install dependencies
|
||||
run: pipenv install --dev --python=${{ matrix.python-version }}
|
||||
- name: Test with pytest
|
||||
run: pipenv run pytest
|
||||
|
||||
meilisearch-go-tests:
|
||||
name: Go SDK tests
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
meilisearch:
|
||||
image: getmeili/meilisearch:nightly
|
||||
env:
|
||||
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
|
||||
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@v3
|
||||
with:
|
||||
go-version: stable
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
repository: meilisearch/meilisearch-go
|
||||
- name: Get dependencies
|
||||
run: |
|
||||
go get -v -t -d ./...
|
||||
if [ -f Gopkg.toml ]; then
|
||||
curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh
|
||||
dep ensure
|
||||
fi
|
||||
- name: Run integration tests
|
||||
run: go test -v ./...
|
||||
|
||||
meilisearch-ruby-tests:
|
||||
name: Ruby SDK tests
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
meilisearch:
|
||||
image: getmeili/meilisearch:nightly
|
||||
env:
|
||||
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
|
||||
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
repository: meilisearch/meilisearch-ruby
|
||||
- name: Set up Ruby 3
|
||||
uses: ruby/setup-ruby@v1
|
||||
with:
|
||||
ruby-version: 3
|
||||
- name: Install ruby dependencies
|
||||
run: bundle install --with test
|
||||
- name: Run test suite
|
||||
run: bundle exec rspec
|
||||
|
||||
meilisearch-rust-tests:
|
||||
name: Rust SDK tests
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
meilisearch:
|
||||
image: getmeili/meilisearch:nightly
|
||||
env:
|
||||
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
|
||||
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
repository: meilisearch/meilisearch-rust
|
||||
- name: Build
|
||||
run: cargo build --verbose
|
||||
- name: Run tests
|
||||
run: cargo test --verbose
|
||||
5
.github/workflows/test-suite.yml
vendored
5
.github/workflows/test-suite.yml
vendored
@@ -138,7 +138,7 @@ jobs:
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
profile: minimal
|
||||
toolchain: 1.67.0
|
||||
toolchain: 1.69.0
|
||||
override: true
|
||||
components: clippy
|
||||
- name: Cache dependencies
|
||||
@@ -147,8 +147,7 @@ jobs:
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
command: clippy
|
||||
# allow unlined_format_args https://github.com/rust-lang/rust-clippy/issues/10087
|
||||
args: --all-targets -- --deny warnings --allow clippy::uninlined_format_args
|
||||
args: --all-targets -- --deny warnings
|
||||
|
||||
fmt:
|
||||
name: Run Rustfmt
|
||||
|
||||
120
.github/workflows/uffizzi-build.yml
vendored
120
.github/workflows/uffizzi-build.yml
vendored
@@ -1,120 +0,0 @@
|
||||
name: Uffizzi - Build PR Image
|
||||
on:
|
||||
pull_request:
|
||||
types: [opened,synchronize,reopened,closed]
|
||||
|
||||
jobs:
|
||||
build-meilisearch:
|
||||
name: Build and push `meilisearch`
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
if: ${{ github.event.action != 'closed' }}
|
||||
steps:
|
||||
- name: checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- run: sudo apt-get install musl-tools
|
||||
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
override: true
|
||||
target: x86_64-unknown-linux-musl
|
||||
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.2.1
|
||||
|
||||
- name: Run cargo check without any default features
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
command: build
|
||||
args: --target x86_64-unknown-linux-musl --release
|
||||
|
||||
- name: Remove dockerignore so we can use the target folder in our docker build
|
||||
run: rm -f .dockerignore
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
|
||||
- name: Generate UUID image name
|
||||
id: uuid
|
||||
run: echo "UUID_TAG=$(uuidgen)" >> $GITHUB_ENV
|
||||
|
||||
- name: Docker metadata
|
||||
id: meta
|
||||
uses: docker/metadata-action@v4
|
||||
with:
|
||||
images: registry.uffizzi.com/${{ env.UUID_TAG }}
|
||||
tags: |
|
||||
type=raw,value=60d
|
||||
|
||||
- name: Build Image
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: ./
|
||||
file: .github/uffizzi/Dockerfile
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
push: true
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
render-compose-file:
|
||||
name: Render Docker Compose File
|
||||
# Pass output of this workflow to another triggered by `workflow_run` event.
|
||||
runs-on: ubuntu-latest
|
||||
needs:
|
||||
- build-meilisearch
|
||||
outputs:
|
||||
compose-file-cache-key: ${{ env.COMPOSE_FILE_HASH }}
|
||||
steps:
|
||||
- name: Checkout git repo
|
||||
uses: actions/checkout@v3
|
||||
- name: Render Compose File
|
||||
run: |
|
||||
MEILISEARCH_IMAGE=$(echo ${{ needs.build-meilisearch.outputs.tags }})
|
||||
export MEILISEARCH_IMAGE
|
||||
# Render simple template from environment variables.
|
||||
envsubst < .github/uffizzi/docker-compose.uffizzi.yml > docker-compose.rendered.yml
|
||||
cat docker-compose.rendered.yml
|
||||
- name: Upload Rendered Compose File as Artifact
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: preview-spec
|
||||
path: docker-compose.rendered.yml
|
||||
retention-days: 2
|
||||
- name: Serialize PR Event to File
|
||||
run: |
|
||||
cat << EOF > event.json
|
||||
${{ toJSON(github.event) }}
|
||||
|
||||
EOF
|
||||
- name: Upload PR Event as Artifact
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: preview-spec
|
||||
path: event.json
|
||||
retention-days: 2
|
||||
|
||||
delete-preview:
|
||||
name: Call for Preview Deletion
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ github.event.action == 'closed' }}
|
||||
steps:
|
||||
# If this PR is closing, we will not render a compose file nor pass it to the next workflow.
|
||||
- name: Serialize PR Event to File
|
||||
run: |
|
||||
cat << EOF > event.json
|
||||
${{ toJSON(github.event) }}
|
||||
|
||||
EOF
|
||||
- name: Upload PR Event as Artifact
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: preview-spec
|
||||
path: event.json
|
||||
retention-days: 2
|
||||
103
.github/workflows/uffizzi-preview-deploy.yml
vendored
103
.github/workflows/uffizzi-preview-deploy.yml
vendored
@@ -1,103 +0,0 @@
|
||||
name: Uffizzi - Deploy Preview
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows:
|
||||
- "Uffizzi - Build PR Image"
|
||||
types:
|
||||
- completed
|
||||
|
||||
jobs:
|
||||
cache-compose-file:
|
||||
name: Cache Compose File
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ github.event.workflow_run.conclusion == 'success' }}
|
||||
outputs:
|
||||
compose-file-cache-key: ${{ env.COMPOSE_FILE_HASH }}
|
||||
pr-number: ${{ env.PR_NUMBER }}
|
||||
expected-url: ${{ env.EXPECTED_URL }}
|
||||
steps:
|
||||
- name: 'Download artifacts'
|
||||
# Fetch output (zip archive) from the workflow run that triggered this workflow.
|
||||
uses: actions/github-script@v6
|
||||
with:
|
||||
script: |
|
||||
let allArtifacts = await github.rest.actions.listWorkflowRunArtifacts({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
run_id: context.payload.workflow_run.id,
|
||||
});
|
||||
let matchArtifact = allArtifacts.data.artifacts.filter((artifact) => {
|
||||
return artifact.name == "preview-spec"
|
||||
})[0];
|
||||
let download = await github.rest.actions.downloadArtifact({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
artifact_id: matchArtifact.id,
|
||||
archive_format: 'zip',
|
||||
});
|
||||
let fs = require('fs');
|
||||
fs.writeFileSync(`${process.env.GITHUB_WORKSPACE}/preview-spec.zip`, Buffer.from(download.data));
|
||||
|
||||
- name: 'Unzip artifact'
|
||||
run: unzip preview-spec.zip
|
||||
|
||||
- name: Read Event into ENV
|
||||
run: |
|
||||
echo 'EVENT_JSON<<EOF' >> $GITHUB_ENV
|
||||
cat event.json >> $GITHUB_ENV
|
||||
echo 'EOF' >> $GITHUB_ENV
|
||||
|
||||
- name: Hash Rendered Compose File
|
||||
id: hash
|
||||
# If the previous workflow was triggered by a PR close event, we will not have a compose file artifact.
|
||||
if: ${{ fromJSON(env.EVENT_JSON).action != 'closed' }}
|
||||
run: echo "COMPOSE_FILE_HASH=$(md5sum docker-compose.rendered.yml | awk '{ print $1 }')" >> $GITHUB_ENV
|
||||
|
||||
- name: Cache Rendered Compose File
|
||||
if: ${{ fromJSON(env.EVENT_JSON).action != 'closed' }}
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: docker-compose.rendered.yml
|
||||
key: ${{ env.COMPOSE_FILE_HASH }}
|
||||
|
||||
- name: Read PR Number From Event Object
|
||||
id: pr
|
||||
run: echo "PR_NUMBER=${{ fromJSON(env.EVENT_JSON).number }}" >> $GITHUB_ENV
|
||||
|
||||
- name: DEBUG - Print Job Outputs
|
||||
if: ${{ runner.debug }}
|
||||
run: |
|
||||
echo "PR number: ${{ env.PR_NUMBER }}"
|
||||
echo "Compose file hash: ${{ env.COMPOSE_FILE_HASH }}"
|
||||
cat event.json
|
||||
|
||||
- name: Add expected URL env var
|
||||
if: ${{ runner.debug }}
|
||||
run: |
|
||||
REPO=$(echo ${{ github.repository }} | sed 's/\./+/g')
|
||||
echo "EXPECTED_URL=${{ inputs.server }}/github.com/$REPO/pull/${{ env.PR_NUMBER }}" >> $GITHUB_ENV
|
||||
|
||||
deploy-uffizzi-preview:
|
||||
name: Use Remote Workflow to Preview on Uffizzi
|
||||
needs:
|
||||
- cache-compose-file
|
||||
uses: UffizziCloud/preview-action/.github/workflows/reusable.yaml@v2
|
||||
with:
|
||||
# If this workflow was triggered by a PR close event, cache-key will be an empty string
|
||||
# and this reusable workflow will delete the preview deployment.
|
||||
compose-file-cache-key: ${{ needs.cache-compose-file.outputs.compose-file-cache-key }}
|
||||
compose-file-cache-path: docker-compose.rendered.yml
|
||||
server: https://app.uffizzi.com
|
||||
pr-number: ${{ needs.cache-compose-file.outputs.pr-number }}
|
||||
description: |
|
||||
The meilisearch preview environment contains a web terminal from where you can run the
|
||||
`meilisearch` command. You should be able to access this instance of meilisearch running in
|
||||
the preview from the link Meilisearch Endpoint link given below.
|
||||
|
||||
Web Terminal Endpoint : <uffizzi-url>
|
||||
Meilisearch Endpoint : <uffizzi-url>/meilisearch
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
id-token: write
|
||||
@@ -18,7 +18,7 @@ If Meilisearch does not offer optimized support for your language, please consid
|
||||
|
||||
## Assumptions
|
||||
|
||||
1. **You're familiar with [GitHub](https://github.com) and the [Pull Requests](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)(PR) workflow.**
|
||||
1. **You're familiar with [GitHub](https://github.com) and the [Pull Requests (PR)](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests) workflow.**
|
||||
2. **You've read the Meilisearch [documentation](https://docs.meilisearch.com).**
|
||||
3. **You know about the [Meilisearch community](https://docs.meilisearch.com/learn/what_is_meilisearch/contact.html).
|
||||
Please use this for help.**
|
||||
|
||||
1142
Cargo.lock
generated
1142
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -17,7 +17,7 @@ members = [
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
version = "1.1.0"
|
||||
version = "1.1.1"
|
||||
authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"]
|
||||
description = "Meilisearch HTTP server"
|
||||
homepage = "https://meilisearch.com"
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
# syntax=docker/dockerfile:1.4
|
||||
# Compile
|
||||
FROM rust:alpine3.16 AS compiler
|
||||
|
||||
@@ -11,7 +12,7 @@ ARG GIT_TAG
|
||||
ENV VERGEN_GIT_SHA=${COMMIT_SHA} VERGEN_GIT_COMMIT_TIMESTAMP=${COMMIT_DATE} VERGEN_GIT_SEMVER_LIGHTWEIGHT=${GIT_TAG}
|
||||
ENV RUSTFLAGS="-C target-feature=-crt-static"
|
||||
|
||||
COPY . .
|
||||
COPY --link . .
|
||||
RUN set -eux; \
|
||||
apkArch="$(apk --print-arch)"; \
|
||||
if [ "$apkArch" = "aarch64" ]; then \
|
||||
@@ -30,7 +31,7 @@ RUN apk update --quiet \
|
||||
|
||||
# add meilisearch to the `/bin` so you can run it from anywhere and it's easy
|
||||
# to find.
|
||||
COPY --from=compiler /meilisearch/target/release/meilisearch /bin/meilisearch
|
||||
COPY --from=compiler --link /meilisearch/target/release/meilisearch /bin/meilisearch
|
||||
# To stay compatible with the older version of the container (pre v0.27.0) we're
|
||||
# going to symlink the meilisearch binary in the path to `/meilisearch`
|
||||
RUN ln -s /bin/meilisearch /meilisearch
|
||||
|
||||
@@ -68,11 +68,11 @@ Install one of our SDKs in your project for seamless integration between Meilise
|
||||
|
||||
Take a look at the complete [Meilisearch integration list](https://meilisearch.com/docs/learn/what_is_meilisearch/sdks).
|
||||
|
||||
[](https://meilisearch.com/docs/learn/what_is_meilisearch/sdks.html)
|
||||
[](https://www.meilisearch.com/docs/learn/what_is_meilisearch/sdks)
|
||||
|
||||
## ⚙️ Advanced usage
|
||||
|
||||
Experienced users will want to keep our [API Reference](https://meilisearch.com/docs/reference/api) close at hand.
|
||||
Experienced users will want to keep our [API Reference](https://www.meilisearch.com/docs/reference/api/overview) close at hand.
|
||||
|
||||
We also offer a wide range of dedicated guides to all Meilisearch features, such as [filtering](https://meilisearch.com/docs/learn/advanced/filtering), [sorting](https://meilisearch.com/docs/learn/advanced/sorting), [geosearch](https://meilisearch.com/docs/learn/advanced/geosearch), [API keys](https://meilisearch.com/docs/learn/security/master_api_keys), and [tenant tokens](https://meilisearch.com/docs/learn/security/tenant_tokens).
|
||||
|
||||
|
||||
@@ -11,11 +11,11 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.65"
|
||||
csv = "1.1.6"
|
||||
anyhow = "1.0.70"
|
||||
csv = "1.2.1"
|
||||
milli = { path = "../milli", default-features = false }
|
||||
mimalloc = { version = "0.1.29", default-features = false }
|
||||
serde_json = { version = "1.0.85", features = ["preserve_order"] }
|
||||
mimalloc = { version = "0.1.36", default-features = false }
|
||||
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = { version = "0.4.0", features = ["html_reports"] }
|
||||
@@ -24,11 +24,11 @@ rand_chacha = "0.3.1"
|
||||
roaring = "0.10.1"
|
||||
|
||||
[build-dependencies]
|
||||
anyhow = "1.0.65"
|
||||
bytes = "1.2.1"
|
||||
anyhow = "1.0.70"
|
||||
bytes = "1.4.0"
|
||||
convert_case = "0.6.0"
|
||||
flate2 = "1.0.24"
|
||||
reqwest = { version = "0.11.12", features = ["blocking", "rustls-tls"], default-features = false }
|
||||
flate2 = "1.0.25"
|
||||
reqwest = { version = "0.11.16", features = ["blocking", "rustls-tls"], default-features = false }
|
||||
|
||||
[features]
|
||||
default = ["milli/default"]
|
||||
@@ -48,7 +48,3 @@ harness = false
|
||||
[[bench]]
|
||||
name = "indexing"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "formatting"
|
||||
harness = false
|
||||
|
||||
@@ -1,67 +0,0 @@
|
||||
use std::rc::Rc;
|
||||
|
||||
use criterion::{criterion_group, criterion_main};
|
||||
use milli::tokenizer::TokenizerBuilder;
|
||||
use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords};
|
||||
|
||||
#[global_allocator]
|
||||
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
||||
|
||||
struct Conf<'a> {
|
||||
name: &'a str,
|
||||
text: &'a str,
|
||||
matching_words: MatcherBuilder<'a, Vec<u8>>,
|
||||
}
|
||||
|
||||
fn bench_formatting(c: &mut criterion::Criterion) {
|
||||
#[rustfmt::skip]
|
||||
let confs = &[
|
||||
Conf {
|
||||
name: "'the door d'",
|
||||
text: r#"He used to do the door sounds in "Star Trek" with his mouth, phssst, phssst. The MD-11 passenger and cargo doors also tend to behave like electromagnetic apertures, because the doors do not have continuous electrical contact with the door frames around the door perimeter. But Theodor said that the doors don't work."#,
|
||||
matching_words: MatcherBuilder::new(MatchingWords::new(vec![
|
||||
(vec![Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()), Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap())], vec![0]),
|
||||
(vec![Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap())], vec![0]),
|
||||
(vec![Rc::new(MatchingWord::new("door".to_string(), 1, false).unwrap())], vec![1]),
|
||||
(vec![Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()), Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap())], vec![0]),
|
||||
(vec![Rc::new(MatchingWord::new("thedoor".to_string(), 1, false).unwrap())], vec![0, 1]),
|
||||
(vec![Rc::new(MatchingWord::new("d".to_string(), 0, true).unwrap())], vec![2]),
|
||||
(vec![Rc::new(MatchingWord::new("thedoord".to_string(), 1, true).unwrap())], vec![0, 1, 2]),
|
||||
(vec![Rc::new(MatchingWord::new("doord".to_string(), 1, true).unwrap())], vec![1, 2]),
|
||||
]
|
||||
).unwrap(), TokenizerBuilder::default().build()),
|
||||
},
|
||||
];
|
||||
|
||||
let format_options = &[
|
||||
FormatOptions { highlight: false, crop: None },
|
||||
FormatOptions { highlight: true, crop: None },
|
||||
FormatOptions { highlight: false, crop: Some(10) },
|
||||
FormatOptions { highlight: true, crop: Some(10) },
|
||||
FormatOptions { highlight: false, crop: Some(20) },
|
||||
FormatOptions { highlight: true, crop: Some(20) },
|
||||
];
|
||||
|
||||
for option in format_options {
|
||||
let highlight = if option.highlight { "highlight" } else { "no-highlight" };
|
||||
|
||||
let name = match option.crop {
|
||||
Some(size) => format!("{}-crop({})", highlight, size),
|
||||
None => format!("{}-no-crop", highlight),
|
||||
};
|
||||
|
||||
let mut group = c.benchmark_group(&name);
|
||||
for conf in confs {
|
||||
group.bench_function(conf.name, |b| {
|
||||
b.iter(|| {
|
||||
let mut matcher = conf.matching_words.build(conf.text);
|
||||
matcher.format(*option);
|
||||
})
|
||||
});
|
||||
}
|
||||
group.finish();
|
||||
}
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_formatting);
|
||||
criterion_main!(benches);
|
||||
@@ -103,7 +103,7 @@ not_available_failure_usage() {
|
||||
printf "$RED%s\n$DEFAULT" 'ERROR: Meilisearch binary is not available for your OS distribution or your architecture yet.'
|
||||
echo ''
|
||||
echo 'However, you can easily compile the binary from the source files.'
|
||||
echo 'Follow the steps at the page ("Source" tab): https://docs.meilisearch.com/learn/getting_started/installation.html'
|
||||
echo 'Follow the steps at the page ("Source" tab): https://www.meilisearch.com/docs/learn/getting_started/installation'
|
||||
}
|
||||
|
||||
fetch_release_failure_usage() {
|
||||
|
||||
@@ -11,22 +11,22 @@ readme.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.65"
|
||||
flate2 = "1.0.22"
|
||||
http = "0.2.8"
|
||||
anyhow = "1.0.70"
|
||||
flate2 = "1.0.25"
|
||||
http = "0.2.9"
|
||||
log = "0.4.17"
|
||||
meilisearch-auth = { path = "../meilisearch-auth" }
|
||||
meilisearch-types = { path = "../meilisearch-types" }
|
||||
once_cell = "1.15.0"
|
||||
regex = "1.6.0"
|
||||
roaring = { version = "0.10.0", features = ["serde"] }
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
serde_json = { version = "1.0.85", features = ["preserve_order"] }
|
||||
once_cell = "1.17.1"
|
||||
regex = "1.7.3"
|
||||
roaring = { version = "0.10.1", features = ["serde"] }
|
||||
serde = { version = "1.0.160", features = ["derive"] }
|
||||
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
||||
tar = "0.4.38"
|
||||
tempfile = "3.3.0"
|
||||
thiserror = "1.0.30"
|
||||
time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] }
|
||||
uuid = { version = "1.1.2", features = ["serde", "v4"] }
|
||||
tempfile = "3.5.0"
|
||||
thiserror = "1.0.40"
|
||||
time = { version = "0.3.20", features = ["serde-well-known", "formatting", "parsing", "macros"] }
|
||||
uuid = { version = "1.3.1", features = ["serde", "v4"] }
|
||||
|
||||
[dev-dependencies]
|
||||
big_s = "1.0.2"
|
||||
|
||||
@@ -25,7 +25,6 @@ impl CompatV2ToV3 {
|
||||
CompatV2ToV3::Compat(compat) => compat.index_uuid(),
|
||||
};
|
||||
v2_uuids
|
||||
.into_iter()
|
||||
.into_iter()
|
||||
.map(|index| v3::meta::IndexUuid { uid: index.uid, uuid: index.uuid })
|
||||
.collect()
|
||||
|
||||
@@ -11,9 +11,9 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
tempfile = "3.3.0"
|
||||
thiserror = "1.0.30"
|
||||
uuid = { version = "1.1.2", features = ["serde", "v4"] }
|
||||
tempfile = "3.5.0"
|
||||
thiserror = "1.0.40"
|
||||
uuid = { version = "1.3.1", features = ["serde", "v4"] }
|
||||
|
||||
[dev-dependencies]
|
||||
faux = "0.1.8"
|
||||
faux = "0.1.9"
|
||||
|
||||
@@ -12,8 +12,8 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
nom = "7.1.1"
|
||||
nom_locate = "4.0.0"
|
||||
nom = "7.1.3"
|
||||
nom_locate = "4.1.0"
|
||||
|
||||
[dev-dependencies]
|
||||
insta = "1.21.0"
|
||||
insta = "1.29.0"
|
||||
|
||||
@@ -20,6 +20,8 @@ pub enum Condition<'a> {
|
||||
GreaterThanOrEqual(Token<'a>),
|
||||
Equal(Token<'a>),
|
||||
NotEqual(Token<'a>),
|
||||
Null,
|
||||
Empty,
|
||||
Exists,
|
||||
LowerThan(Token<'a>),
|
||||
LowerThanOrEqual(Token<'a>),
|
||||
@@ -44,6 +46,38 @@ pub fn parse_condition(input: Span) -> IResult<FilterCondition> {
|
||||
Ok((input, condition))
|
||||
}
|
||||
|
||||
/// null = value "IS" WS+ "NULL"
|
||||
pub fn parse_is_null(input: Span) -> IResult<FilterCondition> {
|
||||
let (input, key) = parse_value(input)?;
|
||||
|
||||
let (input, _) = tuple((tag("IS"), multispace1, tag("NULL")))(input)?;
|
||||
Ok((input, FilterCondition::Condition { fid: key, op: Null }))
|
||||
}
|
||||
|
||||
/// null = value "IS" WS+ "NOT" WS+ "NULL"
|
||||
pub fn parse_is_not_null(input: Span) -> IResult<FilterCondition> {
|
||||
let (input, key) = parse_value(input)?;
|
||||
|
||||
let (input, _) = tuple((tag("IS"), multispace1, tag("NOT"), multispace1, tag("NULL")))(input)?;
|
||||
Ok((input, FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key, op: Null }))))
|
||||
}
|
||||
|
||||
/// empty = value "IS" WS+ "EMPTY"
|
||||
pub fn parse_is_empty(input: Span) -> IResult<FilterCondition> {
|
||||
let (input, key) = parse_value(input)?;
|
||||
|
||||
let (input, _) = tuple((tag("IS"), multispace1, tag("EMPTY")))(input)?;
|
||||
Ok((input, FilterCondition::Condition { fid: key, op: Empty }))
|
||||
}
|
||||
|
||||
/// empty = value "IS" WS+ "NOT" WS+ "EMPTY"
|
||||
pub fn parse_is_not_empty(input: Span) -> IResult<FilterCondition> {
|
||||
let (input, key) = parse_value(input)?;
|
||||
|
||||
let (input, _) = tuple((tag("IS"), multispace1, tag("NOT"), multispace1, tag("EMPTY")))(input)?;
|
||||
Ok((input, FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key, op: Empty }))))
|
||||
}
|
||||
|
||||
/// exist = value "EXISTS"
|
||||
pub fn parse_exists(input: Span) -> IResult<FilterCondition> {
|
||||
let (input, key) = terminated(parse_value, tag("EXISTS"))(input)?;
|
||||
|
||||
@@ -143,11 +143,9 @@ impl<'a> Display for Error<'a> {
|
||||
ErrorKind::MissingClosingDelimiter(c) => {
|
||||
writeln!(f, "Expression `{}` is missing the following closing delimiter: `{}`.", escaped_input, c)?
|
||||
}
|
||||
ErrorKind::InvalidPrimary if input.trim().is_empty() => {
|
||||
writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` but instead got nothing.")?
|
||||
}
|
||||
ErrorKind::InvalidPrimary => {
|
||||
writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `{}`.", escaped_input)?
|
||||
let text = if input.trim().is_empty() { "but instead got nothing.".to_string() } else { format!("at `{}`.", escaped_input) };
|
||||
writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` {}", text)?
|
||||
}
|
||||
ErrorKind::ExpectedEof => {
|
||||
writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)?
|
||||
|
||||
@@ -47,7 +47,10 @@ mod value;
|
||||
use std::fmt::Debug;
|
||||
|
||||
pub use condition::{parse_condition, parse_to, Condition};
|
||||
use condition::{parse_exists, parse_not_exists};
|
||||
use condition::{
|
||||
parse_exists, parse_is_empty, parse_is_not_empty, parse_is_not_null, parse_is_null,
|
||||
parse_not_exists,
|
||||
};
|
||||
use error::{cut_with_err, ExpectedValueKind, NomErrorExt};
|
||||
pub use error::{Error, ErrorKind};
|
||||
use nom::branch::alt;
|
||||
@@ -442,6 +445,10 @@ fn parse_primary(input: Span, depth: usize) -> IResult<FilterCondition> {
|
||||
parse_in,
|
||||
parse_not_in,
|
||||
parse_condition,
|
||||
parse_is_null,
|
||||
parse_is_not_null,
|
||||
parse_is_empty,
|
||||
parse_is_not_empty,
|
||||
parse_exists,
|
||||
parse_not_exists,
|
||||
parse_to,
|
||||
@@ -526,14 +533,30 @@ pub mod tests {
|
||||
insta::assert_display_snapshot!(p("subscribers <= 1000"), @"{subscribers} <= {1000}");
|
||||
insta::assert_display_snapshot!(p("subscribers 100 TO 1000"), @"{subscribers} {100} TO {1000}");
|
||||
|
||||
// Test NOT + EXISTS
|
||||
insta::assert_display_snapshot!(p("subscribers EXISTS"), @"{subscribers} EXISTS");
|
||||
// Test NOT
|
||||
insta::assert_display_snapshot!(p("NOT subscribers < 1000"), @"NOT ({subscribers} < {1000})");
|
||||
insta::assert_display_snapshot!(p("NOT subscribers 100 TO 1000"), @"NOT ({subscribers} {100} TO {1000})");
|
||||
|
||||
// Test NULL + NOT NULL
|
||||
insta::assert_display_snapshot!(p("subscribers IS NULL"), @"{subscribers} IS NULL");
|
||||
insta::assert_display_snapshot!(p("NOT subscribers IS NULL"), @"NOT ({subscribers} IS NULL)");
|
||||
insta::assert_display_snapshot!(p("subscribers IS NOT NULL"), @"NOT ({subscribers} IS NULL)");
|
||||
insta::assert_display_snapshot!(p("NOT subscribers IS NOT NULL"), @"{subscribers} IS NULL");
|
||||
insta::assert_display_snapshot!(p("subscribers IS NOT NULL"), @"NOT ({subscribers} IS NULL)");
|
||||
|
||||
// Test EMPTY + NOT EMPTY
|
||||
insta::assert_display_snapshot!(p("subscribers IS EMPTY"), @"{subscribers} IS EMPTY");
|
||||
insta::assert_display_snapshot!(p("NOT subscribers IS EMPTY"), @"NOT ({subscribers} IS EMPTY)");
|
||||
insta::assert_display_snapshot!(p("subscribers IS NOT EMPTY"), @"NOT ({subscribers} IS EMPTY)");
|
||||
insta::assert_display_snapshot!(p("NOT subscribers IS NOT EMPTY"), @"{subscribers} IS EMPTY");
|
||||
insta::assert_display_snapshot!(p("subscribers IS NOT EMPTY"), @"NOT ({subscribers} IS EMPTY)");
|
||||
|
||||
// Test EXISTS + NOT EXITS
|
||||
insta::assert_display_snapshot!(p("subscribers EXISTS"), @"{subscribers} EXISTS");
|
||||
insta::assert_display_snapshot!(p("NOT subscribers EXISTS"), @"NOT ({subscribers} EXISTS)");
|
||||
insta::assert_display_snapshot!(p("subscribers NOT EXISTS"), @"NOT ({subscribers} EXISTS)");
|
||||
insta::assert_display_snapshot!(p("NOT subscribers NOT EXISTS"), @"{subscribers} EXISTS");
|
||||
insta::assert_display_snapshot!(p("subscribers NOT EXISTS"), @"NOT ({subscribers} EXISTS)");
|
||||
insta::assert_display_snapshot!(p("NOT subscribers 100 TO 1000"), @"NOT ({subscribers} {100} TO {1000})");
|
||||
|
||||
// Test nested NOT
|
||||
insta::assert_display_snapshot!(p("NOT NOT NOT NOT x = 5"), @"{x} = {5}");
|
||||
@@ -606,7 +629,7 @@ pub mod tests {
|
||||
"###);
|
||||
|
||||
insta::assert_display_snapshot!(p("'OR'"), @r###"
|
||||
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `\'OR\'`.
|
||||
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `\'OR\'`.
|
||||
1:5 'OR'
|
||||
"###);
|
||||
|
||||
@@ -616,12 +639,12 @@ pub mod tests {
|
||||
"###);
|
||||
|
||||
insta::assert_display_snapshot!(p("channel Ponce"), @r###"
|
||||
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `channel Ponce`.
|
||||
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `channel Ponce`.
|
||||
1:14 channel Ponce
|
||||
"###);
|
||||
|
||||
insta::assert_display_snapshot!(p("channel = Ponce OR"), @r###"
|
||||
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` but instead got nothing.
|
||||
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` but instead got nothing.
|
||||
19:19 channel = Ponce OR
|
||||
"###);
|
||||
|
||||
@@ -706,12 +729,12 @@ pub mod tests {
|
||||
"###);
|
||||
|
||||
insta::assert_display_snapshot!(p("colour NOT EXIST"), @r###"
|
||||
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `colour NOT EXIST`.
|
||||
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `colour NOT EXIST`.
|
||||
1:17 colour NOT EXIST
|
||||
"###);
|
||||
|
||||
insta::assert_display_snapshot!(p("subscribers 100 TO1000"), @r###"
|
||||
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `subscribers 100 TO1000`.
|
||||
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `subscribers 100 TO1000`.
|
||||
1:23 subscribers 100 TO1000
|
||||
"###);
|
||||
|
||||
@@ -772,6 +795,39 @@ pub mod tests {
|
||||
Was expecting a value but instead got `OR`, which is a reserved keyword. To use `OR` as a field name or a value, surround it by quotes.
|
||||
5:7 NOT OR EXISTS AND EXISTS NOT EXISTS
|
||||
"###);
|
||||
|
||||
insta::assert_display_snapshot!(p(r#"value NULL"#), @r###"
|
||||
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value NULL`.
|
||||
1:11 value NULL
|
||||
"###);
|
||||
insta::assert_display_snapshot!(p(r#"value NOT NULL"#), @r###"
|
||||
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value NOT NULL`.
|
||||
1:15 value NOT NULL
|
||||
"###);
|
||||
insta::assert_display_snapshot!(p(r#"value EMPTY"#), @r###"
|
||||
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value EMPTY`.
|
||||
1:12 value EMPTY
|
||||
"###);
|
||||
insta::assert_display_snapshot!(p(r#"value NOT EMPTY"#), @r###"
|
||||
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value NOT EMPTY`.
|
||||
1:16 value NOT EMPTY
|
||||
"###);
|
||||
insta::assert_display_snapshot!(p(r#"value IS"#), @r###"
|
||||
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value IS`.
|
||||
1:9 value IS
|
||||
"###);
|
||||
insta::assert_display_snapshot!(p(r#"value IS NOT"#), @r###"
|
||||
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value IS NOT`.
|
||||
1:13 value IS NOT
|
||||
"###);
|
||||
insta::assert_display_snapshot!(p(r#"value IS EXISTS"#), @r###"
|
||||
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value IS EXISTS`.
|
||||
1:16 value IS EXISTS
|
||||
"###);
|
||||
insta::assert_display_snapshot!(p(r#"value IS NOT EXISTS"#), @r###"
|
||||
Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value IS NOT EXISTS`.
|
||||
1:20 value IS NOT EXISTS
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -853,6 +909,8 @@ impl<'a> std::fmt::Display for Condition<'a> {
|
||||
Condition::GreaterThanOrEqual(token) => write!(f, ">= {token}"),
|
||||
Condition::Equal(token) => write!(f, "= {token}"),
|
||||
Condition::NotEqual(token) => write!(f, "!= {token}"),
|
||||
Condition::Null => write!(f, "IS NULL"),
|
||||
Condition::Empty => write!(f, "IS EMPTY"),
|
||||
Condition::Exists => write!(f, "EXISTS"),
|
||||
Condition::LowerThan(token) => write!(f, "< {token}"),
|
||||
Condition::LowerThanOrEqual(token) => write!(f, "<= {token}"),
|
||||
|
||||
@@ -183,7 +183,20 @@ fn is_syntax_component(c: char) -> bool {
|
||||
}
|
||||
|
||||
fn is_keyword(s: &str) -> bool {
|
||||
matches!(s, "AND" | "OR" | "IN" | "NOT" | "TO" | "EXISTS" | "_geoRadius" | "_geoBoundingBox")
|
||||
matches!(
|
||||
s,
|
||||
"AND"
|
||||
| "OR"
|
||||
| "IN"
|
||||
| "NOT"
|
||||
| "TO"
|
||||
| "EXISTS"
|
||||
| "IS"
|
||||
| "NULL"
|
||||
| "EMPTY"
|
||||
| "_geoRadius"
|
||||
| "_geoBoundingBox"
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -4,51 +4,56 @@ use serde_json::{Map, Value};
|
||||
|
||||
pub fn flatten(json: &Map<String, Value>) -> Map<String, Value> {
|
||||
let mut obj = Map::new();
|
||||
let mut all_keys = vec![];
|
||||
insert_object(&mut obj, None, json, &mut all_keys);
|
||||
for key in all_keys {
|
||||
obj.entry(key).or_insert(Value::Array(vec![]));
|
||||
let mut all_entries = vec![];
|
||||
insert_object(&mut obj, None, json, &mut all_entries);
|
||||
for (key, old_val) in all_entries {
|
||||
obj.entry(key).or_insert(old_val.clone());
|
||||
}
|
||||
obj
|
||||
}
|
||||
|
||||
fn insert_object(
|
||||
fn insert_object<'a>(
|
||||
base_json: &mut Map<String, Value>,
|
||||
base_key: Option<&str>,
|
||||
object: &Map<String, Value>,
|
||||
all_keys: &mut Vec<String>,
|
||||
object: &'a Map<String, Value>,
|
||||
all_entries: &mut Vec<(String, &'a Value)>,
|
||||
) {
|
||||
for (key, value) in object {
|
||||
let new_key = base_key.map_or_else(|| key.clone(), |base_key| format!("{base_key}.{key}"));
|
||||
all_keys.push(new_key.clone());
|
||||
all_entries.push((new_key.clone(), value));
|
||||
if let Some(array) = value.as_array() {
|
||||
insert_array(base_json, &new_key, array, all_keys);
|
||||
insert_array(base_json, &new_key, array, all_entries);
|
||||
} else if let Some(object) = value.as_object() {
|
||||
insert_object(base_json, Some(&new_key), object, all_keys);
|
||||
insert_object(base_json, Some(&new_key), object, all_entries);
|
||||
} else {
|
||||
insert_value(base_json, &new_key, value.clone());
|
||||
insert_value(base_json, &new_key, value.clone(), false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn insert_array(
|
||||
fn insert_array<'a>(
|
||||
base_json: &mut Map<String, Value>,
|
||||
base_key: &str,
|
||||
array: &Vec<Value>,
|
||||
all_keys: &mut Vec<String>,
|
||||
array: &'a Vec<Value>,
|
||||
all_entries: &mut Vec<(String, &'a Value)>,
|
||||
) {
|
||||
for value in array {
|
||||
if let Some(object) = value.as_object() {
|
||||
insert_object(base_json, Some(base_key), object, all_keys);
|
||||
insert_object(base_json, Some(base_key), object, all_entries);
|
||||
} else if let Some(sub_array) = value.as_array() {
|
||||
insert_array(base_json, base_key, sub_array, all_keys);
|
||||
insert_array(base_json, base_key, sub_array, all_entries);
|
||||
} else {
|
||||
insert_value(base_json, base_key, value.clone());
|
||||
insert_value(base_json, base_key, value.clone(), true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn insert_value(base_json: &mut Map<String, Value>, key: &str, to_insert: Value) {
|
||||
fn insert_value(
|
||||
base_json: &mut Map<String, Value>,
|
||||
key: &str,
|
||||
to_insert: Value,
|
||||
came_from_array: bool,
|
||||
) {
|
||||
debug_assert!(!to_insert.is_object());
|
||||
debug_assert!(!to_insert.is_array());
|
||||
|
||||
@@ -63,6 +68,8 @@ fn insert_value(base_json: &mut Map<String, Value>, key: &str, to_insert: Value)
|
||||
base_json[key] = Value::Array(vec![value, to_insert]);
|
||||
}
|
||||
// if it does not exist we can push the value untouched
|
||||
} else if came_from_array {
|
||||
base_json.insert(key.to_string(), Value::Array(vec![to_insert]));
|
||||
} else {
|
||||
base_json.insert(key.to_string(), to_insert);
|
||||
}
|
||||
@@ -113,7 +120,11 @@ mod tests {
|
||||
assert_eq!(
|
||||
&flat,
|
||||
json!({
|
||||
"a": [],
|
||||
"a": {
|
||||
"b": "c",
|
||||
"d": "e",
|
||||
"f": "g"
|
||||
},
|
||||
"a.b": "c",
|
||||
"a.d": "e",
|
||||
"a.f": "g"
|
||||
@@ -164,7 +175,7 @@ mod tests {
|
||||
assert_eq!(
|
||||
&flat,
|
||||
json!({
|
||||
"a": 42,
|
||||
"a": [42],
|
||||
"a.b": ["c", "d", "e"],
|
||||
})
|
||||
.as_object()
|
||||
@@ -186,7 +197,7 @@ mod tests {
|
||||
assert_eq!(
|
||||
&flat,
|
||||
json!({
|
||||
"a": null,
|
||||
"a": [null],
|
||||
"a.b": ["c", "d", "e"],
|
||||
})
|
||||
.as_object()
|
||||
@@ -208,7 +219,9 @@ mod tests {
|
||||
assert_eq!(
|
||||
&flat,
|
||||
json!({
|
||||
"a": [],
|
||||
"a": {
|
||||
"b": "c"
|
||||
},
|
||||
"a.b": ["c", "d"],
|
||||
})
|
||||
.as_object()
|
||||
@@ -234,7 +247,7 @@ mod tests {
|
||||
json!({
|
||||
"a.b": ["c", "d", "f"],
|
||||
"a.c": "e",
|
||||
"a": 35,
|
||||
"a": [35],
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
@@ -302,4 +315,53 @@ mod tests {
|
||||
.unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn flatten_nested_values_keep_original_values() {
|
||||
let mut base: Value = json!({
|
||||
"tags": {
|
||||
"t1": "v1"
|
||||
},
|
||||
"prices": {
|
||||
"p1": [null],
|
||||
"p1000": {"tamo": {"le": {}}}
|
||||
},
|
||||
"kiki": [[]]
|
||||
});
|
||||
let json = std::mem::take(base.as_object_mut().unwrap());
|
||||
let flat = flatten(&json);
|
||||
|
||||
println!("{}", serde_json::to_string_pretty(&flat).unwrap());
|
||||
|
||||
assert_eq!(
|
||||
&flat,
|
||||
json!({
|
||||
"prices": {
|
||||
"p1": [null],
|
||||
"p1000": {
|
||||
"tamo": {
|
||||
"le": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
"prices.p1": [null],
|
||||
"prices.p1000": {
|
||||
"tamo": {
|
||||
"le": {}
|
||||
}
|
||||
},
|
||||
"prices.p1000.tamo": {
|
||||
"le": {}
|
||||
},
|
||||
"prices.p1000.tamo.le": {},
|
||||
"tags": {
|
||||
"t1": "v1"
|
||||
},
|
||||
"tags.t1": "v1",
|
||||
"kiki": [[]]
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,29 +11,29 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.64"
|
||||
anyhow = "1.0.70"
|
||||
bincode = "1.3.3"
|
||||
csv = "1.1.6"
|
||||
derive_builder = "0.11.2"
|
||||
csv = "1.2.1"
|
||||
derive_builder = "0.12.0"
|
||||
dump = { path = "../dump" }
|
||||
enum-iterator = "1.1.3"
|
||||
enum-iterator = "1.4.0"
|
||||
file-store = { path = "../file-store" }
|
||||
log = "0.4.14"
|
||||
log = "0.4.17"
|
||||
meilisearch-auth = { path = "../meilisearch-auth" }
|
||||
meilisearch-types = { path = "../meilisearch-types" }
|
||||
page_size = "0.5.0"
|
||||
roaring = { version = "0.10.0", features = ["serde"] }
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
serde_json = { version = "1.0.85", features = ["preserve_order"] }
|
||||
roaring = { version = "0.10.1", features = ["serde"] }
|
||||
serde = { version = "1.0.160", features = ["derive"] }
|
||||
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
||||
synchronoise = "1.0.1"
|
||||
tempfile = "3.3.0"
|
||||
thiserror = "1.0.30"
|
||||
time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] }
|
||||
uuid = { version = "1.1.2", features = ["serde", "v4"] }
|
||||
tempfile = "3.5.0"
|
||||
thiserror = "1.0.40"
|
||||
time = { version = "0.3.20", features = ["serde-well-known", "formatting", "parsing", "macros"] }
|
||||
uuid = { version = "1.3.1", features = ["serde", "v4"] }
|
||||
|
||||
[dev-dependencies]
|
||||
big_s = "1.0.2"
|
||||
crossbeam = "0.8.2"
|
||||
insta = { version = "1.19.1", features = ["json", "redactions"] }
|
||||
insta = { version = "1.29.0", features = ["json", "redactions"] }
|
||||
meili-snap = { path = "../meili-snap" }
|
||||
nelson = { git = "https://github.com/meilisearch/nelson.git", rev = "675f13885548fb415ead8fbb447e9e6d9314000a"}
|
||||
|
||||
@@ -311,18 +311,9 @@ impl BatchKind {
|
||||
})
|
||||
}
|
||||
(
|
||||
BatchKind::DocumentOperation { method, allow_index_creation, primary_key, mut operation_ids },
|
||||
this @ BatchKind::DocumentOperation { .. },
|
||||
K::DocumentDeletion,
|
||||
) => {
|
||||
operation_ids.push(id);
|
||||
|
||||
Continue(BatchKind::DocumentOperation {
|
||||
method,
|
||||
allow_index_creation,
|
||||
primary_key,
|
||||
operation_ids,
|
||||
})
|
||||
}
|
||||
) => Break(this),
|
||||
// but we can't autobatch documents if it's not the same kind
|
||||
// this match branch MUST be AFTER the previous one
|
||||
(
|
||||
@@ -345,35 +336,7 @@ impl BatchKind {
|
||||
deletion_ids.push(id);
|
||||
Continue(BatchKind::DocumentClear { ids: deletion_ids })
|
||||
}
|
||||
// we can autobatch the deletion and import if the index already exists
|
||||
(
|
||||
BatchKind::DocumentDeletion { mut deletion_ids },
|
||||
K::DocumentImport { method, allow_index_creation, primary_key }
|
||||
) if index_already_exists => {
|
||||
deletion_ids.push(id);
|
||||
|
||||
Continue(BatchKind::DocumentOperation {
|
||||
method,
|
||||
allow_index_creation,
|
||||
primary_key,
|
||||
operation_ids: deletion_ids,
|
||||
})
|
||||
}
|
||||
// we can autobatch the deletion and import if both can't create an index
|
||||
(
|
||||
BatchKind::DocumentDeletion { mut deletion_ids },
|
||||
K::DocumentImport { method, allow_index_creation, primary_key }
|
||||
) if !allow_index_creation => {
|
||||
deletion_ids.push(id);
|
||||
|
||||
Continue(BatchKind::DocumentOperation {
|
||||
method,
|
||||
allow_index_creation,
|
||||
primary_key,
|
||||
operation_ids: deletion_ids,
|
||||
})
|
||||
}
|
||||
// we can't autobatch a deletion and an import if the index does not exists but would be created by an addition
|
||||
// we can't autobatch a deletion and an import
|
||||
(
|
||||
this @ BatchKind::DocumentDeletion { .. },
|
||||
K::DocumentImport { .. }
|
||||
@@ -674,36 +637,36 @@ mod tests {
|
||||
debug_snapshot!(autobatch_from(false,None, [settings(false)]), @"Some((Settings { allow_index_creation: false, settings_ids: [0] }, false))");
|
||||
debug_snapshot!(autobatch_from(false,None, [settings(false), settings(false), settings(false)]), @"Some((Settings { allow_index_creation: false, settings_ids: [0, 1, 2] }, false))");
|
||||
|
||||
// We can autobatch document addition with document deletion
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))");
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))");
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###);
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###);
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))");
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))");
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###);
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###);
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
|
||||
// And the other way around
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, false))");
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, false))");
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, true, Some("catto"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, true, Some("catto"))]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
|
||||
// We can't autobatch document addition with document deletion
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))");
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))");
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))");
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))");
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###);
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###);
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###);
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###);
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))");
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))");
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))");
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))");
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###);
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###);
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###);
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###);
|
||||
// we also can't do the only way around
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, true, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, true, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
|
||||
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
|
||||
debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -61,6 +61,8 @@ pub enum Error {
|
||||
SwapDuplicateIndexesFound(Vec<String>),
|
||||
#[error("Index `{0}` not found.")]
|
||||
SwapIndexNotFound(String),
|
||||
#[error("Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations.")]
|
||||
NoSpaceLeftInTaskQueue,
|
||||
#[error(
|
||||
"Indexes {} not found.",
|
||||
.0.iter().map(|s| format!("`{}`", s)).collect::<Vec<_>>().join(", ")
|
||||
@@ -152,6 +154,8 @@ impl ErrorCode for Error {
|
||||
Error::TaskNotFound(_) => Code::TaskNotFound,
|
||||
Error::TaskDeletionWithEmptyQuery => Code::MissingTaskFilters,
|
||||
Error::TaskCancelationWithEmptyQuery => Code::MissingTaskFilters,
|
||||
// TODO: not sure of the Code to use
|
||||
Error::NoSpaceLeftInTaskQueue => Code::NoSpaceLeftOnDevice,
|
||||
Error::Dump(e) => e.error_code(),
|
||||
Error::Milli(e) => e.error_code(),
|
||||
Error::ProcessBatchPanicked => Code::Internal,
|
||||
|
||||
@@ -828,6 +828,13 @@ impl IndexScheduler {
|
||||
pub fn register(&self, kind: KindWithContent) -> Result<Task> {
|
||||
let mut wtxn = self.env.write_txn()?;
|
||||
|
||||
// if the task doesn't delete anything and 50% of the task queue is full, we must refuse to enqueue the incomming task
|
||||
if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } if !tasks.is_empty())
|
||||
&& (self.env.non_free_pages_size()? * 100) / self.env.map_size()? as u64 > 50
|
||||
{
|
||||
return Err(Error::NoSpaceLeftInTaskQueue);
|
||||
}
|
||||
|
||||
let mut task = Task {
|
||||
uid: self.next_task_id(&wtxn)?,
|
||||
enqueued_at: OffsetDateTime::now_utc(),
|
||||
@@ -1936,105 +1943,6 @@ mod tests {
|
||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "both_task_succeeded");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn document_addition_and_document_deletion() {
|
||||
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
|
||||
|
||||
let content = r#"[
|
||||
{ "id": 1, "doggo": "jean bob" },
|
||||
{ "id": 2, "catto": "jorts" },
|
||||
{ "id": 3, "doggo": "bork" }
|
||||
]"#;
|
||||
|
||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap();
|
||||
let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap();
|
||||
file.persist().unwrap();
|
||||
index_scheduler
|
||||
.register(KindWithContent::DocumentAdditionOrUpdate {
|
||||
index_uid: S("doggos"),
|
||||
primary_key: Some(S("id")),
|
||||
method: ReplaceDocuments,
|
||||
content_file: uuid,
|
||||
documents_count,
|
||||
allow_index_creation: true,
|
||||
})
|
||||
.unwrap();
|
||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task");
|
||||
index_scheduler
|
||||
.register(KindWithContent::DocumentDeletion {
|
||||
index_uid: S("doggos"),
|
||||
documents_ids: vec![S("1"), S("2")],
|
||||
})
|
||||
.unwrap();
|
||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task");
|
||||
|
||||
handle.advance_one_successful_batch(); // The addition AND deletion should've been batched together
|
||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_processing_the_batch");
|
||||
|
||||
let index = index_scheduler.index("doggos").unwrap();
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
|
||||
let documents = index
|
||||
.all_documents(&rtxn)
|
||||
.unwrap()
|
||||
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn document_deletion_and_document_addition() {
|
||||
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
|
||||
index_scheduler
|
||||
.register(KindWithContent::DocumentDeletion {
|
||||
index_uid: S("doggos"),
|
||||
documents_ids: vec![S("1"), S("2")],
|
||||
})
|
||||
.unwrap();
|
||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task");
|
||||
|
||||
let content = r#"[
|
||||
{ "id": 1, "doggo": "jean bob" },
|
||||
{ "id": 2, "catto": "jorts" },
|
||||
{ "id": 3, "doggo": "bork" }
|
||||
]"#;
|
||||
|
||||
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap();
|
||||
let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap();
|
||||
file.persist().unwrap();
|
||||
index_scheduler
|
||||
.register(KindWithContent::DocumentAdditionOrUpdate {
|
||||
index_uid: S("doggos"),
|
||||
primary_key: Some(S("id")),
|
||||
method: ReplaceDocuments,
|
||||
content_file: uuid,
|
||||
documents_count,
|
||||
allow_index_creation: true,
|
||||
})
|
||||
.unwrap();
|
||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task");
|
||||
|
||||
// The deletion should have failed because it can't create an index
|
||||
handle.advance_one_failed_batch();
|
||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_failing_the_deletion");
|
||||
|
||||
// The addition should works
|
||||
handle.advance_one_successful_batch();
|
||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_last_successful_addition");
|
||||
|
||||
let index = index_scheduler.index("doggos").unwrap();
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
|
||||
let documents = index
|
||||
.all_documents(&rtxn)
|
||||
.unwrap()
|
||||
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn do_not_batch_task_of_different_indexes() {
|
||||
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
|
||||
|
||||
@@ -1,43 +0,0 @@
|
||||
---
|
||||
source: index-scheduler/src/lib.rs
|
||||
---
|
||||
### Autobatching Enabled = true
|
||||
### Processing Tasks:
|
||||
[]
|
||||
----------------------------------------------------------------------
|
||||
### All Tasks:
|
||||
0 {uid: 0, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
|
||||
1 {uid: 1, status: succeeded, details: { received_document_ids: 2, deleted_documents: Some(2) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
|
||||
----------------------------------------------------------------------
|
||||
### Status:
|
||||
enqueued []
|
||||
succeeded [0,1,]
|
||||
----------------------------------------------------------------------
|
||||
### Kind:
|
||||
"documentAdditionOrUpdate" [0,]
|
||||
"documentDeletion" [1,]
|
||||
----------------------------------------------------------------------
|
||||
### Index Tasks:
|
||||
doggos [0,1,]
|
||||
----------------------------------------------------------------------
|
||||
### Index Mapper:
|
||||
doggos: { number_of_documents: 1, field_distribution: {"doggo": 1, "id": 1} }
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Canceled By:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Enqueued At:
|
||||
[timestamp] [0,]
|
||||
[timestamp] [1,]
|
||||
----------------------------------------------------------------------
|
||||
### Started At:
|
||||
[timestamp] [0,1,]
|
||||
----------------------------------------------------------------------
|
||||
### Finished At:
|
||||
[timestamp] [0,1,]
|
||||
----------------------------------------------------------------------
|
||||
### File Store:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
---
|
||||
source: index-scheduler/src/lib.rs
|
||||
---
|
||||
[
|
||||
{
|
||||
"id": 3,
|
||||
"doggo": "bork"
|
||||
}
|
||||
]
|
||||
@@ -1,37 +0,0 @@
|
||||
---
|
||||
source: index-scheduler/src/lib.rs
|
||||
---
|
||||
### Autobatching Enabled = true
|
||||
### Processing Tasks:
|
||||
[]
|
||||
----------------------------------------------------------------------
|
||||
### All Tasks:
|
||||
0 {uid: 0, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
|
||||
----------------------------------------------------------------------
|
||||
### Status:
|
||||
enqueued [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Kind:
|
||||
"documentAdditionOrUpdate" [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Index Tasks:
|
||||
doggos [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Index Mapper:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Canceled By:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Enqueued At:
|
||||
[timestamp] [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Started At:
|
||||
----------------------------------------------------------------------
|
||||
### Finished At:
|
||||
----------------------------------------------------------------------
|
||||
### File Store:
|
||||
00000000-0000-0000-0000-000000000000
|
||||
|
||||
----------------------------------------------------------------------
|
||||
|
||||
@@ -1,40 +0,0 @@
|
||||
---
|
||||
source: index-scheduler/src/lib.rs
|
||||
---
|
||||
### Autobatching Enabled = true
|
||||
### Processing Tasks:
|
||||
[]
|
||||
----------------------------------------------------------------------
|
||||
### All Tasks:
|
||||
0 {uid: 0, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
|
||||
1 {uid: 1, status: enqueued, details: { received_document_ids: 2, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
|
||||
----------------------------------------------------------------------
|
||||
### Status:
|
||||
enqueued [0,1,]
|
||||
----------------------------------------------------------------------
|
||||
### Kind:
|
||||
"documentAdditionOrUpdate" [0,]
|
||||
"documentDeletion" [1,]
|
||||
----------------------------------------------------------------------
|
||||
### Index Tasks:
|
||||
doggos [0,1,]
|
||||
----------------------------------------------------------------------
|
||||
### Index Mapper:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Canceled By:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Enqueued At:
|
||||
[timestamp] [0,]
|
||||
[timestamp] [1,]
|
||||
----------------------------------------------------------------------
|
||||
### Started At:
|
||||
----------------------------------------------------------------------
|
||||
### Finished At:
|
||||
----------------------------------------------------------------------
|
||||
### File Store:
|
||||
00000000-0000-0000-0000-000000000000
|
||||
|
||||
----------------------------------------------------------------------
|
||||
|
||||
@@ -1,43 +0,0 @@
|
||||
---
|
||||
source: index-scheduler/src/lib.rs
|
||||
---
|
||||
### Autobatching Enabled = true
|
||||
### Processing Tasks:
|
||||
[]
|
||||
----------------------------------------------------------------------
|
||||
### All Tasks:
|
||||
0 {uid: 0, status: failed, error: ResponseError { code: 200, message: "Index `doggos` not found.", error_code: "index_not_found", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_not_found" }, details: { received_document_ids: 2, deleted_documents: Some(0) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
|
||||
1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
|
||||
----------------------------------------------------------------------
|
||||
### Status:
|
||||
enqueued [1,]
|
||||
failed [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Kind:
|
||||
"documentAdditionOrUpdate" [1,]
|
||||
"documentDeletion" [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Index Tasks:
|
||||
doggos [0,1,]
|
||||
----------------------------------------------------------------------
|
||||
### Index Mapper:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Canceled By:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Enqueued At:
|
||||
[timestamp] [0,]
|
||||
[timestamp] [1,]
|
||||
----------------------------------------------------------------------
|
||||
### Started At:
|
||||
[timestamp] [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Finished At:
|
||||
[timestamp] [0,]
|
||||
----------------------------------------------------------------------
|
||||
### File Store:
|
||||
00000000-0000-0000-0000-000000000000
|
||||
|
||||
----------------------------------------------------------------------
|
||||
|
||||
@@ -1,46 +0,0 @@
|
||||
---
|
||||
source: index-scheduler/src/lib.rs
|
||||
---
|
||||
### Autobatching Enabled = true
|
||||
### Processing Tasks:
|
||||
[]
|
||||
----------------------------------------------------------------------
|
||||
### All Tasks:
|
||||
0 {uid: 0, status: failed, error: ResponseError { code: 200, message: "Index `doggos` not found.", error_code: "index_not_found", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_not_found" }, details: { received_document_ids: 2, deleted_documents: Some(0) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
|
||||
1 {uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
|
||||
----------------------------------------------------------------------
|
||||
### Status:
|
||||
enqueued []
|
||||
succeeded [1,]
|
||||
failed [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Kind:
|
||||
"documentAdditionOrUpdate" [1,]
|
||||
"documentDeletion" [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Index Tasks:
|
||||
doggos [0,1,]
|
||||
----------------------------------------------------------------------
|
||||
### Index Mapper:
|
||||
doggos: { number_of_documents: 3, field_distribution: {"catto": 1, "doggo": 2, "id": 3} }
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Canceled By:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Enqueued At:
|
||||
[timestamp] [0,]
|
||||
[timestamp] [1,]
|
||||
----------------------------------------------------------------------
|
||||
### Started At:
|
||||
[timestamp] [0,]
|
||||
[timestamp] [1,]
|
||||
----------------------------------------------------------------------
|
||||
### Finished At:
|
||||
[timestamp] [0,]
|
||||
[timestamp] [1,]
|
||||
----------------------------------------------------------------------
|
||||
### File Store:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
---
|
||||
source: index-scheduler/src/lib.rs
|
||||
---
|
||||
[
|
||||
{
|
||||
"id": 1,
|
||||
"doggo": "jean bob"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"catto": "jorts"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"doggo": "bork"
|
||||
}
|
||||
]
|
||||
@@ -1,36 +0,0 @@
|
||||
---
|
||||
source: index-scheduler/src/lib.rs
|
||||
---
|
||||
### Autobatching Enabled = true
|
||||
### Processing Tasks:
|
||||
[]
|
||||
----------------------------------------------------------------------
|
||||
### All Tasks:
|
||||
0 {uid: 0, status: enqueued, details: { received_document_ids: 2, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
|
||||
----------------------------------------------------------------------
|
||||
### Status:
|
||||
enqueued [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Kind:
|
||||
"documentDeletion" [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Index Tasks:
|
||||
doggos [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Index Mapper:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Canceled By:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Enqueued At:
|
||||
[timestamp] [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Started At:
|
||||
----------------------------------------------------------------------
|
||||
### Finished At:
|
||||
----------------------------------------------------------------------
|
||||
### File Store:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
|
||||
@@ -1,40 +0,0 @@
|
||||
---
|
||||
source: index-scheduler/src/lib.rs
|
||||
---
|
||||
### Autobatching Enabled = true
|
||||
### Processing Tasks:
|
||||
[]
|
||||
----------------------------------------------------------------------
|
||||
### All Tasks:
|
||||
0 {uid: 0, status: enqueued, details: { received_document_ids: 2, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
|
||||
1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
|
||||
----------------------------------------------------------------------
|
||||
### Status:
|
||||
enqueued [0,1,]
|
||||
----------------------------------------------------------------------
|
||||
### Kind:
|
||||
"documentAdditionOrUpdate" [1,]
|
||||
"documentDeletion" [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Index Tasks:
|
||||
doggos [0,1,]
|
||||
----------------------------------------------------------------------
|
||||
### Index Mapper:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Canceled By:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Enqueued At:
|
||||
[timestamp] [0,]
|
||||
[timestamp] [1,]
|
||||
----------------------------------------------------------------------
|
||||
### Started At:
|
||||
----------------------------------------------------------------------
|
||||
### Finished At:
|
||||
----------------------------------------------------------------------
|
||||
### File Store:
|
||||
00000000-0000-0000-0000-000000000000
|
||||
|
||||
----------------------------------------------------------------------
|
||||
|
||||
@@ -11,6 +11,6 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
insta = { version = "^1.19.1", features = ["json", "redactions"] }
|
||||
insta = { version = "^1.29.0", features = ["json", "redactions"] }
|
||||
md5 = "0.7.0"
|
||||
once_cell = "1.15"
|
||||
once_cell = "1.17"
|
||||
|
||||
@@ -11,16 +11,16 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
base64 = "0.13.1"
|
||||
enum-iterator = "1.1.3"
|
||||
base64 = "0.21.0"
|
||||
enum-iterator = "1.4.0"
|
||||
hmac = "0.12.1"
|
||||
maplit = "1.0.2"
|
||||
meilisearch-types = { path = "../meilisearch-types" }
|
||||
rand = "0.8.5"
|
||||
roaring = { version = "0.10.0", features = ["serde"] }
|
||||
serde = { version = "1.0.145", features = ["derive"] }
|
||||
serde_json = { version = "1.0.85", features = ["preserve_order"] }
|
||||
roaring = { version = "0.10.1", features = ["serde"] }
|
||||
serde = { version = "1.0.160", features = ["derive"] }
|
||||
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
||||
sha2 = "0.10.6"
|
||||
thiserror = "1.0.37"
|
||||
time = { version = "0.3.15", features = ["serde-well-known", "formatting", "parsing", "macros"] }
|
||||
uuid = { version = "1.1.2", features = ["serde", "v4"] }
|
||||
thiserror = "1.0.40"
|
||||
time = { version = "0.3.20", features = ["serde-well-known", "formatting", "parsing", "macros"] }
|
||||
uuid = { version = "1.3.1", features = ["serde", "v4"] }
|
||||
|
||||
@@ -310,6 +310,7 @@ pub const MASTER_KEY_MIN_SIZE: usize = 16;
|
||||
const MASTER_KEY_GEN_SIZE: usize = 32;
|
||||
|
||||
pub fn generate_master_key() -> String {
|
||||
use base64::Engine;
|
||||
use rand::rngs::OsRng;
|
||||
use rand::RngCore;
|
||||
|
||||
@@ -320,5 +321,5 @@ pub fn generate_master_key() -> String {
|
||||
|
||||
// let's encode the random bytes to base64 to make them human-readable and not too long.
|
||||
// We're using the URL_SAFE alphabet that will produce keys without =, / or other unusual characters.
|
||||
base64::encode_config(buf, base64::URL_SAFE_NO_PAD)
|
||||
base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(buf)
|
||||
}
|
||||
|
||||
@@ -11,31 +11,31 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
actix-web = { version = "4.2.1", default-features = false }
|
||||
anyhow = "1.0.65"
|
||||
actix-web = { version = "4.3.1", default-features = false }
|
||||
anyhow = "1.0.70"
|
||||
convert_case = "0.6.0"
|
||||
csv = "1.1.6"
|
||||
csv = "1.2.1"
|
||||
deserr = "0.5.0"
|
||||
either = { version = "1.6.1", features = ["serde"] }
|
||||
enum-iterator = "1.1.3"
|
||||
either = { version = "1.8.1", features = ["serde"] }
|
||||
enum-iterator = "1.4.0"
|
||||
file-store = { path = "../file-store" }
|
||||
flate2 = "1.0.24"
|
||||
flate2 = "1.0.25"
|
||||
fst = "0.4.7"
|
||||
memmap2 = "0.5.7"
|
||||
memmap2 = "0.5.10"
|
||||
milli = { path = "../milli", default-features = false }
|
||||
roaring = { version = "0.10.0", features = ["serde"] }
|
||||
serde = { version = "1.0.145", features = ["derive"] }
|
||||
roaring = { version = "0.10.1", features = ["serde"] }
|
||||
serde = { version = "1.0.160", features = ["derive"] }
|
||||
serde-cs = "0.2.4"
|
||||
serde_json = "1.0.85"
|
||||
serde_json = "1.0.95"
|
||||
tar = "0.4.38"
|
||||
tempfile = "3.3.0"
|
||||
thiserror = "1.0.30"
|
||||
time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] }
|
||||
tokio = "1.24"
|
||||
uuid = { version = "1.1.2", features = ["serde", "v4"] }
|
||||
tempfile = "3.5.0"
|
||||
thiserror = "1.0.40"
|
||||
time = { version = "0.3.20", features = ["serde-well-known", "formatting", "parsing", "macros"] }
|
||||
tokio = "1.27"
|
||||
uuid = { version = "1.3.1", features = ["serde", "v4"] }
|
||||
|
||||
[dev-dependencies]
|
||||
insta = "1.19.1"
|
||||
insta = "1.29.0"
|
||||
meili-snap = { path = "../meili-snap" }
|
||||
|
||||
[features]
|
||||
@@ -50,3 +50,6 @@ hebrew = ["milli/hebrew"]
|
||||
japanese = ["milli/japanese"]
|
||||
# thai specialized tokenization
|
||||
thai = ["milli/thai"]
|
||||
|
||||
# allow greek specialized tokenization
|
||||
greek = ["milli/greek"]
|
||||
|
||||
@@ -46,7 +46,7 @@ pub fn check_version_file(db_path: &Path) -> anyhow::Result<()> {
|
||||
pub enum VersionFileError {
|
||||
#[error(
|
||||
"Meilisearch (v{}) failed to infer the version of the database.
|
||||
To update Meilisearch please follow our guide on https://docs.meilisearch.com/learn/update_and_migration/updating.html.",
|
||||
To update Meilisearch please follow our guide on https://www.meilisearch.com/docs/learn/update_and_migration/updating.",
|
||||
env!("CARGO_PKG_VERSION").to_string()
|
||||
)]
|
||||
MissingVersionFile,
|
||||
@@ -54,7 +54,7 @@ pub enum VersionFileError {
|
||||
MalformedVersionFile,
|
||||
#[error(
|
||||
"Your database version ({major}.{minor}.{patch}) is incompatible with your current engine version ({}).\n\
|
||||
To migrate data between Meilisearch versions, please follow our guide on https://docs.meilisearch.com/learn/update_and_migration/updating.html.",
|
||||
To migrate data between Meilisearch versions, please follow our guide on https://www.meilisearch.com/docs/learn/update_and_migration/updating.",
|
||||
env!("CARGO_PKG_VERSION").to_string()
|
||||
)]
|
||||
VersionMismatch { major: String, minor: String, patch: String },
|
||||
|
||||
@@ -13,97 +13,97 @@ license.workspace = true
|
||||
default-run = "meilisearch"
|
||||
|
||||
[dependencies]
|
||||
actix-cors = "0.6.3"
|
||||
actix-http = { version = "3.2.2", default-features = false, features = ["compress-brotli", "compress-gzip", "rustls"] }
|
||||
actix-web = { version = "4.2.1", default-features = false, features = ["macros", "compress-brotli", "compress-gzip", "cookies", "rustls"] }
|
||||
actix-cors = "0.6.4"
|
||||
actix-http = { version = "3.3.1", default-features = false, features = ["compress-brotli", "compress-gzip", "rustls"] }
|
||||
actix-web = { version = "4.3.1", default-features = false, features = ["macros", "compress-brotli", "compress-gzip", "cookies", "rustls"] }
|
||||
actix-web-static-files = { git = "https://github.com/kilork/actix-web-static-files.git", rev = "2d3b6160", optional = true }
|
||||
anyhow = { version = "1.0.65", features = ["backtrace"] }
|
||||
async-stream = "0.3.3"
|
||||
async-trait = "0.1.57"
|
||||
bstr = "1.0.1"
|
||||
byte-unit = { version = "4.0.14", default-features = false, features = ["std", "serde"] }
|
||||
bytes = "1.2.1"
|
||||
clap = { version = "4.0.9", features = ["derive", "env"] }
|
||||
crossbeam-channel = "0.5.6"
|
||||
anyhow = { version = "1.0.70", features = ["backtrace"] }
|
||||
async-stream = "0.3.5"
|
||||
async-trait = "0.1.68"
|
||||
bstr = "1.4.0"
|
||||
byte-unit = { version = "4.0.19", default-features = false, features = ["std", "serde"] }
|
||||
bytes = "1.4.0"
|
||||
clap = { version = "4.2.1", features = ["derive", "env"] }
|
||||
crossbeam-channel = "0.5.8"
|
||||
deserr = "0.5.0"
|
||||
dump = { path = "../dump" }
|
||||
either = "1.8.0"
|
||||
env_logger = "0.9.1"
|
||||
either = "1.8.1"
|
||||
env_logger = "0.10.0"
|
||||
file-store = { path = "../file-store" }
|
||||
flate2 = "1.0.24"
|
||||
flate2 = "1.0.25"
|
||||
fst = "0.4.7"
|
||||
futures = "0.3.24"
|
||||
futures-util = "0.3.24"
|
||||
http = "0.2.8"
|
||||
futures = "0.3.28"
|
||||
futures-util = "0.3.28"
|
||||
http = "0.2.9"
|
||||
index-scheduler = { path = "../index-scheduler" }
|
||||
indexmap = { version = "1.9.1", features = ["serde-1"] }
|
||||
indexmap = { version = "1.9.3", features = ["serde-1"] }
|
||||
itertools = "0.10.5"
|
||||
jsonwebtoken = "8.1.1"
|
||||
jsonwebtoken = "8.3.0"
|
||||
lazy_static = "1.4.0"
|
||||
log = "0.4.17"
|
||||
meilisearch-auth = { path = "../meilisearch-auth" }
|
||||
meilisearch-types = { path = "../meilisearch-types" }
|
||||
mimalloc = { version = "0.1.29", default-features = false }
|
||||
mime = "0.3.16"
|
||||
num_cpus = "1.13.1"
|
||||
mimalloc = { version = "0.1.36", default-features = false }
|
||||
mime = "0.3.17"
|
||||
num_cpus = "1.15.0"
|
||||
obkv = "0.2.0"
|
||||
once_cell = "1.15.0"
|
||||
once_cell = "1.17.1"
|
||||
parking_lot = "0.12.1"
|
||||
permissive-json-pointer = { path = "../permissive-json-pointer" }
|
||||
pin-project-lite = "0.2.9"
|
||||
platform-dirs = "0.3.0"
|
||||
prometheus = { version = "0.13.2", features = ["process"] }
|
||||
prometheus = { version = "0.13.3", features = ["process"] }
|
||||
rand = "0.8.5"
|
||||
rayon = "1.5.3"
|
||||
regex = "1.6.0"
|
||||
reqwest = { version = "0.11.12", features = ["rustls-tls", "json"], default-features = false }
|
||||
rustls = "0.20.6"
|
||||
rustls-pemfile = "1.0.1"
|
||||
segment = { version = "0.2.1", optional = true }
|
||||
serde = { version = "1.0.145", features = ["derive"] }
|
||||
serde_json = { version = "1.0.85", features = ["preserve_order"] }
|
||||
rayon = "1.7.0"
|
||||
regex = "1.7.3"
|
||||
reqwest = { version = "0.11.16", features = ["rustls-tls", "json"], default-features = false }
|
||||
rustls = "0.20.8"
|
||||
rustls-pemfile = "1.0.2"
|
||||
segment = { version = "0.2.2", optional = true }
|
||||
serde = { version = "1.0.160", features = ["derive"] }
|
||||
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
||||
sha2 = "0.10.6"
|
||||
siphasher = "0.3.10"
|
||||
slice-group-by = "0.3.0"
|
||||
static-files = { version = "0.2.3", optional = true }
|
||||
sysinfo = "0.26.4"
|
||||
sysinfo = "0.28.4"
|
||||
tar = "0.4.38"
|
||||
tempfile = "3.3.0"
|
||||
thiserror = "1.0.37"
|
||||
time = { version = "0.3.15", features = ["serde-well-known", "formatting", "parsing", "macros"] }
|
||||
tokio = { version = "1.24.2", features = ["full"] }
|
||||
tokio-stream = "0.1.10"
|
||||
toml = "0.5.9"
|
||||
uuid = { version = "1.1.2", features = ["serde", "v4"] }
|
||||
walkdir = "2.3.2"
|
||||
yaup = "0.2.0"
|
||||
tempfile = "3.5.0"
|
||||
thiserror = "1.0.40"
|
||||
time = { version = "0.3.20", features = ["serde-well-known", "formatting", "parsing", "macros"] }
|
||||
tokio = { version = "1.27.0", features = ["full"] }
|
||||
tokio-stream = "0.1.12"
|
||||
toml = "0.7.3"
|
||||
uuid = { version = "1.3.1", features = ["serde", "v4"] }
|
||||
walkdir = "2.3.3"
|
||||
yaup = "0.2.1"
|
||||
serde_urlencoded = "0.7.1"
|
||||
actix-utils = "3.0.1"
|
||||
atty = "0.2.14"
|
||||
termcolor = "1.1.3"
|
||||
termcolor = "1.2.0"
|
||||
|
||||
[dev-dependencies]
|
||||
actix-rt = "2.7.0"
|
||||
actix-rt = "2.8.0"
|
||||
assert-json-diff = "2.0.2"
|
||||
brotli = "3.3.4"
|
||||
insta = "1.19.1"
|
||||
insta = "1.29.0"
|
||||
manifest-dir-macros = "0.1.16"
|
||||
maplit = "1.0.2"
|
||||
meili-snap = {path = "../meili-snap"}
|
||||
temp-env = "0.3.1"
|
||||
temp-env = "0.3.3"
|
||||
urlencoding = "2.1.2"
|
||||
yaup = "0.2.1"
|
||||
|
||||
[build-dependencies]
|
||||
anyhow = { version = "1.0.65", optional = true }
|
||||
cargo_toml = { version = "0.14.0", optional = true }
|
||||
anyhow = { version = "1.0.70", optional = true }
|
||||
cargo_toml = { version = "0.15.2", optional = true }
|
||||
hex = { version = "0.4.3", optional = true }
|
||||
reqwest = { version = "0.11.12", features = ["blocking", "rustls-tls"], default-features = false, optional = true }
|
||||
sha-1 = { version = "0.10.0", optional = true }
|
||||
reqwest = { version = "0.11.16", features = ["blocking", "rustls-tls"], default-features = false, optional = true }
|
||||
sha-1 = { version = "0.10.1", optional = true }
|
||||
static-files = { version = "0.2.3", optional = true }
|
||||
tempfile = { version = "3.3.0", optional = true }
|
||||
vergen = { version = "7.4.2", default-features = false, features = ["git"] }
|
||||
zip = { version = "0.6.2", optional = true }
|
||||
tempfile = { version = "3.5.0", optional = true }
|
||||
vergen = { version = "7.5.1", default-features = false, features = ["git"] }
|
||||
zip = { version = "0.6.4", optional = true }
|
||||
|
||||
[features]
|
||||
default = ["analytics", "meilisearch-types/default", "mini-dashboard"]
|
||||
@@ -113,6 +113,7 @@ chinese = ["meilisearch-types/chinese"]
|
||||
hebrew = ["meilisearch-types/hebrew"]
|
||||
japanese = ["meilisearch-types/japanese"]
|
||||
thai = ["meilisearch-types/thai"]
|
||||
greek = ["meilisearch-types/greek"]
|
||||
|
||||
[package.metadata.mini-dashboard]
|
||||
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.7/build.zip"
|
||||
|
||||
@@ -149,7 +149,7 @@ pub fn print_launch_resume(
|
||||
"
|
||||
Thank you for using Meilisearch!
|
||||
|
||||
\nWe collect anonymized analytics to improve our product and your experience. To learn more, including how to turn off analytics, visit our dedicated documentation page: https://docs.meilisearch.com/learn/what_is_meilisearch/telemetry.html
|
||||
\nWe collect anonymized analytics to improve our product and your experience. To learn more, including how to turn off analytics, visit our dedicated documentation page: https://www.meilisearch.com/docs/learn/what_is_meilisearch/telemetry
|
||||
|
||||
Anonymous telemetry:\t\"Enabled\""
|
||||
);
|
||||
|
||||
@@ -68,7 +68,7 @@ const DEFAULT_LOG_EVERY_N: usize = 100_000;
|
||||
// The actual size of the virtual address space is computed at startup to determine how many 2TiB indexes can be
|
||||
// opened simultaneously.
|
||||
pub const INDEX_SIZE: u64 = 2 * 1024 * 1024 * 1024 * 1024; // 2 TiB
|
||||
pub const TASK_DB_SIZE: u64 = 10 * 1024 * 1024 * 1024; // 10 GiB
|
||||
pub const TASK_DB_SIZE: u64 = 20 * 1024 * 1024 * 1024; // 20 GiB
|
||||
|
||||
#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "UPPERCASE")]
|
||||
@@ -323,10 +323,10 @@ impl Opt {
|
||||
.clone()
|
||||
.unwrap_or_else(|| PathBuf::from(DEFAULT_CONFIG_FILE_PATH));
|
||||
|
||||
match std::fs::read(&config_file_path) {
|
||||
match std::fs::read_to_string(&config_file_path) {
|
||||
Ok(config) => {
|
||||
// If the file is successfully read, we deserialize it with `toml`.
|
||||
let opt_from_config = toml::from_slice::<Opt>(&config)?;
|
||||
let opt_from_config = toml::from_str::<Opt>(&config)?;
|
||||
// Return an error if config file contains 'config_file_path'
|
||||
// Using that key in the config file doesn't make sense bc it creates a logical loop (config file referencing itself)
|
||||
if opt_from_config.config_file_path.is_some() {
|
||||
|
||||
@@ -60,7 +60,7 @@ async fn create_api_key_bad_uid() {
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
{
|
||||
"message": "Invalid value at `.uid`: invalid character: expected an optional prefix of `urn:uuid:` followed by [0-9a-zA-Z], found `o` at 2",
|
||||
"message": "Invalid value at `.uid`: invalid character: expected an optional prefix of `urn:uuid:` followed by [0-9a-fA-F-], found `o` at 2",
|
||||
"code": "invalid_api_key_uid",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_api_key_uid"
|
||||
|
||||
@@ -1773,7 +1773,7 @@ async fn error_add_documents_payload_size() {
|
||||
"content": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec metus erat, consequat in blandit venenatis, ultricies eu ipsum. Etiam luctus elit et mollis ultrices. Nam turpis risus, dictum non eros in, eleifend feugiat elit. Morbi non dolor pulvinar, sagittis mi sed, ultricies lorem. Nulla ultricies sem metus. Donec at suscipit quam, sed elementum mi. Suspendisse potenti. Fusce pharetra turpis tortor, sed eleifend odio dapibus ut. Nulla facilisi. Suspendisse elementum, dui eget aliquet dignissim, ex tellus aliquam nisl, at eleifend nisl metus tempus diam. Mauris fermentum sollicitudin efficitur. Donec dignissim est vitae elit finibus faucibus"
|
||||
}
|
||||
);
|
||||
let documents: Vec<_> = (0..16000).into_iter().map(|_| document.clone()).collect();
|
||||
let documents: Vec<_> = (0..16000).map(|_| document.clone()).collect();
|
||||
let documents = json!(documents);
|
||||
let (response, code) = index.add_documents(documents, None).await;
|
||||
|
||||
@@ -1934,7 +1934,6 @@ async fn batch_several_documents_addition() {
|
||||
let index = server.index("test");
|
||||
|
||||
let mut documents: Vec<_> = (0..150usize)
|
||||
.into_iter()
|
||||
.map(|id| {
|
||||
json!(
|
||||
{
|
||||
|
||||
@@ -547,7 +547,7 @@ async fn filter_invalid_syntax_object() {
|
||||
index.wait_task(1).await;
|
||||
|
||||
let expected_response = json!({
|
||||
"message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass",
|
||||
"message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass",
|
||||
"code": "invalid_search_filter",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_search_filter"
|
||||
@@ -572,7 +572,7 @@ async fn filter_invalid_syntax_array() {
|
||||
index.wait_task(1).await;
|
||||
|
||||
let expected_response = json!({
|
||||
"message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass",
|
||||
"message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass",
|
||||
"code": "invalid_search_filter",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_search_filter"
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
mod errors;
|
||||
|
||||
use byte_unit::{Byte, ByteUnit};
|
||||
use meili_snap::insta::assert_json_snapshot;
|
||||
use meili_snap::{json_string, snapshot};
|
||||
use serde_json::json;
|
||||
use tempfile::TempDir;
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::common::{default_settings, Server};
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn error_get_unexisting_task_status() {
|
||||
@@ -1000,3 +1003,117 @@ async fn test_summarized_dump_creation() {
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[actix_web::test]
|
||||
async fn test_task_queue_is_full() {
|
||||
let dir = TempDir::new().unwrap();
|
||||
let mut options = default_settings(dir.path());
|
||||
options.max_task_db_size = Byte::from_unit(500.0, ByteUnit::B).unwrap();
|
||||
|
||||
let server = Server::new_with_options(options).await.unwrap();
|
||||
|
||||
// the first task should be enqueued without issue
|
||||
let (result, code) = server.create_index(json!({ "uid": "doggo" })).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
snapshot!(json_string!(result, { ".enqueuedAt" => "[date]" }), @r###"
|
||||
{
|
||||
"taskUid": 0,
|
||||
"indexUid": "doggo",
|
||||
"status": "enqueued",
|
||||
"type": "indexCreation",
|
||||
"enqueuedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
|
||||
loop {
|
||||
let (res, code) = server.create_index(json!({ "uid": "doggo" })).await;
|
||||
if code == 422 {
|
||||
break;
|
||||
}
|
||||
if res["taskUid"] == json!(null) {
|
||||
panic!(
|
||||
"Encountered the strange case:\n{}",
|
||||
serde_json::to_string_pretty(&res).unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let (result, code) = server.create_index(json!({ "uid": "doggo" })).await;
|
||||
snapshot!(code, @"422 Unprocessable Entity");
|
||||
snapshot!(json_string!(result), @r###"
|
||||
{
|
||||
"message": "Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations.",
|
||||
"code": "no_space_left_on_device",
|
||||
"type": "system",
|
||||
"link": "https://docs.meilisearch.com/errors#no_space_left_on_device"
|
||||
}
|
||||
"###);
|
||||
|
||||
// But we should still be able to register tasks deletion IF they delete something
|
||||
let (result, code) = server.delete_tasks("uids=*").await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(result, { ".enqueuedAt" => "[date]", ".taskUid" => "uid" }), @r###"
|
||||
{
|
||||
"taskUid": "uid",
|
||||
"indexUid": null,
|
||||
"status": "enqueued",
|
||||
"type": "taskDeletion",
|
||||
"enqueuedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
|
||||
let result = server.wait_task(result["taskUid"].as_u64().unwrap()).await;
|
||||
snapshot!(json_string!(result["status"]), @r###""succeeded""###);
|
||||
|
||||
// Now we should be able to register tasks again
|
||||
let (result, code) = server.create_index(json!({ "uid": "doggo" })).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
snapshot!(json_string!(result, { ".enqueuedAt" => "[date]", ".taskUid" => "uid" }), @r###"
|
||||
{
|
||||
"taskUid": "uid",
|
||||
"indexUid": "doggo",
|
||||
"status": "enqueued",
|
||||
"type": "indexCreation",
|
||||
"enqueuedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
|
||||
// we're going to fill up the queue once again
|
||||
loop {
|
||||
let (res, code) = server.delete_tasks("uids=0").await;
|
||||
if code == 422 {
|
||||
break;
|
||||
}
|
||||
if res["taskUid"] == json!(null) {
|
||||
panic!(
|
||||
"Encountered the strange case:\n{}",
|
||||
serde_json::to_string_pretty(&res).unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// But we should NOT be able to register this task because it doesn't match any tasks
|
||||
let (result, code) = server.delete_tasks("uids=0").await;
|
||||
snapshot!(code, @"422 Unprocessable Entity");
|
||||
snapshot!(json_string!(result), @r###"
|
||||
{
|
||||
"message": "Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations.",
|
||||
"code": "no_space_left_on_device",
|
||||
"type": "system",
|
||||
"link": "https://docs.meilisearch.com/errors#no_space_left_on_device"
|
||||
}
|
||||
"###);
|
||||
|
||||
// The deletion still works
|
||||
let (result, code) = server.delete_tasks("uids=*").await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(result, { ".enqueuedAt" => "[date]", ".taskUid" => "uid" }), @r###"
|
||||
{
|
||||
"taskUid": "uid",
|
||||
"indexUid": null,
|
||||
"status": "enqueued",
|
||||
"type": "taskDeletion",
|
||||
"enqueuedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
@@ -12,40 +12,40 @@ readme.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
bimap = { version = "0.6.2", features = ["serde"] }
|
||||
bimap = { version = "0.6.3", features = ["serde"] }
|
||||
bincode = "1.3.3"
|
||||
bstr = "1.0.1"
|
||||
bstr = "1.4.0"
|
||||
byteorder = "1.4.3"
|
||||
charabia = { version = "0.7.1", default-features = false }
|
||||
charabia = { version = "0.7.2", default-features = false }
|
||||
concat-arrays = "0.1.2"
|
||||
crossbeam-channel = "0.5.6"
|
||||
crossbeam-channel = "0.5.8"
|
||||
deserr = "0.5.0"
|
||||
either = "1.8.0"
|
||||
either = "1.8.1"
|
||||
flatten-serde-json = { path = "../flatten-serde-json" }
|
||||
fst = "0.4.7"
|
||||
fxhash = "0.2.1"
|
||||
geoutils = "0.5.1"
|
||||
grenad = { version = "0.4.3", default-features = false, features = ["tempfile"] }
|
||||
grenad = { version = "0.4.4", default-features = false, features = ["tempfile"] }
|
||||
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.5", default-features = false, features = ["lmdb", "sync-read-txn"] }
|
||||
json-depth-checker = { path = "../json-depth-checker" }
|
||||
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
||||
memmap2 = "0.5.7"
|
||||
memmap2 = "0.5.10"
|
||||
obkv = "0.2.0"
|
||||
once_cell = "1.15.0"
|
||||
ordered-float = "3.2.0"
|
||||
rayon = "1.5.3"
|
||||
once_cell = "1.17.1"
|
||||
ordered-float = "3.6.0"
|
||||
rayon = "1.7.0"
|
||||
roaring = "0.10.1"
|
||||
rstar = { version = "0.9.3", features = ["serde"] }
|
||||
serde = { version = "1.0.145", features = ["derive"] }
|
||||
serde_json = { version = "1.0.85", features = ["preserve_order"] }
|
||||
rstar = { version = "0.10.0", features = ["serde"] }
|
||||
serde = { version = "1.0.160", features = ["derive"] }
|
||||
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
||||
slice-group-by = "0.3.0"
|
||||
smallstr = { version = "0.3.0", features = ["serde"] }
|
||||
smallvec = "1.10.0"
|
||||
smartstring = "1.0.1"
|
||||
tempfile = "3.3.0"
|
||||
thiserror = "1.0.37"
|
||||
time = { version = "0.3.15", features = ["serde-well-known", "formatting", "parsing", "macros"] }
|
||||
uuid = { version = "1.1.2", features = ["v4"] }
|
||||
tempfile = "3.5.0"
|
||||
thiserror = "1.0.40"
|
||||
time = { version = "0.3.20", features = ["serde-well-known", "formatting", "parsing", "macros"] }
|
||||
uuid = { version = "1.3.1", features = ["v4"] }
|
||||
|
||||
filter-parser = { path = "../filter-parser" }
|
||||
|
||||
@@ -55,11 +55,12 @@ itertools = "0.10.5"
|
||||
# logging
|
||||
log = "0.4.17"
|
||||
logging_timer = "1.1.0"
|
||||
csv = "1.1.6"
|
||||
csv = "1.2.1"
|
||||
|
||||
[dev-dependencies]
|
||||
mimalloc = { version = "0.1.29", default-features = false }
|
||||
big_s = "1.0.2"
|
||||
insta = "1.21.0"
|
||||
insta = "1.29.0"
|
||||
maplit = "1.0.2"
|
||||
md5 = "0.7.0"
|
||||
rand = {version = "0.8.5", features = ["small_rng"] }
|
||||
@@ -89,3 +90,6 @@ korean = ["charabia/korean"]
|
||||
|
||||
# allow thai specialized tokenization
|
||||
thai = ["charabia/thai"]
|
||||
|
||||
# allow greek specialized tokenization
|
||||
greek = ["charabia/greek"]
|
||||
|
||||
114
milli/examples/index.rs
Normal file
114
milli/examples/index.rs
Normal file
@@ -0,0 +1,114 @@
|
||||
use std::error::Error;
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader, Cursor, Seek};
|
||||
use std::path::Path;
|
||||
|
||||
use heed::EnvOpenOptions;
|
||||
use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
||||
use milli::{Index, Object};
|
||||
|
||||
fn usage(error: &str, program_name: &str) -> String {
|
||||
format!(
|
||||
"{}. Usage: {} <PATH-TO-INDEX> <PATH-TO-DATASET> [searchable_fields] [filterable_fields]",
|
||||
error, program_name
|
||||
)
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<dyn Error>> {
|
||||
let mut args = std::env::args();
|
||||
let program_name = args.next().expect("No program name");
|
||||
let index_path =
|
||||
args.next().unwrap_or_else(|| panic!("{}", usage("Missing path to index.", &program_name)));
|
||||
let dataset_path = args
|
||||
.next()
|
||||
.unwrap_or_else(|| panic!("{}", usage("Missing path to source dataset.", &program_name)));
|
||||
// let primary_key = args.next().unwrap_or_else(|| "id".into());
|
||||
// "title overview"
|
||||
let searchable_fields: Vec<String> = args
|
||||
.next()
|
||||
.map(|arg| arg.split_whitespace().map(ToString::to_string).collect())
|
||||
.unwrap_or_default();
|
||||
|
||||
println!("{searchable_fields:?}");
|
||||
// "release_date genres"
|
||||
let filterable_fields: Vec<String> = args
|
||||
.next()
|
||||
.map(|arg| arg.split_whitespace().map(ToString::to_string).collect())
|
||||
.unwrap_or_default();
|
||||
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
std::fs::create_dir_all(&index_path).unwrap();
|
||||
let index = Index::new(options, index_path).unwrap();
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
let config = IndexerConfig::default();
|
||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
||||
// builder.set_primary_key(primary_key);
|
||||
let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
|
||||
builder.set_searchable_fields(searchable_fields);
|
||||
let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
|
||||
builder.set_filterable_fields(filterable_fields);
|
||||
|
||||
builder.execute(|_| (), || false).unwrap();
|
||||
|
||||
let config = IndexerConfig::default();
|
||||
let indexing_config = IndexDocumentsConfig::default();
|
||||
|
||||
let builder =
|
||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
|
||||
|
||||
let documents = documents_from(
|
||||
&dataset_path,
|
||||
Path::new(&dataset_path).extension().unwrap_or_default().to_str().unwrap_or_default(),
|
||||
);
|
||||
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
||||
user_error.unwrap();
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
Ok(())
|
||||
}
|
||||
fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
|
||||
let reader = File::open(filename)
|
||||
.unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename));
|
||||
let reader = BufReader::new(reader);
|
||||
let documents = match filetype {
|
||||
"csv" => documents_from_csv(reader).unwrap(),
|
||||
"json" => documents_from_json(reader).unwrap(),
|
||||
"jsonl" => documents_from_jsonl(reader).unwrap(),
|
||||
otherwise => panic!("invalid update format {:?}", otherwise),
|
||||
};
|
||||
DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap()
|
||||
}
|
||||
|
||||
fn documents_from_jsonl(reader: impl BufRead) -> milli::Result<Vec<u8>> {
|
||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||
|
||||
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
|
||||
let object = result.unwrap();
|
||||
documents.append_json_object(&object)?;
|
||||
}
|
||||
|
||||
documents.into_inner().map_err(Into::into)
|
||||
}
|
||||
|
||||
fn documents_from_json(reader: impl BufRead) -> milli::Result<Vec<u8>> {
|
||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||
|
||||
documents.append_json_array(reader)?;
|
||||
|
||||
documents.into_inner().map_err(Into::into)
|
||||
}
|
||||
|
||||
fn documents_from_csv(reader: impl BufRead) -> milli::Result<Vec<u8>> {
|
||||
let csv = csv::Reader::from_reader(reader);
|
||||
|
||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||
documents.append_csv(csv)?;
|
||||
|
||||
documents.into_inner().map_err(Into::into)
|
||||
}
|
||||
117
milli/examples/search.rs
Normal file
117
milli/examples/search.rs
Normal file
@@ -0,0 +1,117 @@
|
||||
use std::error::Error;
|
||||
use std::io::stdin;
|
||||
use std::path::Path;
|
||||
use std::time::Instant;
|
||||
|
||||
use heed::EnvOpenOptions;
|
||||
use milli::{
|
||||
execute_search, DefaultSearchLogger, GeoSortStrategy, Index, SearchContext, SearchLogger,
|
||||
TermsMatchingStrategy,
|
||||
};
|
||||
|
||||
#[global_allocator]
|
||||
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
||||
|
||||
fn main() -> Result<(), Box<dyn Error>> {
|
||||
let mut args = std::env::args();
|
||||
let program_name = args.next().expect("No program name");
|
||||
let dataset = args.next().unwrap_or_else(|| {
|
||||
panic!(
|
||||
"Missing path to index. Usage: {} <PATH-TO-INDEX> [<logger-dir>] [print-documents]",
|
||||
program_name
|
||||
)
|
||||
});
|
||||
let detailed_logger_dir = args.next();
|
||||
let print_documents: bool =
|
||||
if let Some(arg) = args.next() { arg == "print-documents" } else { false };
|
||||
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
let index = Index::new(options, dataset)?;
|
||||
let txn = index.read_txn()?;
|
||||
let mut query = String::new();
|
||||
while stdin().read_line(&mut query)? > 0 {
|
||||
for _ in 0..2 {
|
||||
let mut default_logger = DefaultSearchLogger;
|
||||
// FIXME: consider resetting the state of the logger between search executions as otherwise panics are possible.
|
||||
// Workaround'd here by recreating the logger on each iteration of the loop
|
||||
let mut detailed_logger = detailed_logger_dir
|
||||
.as_ref()
|
||||
.map(|logger_dir| (milli::VisualSearchLogger::default(), logger_dir));
|
||||
let logger: &mut dyn SearchLogger<_> =
|
||||
if let Some((detailed_logger, _)) = detailed_logger.as_mut() {
|
||||
detailed_logger
|
||||
} else {
|
||||
&mut default_logger
|
||||
};
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let mut ctx = SearchContext::new(&index, &txn);
|
||||
let docs = execute_search(
|
||||
&mut ctx,
|
||||
&(!query.trim().is_empty()).then(|| query.trim().to_owned()),
|
||||
TermsMatchingStrategy::Last,
|
||||
false,
|
||||
&None,
|
||||
&None,
|
||||
GeoSortStrategy::default(),
|
||||
0,
|
||||
20,
|
||||
None,
|
||||
&mut DefaultSearchLogger,
|
||||
logger,
|
||||
)?;
|
||||
if let Some((logger, dir)) = detailed_logger {
|
||||
logger.finish(&mut ctx, Path::new(dir))?;
|
||||
}
|
||||
let elapsed = start.elapsed();
|
||||
println!("new: {}us, docids: {:?}", elapsed.as_micros(), docs.documents_ids);
|
||||
if print_documents {
|
||||
let documents = index
|
||||
.documents(&txn, docs.documents_ids.iter().copied())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(id, obkv)| {
|
||||
let mut object = serde_json::Map::default();
|
||||
for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
|
||||
let value = obkv.get(fid).unwrap();
|
||||
let value: serde_json::Value = serde_json::from_slice(value).unwrap();
|
||||
object.insert(fid_name.to_owned(), value);
|
||||
}
|
||||
(id, serde_json::to_string_pretty(&object).unwrap())
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for (id, document) in documents {
|
||||
println!("{id}:");
|
||||
println!("{document}");
|
||||
}
|
||||
|
||||
let documents = index
|
||||
.documents(&txn, docs.documents_ids.iter().copied())
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(id, obkv)| {
|
||||
let mut object = serde_json::Map::default();
|
||||
for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
|
||||
let value = obkv.get(fid).unwrap();
|
||||
let value: serde_json::Value = serde_json::from_slice(value).unwrap();
|
||||
object.insert(fid_name.to_owned(), value);
|
||||
}
|
||||
(id, serde_json::to_string_pretty(&object).unwrap())
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
|
||||
for (id, document) in documents {
|
||||
println!("{id}:");
|
||||
println!("{document}");
|
||||
}
|
||||
}
|
||||
}
|
||||
query.clear();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
33
milli/examples/settings.rs
Normal file
33
milli/examples/settings.rs
Normal file
@@ -0,0 +1,33 @@
|
||||
// use big_s::S;
|
||||
use heed::EnvOpenOptions;
|
||||
// use maplit::hashset;
|
||||
use milli::{
|
||||
update::{IndexerConfig, Settings},
|
||||
Criterion, Index,
|
||||
};
|
||||
|
||||
fn main() {
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
||||
|
||||
let index = Index::new(options, "data_movies.ms").unwrap();
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
let config = IndexerConfig::default();
|
||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
||||
|
||||
// builder.set_min_word_len_one_typo(5);
|
||||
// builder.set_min_word_len_two_typos(7);
|
||||
// builder.set_sortable_fields(hashset! { S("release_date") });
|
||||
builder.set_criteria(vec![
|
||||
Criterion::Words,
|
||||
Criterion::Typo,
|
||||
Criterion::Proximity,
|
||||
Criterion::Attribute,
|
||||
Criterion::Sort,
|
||||
Criterion::Exactness,
|
||||
]);
|
||||
|
||||
builder.execute(|_| (), || false).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
@@ -21,5 +21,5 @@ pub use self::roaring_bitmap_length::{
|
||||
BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec,
|
||||
};
|
||||
pub use self::script_language_codec::ScriptLanguageCodec;
|
||||
pub use self::str_beu32_codec::StrBEU32Codec;
|
||||
pub use self::str_beu32_codec::{StrBEU16Codec, StrBEU32Codec};
|
||||
pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec};
|
||||
|
||||
@@ -36,3 +36,39 @@ impl<'a> heed::BytesEncode<'a> for StrBEU32Codec {
|
||||
Some(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StrBEU16Codec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for StrBEU16Codec {
|
||||
type DItem = (&'a str, u16);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let footer_len = size_of::<u16>();
|
||||
|
||||
if bytes.len() < footer_len + 1 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let (word_plus_nul_byte, bytes) = bytes.split_at(bytes.len() - footer_len);
|
||||
let (_, word) = word_plus_nul_byte.split_last()?;
|
||||
let word = str::from_utf8(word).ok()?;
|
||||
let pos = bytes.try_into().map(u16::from_be_bytes).ok()?;
|
||||
|
||||
Some((word, pos))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for StrBEU16Codec {
|
||||
type EItem = (&'a str, u16);
|
||||
|
||||
fn bytes_encode((word, pos): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
let pos = pos.to_be_bytes();
|
||||
|
||||
let mut bytes = Vec::with_capacity(word.len() + 1 + pos.len());
|
||||
bytes.extend_from_slice(word.as_bytes());
|
||||
bytes.push(0);
|
||||
bytes.extend_from_slice(&pos[..]);
|
||||
|
||||
Some(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,12 +19,12 @@ use crate::heed_codec::facet::{
|
||||
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
||||
FieldIdCodec, OrderedF64Codec,
|
||||
};
|
||||
use crate::heed_codec::{ScriptLanguageCodec, StrRefCodec};
|
||||
use crate::heed_codec::{ScriptLanguageCodec, StrBEU16Codec, StrRefCodec};
|
||||
use crate::{
|
||||
default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
|
||||
DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId,
|
||||
FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec,
|
||||
Search, StrBEU32Codec, U8StrStrCodec, BEU16, BEU32,
|
||||
Search, U8StrStrCodec, BEU16, BEU32,
|
||||
};
|
||||
|
||||
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
|
||||
@@ -76,10 +76,14 @@ pub mod db_name {
|
||||
pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
|
||||
pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids";
|
||||
pub const WORD_POSITION_DOCIDS: &str = "word-position-docids";
|
||||
pub const WORD_FIELD_ID_DOCIDS: &str = "word-field-id-docids";
|
||||
pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids";
|
||||
pub const WORD_PREFIX_FIELD_ID_DOCIDS: &str = "word-prefix-field-id-docids";
|
||||
pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids";
|
||||
pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids";
|
||||
pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids";
|
||||
pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids";
|
||||
pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids";
|
||||
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
|
||||
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
|
||||
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
|
||||
@@ -118,17 +122,26 @@ pub struct Index {
|
||||
pub prefix_word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
|
||||
/// Maps the word and the position with the docids that corresponds to it.
|
||||
pub word_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
|
||||
pub word_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
||||
/// Maps the word and the field id with the docids that corresponds to it.
|
||||
pub word_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
||||
|
||||
/// Maps the field id and the word count with the docids that corresponds to it.
|
||||
pub field_id_word_count_docids: Database<FieldIdWordCountCodec, CboRoaringBitmapCodec>,
|
||||
/// Maps the position of a word prefix with all the docids where this prefix appears.
|
||||
pub word_prefix_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
|
||||
/// Maps the word prefix and a position with all the docids where the prefix appears at the position.
|
||||
pub word_prefix_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
||||
/// Maps the word prefix and a field id with all the docids where the prefix appears inside the field
|
||||
pub word_prefix_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
||||
|
||||
/// Maps the script and language with all the docids that corresponds to it.
|
||||
pub script_language_docids: Database<ScriptLanguageCodec, RoaringBitmapCodec>,
|
||||
|
||||
/// Maps the facet field id and the docids for which this field exists
|
||||
pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
|
||||
/// Maps the facet field id and the docids for which this field is set as null
|
||||
pub facet_id_is_null_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
|
||||
/// Maps the facet field id and the docids for which this field is considered empty
|
||||
pub facet_id_is_empty_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
|
||||
|
||||
/// Maps the facet field id and ranges of numbers with the docids that corresponds to them.
|
||||
pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
|
||||
@@ -153,7 +166,7 @@ impl Index {
|
||||
) -> Result<Index> {
|
||||
use db_name::*;
|
||||
|
||||
options.max_dbs(19);
|
||||
options.max_dbs(23);
|
||||
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
||||
|
||||
let env = options.open(path)?;
|
||||
@@ -170,11 +183,15 @@ impl Index {
|
||||
let prefix_word_pair_proximity_docids =
|
||||
env.create_database(Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?;
|
||||
let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?;
|
||||
let word_fid_docids = env.create_database(Some(WORD_FIELD_ID_DOCIDS))?;
|
||||
let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?;
|
||||
let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?;
|
||||
let word_prefix_fid_docids = env.create_database(Some(WORD_PREFIX_FIELD_ID_DOCIDS))?;
|
||||
let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?;
|
||||
let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?;
|
||||
let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?;
|
||||
let facet_id_is_null_docids = env.create_database(Some(FACET_ID_IS_NULL_DOCIDS))?;
|
||||
let facet_id_is_empty_docids = env.create_database(Some(FACET_ID_IS_EMPTY_DOCIDS))?;
|
||||
|
||||
let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?;
|
||||
let field_id_docid_facet_strings =
|
||||
@@ -196,11 +213,15 @@ impl Index {
|
||||
word_prefix_pair_proximity_docids,
|
||||
prefix_word_pair_proximity_docids,
|
||||
word_position_docids,
|
||||
word_fid_docids,
|
||||
word_prefix_position_docids,
|
||||
word_prefix_fid_docids,
|
||||
field_id_word_count_docids,
|
||||
facet_id_f64_docids,
|
||||
facet_id_string_docids,
|
||||
facet_id_exists_docids,
|
||||
facet_id_is_null_docids,
|
||||
facet_id_is_empty_docids,
|
||||
field_id_docid_facet_f64s,
|
||||
field_id_docid_facet_strings,
|
||||
documents,
|
||||
@@ -833,6 +854,30 @@ impl Index {
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve all the documents which contain this field id set as null
|
||||
pub fn null_faceted_documents_ids(
|
||||
&self,
|
||||
rtxn: &RoTxn,
|
||||
field_id: FieldId,
|
||||
) -> heed::Result<RoaringBitmap> {
|
||||
match self.facet_id_is_null_docids.get(rtxn, &BEU16::new(field_id))? {
|
||||
Some(docids) => Ok(docids),
|
||||
None => Ok(RoaringBitmap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve all the documents which contain this field id and that is considered empty
|
||||
pub fn empty_faceted_documents_ids(
|
||||
&self,
|
||||
rtxn: &RoTxn,
|
||||
field_id: FieldId,
|
||||
) -> heed::Result<RoaringBitmap> {
|
||||
match self.facet_id_is_empty_docids.get(rtxn, &BEU16::new(field_id))? {
|
||||
Some(docids) => Ok(docids),
|
||||
None => Ok(RoaringBitmap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve all the documents which contain this field id
|
||||
pub fn exists_faceted_documents_ids(
|
||||
&self,
|
||||
@@ -1284,10 +1329,10 @@ pub(crate) mod tests {
|
||||
let index_documents_config = IndexDocumentsConfig::default();
|
||||
Self { inner, indexer_config, index_documents_config, _tempdir }
|
||||
}
|
||||
/// Creates a temporary index, with a default `4096 * 1000` size. This should be enough for
|
||||
/// Creates a temporary index, with a default `4096 * 2000` size. This should be enough for
|
||||
/// most tests.
|
||||
pub fn new() -> Self {
|
||||
Self::new_with_map_size(4096 * 1000)
|
||||
Self::new_with_map_size(4096 * 2000)
|
||||
}
|
||||
pub fn add_documents_using_wtxn<'t, R>(
|
||||
&'t self,
|
||||
@@ -1416,11 +1461,11 @@ pub(crate) mod tests {
|
||||
db_snap!(index, field_distribution);
|
||||
|
||||
db_snap!(index, field_distribution,
|
||||
@"
|
||||
age 1
|
||||
id 2
|
||||
name 2
|
||||
"
|
||||
@r###"
|
||||
age 1
|
||||
id 2
|
||||
name 2
|
||||
"###
|
||||
);
|
||||
|
||||
// snapshot_index!(&index, "1", include: "^field_distribution$");
|
||||
@@ -1437,10 +1482,10 @@ pub(crate) mod tests {
|
||||
|
||||
db_snap!(index, field_distribution,
|
||||
@r###"
|
||||
age 1
|
||||
id 2
|
||||
name 2
|
||||
"###
|
||||
age 1
|
||||
id 2
|
||||
name 2
|
||||
"###
|
||||
);
|
||||
|
||||
// then we update a document by removing one field and another by adding one field
|
||||
@@ -1453,10 +1498,10 @@ pub(crate) mod tests {
|
||||
|
||||
db_snap!(index, field_distribution,
|
||||
@r###"
|
||||
has_dog 1
|
||||
id 2
|
||||
name 2
|
||||
"###
|
||||
has_dog 1
|
||||
id 2
|
||||
name 2
|
||||
"###
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,56 @@
|
||||
#![cfg_attr(all(test, fuzzing), feature(no_coverage))]
|
||||
#![allow(clippy::type_complexity)]
|
||||
|
||||
#[cfg(test)]
|
||||
#[global_allocator]
|
||||
pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
||||
|
||||
// #[cfg(test)]
|
||||
// pub mod allocator {
|
||||
// use std::alloc::{GlobalAlloc, System};
|
||||
// use std::sync::atomic::{self, AtomicI64};
|
||||
|
||||
// #[global_allocator]
|
||||
// pub static ALLOC: CountingAlloc = CountingAlloc {
|
||||
// max_resident: AtomicI64::new(0),
|
||||
// resident: AtomicI64::new(0),
|
||||
// allocated: AtomicI64::new(0),
|
||||
// };
|
||||
|
||||
// pub struct CountingAlloc {
|
||||
// pub max_resident: AtomicI64,
|
||||
// pub resident: AtomicI64,
|
||||
// pub allocated: AtomicI64,
|
||||
// }
|
||||
// unsafe impl GlobalAlloc for CountingAlloc {
|
||||
// unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 {
|
||||
// self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst);
|
||||
// let old_resident =
|
||||
// self.resident.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst);
|
||||
|
||||
// let resident = old_resident + layout.size() as i64;
|
||||
// self.max_resident.fetch_max(resident, atomic::Ordering::SeqCst);
|
||||
|
||||
// // if layout.size() > 1_000_000 {
|
||||
// // eprintln!(
|
||||
// // "allocating {} with new resident size: {resident}",
|
||||
// // layout.size() / 1_000_000
|
||||
// // );
|
||||
// // // let trace = std::backtrace::Backtrace::capture();
|
||||
// // // let t = trace.to_string();
|
||||
// // // eprintln!("{t}");
|
||||
// // }
|
||||
|
||||
// System.alloc(layout)
|
||||
// }
|
||||
|
||||
// unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) {
|
||||
// self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed);
|
||||
// System.dealloc(ptr, layout)
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
#[macro_use]
|
||||
pub mod documents;
|
||||
|
||||
@@ -26,6 +78,10 @@ use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer}
|
||||
pub use filter_parser::{Condition, FilterCondition, Span, Token};
|
||||
use fxhash::{FxHasher32, FxHasher64};
|
||||
pub use grenad::CompressionType;
|
||||
pub use search::new::{
|
||||
execute_search, DefaultSearchLogger, GeoSortStrategy, SearchContext, SearchLogger,
|
||||
VisualSearchLogger,
|
||||
};
|
||||
use serde_json::Value;
|
||||
pub use {charabia as tokenizer, heed};
|
||||
|
||||
@@ -43,9 +99,8 @@ pub use self::heed_codec::{
|
||||
};
|
||||
pub use self::index::Index;
|
||||
pub use self::search::{
|
||||
CriterionImplementationStrategy, FacetDistribution, Filter, FormatOptions, MatchBounds,
|
||||
MatcherBuilder, MatchingWord, MatchingWords, Search, SearchResult, TermsMatchingStrategy,
|
||||
DEFAULT_VALUES_PER_FACET,
|
||||
FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, Search,
|
||||
SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
|
||||
};
|
||||
|
||||
pub type Result<T> = std::result::Result<T, error::Error>;
|
||||
@@ -100,6 +155,23 @@ pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, Relative
|
||||
pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position {
|
||||
(field_id as u32) << 16 | (relative as u32)
|
||||
}
|
||||
// TODO: this is wrong, but will do for now
|
||||
/// Compute the "bucketed" absolute position from the field id and relative position in the field.
|
||||
///
|
||||
/// In a bucketed position, the accuracy of the relative position is reduced exponentially as it gets larger.
|
||||
pub fn bucketed_position(relative: u16) -> u16 {
|
||||
// The first few relative positions are kept intact.
|
||||
if relative < 16 {
|
||||
relative
|
||||
} else if relative < 24 {
|
||||
// Relative positions between 16 and 24 all become equal to 24
|
||||
24
|
||||
} else {
|
||||
// Then, groups of positions that have the same base-2 logarithm are reduced to
|
||||
// the same relative position: the smallest power of 2 that is greater than them
|
||||
(relative as f64).log2().ceil().exp2() as u16
|
||||
}
|
||||
}
|
||||
|
||||
/// Transform a raw obkv store into a JSON Object.
|
||||
pub fn obkv_to_json(
|
||||
|
||||
@@ -1,569 +0,0 @@
|
||||
use std::mem::take;
|
||||
|
||||
use heed::BytesDecode;
|
||||
use itertools::Itertools;
|
||||
use log::debug;
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{Criterion, CriterionParameters, CriterionResult};
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec};
|
||||
use crate::heed_codec::ByteSliceRefCodec;
|
||||
use crate::search::criteria::{resolve_query_tree, CriteriaBuilder, InitialCandidates};
|
||||
use crate::search::facet::{ascending_facet_sort, descending_facet_sort};
|
||||
use crate::search::query_tree::Operation;
|
||||
use crate::search::CriterionImplementationStrategy;
|
||||
use crate::{FieldId, Index, Result};
|
||||
|
||||
/// Threshold on the number of candidates that will make
|
||||
/// the system to choose between one algorithm or another.
|
||||
const CANDIDATES_THRESHOLD: u64 = 1000;
|
||||
|
||||
pub struct AscDesc<'t> {
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
field_name: String,
|
||||
field_id: Option<FieldId>,
|
||||
is_ascending: bool,
|
||||
query_tree: Option<Operation>,
|
||||
candidates: Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>,
|
||||
allowed_candidates: RoaringBitmap,
|
||||
initial_candidates: InitialCandidates,
|
||||
faceted_candidates: RoaringBitmap,
|
||||
implementation_strategy: CriterionImplementationStrategy,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
}
|
||||
|
||||
impl<'t> AscDesc<'t> {
|
||||
pub fn asc(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
field_name: String,
|
||||
implementation_strategy: CriterionImplementationStrategy,
|
||||
) -> Result<Self> {
|
||||
Self::new(index, rtxn, parent, field_name, true, implementation_strategy)
|
||||
}
|
||||
|
||||
pub fn desc(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
field_name: String,
|
||||
implementation_strategy: CriterionImplementationStrategy,
|
||||
) -> Result<Self> {
|
||||
Self::new(index, rtxn, parent, field_name, false, implementation_strategy)
|
||||
}
|
||||
|
||||
fn new(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
field_name: String,
|
||||
is_ascending: bool,
|
||||
implementation_strategy: CriterionImplementationStrategy,
|
||||
) -> Result<Self> {
|
||||
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||
let field_id = fields_ids_map.id(&field_name);
|
||||
let faceted_candidates = match field_id {
|
||||
Some(field_id) => {
|
||||
let number_faceted =
|
||||
index.faceted_documents_ids(rtxn, field_id, FacetType::Number)?;
|
||||
let string_faceted =
|
||||
index.faceted_documents_ids(rtxn, field_id, FacetType::String)?;
|
||||
number_faceted | string_faceted
|
||||
}
|
||||
None => RoaringBitmap::default(),
|
||||
};
|
||||
|
||||
Ok(AscDesc {
|
||||
index,
|
||||
rtxn,
|
||||
field_name,
|
||||
field_id,
|
||||
is_ascending,
|
||||
query_tree: None,
|
||||
candidates: Box::new(std::iter::empty()),
|
||||
allowed_candidates: RoaringBitmap::new(),
|
||||
faceted_candidates,
|
||||
initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()),
|
||||
implementation_strategy,
|
||||
parent,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Criterion for AscDesc<'t> {
|
||||
#[logging_timer::time("AscDesc::{}")]
|
||||
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>> {
|
||||
// remove excluded candidates when next is called, instead of doing it in the loop.
|
||||
self.allowed_candidates -= params.excluded_candidates;
|
||||
|
||||
loop {
|
||||
debug!(
|
||||
"Facet {}({}) iteration",
|
||||
if self.is_ascending { "Asc" } else { "Desc" },
|
||||
self.field_name
|
||||
);
|
||||
|
||||
match self.candidates.next().transpose()? {
|
||||
None if !self.allowed_candidates.is_empty() => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: self.query_tree.clone(),
|
||||
candidates: Some(take(&mut self.allowed_candidates)),
|
||||
filtered_candidates: None,
|
||||
initial_candidates: Some(self.initial_candidates.take()),
|
||||
}));
|
||||
}
|
||||
None => match self.parent.next(params)? {
|
||||
Some(CriterionResult {
|
||||
query_tree,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
initial_candidates,
|
||||
}) => {
|
||||
self.query_tree = query_tree;
|
||||
let mut candidates = match (&self.query_tree, candidates) {
|
||||
(_, Some(candidates)) => candidates,
|
||||
(Some(qt), None) => {
|
||||
let context = CriteriaBuilder::new(self.rtxn, self.index)?;
|
||||
resolve_query_tree(&context, qt, params.wdcache)?
|
||||
}
|
||||
(None, None) => self.index.documents_ids(self.rtxn)?,
|
||||
};
|
||||
|
||||
if let Some(filtered_candidates) = filtered_candidates {
|
||||
candidates &= filtered_candidates;
|
||||
}
|
||||
|
||||
match initial_candidates {
|
||||
Some(initial_candidates) => {
|
||||
self.initial_candidates |= initial_candidates
|
||||
}
|
||||
None => self.initial_candidates.map_inplace(|c| c | &candidates),
|
||||
}
|
||||
|
||||
if candidates.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
self.allowed_candidates = &candidates - params.excluded_candidates;
|
||||
self.candidates = match self.field_id {
|
||||
Some(field_id) => facet_ordered(
|
||||
self.index,
|
||||
self.rtxn,
|
||||
field_id,
|
||||
self.is_ascending,
|
||||
candidates & &self.faceted_candidates,
|
||||
self.implementation_strategy,
|
||||
)?,
|
||||
None => Box::new(std::iter::empty()),
|
||||
};
|
||||
}
|
||||
None => return Ok(None),
|
||||
},
|
||||
Some(mut candidates) => {
|
||||
candidates -= params.excluded_candidates;
|
||||
self.allowed_candidates -= &candidates;
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: self.query_tree.clone(),
|
||||
candidates: Some(candidates),
|
||||
filtered_candidates: None,
|
||||
initial_candidates: Some(self.initial_candidates.take()),
|
||||
}));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn facet_ordered_iterative<'t>(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
field_id: FieldId,
|
||||
is_ascending: bool,
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>> {
|
||||
let number_iter = iterative_facet_number_ordered_iter(
|
||||
index,
|
||||
rtxn,
|
||||
field_id,
|
||||
is_ascending,
|
||||
candidates.clone(),
|
||||
)?;
|
||||
let string_iter =
|
||||
iterative_facet_string_ordered_iter(index, rtxn, field_id, is_ascending, candidates)?;
|
||||
Ok(Box::new(number_iter.chain(string_iter).map(Ok)) as Box<dyn Iterator<Item = _>>)
|
||||
}
|
||||
|
||||
fn facet_extreme_value<'t>(
|
||||
mut extreme_it: impl Iterator<Item = heed::Result<(RoaringBitmap, &'t [u8])>> + 't,
|
||||
) -> Result<Option<f64>> {
|
||||
let extreme_value =
|
||||
if let Some(extreme_value) = extreme_it.next() { extreme_value } else { return Ok(None) };
|
||||
let (_, extreme_value) = extreme_value?;
|
||||
|
||||
Ok(OrderedF64Codec::bytes_decode(extreme_value))
|
||||
}
|
||||
|
||||
pub fn facet_min_value<'t>(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
field_id: FieldId,
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<Option<f64>> {
|
||||
let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
|
||||
let it = ascending_facet_sort(rtxn, db, field_id, candidates)?;
|
||||
facet_extreme_value(it)
|
||||
}
|
||||
|
||||
pub fn facet_max_value<'t>(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
field_id: FieldId,
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<Option<f64>> {
|
||||
let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
|
||||
let it = descending_facet_sort(rtxn, db, field_id, candidates)?;
|
||||
facet_extreme_value(it)
|
||||
}
|
||||
|
||||
fn facet_ordered_set_based<'t>(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
field_id: FieldId,
|
||||
is_ascending: bool,
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>> {
|
||||
let number_db =
|
||||
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
|
||||
let string_db =
|
||||
index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
|
||||
|
||||
let (number_iter, string_iter) = if is_ascending {
|
||||
let number_iter = ascending_facet_sort(rtxn, number_db, field_id, candidates.clone())?;
|
||||
let string_iter = ascending_facet_sort(rtxn, string_db, field_id, candidates)?;
|
||||
|
||||
(itertools::Either::Left(number_iter), itertools::Either::Left(string_iter))
|
||||
} else {
|
||||
let number_iter = descending_facet_sort(rtxn, number_db, field_id, candidates.clone())?;
|
||||
let string_iter = descending_facet_sort(rtxn, string_db, field_id, candidates)?;
|
||||
|
||||
(itertools::Either::Right(number_iter), itertools::Either::Right(string_iter))
|
||||
};
|
||||
|
||||
Ok(Box::new(number_iter.chain(string_iter).map(|res| res.map(|(doc_ids, _)| doc_ids))))
|
||||
}
|
||||
|
||||
/// Returns an iterator over groups of the given candidates in ascending or descending order.
|
||||
///
|
||||
/// It will either use an iterative or a recursive method on the whole facet database depending
|
||||
/// on the number of candidates to rank.
|
||||
fn facet_ordered<'t>(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
field_id: FieldId,
|
||||
is_ascending: bool,
|
||||
candidates: RoaringBitmap,
|
||||
implementation_strategy: CriterionImplementationStrategy,
|
||||
) -> Result<Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>> {
|
||||
match implementation_strategy {
|
||||
CriterionImplementationStrategy::OnlyIterative => {
|
||||
facet_ordered_iterative(index, rtxn, field_id, is_ascending, candidates)
|
||||
}
|
||||
CriterionImplementationStrategy::OnlySetBased => {
|
||||
facet_ordered_set_based(index, rtxn, field_id, is_ascending, candidates)
|
||||
}
|
||||
CriterionImplementationStrategy::Dynamic => {
|
||||
if candidates.len() <= CANDIDATES_THRESHOLD {
|
||||
facet_ordered_iterative(index, rtxn, field_id, is_ascending, candidates)
|
||||
} else {
|
||||
facet_ordered_set_based(index, rtxn, field_id, is_ascending, candidates)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Fetch the whole list of candidates facet number values one by one and order them by it.
|
||||
///
|
||||
/// This function is fast when the amount of candidates to rank is small.
|
||||
fn iterative_facet_number_ordered_iter<'t>(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
field_id: FieldId,
|
||||
is_ascending: bool,
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<impl Iterator<Item = RoaringBitmap> + 't> {
|
||||
let mut docids_values = Vec::with_capacity(candidates.len() as usize);
|
||||
for docid in candidates.iter() {
|
||||
let left = (field_id, docid, f64::MIN);
|
||||
let right = (field_id, docid, f64::MAX);
|
||||
let mut iter = index.field_id_docid_facet_f64s.range(rtxn, &(left..=right))?;
|
||||
let entry = if is_ascending { iter.next() } else { iter.last() };
|
||||
if let Some(((_, _, value), ())) = entry.transpose()? {
|
||||
docids_values.push((docid, OrderedFloat(value)));
|
||||
}
|
||||
}
|
||||
docids_values.sort_unstable_by_key(|(_, v)| *v);
|
||||
let iter = docids_values.into_iter();
|
||||
let iter = if is_ascending {
|
||||
Box::new(iter) as Box<dyn Iterator<Item = _>>
|
||||
} else {
|
||||
Box::new(iter.rev())
|
||||
};
|
||||
|
||||
// The itertools GroupBy iterator doesn't provide an owned version, we are therefore
|
||||
// required to collect the result into an owned collection (a Vec).
|
||||
// https://github.com/rust-itertools/itertools/issues/499
|
||||
#[allow(clippy::needless_collect)]
|
||||
let vec: Vec<_> = iter
|
||||
.group_by(|(_, v)| *v)
|
||||
.into_iter()
|
||||
.map(|(_, ids)| ids.map(|(id, _)| id).collect())
|
||||
.collect();
|
||||
|
||||
Ok(vec.into_iter())
|
||||
}
|
||||
|
||||
/// Fetch the whole list of candidates facet string values one by one and order them by it.
|
||||
///
|
||||
/// This function is fast when the amount of candidates to rank is small.
|
||||
fn iterative_facet_string_ordered_iter<'t>(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
field_id: FieldId,
|
||||
is_ascending: bool,
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<impl Iterator<Item = RoaringBitmap> + 't> {
|
||||
let mut docids_values = Vec::with_capacity(candidates.len() as usize);
|
||||
for docid in candidates.iter() {
|
||||
let left = (field_id, docid, "");
|
||||
let right = (field_id, docid.saturating_add(1), "");
|
||||
// FIXME Doing this means that it will never be possible to retrieve
|
||||
// the document with id 2^32, not sure this is a real problem.
|
||||
let mut iter = index.field_id_docid_facet_strings.range(rtxn, &(left..right))?;
|
||||
let entry = if is_ascending { iter.next() } else { iter.last() };
|
||||
if let Some(((_, _, value), _)) = entry.transpose()? {
|
||||
docids_values.push((docid, value));
|
||||
}
|
||||
}
|
||||
docids_values.sort_unstable_by_key(|(_, v)| *v);
|
||||
let iter = docids_values.into_iter();
|
||||
let iter = if is_ascending {
|
||||
Box::new(iter) as Box<dyn Iterator<Item = _>>
|
||||
} else {
|
||||
Box::new(iter.rev())
|
||||
};
|
||||
|
||||
// The itertools GroupBy iterator doesn't provide an owned version, we are therefore
|
||||
// required to collect the result into an owned collection (a Vec).
|
||||
// https://github.com/rust-itertools/itertools/issues/499
|
||||
#[allow(clippy::needless_collect)]
|
||||
let vec: Vec<_> = iter
|
||||
.group_by(|(_, v)| *v)
|
||||
.into_iter()
|
||||
.map(|(_, ids)| ids.map(|(id, _)| id).collect())
|
||||
.collect();
|
||||
|
||||
Ok(vec.into_iter())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
|
||||
use big_s::S;
|
||||
use maplit::hashset;
|
||||
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::{AscDesc, Criterion, Filter, Search, SearchResult};
|
||||
|
||||
// Note that in this test, only the iterative sort algorithms are used. Set the CANDIDATES_THESHOLD
|
||||
// constant to 0 to ensure that the other sort algorithms are also correct.
|
||||
#[test]
|
||||
fn sort_criterion_placeholder() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key("id".to_owned());
|
||||
settings
|
||||
.set_sortable_fields(maplit::hashset! { S("id"), S("mod_10"), S("mod_20") });
|
||||
settings.set_criteria(vec![Criterion::Sort]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let mut docs = vec![];
|
||||
for i in 0..100 {
|
||||
docs.push(
|
||||
serde_json::json!({ "id": i, "mod_10": format!("{}", i % 10), "mod_20": i % 20 }),
|
||||
);
|
||||
}
|
||||
|
||||
index.add_documents(documents!(docs)).unwrap();
|
||||
|
||||
let all_ids = (0..100).collect::<Vec<_>>();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let mut search = Search::new(&rtxn, &index);
|
||||
search.sort_criteria(vec![AscDesc::from_str("mod_10:desc").unwrap()]);
|
||||
search.limit(100);
|
||||
|
||||
let SearchResult { mut documents_ids, .. } = search.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 19, 29, 39, 49, 59, 69, 79, 89, 99, 8, 18, 28, 38, 48, 58, 68, 78, 88, 98, 7, 17, 27, 37, 47, 57, 67, 77, 87, 97, 6, 16, 26, 36, 46, 56, 66, 76, 86, 96, 5, 15, 25, 35, 45, 55, 65, 75, 85, 95, 4, 14, 24, 34, 44, 54, 64, 74, 84, 94, 3, 13, 23, 33, 43, 53, 63, 73, 83, 93, 2, 12, 22, 32, 42, 52, 62, 72, 82, 92, 1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 0, 10, 20, 30, 40, 50, 60, 70, 80, 90]");
|
||||
documents_ids.sort();
|
||||
assert_eq!(all_ids, documents_ids);
|
||||
|
||||
let mut search = Search::new(&rtxn, &index);
|
||||
search.sort_criteria(vec![
|
||||
AscDesc::from_str("mod_10:desc").unwrap(),
|
||||
AscDesc::from_str("id:desc").unwrap(),
|
||||
]);
|
||||
search.limit(100);
|
||||
|
||||
let SearchResult { mut documents_ids, .. } = search.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[99, 89, 79, 69, 59, 49, 39, 29, 19, 9, 98, 88, 78, 68, 58, 48, 38, 28, 18, 8, 97, 87, 77, 67, 57, 47, 37, 27, 17, 7, 96, 86, 76, 66, 56, 46, 36, 26, 16, 6, 95, 85, 75, 65, 55, 45, 35, 25, 15, 5, 94, 84, 74, 64, 54, 44, 34, 24, 14, 4, 93, 83, 73, 63, 53, 43, 33, 23, 13, 3, 92, 82, 72, 62, 52, 42, 32, 22, 12, 2, 91, 81, 71, 61, 51, 41, 31, 21, 11, 1, 90, 80, 70, 60, 50, 40, 30, 20, 10, 0]");
|
||||
documents_ids.sort();
|
||||
assert_eq!(all_ids, documents_ids);
|
||||
|
||||
let mut search = Search::new(&rtxn, &index);
|
||||
search.sort_criteria(vec![
|
||||
AscDesc::from_str("mod_10:desc").unwrap(),
|
||||
AscDesc::from_str("mod_20:asc").unwrap(),
|
||||
]);
|
||||
search.limit(100);
|
||||
|
||||
let SearchResult { mut documents_ids, .. } = search.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 29, 49, 69, 89, 19, 39, 59, 79, 99, 8, 28, 48, 68, 88, 18, 38, 58, 78, 98, 7, 27, 47, 67, 87, 17, 37, 57, 77, 97, 6, 26, 46, 66, 86, 16, 36, 56, 76, 96, 5, 25, 45, 65, 85, 15, 35, 55, 75, 95, 4, 24, 44, 64, 84, 14, 34, 54, 74, 94, 3, 23, 43, 63, 83, 13, 33, 53, 73, 93, 2, 22, 42, 62, 82, 12, 32, 52, 72, 92, 1, 21, 41, 61, 81, 11, 31, 51, 71, 91, 0, 20, 40, 60, 80, 10, 30, 50, 70, 90]");
|
||||
documents_ids.sort();
|
||||
assert_eq!(all_ids, documents_ids);
|
||||
|
||||
let mut search = Search::new(&rtxn, &index);
|
||||
search.sort_criteria(vec![
|
||||
AscDesc::from_str("mod_10:desc").unwrap(),
|
||||
AscDesc::from_str("mod_20:desc").unwrap(),
|
||||
]);
|
||||
search.limit(100);
|
||||
|
||||
let SearchResult { mut documents_ids, .. } = search.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 39, 59, 79, 99, 9, 29, 49, 69, 89, 18, 38, 58, 78, 98, 8, 28, 48, 68, 88, 17, 37, 57, 77, 97, 7, 27, 47, 67, 87, 16, 36, 56, 76, 96, 6, 26, 46, 66, 86, 15, 35, 55, 75, 95, 5, 25, 45, 65, 85, 14, 34, 54, 74, 94, 4, 24, 44, 64, 84, 13, 33, 53, 73, 93, 3, 23, 43, 63, 83, 12, 32, 52, 72, 92, 2, 22, 42, 62, 82, 11, 31, 51, 71, 91, 1, 21, 41, 61, 81, 10, 30, 50, 70, 90, 0, 20, 40, 60, 80]");
|
||||
documents_ids.sort();
|
||||
assert_eq!(all_ids, documents_ids);
|
||||
|
||||
let mut search = Search::new(&rtxn, &index);
|
||||
search.sort_criteria(vec![
|
||||
AscDesc::from_str("mod_10:desc").unwrap(),
|
||||
AscDesc::from_str("mod_20:desc").unwrap(),
|
||||
AscDesc::from_str("id:desc").unwrap(),
|
||||
]);
|
||||
search.limit(100);
|
||||
|
||||
let SearchResult { mut documents_ids, .. } = search.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[99, 79, 59, 39, 19, 89, 69, 49, 29, 9, 98, 78, 58, 38, 18, 88, 68, 48, 28, 8, 97, 77, 57, 37, 17, 87, 67, 47, 27, 7, 96, 76, 56, 36, 16, 86, 66, 46, 26, 6, 95, 75, 55, 35, 15, 85, 65, 45, 25, 5, 94, 74, 54, 34, 14, 84, 64, 44, 24, 4, 93, 73, 53, 33, 13, 83, 63, 43, 23, 3, 92, 72, 52, 32, 12, 82, 62, 42, 22, 2, 91, 71, 51, 31, 11, 81, 61, 41, 21, 1, 90, 70, 50, 30, 10, 80, 60, 40, 20, 0]");
|
||||
documents_ids.sort();
|
||||
assert_eq!(all_ids, documents_ids);
|
||||
}
|
||||
|
||||
// Note that in this test, only the iterative sort algorithms are used. Set the CANDIDATES_THESHOLD
|
||||
// constant to 0 to ensure that the other sort algorithms are also correct.
|
||||
#[test]
|
||||
fn sort_criterion_non_placeholder() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key("id".to_owned());
|
||||
settings.set_filterable_fields(hashset! { S("id"), S("mod_10"), S("mod_20") });
|
||||
settings.set_sortable_fields(hashset! { S("id"), S("mod_10"), S("mod_20") });
|
||||
settings.set_criteria(vec![Criterion::Sort]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let mut docs = vec![];
|
||||
for i in 0..100 {
|
||||
docs.push(
|
||||
serde_json::json!({ "id": i, "mod_10": format!("{}", i % 10), "mod_20": i % 20 }),
|
||||
);
|
||||
}
|
||||
|
||||
index.add_documents(documents!(docs)).unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let mut search = Search::new(&rtxn, &index);
|
||||
search.filter(
|
||||
Filter::from_str("mod_10 IN [1, 0, 2] OR mod_20 IN [10, 13] OR id IN [5, 6]")
|
||||
.unwrap()
|
||||
.unwrap(),
|
||||
);
|
||||
search.sort_criteria(vec![
|
||||
AscDesc::from_str("mod_10:desc").unwrap(),
|
||||
AscDesc::from_str("mod_20:asc").unwrap(),
|
||||
AscDesc::from_str("id:desc").unwrap(),
|
||||
]);
|
||||
search.limit(100);
|
||||
|
||||
let SearchResult { mut documents_ids, .. } = search.execute().unwrap();
|
||||
// The order should be in increasing value of the id modulo 10, followed by increasing value of the id modulo 20, followed by decreasing value of the id
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6, 5, 93, 73, 53, 33, 13, 82, 62, 42, 22, 2, 92, 72, 52, 32, 12, 81, 61, 41, 21, 1, 91, 71, 51, 31, 11, 80, 60, 40, 20, 0, 90, 70, 50, 30, 10]");
|
||||
let expected_ids = (0..100)
|
||||
.filter(|id| {
|
||||
[1, 0, 2].contains(&(id % 10))
|
||||
|| [10, 13].contains(&(id % 20))
|
||||
|| [5, 6].contains(id)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
documents_ids.sort();
|
||||
assert_eq!(expected_ids, documents_ids);
|
||||
|
||||
let mut search = Search::new(&rtxn, &index);
|
||||
search.filter(
|
||||
Filter::from_str("mod_10 IN [7, 8, 0] OR mod_20 IN [1, 15, 16] OR id IN [0, 4]")
|
||||
.unwrap()
|
||||
.unwrap(),
|
||||
);
|
||||
search.sort_criteria(vec![
|
||||
AscDesc::from_str("mod_10:asc").unwrap(),
|
||||
AscDesc::from_str("mod_20:asc").unwrap(),
|
||||
AscDesc::from_str("id:desc").unwrap(),
|
||||
]);
|
||||
search.limit(100);
|
||||
|
||||
let SearchResult { mut documents_ids, .. } = search.execute().unwrap();
|
||||
// The order should be in increasing value of the id modulo 10, followed by increasing value of the id modulo 20, followed by decreasing value of the id
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[80, 60, 40, 20, 0, 90, 70, 50, 30, 10, 81, 61, 41, 21, 1, 4, 95, 75, 55, 35, 15, 96, 76, 56, 36, 16, 87, 67, 47, 27, 7, 97, 77, 57, 37, 17, 88, 68, 48, 28, 8, 98, 78, 58, 38, 18]");
|
||||
let expected_ids = (0..100)
|
||||
.filter(|id| {
|
||||
[7, 8, 0].contains(&(id % 10))
|
||||
|| [1, 15, 16].contains(&(id % 20))
|
||||
|| [0, 4].contains(id)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
documents_ids.sort();
|
||||
assert_eq!(expected_ids, documents_ids);
|
||||
|
||||
let mut search = Search::new(&rtxn, &index);
|
||||
search.filter(
|
||||
Filter::from_str("mod_10 IN [1, 0, 2] OR mod_20 IN [10, 13] OR id IN [5, 6]")
|
||||
.unwrap()
|
||||
.unwrap(),
|
||||
);
|
||||
search.sort_criteria(vec![AscDesc::from_str("id:desc").unwrap()]);
|
||||
search.limit(100);
|
||||
|
||||
let SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||
// The order should be in decreasing value of the id
|
||||
let mut expected_ids = (0..100)
|
||||
.filter(|id| {
|
||||
[1, 0, 2].contains(&(id % 10))
|
||||
|| [10, 13].contains(&(id % 20))
|
||||
|| [5, 6].contains(id)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
expected_ids.sort();
|
||||
expected_ids.reverse();
|
||||
assert_eq!(expected_ids, documents_ids);
|
||||
}
|
||||
}
|
||||
@@ -1,709 +0,0 @@
|
||||
use std::cmp::{self, Ordering};
|
||||
use std::collections::binary_heap::PeekMut;
|
||||
use std::collections::{btree_map, BTreeMap, BinaryHeap, HashMap};
|
||||
use std::iter::Peekable;
|
||||
use std::mem::take;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult};
|
||||
use crate::search::criteria::{InitialCandidates, Query};
|
||||
use crate::search::query_tree::{Operation, QueryKind};
|
||||
use crate::search::{
|
||||
build_dfa, word_derivations, CriterionImplementationStrategy, WordDerivationsCache,
|
||||
};
|
||||
use crate::Result;
|
||||
|
||||
/// To be able to divide integers by the number of words in the query
|
||||
/// we want to find a multiplier that allow us to divide by any number between 1 and 10.
|
||||
/// We chose the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple).
|
||||
const LCM_10_FIRST_NUMBERS: u32 = 2520;
|
||||
|
||||
/// Threshold on the number of candidates that will make
|
||||
/// the system to choose between one algorithm or another.
|
||||
const CANDIDATES_THRESHOLD: u64 = 500;
|
||||
|
||||
type FlattenedQueryTree = Vec<Vec<Vec<Query>>>;
|
||||
|
||||
pub struct Attribute<'t> {
|
||||
ctx: &'t dyn Context<'t>,
|
||||
state: Option<(Operation, FlattenedQueryTree, RoaringBitmap)>,
|
||||
initial_candidates: InitialCandidates,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
linear_buckets: Option<btree_map::IntoIter<u64, RoaringBitmap>>,
|
||||
set_buckets: Option<BinaryHeap<Branch<'t>>>,
|
||||
implementation_strategy: CriterionImplementationStrategy,
|
||||
}
|
||||
|
||||
impl<'t> Attribute<'t> {
|
||||
pub fn new(
|
||||
ctx: &'t dyn Context<'t>,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
implementation_strategy: CriterionImplementationStrategy,
|
||||
) -> Self {
|
||||
Attribute {
|
||||
ctx,
|
||||
state: None,
|
||||
initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()),
|
||||
parent,
|
||||
linear_buckets: None,
|
||||
set_buckets: None,
|
||||
implementation_strategy,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Criterion for Attribute<'t> {
|
||||
#[logging_timer::time("Attribute::{}")]
|
||||
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>> {
|
||||
// remove excluded candidates when next is called, instead of doing it in the loop.
|
||||
if let Some((_, _, allowed_candidates)) = self.state.as_mut() {
|
||||
*allowed_candidates -= params.excluded_candidates;
|
||||
}
|
||||
|
||||
loop {
|
||||
match self.state.take() {
|
||||
Some((query_tree, _, allowed_candidates)) if allowed_candidates.is_empty() => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: Some(query_tree),
|
||||
candidates: Some(RoaringBitmap::new()),
|
||||
filtered_candidates: None,
|
||||
initial_candidates: Some(self.initial_candidates.take()),
|
||||
}));
|
||||
}
|
||||
Some((query_tree, flattened_query_tree, mut allowed_candidates)) => {
|
||||
let found_candidates = if matches!(
|
||||
self.implementation_strategy,
|
||||
CriterionImplementationStrategy::OnlyIterative
|
||||
) || (matches!(
|
||||
self.implementation_strategy,
|
||||
CriterionImplementationStrategy::Dynamic
|
||||
) && allowed_candidates.len()
|
||||
< CANDIDATES_THRESHOLD)
|
||||
{
|
||||
let linear_buckets = match self.linear_buckets.as_mut() {
|
||||
Some(linear_buckets) => linear_buckets,
|
||||
None => {
|
||||
let new_buckets = initialize_linear_buckets(
|
||||
self.ctx,
|
||||
&flattened_query_tree,
|
||||
&allowed_candidates,
|
||||
)?;
|
||||
self.linear_buckets.get_or_insert(new_buckets.into_iter())
|
||||
}
|
||||
};
|
||||
|
||||
match linear_buckets.next() {
|
||||
Some((_score, candidates)) => candidates,
|
||||
None => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: Some(query_tree),
|
||||
candidates: Some(RoaringBitmap::new()),
|
||||
filtered_candidates: None,
|
||||
initial_candidates: Some(self.initial_candidates.take()),
|
||||
}));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let set_buckets = match self.set_buckets.as_mut() {
|
||||
Some(set_buckets) => set_buckets,
|
||||
None => {
|
||||
let new_buckets = initialize_set_buckets(
|
||||
self.ctx,
|
||||
&flattened_query_tree,
|
||||
&allowed_candidates,
|
||||
params.wdcache,
|
||||
)?;
|
||||
self.set_buckets.get_or_insert(new_buckets)
|
||||
}
|
||||
};
|
||||
|
||||
match set_compute_candidates(set_buckets, &allowed_candidates)? {
|
||||
Some((_score, candidates)) => candidates,
|
||||
None => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: Some(query_tree),
|
||||
candidates: Some(allowed_candidates),
|
||||
filtered_candidates: None,
|
||||
initial_candidates: Some(self.initial_candidates.take()),
|
||||
}));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
allowed_candidates -= &found_candidates;
|
||||
|
||||
self.state =
|
||||
Some((query_tree.clone(), flattened_query_tree, allowed_candidates));
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: Some(query_tree),
|
||||
candidates: Some(found_candidates),
|
||||
filtered_candidates: None,
|
||||
initial_candidates: Some(self.initial_candidates.take()),
|
||||
}));
|
||||
}
|
||||
None => match self.parent.next(params)? {
|
||||
Some(CriterionResult {
|
||||
query_tree: Some(query_tree),
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
initial_candidates,
|
||||
}) => {
|
||||
let mut candidates = match candidates {
|
||||
Some(candidates) => candidates,
|
||||
None => {
|
||||
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?
|
||||
- params.excluded_candidates
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(filtered_candidates) = filtered_candidates {
|
||||
candidates &= filtered_candidates;
|
||||
}
|
||||
|
||||
let flattened_query_tree = flatten_query_tree(&query_tree);
|
||||
|
||||
match initial_candidates {
|
||||
Some(initial_candidates) => {
|
||||
self.initial_candidates |= initial_candidates
|
||||
}
|
||||
None => self.initial_candidates.map_inplace(|c| c | &candidates),
|
||||
}
|
||||
|
||||
self.state = Some((query_tree, flattened_query_tree, candidates));
|
||||
self.linear_buckets = None;
|
||||
}
|
||||
Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
initial_candidates,
|
||||
}) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
initial_candidates,
|
||||
}));
|
||||
}
|
||||
None => return Ok(None),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// QueryPositionIterator is an Iterator over positions of a Query,
|
||||
/// It contains iterators over words positions.
|
||||
struct QueryPositionIterator<'t> {
|
||||
#[allow(clippy::type_complexity)]
|
||||
inner:
|
||||
Vec<Peekable<Box<dyn Iterator<Item = heed::Result<((&'t str, u32), RoaringBitmap)>> + 't>>>,
|
||||
}
|
||||
|
||||
impl<'t> QueryPositionIterator<'t> {
|
||||
fn new(
|
||||
ctx: &'t dyn Context<'t>,
|
||||
queries: &[Query],
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<Self> {
|
||||
let mut inner = Vec::with_capacity(queries.len());
|
||||
for query in queries {
|
||||
let in_prefix_cache = query.prefix && ctx.in_prefix_cache(query.kind.word());
|
||||
match &query.kind {
|
||||
QueryKind::Exact { word, .. } => {
|
||||
if !query.prefix || in_prefix_cache {
|
||||
let word = query.kind.word();
|
||||
let iter = ctx.word_position_iterator(word, in_prefix_cache)?;
|
||||
inner.push(iter.peekable());
|
||||
} else {
|
||||
for (word, _) in word_derivations(word, true, 0, ctx.words_fst(), wdcache)?
|
||||
{
|
||||
let iter = ctx.word_position_iterator(word, in_prefix_cache)?;
|
||||
inner.push(iter.peekable());
|
||||
}
|
||||
}
|
||||
}
|
||||
QueryKind::Tolerant { typo, word } => {
|
||||
for (word, _) in
|
||||
word_derivations(word, query.prefix, *typo, ctx.words_fst(), wdcache)?
|
||||
{
|
||||
let iter = ctx.word_position_iterator(word, in_prefix_cache)?;
|
||||
inner.push(iter.peekable());
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
Ok(Self { inner })
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Iterator for QueryPositionIterator<'t> {
|
||||
type Item = heed::Result<(u32, RoaringBitmap)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
// sort inner words from the closest next position to the farthest next position.
|
||||
let expected_pos = self
|
||||
.inner
|
||||
.iter_mut()
|
||||
.filter_map(|wli| match wli.peek() {
|
||||
Some(Ok(((_, pos), _))) => Some(*pos),
|
||||
_ => None,
|
||||
})
|
||||
.min()?;
|
||||
|
||||
let mut candidates = None;
|
||||
for wli in self.inner.iter_mut() {
|
||||
if let Some(Ok(((_, pos), _))) = wli.peek() {
|
||||
if *pos > expected_pos {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
match wli.next() {
|
||||
Some(Ok((_, docids))) => {
|
||||
candidates = match candidates.take() {
|
||||
Some(candidates) => Some(candidates | docids),
|
||||
None => Some(docids),
|
||||
}
|
||||
}
|
||||
Some(Err(e)) => return Some(Err(e)),
|
||||
None => continue,
|
||||
}
|
||||
}
|
||||
|
||||
candidates.map(|candidates| Ok((expected_pos, candidates)))
|
||||
}
|
||||
}
|
||||
|
||||
/// A Branch is represent a possible alternative of the original query and is build with the Query Tree,
|
||||
/// This branch allows us to iterate over meta-interval of positions.
|
||||
struct Branch<'t> {
|
||||
query_level_iterator: Vec<(u32, RoaringBitmap, Peekable<QueryPositionIterator<'t>>)>,
|
||||
last_result: (u32, RoaringBitmap),
|
||||
branch_size: u32,
|
||||
}
|
||||
|
||||
impl<'t> Branch<'t> {
|
||||
fn new(
|
||||
ctx: &'t dyn Context<'t>,
|
||||
flatten_branch: &[Vec<Query>],
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
allowed_candidates: &RoaringBitmap,
|
||||
) -> Result<Self> {
|
||||
let mut query_level_iterator = Vec::new();
|
||||
for queries in flatten_branch {
|
||||
let mut qli = QueryPositionIterator::new(ctx, queries, wdcache)?.peekable();
|
||||
let (pos, docids) = qli.next().transpose()?.unwrap_or((0, RoaringBitmap::new()));
|
||||
query_level_iterator.push((pos, docids & allowed_candidates, qli));
|
||||
}
|
||||
|
||||
let mut branch = Self {
|
||||
query_level_iterator,
|
||||
last_result: (0, RoaringBitmap::new()),
|
||||
branch_size: flatten_branch.len() as u32,
|
||||
};
|
||||
|
||||
branch.update_last_result();
|
||||
|
||||
Ok(branch)
|
||||
}
|
||||
|
||||
/// return the next meta-interval of the branch,
|
||||
/// and update inner interval in order to be ranked by the BinaryHeap.
|
||||
fn next(&mut self, allowed_candidates: &RoaringBitmap) -> heed::Result<bool> {
|
||||
// update the first query.
|
||||
let index = self.lowest_iterator_index();
|
||||
match self.query_level_iterator.get_mut(index) {
|
||||
Some((cur_pos, cur_docids, qli)) => match qli.next().transpose()? {
|
||||
Some((next_pos, next_docids)) => {
|
||||
*cur_pos = next_pos;
|
||||
*cur_docids |= next_docids & allowed_candidates;
|
||||
self.update_last_result();
|
||||
Ok(true)
|
||||
}
|
||||
None => Ok(false),
|
||||
},
|
||||
None => Ok(false),
|
||||
}
|
||||
}
|
||||
|
||||
fn lowest_iterator_index(&mut self) -> usize {
|
||||
let (index, _) = self
|
||||
.query_level_iterator
|
||||
.iter_mut()
|
||||
.map(|(pos, docids, qli)| {
|
||||
if docids.is_empty() {
|
||||
0
|
||||
} else {
|
||||
match qli.peek() {
|
||||
Some(result) => {
|
||||
result.as_ref().map(|(next_pos, _)| *next_pos - *pos).unwrap_or(0)
|
||||
}
|
||||
None => u32::MAX,
|
||||
}
|
||||
}
|
||||
})
|
||||
.enumerate()
|
||||
.min_by_key(|(_, diff)| *diff)
|
||||
.unwrap_or((0, 0));
|
||||
|
||||
index
|
||||
}
|
||||
|
||||
fn update_last_result(&mut self) {
|
||||
let mut result_pos = 0;
|
||||
let mut result_docids = None;
|
||||
|
||||
for (pos, docids, _qli) in self.query_level_iterator.iter() {
|
||||
result_pos += pos;
|
||||
result_docids = result_docids
|
||||
.take()
|
||||
.map_or_else(|| Some(docids.clone()), |candidates| Some(candidates & docids));
|
||||
}
|
||||
|
||||
// remove last result docids from inner iterators
|
||||
if let Some(docids) = result_docids.as_ref() {
|
||||
for (_, query_docids, _) in self.query_level_iterator.iter_mut() {
|
||||
*query_docids -= docids;
|
||||
}
|
||||
}
|
||||
|
||||
self.last_result = (result_pos, result_docids.unwrap_or_default());
|
||||
}
|
||||
|
||||
/// return the score of the current inner interval.
|
||||
fn compute_rank(&self) -> u32 {
|
||||
// we compute a rank from the position.
|
||||
let (pos, _) = self.last_result;
|
||||
pos.saturating_sub((0..self.branch_size).sum()) * LCM_10_FIRST_NUMBERS / self.branch_size
|
||||
}
|
||||
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
let self_rank = self.compute_rank();
|
||||
let other_rank = other.compute_rank();
|
||||
|
||||
// lower rank is better, and because BinaryHeap give the higher ranked branch, we reverse it.
|
||||
self_rank.cmp(&other_rank).reverse()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Ord for Branch<'t> {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.cmp(other)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> PartialOrd for Branch<'t> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> PartialEq for Branch<'t> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.cmp(other) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Eq for Branch<'t> {}
|
||||
|
||||
fn initialize_set_buckets<'t>(
|
||||
ctx: &'t dyn Context<'t>,
|
||||
branches: &FlattenedQueryTree,
|
||||
allowed_candidates: &RoaringBitmap,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<BinaryHeap<Branch<'t>>> {
|
||||
let mut heap = BinaryHeap::new();
|
||||
for flatten_branch in branches {
|
||||
let branch = Branch::new(ctx, flatten_branch, wdcache, allowed_candidates)?;
|
||||
heap.push(branch);
|
||||
}
|
||||
|
||||
Ok(heap)
|
||||
}
|
||||
|
||||
fn set_compute_candidates(
|
||||
branches_heap: &mut BinaryHeap<Branch>,
|
||||
allowed_candidates: &RoaringBitmap,
|
||||
) -> Result<Option<(u32, RoaringBitmap)>> {
|
||||
let mut final_candidates: Option<(u32, RoaringBitmap)> = None;
|
||||
let mut allowed_candidates = allowed_candidates.clone();
|
||||
|
||||
while let Some(mut branch) = branches_heap.peek_mut() {
|
||||
// if current is worst than best we break to return
|
||||
// candidates that correspond to the best rank
|
||||
let branch_rank = branch.compute_rank();
|
||||
if let Some((best_rank, _)) = final_candidates {
|
||||
if branch_rank > best_rank {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let candidates = take(&mut branch.last_result.1);
|
||||
if candidates.is_empty() {
|
||||
// we don't have candidates, get next interval.
|
||||
if !branch.next(&allowed_candidates)? {
|
||||
PeekMut::pop(branch);
|
||||
}
|
||||
} else {
|
||||
allowed_candidates -= &candidates;
|
||||
final_candidates = match final_candidates.take() {
|
||||
// we add current candidates to best candidates
|
||||
Some((best_rank, mut best_candidates)) => {
|
||||
best_candidates |= candidates;
|
||||
branch.next(&allowed_candidates)?;
|
||||
Some((best_rank, best_candidates))
|
||||
}
|
||||
// we take current candidates as best candidates
|
||||
None => {
|
||||
branch.next(&allowed_candidates)?;
|
||||
Some((branch_rank, candidates))
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
Ok(final_candidates)
|
||||
}
|
||||
|
||||
fn initialize_linear_buckets(
|
||||
ctx: &dyn Context,
|
||||
branches: &FlattenedQueryTree,
|
||||
allowed_candidates: &RoaringBitmap,
|
||||
) -> Result<BTreeMap<u64, RoaringBitmap>> {
|
||||
fn compute_candidate_rank(
|
||||
branches: &FlattenedQueryTree,
|
||||
words_positions: HashMap<String, RoaringBitmap>,
|
||||
) -> u64 {
|
||||
let mut min_rank = u64::max_value();
|
||||
for branch in branches {
|
||||
let branch_len = branch.len();
|
||||
let mut branch_rank = Vec::with_capacity(branch_len);
|
||||
for derivates in branch {
|
||||
let mut position = None;
|
||||
for Query { prefix, kind } in derivates {
|
||||
// find the best position of the current word in the document.
|
||||
let current_position = match kind {
|
||||
QueryKind::Exact { word, .. } => {
|
||||
if *prefix {
|
||||
word_derivations(word, true, 0, &words_positions)
|
||||
.flat_map(|positions| positions.iter().next())
|
||||
.min()
|
||||
} else {
|
||||
words_positions
|
||||
.get(word)
|
||||
.and_then(|positions| positions.iter().next())
|
||||
}
|
||||
}
|
||||
QueryKind::Tolerant { typo, word } => {
|
||||
word_derivations(word, *prefix, *typo, &words_positions)
|
||||
.flat_map(|positions| positions.iter().next())
|
||||
.min()
|
||||
}
|
||||
};
|
||||
|
||||
match (position, current_position) {
|
||||
(Some(p), Some(cp)) => position = Some(cmp::min(p, cp)),
|
||||
(None, Some(cp)) => position = Some(cp),
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
||||
// if a position is found, we add it to the branch score,
|
||||
// otherwise the branch is considered as unfindable in this document and we break.
|
||||
if let Some(position) = position {
|
||||
branch_rank.push(position as u64);
|
||||
} else {
|
||||
branch_rank.clear();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !branch_rank.is_empty() {
|
||||
branch_rank.sort_unstable();
|
||||
// because several words in same query can't match all a the position 0,
|
||||
// we substract the word index to the position.
|
||||
let branch_rank: u64 =
|
||||
branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum();
|
||||
// here we do the means of the words of the branch
|
||||
min_rank =
|
||||
min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64);
|
||||
}
|
||||
}
|
||||
|
||||
min_rank
|
||||
}
|
||||
|
||||
fn word_derivations<'a>(
|
||||
word: &str,
|
||||
is_prefix: bool,
|
||||
max_typo: u8,
|
||||
words_positions: &'a HashMap<String, RoaringBitmap>,
|
||||
) -> impl Iterator<Item = &'a RoaringBitmap> {
|
||||
let dfa = build_dfa(word, max_typo, is_prefix);
|
||||
words_positions.iter().filter_map(move |(document_word, positions)| {
|
||||
use levenshtein_automata::Distance;
|
||||
match dfa.eval(document_word) {
|
||||
Distance::Exact(_) => Some(positions),
|
||||
Distance::AtLeast(_) => None,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
let mut candidates = BTreeMap::new();
|
||||
for docid in allowed_candidates {
|
||||
let words_positions = ctx.docid_words_positions(docid)?;
|
||||
let rank = compute_candidate_rank(branches, words_positions);
|
||||
candidates.entry(rank).or_insert_with(RoaringBitmap::new).insert(docid);
|
||||
}
|
||||
|
||||
Ok(candidates)
|
||||
}
|
||||
|
||||
// TODO can we keep refs of Query
|
||||
fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
|
||||
use crate::search::criteria::Operation::{And, Or, Phrase};
|
||||
|
||||
fn and_recurse(head: &Operation, tail: &[Operation]) -> FlattenedQueryTree {
|
||||
match tail.split_first() {
|
||||
Some((thead, tail)) => {
|
||||
let tail = and_recurse(thead, tail);
|
||||
let mut out = Vec::new();
|
||||
for array in recurse(head) {
|
||||
for tail_array in &tail {
|
||||
let mut array = array.clone();
|
||||
array.extend(tail_array.iter().cloned());
|
||||
out.push(array);
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
None => recurse(head),
|
||||
}
|
||||
}
|
||||
|
||||
fn recurse(op: &Operation) -> FlattenedQueryTree {
|
||||
match op {
|
||||
And(ops) => ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)),
|
||||
Or(_, ops) => {
|
||||
if ops.iter().all(|op| op.query().is_some()) {
|
||||
vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]]
|
||||
} else {
|
||||
ops.iter().flat_map(recurse).collect()
|
||||
}
|
||||
}
|
||||
Phrase(words) => {
|
||||
let queries = words
|
||||
.iter()
|
||||
.filter_map(|w| w.as_ref())
|
||||
.map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }])
|
||||
.collect();
|
||||
vec![queries]
|
||||
}
|
||||
Operation::Query(query) => vec![vec![vec![query.clone()]]],
|
||||
}
|
||||
}
|
||||
|
||||
recurse(query_tree)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use big_s::S;
|
||||
|
||||
use super::*;
|
||||
use crate::search::criteria::QueryKind;
|
||||
|
||||
#[test]
|
||||
fn simple_flatten_query_tree() {
|
||||
let query_tree = Operation::Or(
|
||||
false,
|
||||
vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }),
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }),
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }),
|
||||
]),
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }),
|
||||
Operation::Or(
|
||||
false,
|
||||
vec![
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact(S("thefish")),
|
||||
}),
|
||||
Operation::And(vec![
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact(S("the")),
|
||||
}),
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact(S("fish")),
|
||||
}),
|
||||
]),
|
||||
],
|
||||
),
|
||||
]),
|
||||
],
|
||||
);
|
||||
let result = flatten_query_tree(&query_tree);
|
||||
|
||||
insta::assert_debug_snapshot!(result, @r###"
|
||||
[
|
||||
[
|
||||
[
|
||||
Exact {
|
||||
word: "manythefish",
|
||||
},
|
||||
],
|
||||
],
|
||||
[
|
||||
[
|
||||
Exact {
|
||||
word: "manythe",
|
||||
},
|
||||
],
|
||||
[
|
||||
Exact {
|
||||
word: "fish",
|
||||
},
|
||||
],
|
||||
],
|
||||
[
|
||||
[
|
||||
Exact {
|
||||
word: "many",
|
||||
},
|
||||
],
|
||||
[
|
||||
Exact {
|
||||
word: "thefish",
|
||||
},
|
||||
],
|
||||
],
|
||||
[
|
||||
[
|
||||
Exact {
|
||||
word: "many",
|
||||
},
|
||||
],
|
||||
[
|
||||
Exact {
|
||||
word: "the",
|
||||
},
|
||||
],
|
||||
[
|
||||
Exact {
|
||||
word: "fish",
|
||||
},
|
||||
],
|
||||
],
|
||||
]
|
||||
"###);
|
||||
}
|
||||
}
|
||||
@@ -1,766 +0,0 @@
|
||||
use std::collections::btree_map::Entry;
|
||||
use std::collections::BTreeMap;
|
||||
use std::convert::TryFrom;
|
||||
use std::mem::take;
|
||||
|
||||
use log::debug;
|
||||
use roaring::{MultiOps, RoaringBitmap};
|
||||
|
||||
use crate::search::criteria::{
|
||||
resolve_phrase, resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult,
|
||||
InitialCandidates,
|
||||
};
|
||||
use crate::search::query_tree::{Operation, PrimitiveQueryPart};
|
||||
use crate::{absolute_from_relative_position, FieldId, Result};
|
||||
|
||||
pub struct Exactness<'t> {
|
||||
ctx: &'t dyn Context<'t>,
|
||||
query_tree: Option<Operation>,
|
||||
state: Option<State>,
|
||||
initial_candidates: InitialCandidates,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
query: Vec<ExactQueryPart>,
|
||||
cache: Option<ExactWordsCombinationCache>,
|
||||
}
|
||||
|
||||
impl<'t> Exactness<'t> {
|
||||
pub fn new(
|
||||
ctx: &'t dyn Context<'t>,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
primitive_query: &[PrimitiveQueryPart],
|
||||
) -> heed::Result<Self> {
|
||||
let mut query: Vec<_> = Vec::with_capacity(primitive_query.len());
|
||||
for part in primitive_query {
|
||||
query.push(ExactQueryPart::from_primitive_query_part(ctx, part)?);
|
||||
}
|
||||
|
||||
Ok(Exactness {
|
||||
ctx,
|
||||
query_tree: None,
|
||||
state: None,
|
||||
initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()),
|
||||
parent,
|
||||
query,
|
||||
cache: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Criterion for Exactness<'t> {
|
||||
#[logging_timer::time("Exactness::{}")]
|
||||
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>> {
|
||||
// remove excluded candidates when next is called, instead of doing it in the loop.
|
||||
if let Some(state) = self.state.as_mut() {
|
||||
state.difference_with(params.excluded_candidates);
|
||||
}
|
||||
loop {
|
||||
debug!("Exactness at state {:?}", self.state);
|
||||
|
||||
match self.state.as_mut() {
|
||||
Some(state) if state.is_empty() => {
|
||||
// reset state
|
||||
self.state = None;
|
||||
self.query_tree = None;
|
||||
// we don't need to reset the combinations cache since it only depends on
|
||||
// the primitive query, which does not change
|
||||
}
|
||||
Some(state) => {
|
||||
let (candidates, state) =
|
||||
resolve_state(self.ctx, take(state), &self.query, &mut self.cache)?;
|
||||
self.state = state;
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: self.query_tree.clone(),
|
||||
candidates: Some(candidates),
|
||||
filtered_candidates: None,
|
||||
initial_candidates: Some(self.initial_candidates.take()),
|
||||
}));
|
||||
}
|
||||
None => match self.parent.next(params)? {
|
||||
Some(CriterionResult {
|
||||
query_tree: Some(query_tree),
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
initial_candidates,
|
||||
}) => {
|
||||
let mut candidates = match candidates {
|
||||
Some(candidates) => candidates,
|
||||
None => {
|
||||
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?
|
||||
- params.excluded_candidates
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(filtered_candidates) = filtered_candidates {
|
||||
candidates &= filtered_candidates;
|
||||
}
|
||||
|
||||
match initial_candidates {
|
||||
Some(initial_candidates) => {
|
||||
self.initial_candidates |= initial_candidates
|
||||
}
|
||||
None => self.initial_candidates.map_inplace(|c| c | &candidates),
|
||||
}
|
||||
|
||||
self.state = Some(State::new(candidates));
|
||||
self.query_tree = Some(query_tree);
|
||||
}
|
||||
Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
initial_candidates,
|
||||
}) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
initial_candidates,
|
||||
}));
|
||||
}
|
||||
None => return Ok(None),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum State {
|
||||
/// Extract the documents that have an attribute that contains exactly the query.
|
||||
ExactAttribute(RoaringBitmap),
|
||||
/// Extract the documents that have an attribute that starts with exactly the query.
|
||||
AttributeStartsWith(RoaringBitmap),
|
||||
/// Rank the remaining documents by the number of exact words contained.
|
||||
ExactWords(RoaringBitmap),
|
||||
Remainings(Vec<RoaringBitmap>),
|
||||
}
|
||||
|
||||
impl State {
|
||||
fn new(candidates: RoaringBitmap) -> Self {
|
||||
Self::ExactAttribute(candidates)
|
||||
}
|
||||
|
||||
fn difference_with(&mut self, lhs: &RoaringBitmap) {
|
||||
match self {
|
||||
Self::ExactAttribute(candidates)
|
||||
| Self::AttributeStartsWith(candidates)
|
||||
| Self::ExactWords(candidates) => *candidates -= lhs,
|
||||
Self::Remainings(candidates_array) => {
|
||||
candidates_array.iter_mut().for_each(|candidates| *candidates -= lhs);
|
||||
candidates_array.retain(|candidates| !candidates.is_empty());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
match self {
|
||||
Self::ExactAttribute(candidates)
|
||||
| Self::AttributeStartsWith(candidates)
|
||||
| Self::ExactWords(candidates) => candidates.is_empty(),
|
||||
Self::Remainings(candidates_array) => {
|
||||
candidates_array.iter().all(RoaringBitmap::is_empty)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for State {
|
||||
fn default() -> Self {
|
||||
Self::Remainings(vec![])
|
||||
}
|
||||
}
|
||||
#[logging_timer::time("Exactness::{}")]
|
||||
fn resolve_state(
|
||||
ctx: &dyn Context,
|
||||
state: State,
|
||||
query: &[ExactQueryPart],
|
||||
cache: &mut Option<ExactWordsCombinationCache>,
|
||||
) -> Result<(RoaringBitmap, Option<State>)> {
|
||||
use State::*;
|
||||
match state {
|
||||
ExactAttribute(mut allowed_candidates) => {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
if let Ok(query_len) = u8::try_from(query.len()) {
|
||||
let attributes_ids = ctx.searchable_fields_ids()?;
|
||||
for id in attributes_ids {
|
||||
if let Some(attribute_allowed_docids) =
|
||||
ctx.field_id_word_count_docids(id, query_len)?
|
||||
{
|
||||
let mut attribute_candidates_array =
|
||||
attribute_start_with_docids(ctx, id, query)?;
|
||||
attribute_candidates_array.push(attribute_allowed_docids);
|
||||
|
||||
candidates |= MultiOps::intersection(attribute_candidates_array);
|
||||
}
|
||||
}
|
||||
|
||||
// only keep allowed candidates
|
||||
candidates &= &allowed_candidates;
|
||||
// remove current candidates from allowed candidates
|
||||
allowed_candidates -= &candidates;
|
||||
}
|
||||
|
||||
Ok((candidates, Some(AttributeStartsWith(allowed_candidates))))
|
||||
}
|
||||
AttributeStartsWith(mut allowed_candidates) => {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
let attributes_ids = ctx.searchable_fields_ids()?;
|
||||
for id in attributes_ids {
|
||||
let attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?;
|
||||
candidates |= MultiOps::intersection(attribute_candidates_array);
|
||||
}
|
||||
|
||||
// only keep allowed candidates
|
||||
candidates &= &allowed_candidates;
|
||||
// remove current candidates from allowed candidates
|
||||
allowed_candidates -= &candidates;
|
||||
Ok((candidates, Some(ExactWords(allowed_candidates))))
|
||||
}
|
||||
ExactWords(allowed_candidates) => {
|
||||
// Retrieve the cache if it already exist, otherwise create it.
|
||||
let owned_cache = if let Some(cache) = cache.take() {
|
||||
cache
|
||||
} else {
|
||||
compute_combinations(ctx, query)?
|
||||
};
|
||||
// The cache contains the sets of documents which contain exactly 1,2,3,.. exact words
|
||||
// from the query. It cannot be empty. All the candidates in it are disjoint.
|
||||
|
||||
let mut candidates_array = owned_cache.combinations.clone();
|
||||
for candidates in candidates_array.iter_mut() {
|
||||
*candidates &= &allowed_candidates;
|
||||
}
|
||||
*cache = Some(owned_cache);
|
||||
|
||||
let best_candidates = candidates_array.pop().unwrap();
|
||||
|
||||
candidates_array.insert(0, allowed_candidates);
|
||||
Ok((best_candidates, Some(Remainings(candidates_array))))
|
||||
}
|
||||
// pop remainings candidates until the emptiness
|
||||
Remainings(mut candidates_array) => {
|
||||
let candidates = candidates_array.pop().unwrap_or_default();
|
||||
if !candidates_array.is_empty() {
|
||||
Ok((candidates, Some(Remainings(candidates_array))))
|
||||
} else {
|
||||
Ok((candidates, None))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn attribute_start_with_docids(
|
||||
ctx: &dyn Context,
|
||||
attribute_id: FieldId,
|
||||
query: &[ExactQueryPart],
|
||||
) -> heed::Result<Vec<RoaringBitmap>> {
|
||||
let mut attribute_candidates_array = Vec::new();
|
||||
// start from attribute first position
|
||||
let mut pos = absolute_from_relative_position(attribute_id, 0);
|
||||
for part in query {
|
||||
use ExactQueryPart::*;
|
||||
match part {
|
||||
Synonyms(synonyms) => {
|
||||
let mut synonyms_candidates = RoaringBitmap::new();
|
||||
for word in synonyms {
|
||||
let wc = ctx.word_position_docids(word, pos)?;
|
||||
if let Some(word_candidates) = wc {
|
||||
synonyms_candidates |= word_candidates;
|
||||
}
|
||||
}
|
||||
attribute_candidates_array.push(synonyms_candidates);
|
||||
pos += 1;
|
||||
}
|
||||
Phrase(phrase) => {
|
||||
for word in phrase {
|
||||
if let Some(word) = word {
|
||||
let wc = ctx.word_position_docids(word, pos)?;
|
||||
if let Some(word_candidates) = wc {
|
||||
attribute_candidates_array.push(word_candidates);
|
||||
}
|
||||
}
|
||||
pos += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(attribute_candidates_array)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum ExactQueryPart {
|
||||
Phrase(Vec<Option<String>>),
|
||||
Synonyms(Vec<String>),
|
||||
}
|
||||
|
||||
impl ExactQueryPart {
|
||||
fn from_primitive_query_part(
|
||||
ctx: &dyn Context,
|
||||
part: &PrimitiveQueryPart,
|
||||
) -> heed::Result<Self> {
|
||||
let part = match part {
|
||||
PrimitiveQueryPart::Word(word, _) => {
|
||||
match ctx.synonyms(word)? {
|
||||
Some(synonyms) => {
|
||||
let mut synonyms: Vec<_> = synonyms
|
||||
.into_iter()
|
||||
.filter_map(|mut array| {
|
||||
// keep 1 word synonyms only.
|
||||
match array.pop() {
|
||||
Some(word) if array.is_empty() => Some(word),
|
||||
_ => None,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
synonyms.push(word.clone());
|
||||
ExactQueryPart::Synonyms(synonyms)
|
||||
}
|
||||
None => ExactQueryPart::Synonyms(vec![word.clone()]),
|
||||
}
|
||||
}
|
||||
PrimitiveQueryPart::Phrase(phrase) => ExactQueryPart::Phrase(phrase.clone()),
|
||||
};
|
||||
|
||||
Ok(part)
|
||||
}
|
||||
}
|
||||
|
||||
struct ExactWordsCombinationCache {
|
||||
// index 0 is only 1 word
|
||||
combinations: Vec<RoaringBitmap>,
|
||||
}
|
||||
|
||||
fn compute_combinations(
|
||||
ctx: &dyn Context,
|
||||
query: &[ExactQueryPart],
|
||||
) -> Result<ExactWordsCombinationCache> {
|
||||
let number_of_part = query.len();
|
||||
let mut parts_candidates_array = Vec::with_capacity(number_of_part);
|
||||
for part in query {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
use ExactQueryPart::*;
|
||||
match part {
|
||||
Synonyms(synonyms) => {
|
||||
for synonym in synonyms {
|
||||
if let Some(synonym_candidates) = ctx.word_docids(synonym)? {
|
||||
candidates |= synonym_candidates;
|
||||
}
|
||||
}
|
||||
}
|
||||
// compute intersection on pair of words with a proximity of 0.
|
||||
Phrase(phrase) => {
|
||||
candidates |= resolve_phrase(ctx, phrase)?;
|
||||
}
|
||||
}
|
||||
parts_candidates_array.push(candidates);
|
||||
}
|
||||
let combinations = create_disjoint_combinations(parts_candidates_array);
|
||||
|
||||
Ok(ExactWordsCombinationCache { combinations })
|
||||
}
|
||||
|
||||
/// Given a list of bitmaps `b0,b1,...,bn` , compute the list of bitmaps `X0,X1,...,Xn`
|
||||
/// such that `Xi` contains all the elements that are contained in **at least** `i+1` bitmaps among `b0,b1,...,bn`.
|
||||
///
|
||||
/// The returned vector is guaranteed to be of length `n`. It is equal to `vec![X0, X1, ..., Xn]`.
|
||||
///
|
||||
/// ## Implementation
|
||||
///
|
||||
/// We do so by iteratively building a map containing the union of all the different ways to intersect `J` bitmaps among `b0,b1,...,bn`.
|
||||
/// - The key of the map is the index `i` of the last bitmap in the intersections
|
||||
/// - The value is the union of all the possible intersections of J bitmaps such that the last bitmap in the intersection is `bi`
|
||||
///
|
||||
/// For example, with the bitmaps `b0,b1,b2,b3`, this map should look like this
|
||||
/// ```text
|
||||
/// Map 0: (first iteration, contains all the combinations of 1 bitmap)
|
||||
/// // What follows are unions of intersection of bitmaps asscociated with the index of their last component
|
||||
/// 0: [b0]
|
||||
/// 1: [b1]
|
||||
/// 2: [b2]
|
||||
/// 3: [b3]
|
||||
/// Map 1: (second iteration, combinations of 2 bitmaps)
|
||||
/// 1: [b0&b1]
|
||||
/// 2: [b0&b2 | b1&b2]
|
||||
/// 3: [b0&b3 | b1&b3 | b2&b3]
|
||||
/// Map 2: (third iteration, combinations of 3 bitmaps)
|
||||
/// 2: [b0&b1&b2]
|
||||
/// 3: [b0&b2&b3 | b1&b2&b3]
|
||||
/// Map 3: (fourth iteration, combinations of 4 bitmaps)
|
||||
/// 3: [b0&b1&b2&b3]
|
||||
/// ```
|
||||
///
|
||||
/// These maps are built one by one from the content of the preceding map.
|
||||
/// For example, to create Map 2, we look at each line of Map 1, for example:
|
||||
/// ```text
|
||||
/// 2: [b0&b2 | b1&b2]
|
||||
/// ```
|
||||
/// And then for each i > 2, we compute `(b0&b2 | b1&b2) & bi = b0&b2&bi | b1&b2&bi`
|
||||
/// and then add it the new map (Map 3) under the key `i` (if it is not empty):
|
||||
/// ```text
|
||||
/// 3: [b0&b2&b3 | b1&b2&b3]
|
||||
/// 4: [b0&b2&b4 | b1&b2&b4]
|
||||
/// 5: [b0&b2&b5 | b1&b2&b5]
|
||||
/// etc.
|
||||
/// ```
|
||||
/// We only keep two maps in memory at any one point. As soon as Map J is built, we flatten Map J-1 into
|
||||
/// a single bitmap by taking the union of all of its values. This union gives us Xj-1.
|
||||
///
|
||||
/// ## Memory Usage
|
||||
/// This function is expected to be called on a maximum of 10 bitmaps. The worst case thus happens when
|
||||
/// 10 identical large bitmaps are given.
|
||||
///
|
||||
/// In the context of Meilisearch, let's imagine that we are given 10 bitmaps containing all
|
||||
/// the document ids. If the dataset contains 16 million documents, then each bitmap will take
|
||||
/// around 2MB of memory.
|
||||
///
|
||||
/// When creating Map 3, we will have, in memory:
|
||||
/// 1. The 10 original bitmaps (20MB)
|
||||
/// 2. X0 : 2MB
|
||||
/// 3. Map 1, containing 9 bitmaps: 18MB
|
||||
/// 4. Map 2, containing 8 bitmaps: 16MB
|
||||
/// 5. X1: 2MB
|
||||
/// for a total of around 60MB of memory. This roughly represents the maximum memory usage of this function.
|
||||
///
|
||||
/// ## Time complexity
|
||||
/// Let N be the size of the given list of bitmaps and M the length of each individual bitmap.
|
||||
///
|
||||
/// We need to create N new bitmaps. The most expensive one to create is the second one, where we need to
|
||||
/// iterate over the N keys of Map 1, and for each of those keys `k_i`, we perform `N-k_i` bitmap unions.
|
||||
/// Unioning two bitmaps is O(M), and we need to do it O(N^2) times.
|
||||
///
|
||||
/// Therefore the time complexity is O(N^3 * M).
|
||||
fn create_non_disjoint_combinations(bitmaps: Vec<RoaringBitmap>) -> Vec<RoaringBitmap> {
|
||||
let nbr_parts = bitmaps.len();
|
||||
if nbr_parts == 1 {
|
||||
return bitmaps;
|
||||
}
|
||||
let mut flattened_levels = vec![];
|
||||
let mut last_level: BTreeMap<usize, RoaringBitmap> =
|
||||
bitmaps.clone().into_iter().enumerate().collect();
|
||||
|
||||
for _ in 2..=nbr_parts {
|
||||
let mut new_level = BTreeMap::new();
|
||||
for (last_part_index, base_combination) in last_level.iter() {
|
||||
#[allow(clippy::needless_range_loop)]
|
||||
for new_last_part_index in last_part_index + 1..nbr_parts {
|
||||
let new_combination = base_combination & &bitmaps[new_last_part_index];
|
||||
if !new_combination.is_empty() {
|
||||
match new_level.entry(new_last_part_index) {
|
||||
Entry::Occupied(mut b) => {
|
||||
*b.get_mut() |= new_combination;
|
||||
}
|
||||
Entry::Vacant(entry) => {
|
||||
entry.insert(new_combination);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Now flatten the last level to save memory
|
||||
let flattened_last_level = MultiOps::union(last_level.into_values());
|
||||
flattened_levels.push(flattened_last_level);
|
||||
last_level = new_level;
|
||||
}
|
||||
// Flatten the last level
|
||||
let flattened_last_level = MultiOps::union(last_level.into_values());
|
||||
flattened_levels.push(flattened_last_level);
|
||||
flattened_levels
|
||||
}
|
||||
|
||||
/// Given a list of bitmaps `b0,b1,...,bn` , compute the list of bitmaps `X0,X1,...,Xn`
|
||||
/// such that `Xi` contains all the elements that are contained in **exactly** `i+1` bitmaps among `b0,b1,...,bn`.
|
||||
///
|
||||
/// The returned vector is guaranteed to be of length `n`. It is equal to `vec![X0, X1, ..., Xn]`.
|
||||
fn create_disjoint_combinations(parts_candidates_array: Vec<RoaringBitmap>) -> Vec<RoaringBitmap> {
|
||||
let non_disjoint_combinations = create_non_disjoint_combinations(parts_candidates_array);
|
||||
let mut disjoint_combinations = vec![];
|
||||
let mut combinations = non_disjoint_combinations.into_iter().peekable();
|
||||
while let Some(mut combination) = combinations.next() {
|
||||
if let Some(forbidden) = combinations.peek() {
|
||||
combination -= forbidden;
|
||||
}
|
||||
disjoint_combinations.push(combination)
|
||||
}
|
||||
|
||||
disjoint_combinations
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use big_s::S;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::search::criteria::exactness::{
|
||||
create_disjoint_combinations, create_non_disjoint_combinations,
|
||||
};
|
||||
use crate::snapshot_tests::display_bitmap;
|
||||
use crate::{Criterion, SearchResult};
|
||||
|
||||
#[test]
|
||||
fn test_exact_words_subcriterion() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key(S("id"));
|
||||
settings.set_criteria(vec![Criterion::Exactness]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
// not relevant
|
||||
{ "id": "0", "text": "cat good dog bad" },
|
||||
// 1 exact word
|
||||
{ "id": "1", "text": "they said: cats arebetter thandogs" },
|
||||
// 3 exact words
|
||||
{ "id": "2", "text": "they said: cats arebetter than dogs" },
|
||||
// 5 exact words
|
||||
{ "id": "3", "text": "they said: cats are better than dogs" },
|
||||
// attribute starts with the exact words
|
||||
{ "id": "4", "text": "cats are better than dogs except on Saturday" },
|
||||
// attribute equal to the exact words
|
||||
{ "id": "5", "text": "cats are better than dogs" },
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let SearchResult { matching_words: _, candidates: _, documents_ids } =
|
||||
index.search(&rtxn).query("cats are better than dogs").execute().unwrap();
|
||||
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 4, 3, 2, 1]");
|
||||
}
|
||||
|
||||
fn print_combinations(rbs: &[RoaringBitmap]) -> String {
|
||||
let mut s = String::new();
|
||||
for rb in rbs {
|
||||
s.push_str(&format!("{}\n", &display_bitmap(rb)));
|
||||
}
|
||||
s
|
||||
}
|
||||
|
||||
// In these unit tests, the test bitmaps always contain all the multiple of a certain number.
|
||||
// This makes it easy to check the validity of the results of `create_disjoint_combinations` by
|
||||
// counting the number of dividers of elements in the returned bitmaps.
|
||||
fn assert_correct_combinations(combinations: &[RoaringBitmap], dividers: &[u32]) {
|
||||
for (i, set) in combinations.iter().enumerate() {
|
||||
let expected_nbr_dividers = i + 1;
|
||||
for el in set {
|
||||
let nbr_dividers = dividers.iter().map(|d| usize::from(el % d == 0)).sum::<usize>();
|
||||
assert_eq!(
|
||||
nbr_dividers, expected_nbr_dividers,
|
||||
"{el} is divisible by {nbr_dividers} elements, not {expected_nbr_dividers}."
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn compute_combinations_1() {
|
||||
let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect();
|
||||
|
||||
let parts_candidates = vec![b0];
|
||||
|
||||
let combinations = create_disjoint_combinations(parts_candidates);
|
||||
insta::assert_snapshot!(print_combinations(&combinations), @r###"
|
||||
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, ]
|
||||
"###);
|
||||
|
||||
assert_correct_combinations(&combinations, &[2]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn compute_combinations_2() {
|
||||
let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect();
|
||||
let b1: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect();
|
||||
|
||||
let parts_candidates = vec![b0, b1];
|
||||
|
||||
let combinations = create_disjoint_combinations(parts_candidates);
|
||||
insta::assert_snapshot!(print_combinations(&combinations), @r###"
|
||||
[2, 3, 4, 8, 9, 10, 14, 15, 16, 20, 21, 22, 26, 27, 28, 32, 33, 34, 38, 39, 40, 44, 45, 46, 50, 51, 52, 56, 57, 58, 62, 63, 64, 68, 69, 70, 74, 75, 76, 80, 81, 82, 86, 87, 88, 92, 93, 94, 98, 99, 100, 104, 105, 106, 110, 111, 112, 116, 117, 118, 122, 123, 124, 128, 129, 130, 134, 135, 136, 140, 141, 142, 146, 147, 148, ]
|
||||
[0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108, 114, 120, 126, 132, 138, 144, ]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn compute_combinations_4() {
|
||||
let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect();
|
||||
let b1: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect();
|
||||
let b2: RoaringBitmap = (0..).into_iter().map(|x| 5 * x).take_while(|x| *x < 150).collect();
|
||||
let b3: RoaringBitmap = (0..).into_iter().map(|x| 7 * x).take_while(|x| *x < 150).collect();
|
||||
|
||||
let parts_candidates = vec![b0, b1, b2, b3];
|
||||
|
||||
let combinations = create_disjoint_combinations(parts_candidates);
|
||||
|
||||
insta::assert_snapshot!(print_combinations(&combinations), @r###"
|
||||
[2, 3, 4, 5, 7, 8, 9, 16, 22, 25, 26, 27, 32, 33, 34, 38, 39, 44, 46, 49, 51, 52, 55, 57, 58, 62, 64, 65, 68, 69, 74, 76, 77, 81, 82, 85, 86, 87, 88, 91, 92, 93, 94, 95, 99, 104, 106, 111, 115, 116, 117, 118, 119, 122, 123, 124, 125, 128, 129, 133, 134, 136, 141, 142, 145, 146, 148, ]
|
||||
[6, 10, 12, 14, 15, 18, 20, 21, 24, 28, 35, 36, 40, 45, 48, 50, 54, 56, 63, 66, 72, 75, 78, 80, 96, 98, 100, 102, 108, 110, 112, 114, 130, 132, 135, 138, 144, 147, ]
|
||||
[30, 42, 60, 70, 84, 90, 105, 120, 126, 140, ]
|
||||
[0, ]
|
||||
"###);
|
||||
|
||||
// But we also check it programmatically
|
||||
assert_correct_combinations(&combinations, &[2, 3, 5, 7]);
|
||||
}
|
||||
#[test]
|
||||
fn compute_combinations_4_with_empty_results_at_end() {
|
||||
let b0: RoaringBitmap = (1..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect();
|
||||
let b1: RoaringBitmap = (1..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect();
|
||||
let b2: RoaringBitmap = (1..).into_iter().map(|x| 5 * x).take_while(|x| *x < 150).collect();
|
||||
let b3: RoaringBitmap = (1..).into_iter().map(|x| 7 * x).take_while(|x| *x < 150).collect();
|
||||
|
||||
let parts_candidates = vec![b0, b1, b2, b3];
|
||||
|
||||
let combinations = create_disjoint_combinations(parts_candidates);
|
||||
|
||||
insta::assert_snapshot!(print_combinations(&combinations), @r###"
|
||||
[2, 3, 4, 5, 7, 8, 9, 16, 22, 25, 26, 27, 32, 33, 34, 38, 39, 44, 46, 49, 51, 52, 55, 57, 58, 62, 64, 65, 68, 69, 74, 76, 77, 81, 82, 85, 86, 87, 88, 91, 92, 93, 94, 95, 99, 104, 106, 111, 115, 116, 117, 118, 119, 122, 123, 124, 125, 128, 129, 133, 134, 136, 141, 142, 145, 146, 148, ]
|
||||
[6, 10, 12, 14, 15, 18, 20, 21, 24, 28, 35, 36, 40, 45, 48, 50, 54, 56, 63, 66, 72, 75, 78, 80, 96, 98, 100, 102, 108, 110, 112, 114, 130, 132, 135, 138, 144, 147, ]
|
||||
[30, 42, 60, 70, 84, 90, 105, 120, 126, 140, ]
|
||||
[]
|
||||
"###);
|
||||
|
||||
// But we also check it programmatically
|
||||
assert_correct_combinations(&combinations, &[2, 3, 5, 7]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn compute_combinations_4_with_some_equal_bitmaps() {
|
||||
let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect();
|
||||
let b1: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect();
|
||||
let b2: RoaringBitmap = (0..).into_iter().map(|x| 5 * x).take_while(|x| *x < 150).collect();
|
||||
// b3 == b1
|
||||
let b3: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect();
|
||||
|
||||
let parts_candidates = vec![b0, b1, b2, b3];
|
||||
|
||||
let combinations = create_disjoint_combinations(parts_candidates);
|
||||
|
||||
insta::assert_snapshot!(print_combinations(&combinations), @r###"
|
||||
[2, 4, 5, 8, 14, 16, 22, 25, 26, 28, 32, 34, 35, 38, 44, 46, 52, 55, 56, 58, 62, 64, 65, 68, 74, 76, 82, 85, 86, 88, 92, 94, 95, 98, 104, 106, 112, 115, 116, 118, 122, 124, 125, 128, 134, 136, 142, 145, 146, 148, ]
|
||||
[3, 9, 10, 20, 21, 27, 33, 39, 40, 50, 51, 57, 63, 69, 70, 80, 81, 87, 93, 99, 100, 110, 111, 117, 123, 129, 130, 140, 141, 147, ]
|
||||
[6, 12, 15, 18, 24, 36, 42, 45, 48, 54, 66, 72, 75, 78, 84, 96, 102, 105, 108, 114, 126, 132, 135, 138, 144, ]
|
||||
[0, 30, 60, 90, 120, ]
|
||||
"###);
|
||||
|
||||
// But we also check it programmatically
|
||||
assert_correct_combinations(&combinations, &[2, 3, 5, 3]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn compute_combinations_10() {
|
||||
let dividers = [2, 3, 5, 7, 11, 6, 15, 35, 18, 14];
|
||||
let parts_candidates: Vec<RoaringBitmap> = dividers
|
||||
.iter()
|
||||
.map(|÷r| {
|
||||
(0..).into_iter().map(|x| divider * x).take_while(|x| *x <= 210).collect()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let combinations = create_disjoint_combinations(parts_candidates);
|
||||
insta::assert_snapshot!(print_combinations(&combinations), @r###"
|
||||
[2, 3, 4, 5, 7, 8, 9, 11, 16, 25, 26, 27, 32, 34, 38, 39, 46, 49, 51, 52, 57, 58, 62, 64, 65, 68, 69, 74, 76, 81, 82, 85, 86, 87, 91, 92, 93, 94, 95, 104, 106, 111, 115, 116, 117, 118, 119, 121, 122, 123, 124, 125, 128, 129, 133, 134, 136, 141, 142, 143, 145, 146, 148, 152, 153, 155, 158, 159, 161, 164, 166, 171, 172, 177, 178, 183, 184, 185, 187, 188, 194, 201, 202, 203, 205, 206, 207, 208, 209, ]
|
||||
[10, 20, 21, 22, 33, 40, 44, 50, 55, 63, 77, 80, 88, 99, 100, 130, 147, 160, 170, 176, 189, 190, 200, ]
|
||||
[6, 12, 14, 15, 24, 28, 35, 45, 48, 56, 75, 78, 96, 98, 102, 110, 112, 114, 135, 138, 156, 174, 175, 182, 186, 192, 195, 196, 204, ]
|
||||
[18, 36, 54, 66, 72, 108, 132, 144, 154, 162, 165, ]
|
||||
[30, 42, 60, 70, 84, 105, 120, 140, 150, 168, 198, ]
|
||||
[90, 126, 180, ]
|
||||
[]
|
||||
[210, ]
|
||||
[]
|
||||
[0, ]
|
||||
"###);
|
||||
|
||||
assert_correct_combinations(&combinations, ÷rs);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn compute_combinations_30() {
|
||||
let dividers: [u32; 30] = [
|
||||
1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4,
|
||||
5,
|
||||
];
|
||||
let parts_candidates: Vec<RoaringBitmap> = dividers
|
||||
.iter()
|
||||
.map(|divider| {
|
||||
(0..).into_iter().map(|x| divider * x).take_while(|x| *x <= 100).collect()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let combinations = create_non_disjoint_combinations(parts_candidates.clone());
|
||||
insta::assert_snapshot!(print_combinations(&combinations), @r###"
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ]
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ]
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ]
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ]
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ]
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ]
|
||||
[0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ]
|
||||
[0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ]
|
||||
[0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ]
|
||||
[0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ]
|
||||
[0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ]
|
||||
[0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ]
|
||||
[0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ]
|
||||
[0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ]
|
||||
[0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ]
|
||||
[0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ]
|
||||
[0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ]
|
||||
[0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ]
|
||||
[0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ]
|
||||
[0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ]
|
||||
[0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ]
|
||||
[0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ]
|
||||
[0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ]
|
||||
[0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ]
|
||||
[0, 60, ]
|
||||
[0, 60, ]
|
||||
[0, 60, ]
|
||||
[0, 60, ]
|
||||
[0, 60, ]
|
||||
[0, 60, ]
|
||||
"###);
|
||||
|
||||
let combinations = create_disjoint_combinations(parts_candidates);
|
||||
insta::assert_snapshot!(print_combinations(&combinations), @r###"
|
||||
[]
|
||||
[]
|
||||
[]
|
||||
[]
|
||||
[]
|
||||
[1, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 49, 53, 59, 61, 67, 71, 73, 77, 79, 83, 89, 91, 97, ]
|
||||
[]
|
||||
[]
|
||||
[]
|
||||
[]
|
||||
[]
|
||||
[2, 3, 5, 9, 14, 21, 22, 25, 26, 27, 33, 34, 35, 38, 39, 46, 51, 55, 57, 58, 62, 63, 65, 69, 74, 81, 82, 85, 86, 87, 93, 94, 95, 98, 99, ]
|
||||
[]
|
||||
[]
|
||||
[]
|
||||
[]
|
||||
[]
|
||||
[4, 6, 8, 10, 15, 16, 18, 28, 32, 42, 44, 45, 50, 52, 54, 56, 64, 66, 68, 70, 75, 76, 78, 88, 92, ]
|
||||
[]
|
||||
[]
|
||||
[]
|
||||
[]
|
||||
[]
|
||||
[12, 20, 24, 30, 36, 40, 48, 72, 80, 84, 90, 96, 100, ]
|
||||
[]
|
||||
[]
|
||||
[]
|
||||
[]
|
||||
[]
|
||||
[0, 60, ]
|
||||
"###);
|
||||
|
||||
assert_correct_combinations(&combinations, ÷rs);
|
||||
}
|
||||
}
|
||||
@@ -1,77 +0,0 @@
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult};
|
||||
use crate::search::criteria::InitialCandidates;
|
||||
use crate::search::query_tree::Operation;
|
||||
use crate::search::WordDerivationsCache;
|
||||
use crate::Result;
|
||||
|
||||
/// The result of a call to the fetcher.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct FinalResult {
|
||||
/// The query tree corresponding to the current bucket of the last criterion.
|
||||
pub query_tree: Option<Operation>,
|
||||
/// The candidates of the current bucket of the last criterion.
|
||||
pub candidates: RoaringBitmap,
|
||||
/// Candidates that comes from the current bucket of the initial criterion.
|
||||
pub initial_candidates: InitialCandidates,
|
||||
}
|
||||
|
||||
pub struct Final<'t> {
|
||||
ctx: &'t dyn Context<'t>,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
wdcache: WordDerivationsCache,
|
||||
returned_candidates: RoaringBitmap,
|
||||
}
|
||||
|
||||
impl<'t> Final<'t> {
|
||||
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Final<'t> {
|
||||
Final {
|
||||
ctx,
|
||||
parent,
|
||||
wdcache: WordDerivationsCache::new(),
|
||||
returned_candidates: RoaringBitmap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
#[logging_timer::time("Final::{}")]
|
||||
pub fn next(&mut self, excluded_candidates: &RoaringBitmap) -> Result<Option<FinalResult>> {
|
||||
debug!("Final iteration");
|
||||
let excluded_candidates = &self.returned_candidates | excluded_candidates;
|
||||
let mut criterion_parameters = CriterionParameters {
|
||||
wdcache: &mut self.wdcache,
|
||||
// returned_candidates is merged with excluded_candidates to avoid duplicas
|
||||
excluded_candidates: &excluded_candidates,
|
||||
};
|
||||
|
||||
match self.parent.next(&mut criterion_parameters)? {
|
||||
Some(CriterionResult {
|
||||
query_tree,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
initial_candidates,
|
||||
}) => {
|
||||
let mut candidates = match (candidates, query_tree.as_ref()) {
|
||||
(Some(candidates), _) => candidates,
|
||||
(None, Some(qt)) => {
|
||||
resolve_query_tree(self.ctx, qt, &mut self.wdcache)? - excluded_candidates
|
||||
}
|
||||
(None, None) => self.ctx.documents_ids()? - excluded_candidates,
|
||||
};
|
||||
|
||||
if let Some(filtered_candidates) = filtered_candidates {
|
||||
candidates &= filtered_candidates;
|
||||
}
|
||||
|
||||
let initial_candidates = initial_candidates
|
||||
.unwrap_or_else(|| InitialCandidates::Estimated(candidates.clone()));
|
||||
|
||||
self.returned_candidates |= &candidates;
|
||||
|
||||
Ok(Some(FinalResult { query_tree, candidates, initial_candidates }))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,154 +0,0 @@
|
||||
use std::iter;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
use rstar::RTree;
|
||||
|
||||
use super::{Criterion, CriterionParameters, CriterionResult};
|
||||
use crate::search::criteria::{resolve_query_tree, CriteriaBuilder, InitialCandidates};
|
||||
use crate::{lat_lng_to_xyz, GeoPoint, Index, Result};
|
||||
|
||||
pub struct Geo<'t> {
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
ascending: bool,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
candidates: Box<dyn Iterator<Item = RoaringBitmap>>,
|
||||
allowed_candidates: RoaringBitmap,
|
||||
initial_candidates: InitialCandidates,
|
||||
rtree: Option<RTree<GeoPoint>>,
|
||||
point: [f64; 2],
|
||||
}
|
||||
|
||||
impl<'t> Geo<'t> {
|
||||
pub fn asc(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
point: [f64; 2],
|
||||
) -> Result<Self> {
|
||||
Self::new(index, rtxn, parent, point, true)
|
||||
}
|
||||
|
||||
pub fn desc(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
point: [f64; 2],
|
||||
) -> Result<Self> {
|
||||
Self::new(index, rtxn, parent, point, false)
|
||||
}
|
||||
|
||||
fn new(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
point: [f64; 2],
|
||||
ascending: bool,
|
||||
) -> Result<Self> {
|
||||
let candidates = Box::new(iter::empty());
|
||||
let allowed_candidates = index.geo_faceted_documents_ids(rtxn)?;
|
||||
let initial_candidates = InitialCandidates::Estimated(RoaringBitmap::new());
|
||||
let rtree = index.geo_rtree(rtxn)?;
|
||||
|
||||
Ok(Self {
|
||||
index,
|
||||
rtxn,
|
||||
ascending,
|
||||
parent,
|
||||
candidates,
|
||||
allowed_candidates,
|
||||
initial_candidates,
|
||||
rtree,
|
||||
point,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Criterion for Geo<'_> {
|
||||
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>> {
|
||||
let rtree = self.rtree.as_ref();
|
||||
|
||||
loop {
|
||||
match self.candidates.next() {
|
||||
Some(mut candidates) => {
|
||||
candidates -= params.excluded_candidates;
|
||||
self.allowed_candidates -= &candidates;
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates: Some(candidates),
|
||||
filtered_candidates: None,
|
||||
initial_candidates: Some(self.initial_candidates.clone()),
|
||||
}));
|
||||
}
|
||||
None => match self.parent.next(params)? {
|
||||
Some(CriterionResult {
|
||||
query_tree,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
initial_candidates,
|
||||
}) => {
|
||||
let mut candidates = match (&query_tree, candidates) {
|
||||
(_, Some(candidates)) => candidates,
|
||||
(Some(qt), None) => {
|
||||
let context = CriteriaBuilder::new(self.rtxn, self.index)?;
|
||||
resolve_query_tree(&context, qt, params.wdcache)?
|
||||
}
|
||||
(None, None) => self.index.documents_ids(self.rtxn)?,
|
||||
};
|
||||
|
||||
if let Some(filtered_candidates) = filtered_candidates {
|
||||
candidates &= filtered_candidates;
|
||||
}
|
||||
|
||||
match initial_candidates {
|
||||
Some(initial_candidates) => {
|
||||
self.initial_candidates |= initial_candidates
|
||||
}
|
||||
None => self.initial_candidates.map_inplace(|c| c | &candidates),
|
||||
}
|
||||
|
||||
if candidates.is_empty() {
|
||||
continue;
|
||||
}
|
||||
self.allowed_candidates = &candidates - params.excluded_candidates;
|
||||
self.candidates = match rtree {
|
||||
Some(rtree) => geo_point(
|
||||
rtree,
|
||||
self.allowed_candidates.clone(),
|
||||
self.point,
|
||||
self.ascending,
|
||||
),
|
||||
None => Box::new(std::iter::empty()),
|
||||
};
|
||||
}
|
||||
None => return Ok(None),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn geo_point(
|
||||
rtree: &RTree<GeoPoint>,
|
||||
mut candidates: RoaringBitmap,
|
||||
point: [f64; 2],
|
||||
ascending: bool,
|
||||
) -> Box<dyn Iterator<Item = RoaringBitmap>> {
|
||||
let point = lat_lng_to_xyz(&point);
|
||||
|
||||
let mut results = Vec::new();
|
||||
for point in rtree.nearest_neighbor_iter(&point) {
|
||||
if candidates.remove(point.data.0) {
|
||||
results.push(std::iter::once(point.data.0).collect());
|
||||
if candidates.is_empty() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ascending {
|
||||
Box::new(results.into_iter())
|
||||
} else {
|
||||
Box::new(results.into_iter().rev())
|
||||
}
|
||||
}
|
||||
@@ -1,82 +0,0 @@
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{Criterion, CriterionParameters, CriterionResult};
|
||||
use crate::search::criteria::{resolve_query_tree, Context, InitialCandidates};
|
||||
use crate::search::query_tree::Operation;
|
||||
use crate::search::Distinct;
|
||||
use crate::Result;
|
||||
/// Initial is a mandatory criterion, it is always the first
|
||||
/// and is meant to initalize the CriterionResult used by the other criteria.
|
||||
/// It behave like an [Once Iterator](https://doc.rust-lang.org/std/iter/struct.Once.html) and will return Some(CriterionResult) only one time.
|
||||
pub struct Initial<'t, D> {
|
||||
ctx: &'t dyn Context<'t>,
|
||||
answer: Option<CriterionResult>,
|
||||
exhaustive_number_hits: bool,
|
||||
distinct: Option<D>,
|
||||
}
|
||||
|
||||
impl<'t, D> Initial<'t, D> {
|
||||
pub fn new(
|
||||
ctx: &'t dyn Context<'t>,
|
||||
query_tree: Option<Operation>,
|
||||
filtered_candidates: Option<RoaringBitmap>,
|
||||
exhaustive_number_hits: bool,
|
||||
distinct: Option<D>,
|
||||
) -> Initial<D> {
|
||||
let answer = CriterionResult {
|
||||
query_tree,
|
||||
candidates: None,
|
||||
filtered_candidates,
|
||||
initial_candidates: None,
|
||||
};
|
||||
Initial { ctx, answer: Some(answer), exhaustive_number_hits, distinct }
|
||||
}
|
||||
}
|
||||
|
||||
impl<D: Distinct> Criterion for Initial<'_, D> {
|
||||
#[logging_timer::time("Initial::{}")]
|
||||
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>> {
|
||||
self.answer
|
||||
.take()
|
||||
.map(|mut answer| {
|
||||
if self.exhaustive_number_hits {
|
||||
// resolve the whole query tree to retrieve an exhaustive list of documents matching the query.
|
||||
let candidates = answer
|
||||
.query_tree
|
||||
.as_ref()
|
||||
.map(|query_tree| resolve_query_tree(self.ctx, query_tree, params.wdcache))
|
||||
.transpose()?;
|
||||
|
||||
// then intersect the candidates with the potential filtered candidates.
|
||||
let mut candidates = match (candidates, answer.filtered_candidates.take()) {
|
||||
(Some(candidates), Some(filtered)) => candidates & filtered,
|
||||
(Some(candidates), None) => candidates,
|
||||
(None, Some(filtered)) => filtered,
|
||||
(None, None) => self.ctx.documents_ids()?,
|
||||
};
|
||||
|
||||
// then remove the potential soft deleted documents.
|
||||
candidates -= params.excluded_candidates;
|
||||
|
||||
// because the initial_candidates should be an exhaustive count of the matching documents,
|
||||
// we precompute the distinct attributes.
|
||||
let initial_candidates = match &mut self.distinct {
|
||||
Some(distinct) => {
|
||||
let mut initial_candidates = RoaringBitmap::new();
|
||||
for c in distinct.distinct(candidates.clone(), RoaringBitmap::new()) {
|
||||
initial_candidates.insert(c?);
|
||||
}
|
||||
initial_candidates
|
||||
}
|
||||
None => candidates.clone(),
|
||||
};
|
||||
|
||||
answer.candidates = Some(candidates);
|
||||
answer.initial_candidates =
|
||||
Some(InitialCandidates::Exhaustive(initial_candidates));
|
||||
}
|
||||
Ok(answer)
|
||||
})
|
||||
.transpose()
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,712 +0,0 @@
|
||||
use std::collections::btree_map::{self, BTreeMap};
|
||||
use std::collections::hash_map::HashMap;
|
||||
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use super::{
|
||||
query_docids, query_pair_proximity_docids, resolve_phrase, resolve_query_tree, Context,
|
||||
Criterion, CriterionParameters, CriterionResult,
|
||||
};
|
||||
use crate::search::criteria::InitialCandidates;
|
||||
use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind};
|
||||
use crate::search::{build_dfa, CriterionImplementationStrategy, WordDerivationsCache};
|
||||
use crate::{Position, Result};
|
||||
|
||||
type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>;
|
||||
|
||||
/// Threshold on the number of candidates that will make
|
||||
/// the system choose between one algorithm or another.
|
||||
const CANDIDATES_THRESHOLD: u64 = 1000;
|
||||
|
||||
/// Threshold on the number of proximity that will make
|
||||
/// the system choose between one algorithm or another.
|
||||
const PROXIMITY_THRESHOLD: u8 = 0;
|
||||
|
||||
pub struct Proximity<'t> {
|
||||
ctx: &'t dyn Context<'t>,
|
||||
/// (max_proximity, query_tree, allowed_candidates)
|
||||
state: Option<(u8, Operation, RoaringBitmap)>,
|
||||
proximity: u8,
|
||||
initial_candidates: InitialCandidates,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
candidates_cache: Cache,
|
||||
plane_sweep_cache: Option<btree_map::IntoIter<u8, RoaringBitmap>>,
|
||||
implementation_strategy: CriterionImplementationStrategy,
|
||||
}
|
||||
|
||||
impl<'t> Proximity<'t> {
|
||||
pub fn new(
|
||||
ctx: &'t dyn Context<'t>,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
implementation_strategy: CriterionImplementationStrategy,
|
||||
) -> Self {
|
||||
Proximity {
|
||||
ctx,
|
||||
state: None,
|
||||
proximity: 0,
|
||||
initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()),
|
||||
parent,
|
||||
candidates_cache: Cache::new(),
|
||||
plane_sweep_cache: None,
|
||||
implementation_strategy,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Criterion for Proximity<'t> {
|
||||
#[logging_timer::time("Proximity::{}")]
|
||||
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>> {
|
||||
// remove excluded candidates when next is called, instead of doing it in the loop.
|
||||
if let Some((_, _, allowed_candidates)) = self.state.as_mut() {
|
||||
*allowed_candidates -= params.excluded_candidates;
|
||||
}
|
||||
|
||||
loop {
|
||||
debug!(
|
||||
"Proximity at iteration {} (max prox {:?}) ({:?})",
|
||||
self.proximity,
|
||||
self.state.as_ref().map(|(mp, _, _)| mp),
|
||||
self.state.as_ref().map(|(_, _, cd)| cd),
|
||||
);
|
||||
|
||||
match &mut self.state {
|
||||
Some((max_prox, _, allowed_candidates))
|
||||
if allowed_candidates.is_empty() || self.proximity > *max_prox =>
|
||||
{
|
||||
self.state = None; // reset state
|
||||
}
|
||||
Some((_, query_tree, allowed_candidates)) => {
|
||||
let mut new_candidates = if matches!(
|
||||
self.implementation_strategy,
|
||||
CriterionImplementationStrategy::OnlyIterative
|
||||
) || (matches!(
|
||||
self.implementation_strategy,
|
||||
CriterionImplementationStrategy::Dynamic
|
||||
) && allowed_candidates.len()
|
||||
<= CANDIDATES_THRESHOLD
|
||||
&& self.proximity > PROXIMITY_THRESHOLD)
|
||||
{
|
||||
if let Some(cache) = self.plane_sweep_cache.as_mut() {
|
||||
match cache.next() {
|
||||
Some((p, candidates)) => {
|
||||
self.proximity = p;
|
||||
candidates
|
||||
}
|
||||
None => {
|
||||
self.state = None; // reset state
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let cache = resolve_plane_sweep_candidates(
|
||||
self.ctx,
|
||||
query_tree,
|
||||
allowed_candidates,
|
||||
)?;
|
||||
self.plane_sweep_cache = Some(cache.into_iter());
|
||||
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
// use set theory based algorithm
|
||||
resolve_candidates(
|
||||
self.ctx,
|
||||
query_tree,
|
||||
self.proximity,
|
||||
&mut self.candidates_cache,
|
||||
params.wdcache,
|
||||
)?
|
||||
};
|
||||
|
||||
new_candidates &= &*allowed_candidates;
|
||||
*allowed_candidates -= &new_candidates;
|
||||
self.proximity += 1;
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: Some(query_tree.clone()),
|
||||
candidates: Some(new_candidates),
|
||||
filtered_candidates: None,
|
||||
initial_candidates: Some(self.initial_candidates.take()),
|
||||
}));
|
||||
}
|
||||
None => match self.parent.next(params)? {
|
||||
Some(CriterionResult {
|
||||
query_tree: Some(query_tree),
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
initial_candidates,
|
||||
}) => {
|
||||
let mut candidates = match candidates {
|
||||
Some(candidates) => candidates,
|
||||
None => {
|
||||
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?
|
||||
- params.excluded_candidates
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(filtered_candidates) = filtered_candidates {
|
||||
candidates &= filtered_candidates;
|
||||
}
|
||||
|
||||
match initial_candidates {
|
||||
Some(initial_candidates) => {
|
||||
self.initial_candidates |= initial_candidates
|
||||
}
|
||||
None => self.initial_candidates.map_inplace(|c| c | &candidates),
|
||||
}
|
||||
|
||||
let maximum_proximity = maximum_proximity(&query_tree);
|
||||
self.state = Some((maximum_proximity as u8, query_tree, candidates));
|
||||
self.proximity = 0;
|
||||
self.plane_sweep_cache = None;
|
||||
}
|
||||
Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
initial_candidates,
|
||||
}) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
initial_candidates,
|
||||
}));
|
||||
}
|
||||
None => return Ok(None),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn resolve_candidates(
|
||||
ctx: &dyn Context,
|
||||
query_tree: &Operation,
|
||||
proximity: u8,
|
||||
cache: &mut Cache,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<RoaringBitmap> {
|
||||
fn resolve_operation(
|
||||
ctx: &dyn Context,
|
||||
query_tree: &Operation,
|
||||
proximity: u8,
|
||||
cache: &mut Cache,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
|
||||
use Operation::{And, Or, Phrase};
|
||||
|
||||
let result = match query_tree {
|
||||
And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?,
|
||||
Phrase(words) => {
|
||||
if proximity == 0 {
|
||||
let most_left = words
|
||||
.iter()
|
||||
.filter_map(|o| o.as_ref())
|
||||
.next()
|
||||
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
|
||||
let most_right = words
|
||||
.iter()
|
||||
.rev()
|
||||
.filter_map(|o| o.as_ref())
|
||||
.next()
|
||||
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
|
||||
|
||||
match (most_left, most_right) {
|
||||
(Some(l), Some(r)) => vec![(l, r, resolve_phrase(ctx, words)?)],
|
||||
_otherwise => Default::default(),
|
||||
}
|
||||
} else {
|
||||
Default::default()
|
||||
}
|
||||
}
|
||||
Or(_, ops) => {
|
||||
let mut output = Vec::new();
|
||||
for op in ops {
|
||||
let result = resolve_operation(ctx, op, proximity, cache, wdcache)?;
|
||||
output.extend(result);
|
||||
}
|
||||
output
|
||||
}
|
||||
Operation::Query(q) => {
|
||||
if proximity == 0 {
|
||||
let candidates = query_docids(ctx, q, wdcache)?;
|
||||
vec![(q.clone(), q.clone(), candidates)]
|
||||
} else {
|
||||
Default::default()
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
fn mdfs_pair(
|
||||
ctx: &dyn Context,
|
||||
left: &Operation,
|
||||
right: &Operation,
|
||||
proximity: u8,
|
||||
cache: &mut Cache,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
|
||||
fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator<Item = (u8, u8)> {
|
||||
(0..=mana.min(left_max)).map(move |m| (m, mana - m))
|
||||
}
|
||||
|
||||
let pair_max_proximity = 7;
|
||||
|
||||
let mut output = Vec::new();
|
||||
|
||||
for (pair_p, left_right_p) in pair_combinations(proximity, pair_max_proximity) {
|
||||
for (left_p, right_p) in pair_combinations(left_right_p, left_right_p) {
|
||||
let left_key = (left.clone(), left_p);
|
||||
if !cache.contains_key(&left_key) {
|
||||
let candidates = resolve_operation(ctx, left, left_p, cache, wdcache)?;
|
||||
cache.insert(left_key.clone(), candidates);
|
||||
}
|
||||
|
||||
let right_key = (right.clone(), right_p);
|
||||
if !cache.contains_key(&right_key) {
|
||||
let candidates = resolve_operation(ctx, right, right_p, cache, wdcache)?;
|
||||
cache.insert(right_key.clone(), candidates);
|
||||
}
|
||||
|
||||
let lefts = cache.get(&left_key).unwrap();
|
||||
let rights = cache.get(&right_key).unwrap();
|
||||
|
||||
for (ll, lr, lcandidates) in lefts {
|
||||
for (rl, rr, rcandidates) in rights {
|
||||
let mut candidates =
|
||||
query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?;
|
||||
if lcandidates.len() < rcandidates.len() {
|
||||
candidates &= lcandidates;
|
||||
candidates &= rcandidates;
|
||||
} else {
|
||||
candidates &= rcandidates;
|
||||
candidates &= lcandidates;
|
||||
}
|
||||
if !candidates.is_empty() {
|
||||
output.push((ll.clone(), rr.clone(), candidates));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(output)
|
||||
}
|
||||
|
||||
fn mdfs(
|
||||
ctx: &dyn Context,
|
||||
branches: &[Operation],
|
||||
proximity: u8,
|
||||
cache: &mut Cache,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
|
||||
// Extract the first two elements but gives the tail
|
||||
// that is just after the first element.
|
||||
let next =
|
||||
branches.split_first().map(|(h1, t)| (h1, t.split_first().map(|(h2, _)| (h2, t))));
|
||||
|
||||
match next {
|
||||
Some((head1, Some((head2, [_])))) => {
|
||||
mdfs_pair(ctx, head1, head2, proximity, cache, wdcache)
|
||||
}
|
||||
Some((head1, Some((head2, tail)))) => {
|
||||
let mut output = Vec::new();
|
||||
for p in 0..=proximity {
|
||||
for (lhead, _, head_candidates) in
|
||||
mdfs_pair(ctx, head1, head2, p, cache, wdcache)?
|
||||
{
|
||||
if !head_candidates.is_empty() {
|
||||
for (_, rtail, mut candidates) in
|
||||
mdfs(ctx, tail, proximity - p, cache, wdcache)?
|
||||
{
|
||||
candidates &= &head_candidates;
|
||||
if !candidates.is_empty() {
|
||||
output.push((lhead.clone(), rtail, candidates));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(output)
|
||||
}
|
||||
Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache, wdcache),
|
||||
None => Ok(Default::default()),
|
||||
}
|
||||
}
|
||||
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
for (_, _, cds) in resolve_operation(ctx, query_tree, proximity, cache, wdcache)? {
|
||||
candidates |= cds;
|
||||
}
|
||||
Ok(candidates)
|
||||
}
|
||||
|
||||
fn resolve_plane_sweep_candidates(
|
||||
ctx: &dyn Context,
|
||||
query_tree: &Operation,
|
||||
allowed_candidates: &RoaringBitmap,
|
||||
) -> Result<BTreeMap<u8, RoaringBitmap>> {
|
||||
/// FIXME may be buggy with query like "new new york"
|
||||
fn plane_sweep(
|
||||
groups_positions: Vec<Vec<(Position, u8, Position)>>,
|
||||
consecutive: bool,
|
||||
) -> Result<Vec<(Position, u8, Position)>> {
|
||||
fn compute_groups_proximity(
|
||||
groups: &[(usize, (Position, u8, Position))],
|
||||
consecutive: bool,
|
||||
) -> Option<(Position, u8, Position)> {
|
||||
// take the inner proximity of the first group as initial
|
||||
let (_, (_, mut proximity, _)) = groups.first()?;
|
||||
let (_, (left_most_pos, _, _)) = groups.first()?;
|
||||
let (_, (_, _, right_most_pos)) =
|
||||
groups.iter().max_by_key(|(_, (_, _, right_most_pos))| right_most_pos)?;
|
||||
|
||||
for pair in groups.windows(2) {
|
||||
if let [(i1, (lpos1, _, rpos1)), (i2, (lpos2, prox2, rpos2))] = pair {
|
||||
// if two positions are equal, meaning that they share at least a word, we return None
|
||||
if rpos1 == rpos2 || lpos1 == lpos2 || rpos1 == lpos2 || lpos1 == rpos2 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let pair_proximity = {
|
||||
// if intervals are disjoint [..].(..)
|
||||
if lpos2 > rpos1 {
|
||||
lpos2 - rpos1
|
||||
}
|
||||
// if the second interval is a subset of the first [.(..).]
|
||||
else if rpos2 < rpos1 {
|
||||
(lpos2 - lpos1).min(rpos1 - rpos2)
|
||||
}
|
||||
// if intervals overlaps [.(..].)
|
||||
else {
|
||||
(lpos2 - lpos1).min(rpos2 - rpos1)
|
||||
}
|
||||
};
|
||||
|
||||
// if groups are in the good order (query order) we remove 1 to the proximity
|
||||
// the proximity is clamped to 7
|
||||
let pair_proximity =
|
||||
if i1 < i2 { (pair_proximity - 1).min(7) } else { pair_proximity.min(7) };
|
||||
|
||||
proximity += pair_proximity as u8 + prox2;
|
||||
}
|
||||
}
|
||||
|
||||
// if groups should be consecutives, we will only accept groups with a proximity of 0
|
||||
if !consecutive || proximity == 0 {
|
||||
Some((*left_most_pos, proximity, *right_most_pos))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
let groups_len = groups_positions.len();
|
||||
|
||||
let mut groups_positions: Vec<_> =
|
||||
groups_positions.into_iter().map(|pos| pos.into_iter()).collect();
|
||||
|
||||
// Pop top elements of each list.
|
||||
let mut current = Vec::with_capacity(groups_len);
|
||||
for (i, positions) in groups_positions.iter_mut().enumerate() {
|
||||
match positions.next() {
|
||||
Some(p) => current.push((i, p)),
|
||||
// if a group return None, it means that the document does not contain all the words,
|
||||
// we return an empty result.
|
||||
None => return Ok(Vec::new()),
|
||||
}
|
||||
}
|
||||
|
||||
// Sort k elements by their positions.
|
||||
current.sort_unstable_by_key(|(_, p)| *p);
|
||||
|
||||
// Find leftmost and rightmost group and their positions.
|
||||
let mut leftmost = *current.first().unwrap();
|
||||
let mut rightmost = *current.last().unwrap();
|
||||
|
||||
let mut output = Vec::new();
|
||||
loop {
|
||||
// Find the position p of the next elements of a list of the leftmost group.
|
||||
// If the list is empty, break the loop.
|
||||
let p = groups_positions[leftmost.0].next().map(|p| (leftmost.0, p));
|
||||
|
||||
// let q be the position q of second group of the interval.
|
||||
let q = current[1];
|
||||
|
||||
// If p > r, then the interval [l, r] is minimal and
|
||||
// we insert it into the heap according to its size.
|
||||
if p.map_or(true, |p| p.1 > rightmost.1) {
|
||||
if let Some(group) = compute_groups_proximity(¤t, consecutive) {
|
||||
output.push(group);
|
||||
}
|
||||
}
|
||||
|
||||
let p = match p {
|
||||
Some(p) => p,
|
||||
None => break,
|
||||
};
|
||||
|
||||
// Replace the leftmost group P in the interval.
|
||||
current[0] = p;
|
||||
|
||||
if p.1 > rightmost.1 {
|
||||
// if [l, r] is minimal, let r = p and l = q.
|
||||
rightmost = p;
|
||||
leftmost = q;
|
||||
} else {
|
||||
// Ohterwise, let l = min{p,q}.
|
||||
leftmost = if p.1 < q.1 { p } else { q };
|
||||
}
|
||||
|
||||
// Then update the interval and order of groups_positions in the interval.
|
||||
current.sort_unstable_by_key(|(_, p)| *p);
|
||||
}
|
||||
|
||||
// Sort the list according to the size and the positions.
|
||||
output.sort_unstable();
|
||||
|
||||
Ok(output)
|
||||
}
|
||||
|
||||
fn resolve_operation<'a>(
|
||||
query_tree: &'a Operation,
|
||||
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
|
||||
words_positions: &HashMap<String, RoaringBitmap>,
|
||||
) -> Result<Vec<(Position, u8, Position)>> {
|
||||
use Operation::{And, Or, Phrase};
|
||||
|
||||
if let Some(result) = rocache.get(query_tree) {
|
||||
return Ok(result.clone());
|
||||
}
|
||||
|
||||
let result = match query_tree {
|
||||
And(ops) => {
|
||||
let mut groups_positions = Vec::with_capacity(ops.len());
|
||||
for operation in ops {
|
||||
let positions = resolve_operation(operation, rocache, words_positions)?;
|
||||
groups_positions.push(positions);
|
||||
}
|
||||
plane_sweep(groups_positions, false)?
|
||||
}
|
||||
Phrase(words) => {
|
||||
let mut groups_positions = Vec::with_capacity(words.len());
|
||||
|
||||
// group stop_words together.
|
||||
for words in words.linear_group_by_key(Option::is_none) {
|
||||
// skip if it's a group of stop words.
|
||||
if matches!(words.first(), None | Some(None)) {
|
||||
continue;
|
||||
}
|
||||
// make a consecutive plane-sweep on the subgroup of words.
|
||||
let mut subgroup = Vec::with_capacity(words.len());
|
||||
for word in words.iter().map(|w| w.as_deref().unwrap()) {
|
||||
match words_positions.get(word) {
|
||||
Some(positions) => {
|
||||
subgroup.push(positions.iter().map(|p| (p, 0, p)).collect())
|
||||
}
|
||||
None => return Ok(vec![]),
|
||||
}
|
||||
}
|
||||
match subgroup.len() {
|
||||
0 => {}
|
||||
1 => groups_positions.push(subgroup.pop().unwrap()),
|
||||
_ => groups_positions.push(plane_sweep(subgroup, true)?),
|
||||
}
|
||||
}
|
||||
match groups_positions.len() {
|
||||
0 => vec![],
|
||||
1 => groups_positions.pop().unwrap(),
|
||||
_ => plane_sweep(groups_positions, false)?,
|
||||
}
|
||||
}
|
||||
Or(_, ops) => {
|
||||
let mut result = Vec::new();
|
||||
for op in ops {
|
||||
result.extend(resolve_operation(op, rocache, words_positions)?)
|
||||
}
|
||||
|
||||
result.sort_unstable();
|
||||
result
|
||||
}
|
||||
Operation::Query(Query { prefix, kind }) => {
|
||||
let mut result = Vec::new();
|
||||
match kind {
|
||||
QueryKind::Exact { word, .. } => {
|
||||
if *prefix {
|
||||
let iter = word_derivations(word, true, 0, words_positions)
|
||||
.flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
|
||||
result.extend(iter);
|
||||
} else if let Some(positions) = words_positions.get(word) {
|
||||
result.extend(positions.iter().map(|p| (p, 0, p)));
|
||||
}
|
||||
}
|
||||
QueryKind::Tolerant { typo, word } => {
|
||||
let iter = word_derivations(word, *prefix, *typo, words_positions)
|
||||
.flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
|
||||
result.extend(iter);
|
||||
}
|
||||
}
|
||||
|
||||
result.sort_unstable();
|
||||
result
|
||||
}
|
||||
};
|
||||
|
||||
rocache.insert(query_tree, result.clone());
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
fn word_derivations<'a>(
|
||||
word: &str,
|
||||
is_prefix: bool,
|
||||
max_typo: u8,
|
||||
words_positions: &'a HashMap<String, RoaringBitmap>,
|
||||
) -> impl Iterator<Item = &'a RoaringBitmap> {
|
||||
let dfa = build_dfa(word, max_typo, is_prefix);
|
||||
words_positions.iter().filter_map(move |(document_word, positions)| {
|
||||
use levenshtein_automata::Distance;
|
||||
match dfa.eval(document_word) {
|
||||
Distance::Exact(_) => Some(positions),
|
||||
Distance::AtLeast(_) => None,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
let mut resolve_operation_cache = HashMap::new();
|
||||
let mut candidates = BTreeMap::new();
|
||||
for docid in allowed_candidates {
|
||||
let words_positions = ctx.docid_words_positions(docid)?;
|
||||
resolve_operation_cache.clear();
|
||||
let positions =
|
||||
resolve_operation(query_tree, &mut resolve_operation_cache, &words_positions)?;
|
||||
let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity);
|
||||
let best_proximity = best_proximity.map(|(_, proximity, _)| proximity).unwrap_or(7);
|
||||
candidates.entry(best_proximity).or_insert_with(RoaringBitmap::new).insert(docid);
|
||||
}
|
||||
|
||||
Ok(candidates)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::io::Cursor;
|
||||
|
||||
use big_s::S;
|
||||
|
||||
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::{Criterion, CriterionImplementationStrategy, SearchResult};
|
||||
|
||||
fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec<crate::Object> {
|
||||
let mut documents = Vec::new();
|
||||
for prefix in prefixes {
|
||||
for i in 0..500 {
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"text": format!("{prefix}{i:x}"),
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
)
|
||||
}
|
||||
}
|
||||
documents
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_proximity_criterion_prefix_handling() {
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key(S("id"));
|
||||
settings.set_criteria(vec![
|
||||
Criterion::Words,
|
||||
Criterion::Typo,
|
||||
Criterion::Proximity,
|
||||
]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
||||
|
||||
for doc in [
|
||||
// 0
|
||||
serde_json::json!({ "text": "zero is exactly the amount of configuration I want" }),
|
||||
// 1
|
||||
serde_json::json!({ "text": "zero bad configuration" }),
|
||||
// 2
|
||||
serde_json::json!({ "text": "zero configuration" }),
|
||||
// 3
|
||||
serde_json::json!({ "text": "zero config" }),
|
||||
// 4
|
||||
serde_json::json!({ "text": "zero conf" }),
|
||||
// 5
|
||||
serde_json::json!({ "text": "zero bad conf" }),
|
||||
] {
|
||||
documents.append_json_object(doc.as_object().unwrap()).unwrap();
|
||||
}
|
||||
for doc in documents_with_enough_different_words_for_prefixes(&["conf"]) {
|
||||
documents.append_json_object(&doc).unwrap();
|
||||
}
|
||||
let documents =
|
||||
DocumentsBatchReader::from_reader(Cursor::new(documents.into_inner().unwrap()))
|
||||
.unwrap();
|
||||
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let SearchResult { matching_words: _, candidates: _, documents_ids } = index
|
||||
.search(&rtxn)
|
||||
.query("zero c")
|
||||
.criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased)
|
||||
.execute()
|
||||
.unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 1, 5, 0]");
|
||||
|
||||
let SearchResult { matching_words: _, candidates: _, documents_ids } = index
|
||||
.search(&rtxn)
|
||||
.query("zero co")
|
||||
.criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased)
|
||||
.execute()
|
||||
.unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 1, 5, 0]");
|
||||
|
||||
let SearchResult { matching_words: _, candidates: _, documents_ids } = index
|
||||
.search(&rtxn)
|
||||
.query("zero con")
|
||||
.criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased)
|
||||
.execute()
|
||||
.unwrap();
|
||||
// Here searh results are degraded because `con` is in the prefix cache but it is too
|
||||
// long to be stored in the prefix proximity databases, and we don't want to iterate over
|
||||
// all of its word derivations
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5]");
|
||||
|
||||
let SearchResult { matching_words: _, candidates: _, documents_ids } = index
|
||||
.search(&rtxn)
|
||||
.criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased)
|
||||
.query("zero conf")
|
||||
.execute()
|
||||
.unwrap();
|
||||
// Here search results are degraded as well, but we can still rank correctly documents
|
||||
// that contain `conf` exactly, and not as a prefix.
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 2, 3]");
|
||||
|
||||
let SearchResult { matching_words: _, candidates: _, documents_ids } = index
|
||||
.search(&rtxn)
|
||||
.criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased)
|
||||
.query("zero config")
|
||||
.execute()
|
||||
.unwrap();
|
||||
// `config` is not a common prefix, so the normal methods are used
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 4, 5]");
|
||||
}
|
||||
}
|
||||
@@ -1,493 +0,0 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::mem::take;
|
||||
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{
|
||||
query_docids, resolve_query_tree, Candidates, Context, Criterion, CriterionParameters,
|
||||
CriterionResult,
|
||||
};
|
||||
use crate::search::criteria::{resolve_phrase, InitialCandidates};
|
||||
use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind};
|
||||
use crate::search::{word_derivations, WordDerivationsCache};
|
||||
use crate::Result;
|
||||
|
||||
/// Maximum number of typo for a word of any length.
|
||||
const MAX_TYPOS_PER_WORD: u8 = 2;
|
||||
|
||||
pub struct Typo<'t> {
|
||||
ctx: &'t dyn Context<'t>,
|
||||
/// (max_typos, query_tree, candidates)
|
||||
state: Option<(u8, Operation, Candidates)>,
|
||||
typos: u8,
|
||||
initial_candidates: Option<InitialCandidates>,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
candidates_cache: HashMap<(Operation, u8), RoaringBitmap>,
|
||||
}
|
||||
|
||||
impl<'t> Typo<'t> {
|
||||
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
|
||||
Typo {
|
||||
ctx,
|
||||
state: None,
|
||||
typos: 0,
|
||||
initial_candidates: None,
|
||||
parent,
|
||||
candidates_cache: HashMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Criterion for Typo<'t> {
|
||||
#[logging_timer::time("Typo::{}")]
|
||||
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>> {
|
||||
use Candidates::{Allowed, Forbidden};
|
||||
// remove excluded candidates when next is called, instead of doing it in the loop.
|
||||
match self.state.as_mut() {
|
||||
Some((_, _, Allowed(candidates))) => *candidates -= params.excluded_candidates,
|
||||
Some((_, _, Forbidden(candidates))) => *candidates |= params.excluded_candidates,
|
||||
None => (),
|
||||
}
|
||||
|
||||
loop {
|
||||
debug!(
|
||||
"Typo at iteration {} (max typos {:?}) ({:?})",
|
||||
self.typos,
|
||||
self.state.as_ref().map(|(mt, _, _)| mt),
|
||||
self.state.as_ref().map(|(_, _, cd)| cd),
|
||||
);
|
||||
|
||||
match self.state.as_mut() {
|
||||
Some((max_typos, _, _)) if self.typos > *max_typos => {
|
||||
self.state = None; // reset state
|
||||
}
|
||||
Some((_, _, Allowed(allowed_candidates))) if allowed_candidates.is_empty() => {
|
||||
self.state = None; // reset state
|
||||
}
|
||||
Some((_, query_tree, candidates_authorization)) => {
|
||||
let fst = self.ctx.words_fst();
|
||||
let new_query_tree = match self.typos {
|
||||
typos if typos < MAX_TYPOS_PER_WORD => alterate_query_tree(
|
||||
fst,
|
||||
query_tree.clone(),
|
||||
self.typos,
|
||||
params.wdcache,
|
||||
)?,
|
||||
MAX_TYPOS_PER_WORD => {
|
||||
// When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible,
|
||||
// we keep the altered query tree
|
||||
*query_tree = alterate_query_tree(
|
||||
fst,
|
||||
query_tree.clone(),
|
||||
self.typos,
|
||||
params.wdcache,
|
||||
)?;
|
||||
// we compute the allowed candidates
|
||||
let query_tree_allowed_candidates =
|
||||
resolve_query_tree(self.ctx, query_tree, params.wdcache)?;
|
||||
// we assign the allowed candidates to the candidates authorization.
|
||||
*candidates_authorization = match take(candidates_authorization) {
|
||||
Allowed(allowed_candidates) => {
|
||||
Allowed(query_tree_allowed_candidates & allowed_candidates)
|
||||
}
|
||||
Forbidden(forbidden_candidates) => {
|
||||
Allowed(query_tree_allowed_candidates - forbidden_candidates)
|
||||
}
|
||||
};
|
||||
query_tree.clone()
|
||||
}
|
||||
_otherwise => query_tree.clone(),
|
||||
};
|
||||
|
||||
let mut candidates = resolve_candidates(
|
||||
self.ctx,
|
||||
&new_query_tree,
|
||||
self.typos,
|
||||
&mut self.candidates_cache,
|
||||
params.wdcache,
|
||||
)?;
|
||||
|
||||
match candidates_authorization {
|
||||
Allowed(allowed_candidates) => {
|
||||
candidates &= &*allowed_candidates;
|
||||
*allowed_candidates -= &candidates;
|
||||
}
|
||||
Forbidden(forbidden_candidates) => {
|
||||
candidates -= &*forbidden_candidates;
|
||||
*forbidden_candidates |= &candidates;
|
||||
}
|
||||
}
|
||||
|
||||
let initial_candidates = match self.initial_candidates.as_mut() {
|
||||
Some(initial_candidates) => initial_candidates.take(),
|
||||
None => InitialCandidates::Estimated(candidates.clone()),
|
||||
};
|
||||
|
||||
self.typos += 1;
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: Some(new_query_tree),
|
||||
candidates: Some(candidates),
|
||||
filtered_candidates: None,
|
||||
initial_candidates: Some(initial_candidates),
|
||||
}));
|
||||
}
|
||||
None => match self.parent.next(params)? {
|
||||
Some(CriterionResult {
|
||||
query_tree: Some(query_tree),
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
initial_candidates,
|
||||
}) => {
|
||||
self.initial_candidates =
|
||||
match (self.initial_candidates.take(), initial_candidates) {
|
||||
(Some(self_ic), Some(parent_ic)) => Some(self_ic | parent_ic),
|
||||
(self_ic, parent_ic) => self_ic.or(parent_ic),
|
||||
};
|
||||
|
||||
let candidates = match candidates.or(filtered_candidates) {
|
||||
Some(candidates) => {
|
||||
Candidates::Allowed(candidates - params.excluded_candidates)
|
||||
}
|
||||
None => Candidates::Forbidden(params.excluded_candidates.clone()),
|
||||
};
|
||||
|
||||
let maximum_typos = maximum_typo(&query_tree) as u8;
|
||||
self.state = Some((maximum_typos, query_tree, candidates));
|
||||
self.typos = 0;
|
||||
}
|
||||
Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
initial_candidates,
|
||||
}) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
initial_candidates,
|
||||
}));
|
||||
}
|
||||
None => return Ok(None),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Modify the query tree by replacing every tolerant query by an Or operation
|
||||
/// containing all of the corresponding exact words in the words FST. Each tolerant
|
||||
/// query will only be replaced by exact query with up to `number_typos` maximum typos.
|
||||
fn alterate_query_tree(
|
||||
words_fst: &fst::Set<Cow<[u8]>>,
|
||||
mut query_tree: Operation,
|
||||
number_typos: u8,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<Operation> {
|
||||
fn recurse(
|
||||
words_fst: &fst::Set<Cow<[u8]>>,
|
||||
operation: &mut Operation,
|
||||
number_typos: u8,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<()> {
|
||||
use Operation::{And, Or, Phrase};
|
||||
|
||||
match operation {
|
||||
And(ops) | Or(_, ops) => {
|
||||
ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache))
|
||||
}
|
||||
// Because Phrases don't allow typos, no alteration can be done.
|
||||
Phrase(_words) => Ok(()),
|
||||
Operation::Query(q) => {
|
||||
if let QueryKind::Tolerant { typo, word } = &q.kind {
|
||||
// if no typo is allowed we don't call word_derivations function,
|
||||
// and directly create an Exact query
|
||||
if number_typos == 0 {
|
||||
*operation = Operation::Query(Query {
|
||||
prefix: q.prefix,
|
||||
kind: QueryKind::Exact { original_typo: 0, word: word.clone() },
|
||||
});
|
||||
} else {
|
||||
let typo = *typo.min(&number_typos);
|
||||
let words = word_derivations(word, q.prefix, typo, words_fst, wdcache)?;
|
||||
let queries = words
|
||||
.iter()
|
||||
.map(|(word, typo)| {
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::Exact {
|
||||
original_typo: *typo,
|
||||
word: word.to_string(),
|
||||
},
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
*operation = Operation::or(false, queries);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
recurse(words_fst, &mut query_tree, number_typos, wdcache)?;
|
||||
Ok(query_tree)
|
||||
}
|
||||
|
||||
fn resolve_candidates(
|
||||
ctx: &dyn Context,
|
||||
query_tree: &Operation,
|
||||
number_typos: u8,
|
||||
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<RoaringBitmap> {
|
||||
fn resolve_operation(
|
||||
ctx: &dyn Context,
|
||||
query_tree: &Operation,
|
||||
number_typos: u8,
|
||||
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<RoaringBitmap> {
|
||||
use Operation::{And, Or, Phrase, Query};
|
||||
|
||||
match query_tree {
|
||||
And(ops) => mdfs(ctx, ops, number_typos, cache, wdcache),
|
||||
Phrase(words) => resolve_phrase(ctx, words),
|
||||
Or(_, ops) => {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
for op in ops {
|
||||
let docids = resolve_operation(ctx, op, number_typos, cache, wdcache)?;
|
||||
candidates |= docids;
|
||||
}
|
||||
Ok(candidates)
|
||||
}
|
||||
Query(q) => {
|
||||
if q.kind.typo() == number_typos {
|
||||
Ok(query_docids(ctx, q, wdcache)?)
|
||||
} else {
|
||||
Ok(RoaringBitmap::new())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn mdfs(
|
||||
ctx: &dyn Context,
|
||||
branches: &[Operation],
|
||||
mana: u8,
|
||||
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<RoaringBitmap> {
|
||||
match branches.split_first() {
|
||||
Some((head, [])) => {
|
||||
let cache_key = (head.clone(), mana);
|
||||
if let Some(candidates) = cache.get(&cache_key) {
|
||||
Ok(candidates.clone())
|
||||
} else {
|
||||
let candidates = resolve_operation(ctx, head, mana, cache, wdcache)?;
|
||||
cache.insert(cache_key, candidates.clone());
|
||||
Ok(candidates)
|
||||
}
|
||||
}
|
||||
Some((head, tail)) => {
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
|
||||
for m in 0..=mana {
|
||||
let mut head_candidates = {
|
||||
let cache_key = (head.clone(), m);
|
||||
if let Some(candidates) = cache.get(&cache_key) {
|
||||
candidates.clone()
|
||||
} else {
|
||||
let candidates = resolve_operation(ctx, head, m, cache, wdcache)?;
|
||||
cache.insert(cache_key, candidates.clone());
|
||||
candidates
|
||||
}
|
||||
};
|
||||
if !head_candidates.is_empty() {
|
||||
let tail_candidates = mdfs(ctx, tail, mana - m, cache, wdcache)?;
|
||||
head_candidates &= tail_candidates;
|
||||
candidates |= head_candidates;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(candidates)
|
||||
}
|
||||
None => Ok(RoaringBitmap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
resolve_operation(ctx, query_tree, number_typos, cache, wdcache)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::super::initial::Initial;
|
||||
use super::super::test::TestContext;
|
||||
use super::*;
|
||||
use crate::search::NoopDistinct;
|
||||
|
||||
fn display_criteria(mut criteria: Typo, mut parameters: CriterionParameters) -> String {
|
||||
let mut result = String::new();
|
||||
while let Some(criterion) = criteria.next(&mut parameters).unwrap() {
|
||||
result.push_str(&format!("{criterion:?}\n\n"));
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn initial_placeholder_no_facets() {
|
||||
let context = TestContext::default();
|
||||
let query_tree = None;
|
||||
let facet_candidates = None;
|
||||
|
||||
let criterion_parameters = CriterionParameters {
|
||||
wdcache: &mut WordDerivationsCache::new(),
|
||||
excluded_candidates: &RoaringBitmap::new(),
|
||||
};
|
||||
|
||||
let parent =
|
||||
Initial::<NoopDistinct>::new(&context, query_tree, facet_candidates, false, None);
|
||||
let criteria = Typo::new(&context, Box::new(parent));
|
||||
|
||||
let result = display_criteria(criteria, criterion_parameters);
|
||||
insta::assert_snapshot!(result, @r###"
|
||||
CriterionResult { query_tree: None, candidates: None, filtered_candidates: None, initial_candidates: None }
|
||||
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn initial_query_tree_no_facets() {
|
||||
let context = TestContext::default();
|
||||
let query_tree = Operation::Or(
|
||||
false,
|
||||
vec![Operation::And(vec![
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("split".to_string()),
|
||||
}),
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("this".to_string()),
|
||||
}),
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::tolerant(1, "world".to_string()),
|
||||
}),
|
||||
])],
|
||||
);
|
||||
|
||||
let facet_candidates = None;
|
||||
|
||||
let criterion_parameters = CriterionParameters {
|
||||
wdcache: &mut WordDerivationsCache::new(),
|
||||
excluded_candidates: &RoaringBitmap::new(),
|
||||
};
|
||||
let parent =
|
||||
Initial::<NoopDistinct>::new(&context, Some(query_tree), facet_candidates, false, None);
|
||||
let criteria = Typo::new(&context, Box::new(parent));
|
||||
|
||||
let result = display_criteria(criteria, criterion_parameters);
|
||||
insta::assert_snapshot!(result, @r###"
|
||||
CriterionResult { query_tree: Some(OR
|
||||
AND
|
||||
Exact { word: "split" }
|
||||
Exact { word: "this" }
|
||||
Exact { word: "world" }
|
||||
), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) }
|
||||
|
||||
CriterionResult { query_tree: Some(OR
|
||||
AND
|
||||
Exact { word: "split" }
|
||||
Exact { word: "this" }
|
||||
OR
|
||||
Exact { word: "word" }
|
||||
Exact { word: "world" }
|
||||
), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) }
|
||||
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn initial_placeholder_with_facets() {
|
||||
let context = TestContext::default();
|
||||
let query_tree = None;
|
||||
let facet_candidates = context.word_docids("earth").unwrap().unwrap();
|
||||
|
||||
let criterion_parameters = CriterionParameters {
|
||||
wdcache: &mut WordDerivationsCache::new(),
|
||||
excluded_candidates: &RoaringBitmap::new(),
|
||||
};
|
||||
let parent =
|
||||
Initial::<NoopDistinct>::new(&context, query_tree, Some(facet_candidates), false, None);
|
||||
let criteria = Typo::new(&context, Box::new(parent));
|
||||
|
||||
let result = display_criteria(criteria, criterion_parameters);
|
||||
insta::assert_snapshot!(result, @r###"
|
||||
CriterionResult { query_tree: None, candidates: None, filtered_candidates: Some(RoaringBitmap<8000 values between 986424 and 4294786076>), initial_candidates: None }
|
||||
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn initial_query_tree_with_facets() {
|
||||
let context = TestContext::default();
|
||||
let query_tree = Operation::Or(
|
||||
false,
|
||||
vec![Operation::And(vec![
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("split".to_string()),
|
||||
}),
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::exact("this".to_string()),
|
||||
}),
|
||||
Operation::Query(Query {
|
||||
prefix: false,
|
||||
kind: QueryKind::tolerant(1, "world".to_string()),
|
||||
}),
|
||||
])],
|
||||
);
|
||||
|
||||
let facet_candidates = context.word_docids("earth").unwrap().unwrap();
|
||||
|
||||
let criterion_parameters = CriterionParameters {
|
||||
wdcache: &mut WordDerivationsCache::new(),
|
||||
excluded_candidates: &RoaringBitmap::new(),
|
||||
};
|
||||
let parent = Initial::<NoopDistinct>::new(
|
||||
&context,
|
||||
Some(query_tree),
|
||||
Some(facet_candidates),
|
||||
false,
|
||||
None,
|
||||
);
|
||||
let criteria = Typo::new(&context, Box::new(parent));
|
||||
|
||||
let result = display_criteria(criteria, criterion_parameters);
|
||||
insta::assert_snapshot!(result, @r###"
|
||||
CriterionResult { query_tree: Some(OR
|
||||
AND
|
||||
Exact { word: "split" }
|
||||
Exact { word: "this" }
|
||||
Exact { word: "world" }
|
||||
), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) }
|
||||
|
||||
CriterionResult { query_tree: Some(OR
|
||||
AND
|
||||
Exact { word: "split" }
|
||||
Exact { word: "this" }
|
||||
OR
|
||||
Exact { word: "word" }
|
||||
Exact { word: "world" }
|
||||
), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) }
|
||||
|
||||
"###);
|
||||
}
|
||||
}
|
||||
@@ -1,106 +0,0 @@
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult};
|
||||
use crate::search::criteria::InitialCandidates;
|
||||
use crate::search::query_tree::Operation;
|
||||
use crate::Result;
|
||||
|
||||
pub struct Words<'t> {
|
||||
ctx: &'t dyn Context<'t>,
|
||||
query_trees: Vec<Operation>,
|
||||
candidates: Option<RoaringBitmap>,
|
||||
initial_candidates: Option<InitialCandidates>,
|
||||
filtered_candidates: Option<RoaringBitmap>,
|
||||
parent: Box<dyn Criterion + 't>,
|
||||
}
|
||||
|
||||
impl<'t> Words<'t> {
|
||||
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
|
||||
Words {
|
||||
ctx,
|
||||
query_trees: Vec::default(),
|
||||
candidates: None,
|
||||
initial_candidates: None,
|
||||
parent,
|
||||
filtered_candidates: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Criterion for Words<'t> {
|
||||
#[logging_timer::time("Words::{}")]
|
||||
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>> {
|
||||
// remove excluded candidates when next is called, instead of doing it in the loop.
|
||||
if let Some(candidates) = self.candidates.as_mut() {
|
||||
*candidates -= params.excluded_candidates;
|
||||
}
|
||||
|
||||
loop {
|
||||
debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates);
|
||||
|
||||
match self.query_trees.pop() {
|
||||
Some(query_tree) => {
|
||||
let candidates = match self.candidates.as_mut() {
|
||||
Some(allowed_candidates) => {
|
||||
let mut candidates =
|
||||
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?;
|
||||
candidates &= &*allowed_candidates;
|
||||
*allowed_candidates -= &candidates;
|
||||
Some(candidates)
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
let initial_candidates = self.initial_candidates.clone();
|
||||
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: Some(query_tree),
|
||||
candidates,
|
||||
filtered_candidates: self.filtered_candidates.clone(),
|
||||
initial_candidates,
|
||||
}));
|
||||
}
|
||||
None => match self.parent.next(params)? {
|
||||
Some(CriterionResult {
|
||||
query_tree: Some(query_tree),
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
initial_candidates,
|
||||
}) => {
|
||||
self.query_trees = explode_query_tree(query_tree);
|
||||
self.candidates = candidates;
|
||||
self.filtered_candidates = filtered_candidates;
|
||||
|
||||
self.initial_candidates =
|
||||
match (self.initial_candidates.take(), initial_candidates) {
|
||||
(Some(self_ic), Some(parent_ic)) => Some(self_ic | parent_ic),
|
||||
(self_ic, parent_ic) => self_ic.or(parent_ic),
|
||||
};
|
||||
}
|
||||
Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
initial_candidates,
|
||||
}) => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: None,
|
||||
candidates,
|
||||
filtered_candidates,
|
||||
initial_candidates,
|
||||
}));
|
||||
}
|
||||
None => return Ok(None),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn explode_query_tree(query_tree: Operation) -> Vec<Operation> {
|
||||
match query_tree {
|
||||
Operation::Or(true, ops) => ops,
|
||||
otherwise => vec![otherwise],
|
||||
}
|
||||
}
|
||||
@@ -1,218 +0,0 @@
|
||||
use std::mem::size_of;
|
||||
|
||||
use concat_arrays::concat_arrays;
|
||||
use heed::types::{ByteSlice, Str, Unit};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{Distinct, DocIter};
|
||||
use crate::error::InternalError;
|
||||
use crate::heed_codec::facet::{FacetGroupKey, *};
|
||||
use crate::index::db_name;
|
||||
use crate::{DocumentId, FieldId, Index, Result};
|
||||
|
||||
const FID_SIZE: usize = size_of::<FieldId>();
|
||||
const DOCID_SIZE: usize = size_of::<DocumentId>();
|
||||
|
||||
/// A distinct implementer that is backed by facets.
|
||||
///
|
||||
/// On each iteration, the facet values for the
|
||||
/// distinct attribute of the first document are retrieved. The document ids for these facet values
|
||||
/// are then retrieved and taken out of the the candidate and added to the excluded set. We take
|
||||
/// care to keep the document we are currently on, and remove it from the excluded list. The next
|
||||
/// iterations will never contain any occurence of a document with the same distinct value as a
|
||||
/// document from previous iterations.
|
||||
#[derive(Clone)]
|
||||
pub struct FacetDistinct<'a> {
|
||||
distinct: FieldId,
|
||||
index: &'a Index,
|
||||
txn: &'a heed::RoTxn<'a>,
|
||||
}
|
||||
|
||||
impl<'a> FacetDistinct<'a> {
|
||||
pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self {
|
||||
Self { distinct, index, txn }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FacetDistinctIter<'a> {
|
||||
candidates: RoaringBitmap,
|
||||
distinct: FieldId,
|
||||
excluded: RoaringBitmap,
|
||||
index: &'a Index,
|
||||
iter_offset: usize,
|
||||
txn: &'a heed::RoTxn<'a>,
|
||||
}
|
||||
|
||||
impl<'a> FacetDistinctIter<'a> {
|
||||
fn facet_string_docids(&self, key: &str) -> heed::Result<Option<RoaringBitmap>> {
|
||||
self.index
|
||||
.facet_id_string_docids
|
||||
.get(self.txn, &FacetGroupKey { field_id: self.distinct, level: 0, left_bound: key })
|
||||
.map(|opt| opt.map(|v| v.bitmap))
|
||||
}
|
||||
|
||||
fn facet_number_docids(&self, key: f64) -> heed::Result<Option<RoaringBitmap>> {
|
||||
// get facet docids on level 0
|
||||
self.index
|
||||
.facet_id_f64_docids
|
||||
.get(self.txn, &FacetGroupKey { field_id: self.distinct, level: 0, left_bound: key })
|
||||
.map(|opt| opt.map(|v| v.bitmap))
|
||||
}
|
||||
|
||||
fn distinct_string(&mut self, id: DocumentId) -> Result<()> {
|
||||
let iter = facet_string_values(id, self.distinct, self.index, self.txn)?;
|
||||
|
||||
for item in iter {
|
||||
let ((_, _, value), _) = item?;
|
||||
let facet_docids =
|
||||
self.facet_string_docids(value)?.ok_or(InternalError::DatabaseMissingEntry {
|
||||
db_name: db_name::FACET_ID_STRING_DOCIDS,
|
||||
key: None,
|
||||
})?;
|
||||
self.excluded |= facet_docids;
|
||||
}
|
||||
|
||||
self.excluded.remove(id);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn distinct_number(&mut self, id: DocumentId) -> Result<()> {
|
||||
let iter = facet_number_values(id, self.distinct, self.index, self.txn)?;
|
||||
|
||||
for item in iter {
|
||||
let ((_, _, value), _) = item?;
|
||||
let facet_docids =
|
||||
self.facet_number_docids(value)?.ok_or(InternalError::DatabaseMissingEntry {
|
||||
db_name: db_name::FACET_ID_F64_DOCIDS,
|
||||
key: None,
|
||||
})?;
|
||||
self.excluded |= facet_docids;
|
||||
}
|
||||
|
||||
self.excluded.remove(id);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Performs the next iteration of the facet distinct. This is a convenience method that is
|
||||
/// called by the Iterator::next implementation that transposes the result. It makes error
|
||||
/// handling easier.
|
||||
fn next_inner(&mut self) -> Result<Option<DocumentId>> {
|
||||
// The first step is to remove all the excluded documents from our candidates
|
||||
self.candidates -= &self.excluded;
|
||||
|
||||
let mut candidates_iter = self.candidates.iter().skip(self.iter_offset);
|
||||
match candidates_iter.next() {
|
||||
Some(id) => {
|
||||
// We distinct the document id on its facet strings and facet numbers.
|
||||
self.distinct_string(id)?;
|
||||
self.distinct_number(id)?;
|
||||
|
||||
// The first document of each iteration is kept, since the next call to
|
||||
// `difference_with` will filter out all the documents for that facet value. By
|
||||
// increasing the offset we make sure to get the first valid value for the next
|
||||
// distinct document to keep.
|
||||
self.iter_offset += 1;
|
||||
|
||||
Ok(Some(id))
|
||||
}
|
||||
// no more candidate at this offset, return.
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] {
|
||||
concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes())
|
||||
}
|
||||
|
||||
fn facet_number_values<'a>(
|
||||
id: DocumentId,
|
||||
distinct: FieldId,
|
||||
index: &Index,
|
||||
txn: &'a heed::RoTxn,
|
||||
) -> Result<heed::RoPrefix<'a, FieldDocIdFacetF64Codec, Unit>> {
|
||||
let key = facet_values_prefix_key(distinct, id);
|
||||
|
||||
let iter = index
|
||||
.field_id_docid_facet_f64s
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(txn, &key)?
|
||||
.remap_key_type::<FieldDocIdFacetF64Codec>();
|
||||
|
||||
Ok(iter)
|
||||
}
|
||||
|
||||
fn facet_string_values<'a>(
|
||||
id: DocumentId,
|
||||
distinct: FieldId,
|
||||
index: &Index,
|
||||
txn: &'a heed::RoTxn,
|
||||
) -> Result<heed::RoPrefix<'a, FieldDocIdFacetStringCodec, Str>> {
|
||||
let key = facet_values_prefix_key(distinct, id);
|
||||
|
||||
let iter = index
|
||||
.field_id_docid_facet_strings
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(txn, &key)?
|
||||
.remap_types::<FieldDocIdFacetStringCodec, Str>();
|
||||
|
||||
Ok(iter)
|
||||
}
|
||||
|
||||
impl Iterator for FacetDistinctIter<'_> {
|
||||
type Item = Result<DocumentId>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.next_inner().transpose()
|
||||
}
|
||||
}
|
||||
|
||||
impl DocIter for FacetDistinctIter<'_> {
|
||||
fn into_excluded(self) -> RoaringBitmap {
|
||||
self.excluded
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Distinct for FacetDistinct<'a> {
|
||||
type Iter = FacetDistinctIter<'a>;
|
||||
|
||||
fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter {
|
||||
FacetDistinctIter {
|
||||
candidates,
|
||||
distinct: self.distinct,
|
||||
excluded,
|
||||
index: self.index,
|
||||
iter_offset: 0,
|
||||
txn: self.txn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::super::test::{generate_index, validate_distinct_candidates};
|
||||
use super::*;
|
||||
|
||||
macro_rules! test_facet_distinct {
|
||||
($name:ident, $distinct:literal) => {
|
||||
#[test]
|
||||
fn $name() {
|
||||
let (index, fid, candidates) = generate_index($distinct);
|
||||
let txn = index.read_txn().unwrap();
|
||||
let mut map_distinct = FacetDistinct::new(fid, &index, &txn);
|
||||
let excluded = RoaringBitmap::new();
|
||||
let mut iter = map_distinct.distinct(candidates.clone(), excluded);
|
||||
let count = validate_distinct_candidates(iter.by_ref(), fid, &index);
|
||||
let excluded = iter.into_excluded();
|
||||
assert_eq!(count as u64 + excluded.len(), candidates.len());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
test_facet_distinct!(test_string, "txt");
|
||||
test_facet_distinct!(test_strings, "txts");
|
||||
test_facet_distinct!(test_number, "cat-int");
|
||||
}
|
||||
@@ -1,155 +0,0 @@
|
||||
mod facet_distinct;
|
||||
mod noop_distinct;
|
||||
|
||||
pub use facet_distinct::FacetDistinct;
|
||||
pub use noop_distinct::NoopDistinct;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::{DocumentId, Result};
|
||||
|
||||
/// A trait implemented by document interators that are returned by calls to `Distinct::distinct`.
|
||||
/// It provides a way to get back the ownership to the excluded set.
|
||||
pub trait DocIter: Iterator<Item = Result<DocumentId>> {
|
||||
/// Returns ownership on the internal exluded set.
|
||||
fn into_excluded(self) -> RoaringBitmap;
|
||||
}
|
||||
|
||||
/// A trait that is implemented by structs that perform a distinct on `candidates`. Calling distinct
|
||||
/// must return an iterator containing only distinct documents, and add the discarded documents to
|
||||
/// the excluded set. The excluded set can later be retrieved by calling `DocIter::excluded` on the
|
||||
/// returned iterator.
|
||||
pub trait Distinct {
|
||||
type Iter: DocIter;
|
||||
|
||||
fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::collections::HashSet;
|
||||
use std::io::Cursor;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::Rng;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::{json, Value};
|
||||
|
||||
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::index::Index;
|
||||
use crate::update::{
|
||||
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
|
||||
};
|
||||
use crate::{DocumentId, FieldId, BEU32};
|
||||
|
||||
static JSON: Lazy<Vec<u8>> = Lazy::new(|| {
|
||||
let mut rng = rand::thread_rng();
|
||||
let num_docs = rng.gen_range(10..30);
|
||||
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
let txts = ["Toto", "Titi", "Tata"];
|
||||
let cats = (1..10).map(|i| i.to_string()).collect::<Vec<_>>();
|
||||
let cat_ints = (1..10).collect::<Vec<_>>();
|
||||
|
||||
for i in 0..num_docs {
|
||||
let txt = txts.choose(&mut rng).unwrap();
|
||||
let mut sample_txts = cats.clone();
|
||||
sample_txts.shuffle(&mut rng);
|
||||
|
||||
let mut sample_ints = cat_ints.clone();
|
||||
sample_ints.shuffle(&mut rng);
|
||||
|
||||
let json = json!({
|
||||
"id": i,
|
||||
"txt": txt,
|
||||
"cat-int": rng.gen_range(0..3),
|
||||
"txts": sample_txts[..(rng.gen_range(0..3))],
|
||||
"cat-ints": sample_ints[..(rng.gen_range(0..3))],
|
||||
});
|
||||
|
||||
let object = match json {
|
||||
Value::Object(object) => object,
|
||||
_ => panic!(),
|
||||
};
|
||||
|
||||
builder.append_json_object(&object).unwrap();
|
||||
}
|
||||
|
||||
builder.into_inner().unwrap()
|
||||
});
|
||||
|
||||
/// Returns a temporary index populated with random test documents, the FieldId for the
|
||||
/// distinct attribute, and the RoaringBitmap with the document ids.
|
||||
pub(crate) fn generate_index(distinct: &str) -> (TempIndex, FieldId, RoaringBitmap) {
|
||||
let index = TempIndex::new();
|
||||
let mut txn = index.write_txn().unwrap();
|
||||
|
||||
// set distinct and faceted attributes for the index.
|
||||
let config = IndexerConfig::default();
|
||||
let mut update = Settings::new(&mut txn, &index, &config);
|
||||
update.set_distinct_field(distinct.to_string());
|
||||
update.execute(|_| (), || false).unwrap();
|
||||
|
||||
// add documents to the index
|
||||
let config = IndexerConfig::default();
|
||||
let indexing_config = IndexDocumentsConfig {
|
||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
..Default::default()
|
||||
};
|
||||
let addition =
|
||||
IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| (), || false)
|
||||
.unwrap();
|
||||
|
||||
let reader =
|
||||
crate::documents::DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice()))
|
||||
.unwrap();
|
||||
|
||||
let (addition, user_error) = addition.add_documents(reader).unwrap();
|
||||
user_error.unwrap();
|
||||
addition.execute().unwrap();
|
||||
|
||||
let fields_map = index.fields_ids_map(&txn).unwrap();
|
||||
let fid = fields_map.id(distinct).unwrap();
|
||||
|
||||
let documents = DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())).unwrap();
|
||||
let map = (0..documents.documents_count()).collect();
|
||||
|
||||
txn.commit().unwrap();
|
||||
|
||||
(index, fid, map)
|
||||
}
|
||||
|
||||
/// Checks that all the candidates are distinct, and returns the candidates number.
|
||||
pub(crate) fn validate_distinct_candidates(
|
||||
candidates: impl Iterator<Item = crate::Result<DocumentId>>,
|
||||
distinct: FieldId,
|
||||
index: &Index,
|
||||
) -> usize {
|
||||
fn test(seen: &mut HashSet<String>, value: &Value) {
|
||||
match value {
|
||||
Value::Null | Value::Object(_) | Value::Bool(_) => (),
|
||||
Value::Number(_) | Value::String(_) => {
|
||||
let s = value.to_string();
|
||||
assert!(seen.insert(s));
|
||||
}
|
||||
Value::Array(values) => values.iter().for_each(|value| test(seen, value)),
|
||||
}
|
||||
}
|
||||
|
||||
let mut seen = HashSet::<String>::new();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
let mut count = 0;
|
||||
for candidate in candidates {
|
||||
count += 1;
|
||||
let candidate = candidate.unwrap();
|
||||
let id = BEU32::new(candidate);
|
||||
let document = index.documents.get(&txn, &id).unwrap().unwrap();
|
||||
let value = document.get(distinct).unwrap();
|
||||
let value = serde_json::from_slice(value).unwrap();
|
||||
test(&mut seen, &value);
|
||||
}
|
||||
count
|
||||
}
|
||||
}
|
||||
@@ -1,55 +0,0 @@
|
||||
use roaring::bitmap::IntoIter;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{Distinct, DocIter};
|
||||
use crate::{DocumentId, Result};
|
||||
|
||||
/// A distinct implementer that does not perform any distinct,
|
||||
/// and simply returns an iterator to the candidates.
|
||||
pub struct NoopDistinct;
|
||||
|
||||
pub struct NoopDistinctIter {
|
||||
candidates: IntoIter,
|
||||
excluded: RoaringBitmap,
|
||||
}
|
||||
|
||||
impl Iterator for NoopDistinctIter {
|
||||
type Item = Result<DocumentId>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.candidates.next().map(Ok)
|
||||
}
|
||||
}
|
||||
|
||||
impl DocIter for NoopDistinctIter {
|
||||
fn into_excluded(self) -> RoaringBitmap {
|
||||
self.excluded
|
||||
}
|
||||
}
|
||||
|
||||
impl Distinct for NoopDistinct {
|
||||
type Iter = NoopDistinctIter;
|
||||
|
||||
fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter {
|
||||
NoopDistinctIter { candidates: candidates.into_iter(), excluded }
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_noop() {
|
||||
let candidates = (1..10).collect();
|
||||
let excluded = RoaringBitmap::new();
|
||||
let mut iter = NoopDistinct.distinct(candidates, excluded);
|
||||
assert_eq!(
|
||||
iter.by_ref().map(Result::unwrap).collect::<Vec<_>>(),
|
||||
(1..10).collect::<Vec<_>>()
|
||||
);
|
||||
|
||||
let excluded = iter.into_excluded();
|
||||
assert!(excluded.is_empty());
|
||||
}
|
||||
}
|
||||
@@ -73,7 +73,7 @@ impl<'a> FacetDistribution<'a> {
|
||||
|
||||
let distribution_prelength = distribution.len();
|
||||
let db = self.index.field_id_docid_facet_f64s;
|
||||
for docid in candidates.into_iter() {
|
||||
for docid in candidates {
|
||||
key_buffer.truncate(mem::size_of::<FieldId>());
|
||||
key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||
let iter = db
|
||||
@@ -97,7 +97,7 @@ impl<'a> FacetDistribution<'a> {
|
||||
let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec();
|
||||
|
||||
let db = self.index.field_id_docid_facet_strings;
|
||||
'outer: for docid in candidates.into_iter() {
|
||||
'outer: for docid in candidates {
|
||||
key_buffer.truncate(mem::size_of::<FieldId>());
|
||||
key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||
let iter = db
|
||||
@@ -309,7 +309,7 @@ impl<'a> FacetDistribution<'a> {
|
||||
let mut distribution = BTreeMap::new();
|
||||
for (fid, name) in fields_ids_map.iter() {
|
||||
if crate::is_faceted(name, &fields) {
|
||||
let min_value = if let Some(min_value) = crate::search::criteria::facet_min_value(
|
||||
let min_value = if let Some(min_value) = crate::search::facet::facet_min_value(
|
||||
self.index,
|
||||
self.rtxn,
|
||||
fid,
|
||||
@@ -319,7 +319,7 @@ impl<'a> FacetDistribution<'a> {
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
let max_value = if let Some(max_value) = crate::search::criteria::facet_max_value(
|
||||
let max_value = if let Some(max_value) = crate::search::facet::facet_max_value(
|
||||
self.index,
|
||||
self.rtxn,
|
||||
fid,
|
||||
@@ -505,7 +505,7 @@ mod tests {
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(std::iter::once("colour"))
|
||||
.candidates((0..10_000).into_iter().collect())
|
||||
.candidates((0..10_000).collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
@@ -513,7 +513,7 @@ mod tests {
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(std::iter::once("colour"))
|
||||
.candidates((0..5_000).into_iter().collect())
|
||||
.candidates((0..5_000).collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
@@ -521,7 +521,7 @@ mod tests {
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(std::iter::once("colour"))
|
||||
.candidates((0..5_000).into_iter().collect())
|
||||
.candidates((0..5_000).collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
@@ -529,7 +529,7 @@ mod tests {
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(std::iter::once("colour"))
|
||||
.candidates((0..5_000).into_iter().collect())
|
||||
.candidates((0..5_000).collect())
|
||||
.max_values_per_facet(1)
|
||||
.execute()
|
||||
.unwrap();
|
||||
@@ -546,7 +546,7 @@ mod tests {
|
||||
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
|
||||
.unwrap();
|
||||
|
||||
let facet_values = (0..1000).into_iter().map(|x| format!("{x:x}")).collect::<Vec<_>>();
|
||||
let facet_values = (0..1000).map(|x| format!("{x:x}")).collect::<Vec<_>>();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..10_000 {
|
||||
@@ -582,7 +582,7 @@ mod tests {
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(std::iter::once("colour"))
|
||||
.candidates((0..10_000).into_iter().collect())
|
||||
.candidates((0..10_000).collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
@@ -590,7 +590,7 @@ mod tests {
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(std::iter::once("colour"))
|
||||
.candidates((0..5_000).into_iter().collect())
|
||||
.candidates((0..5_000).collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
@@ -606,7 +606,7 @@ mod tests {
|
||||
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
|
||||
.unwrap();
|
||||
|
||||
let facet_values = (0..1000).into_iter().collect::<Vec<_>>();
|
||||
let facet_values = (0..1000).collect::<Vec<_>>();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
@@ -634,7 +634,7 @@ mod tests {
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(std::iter::once("colour"))
|
||||
.candidates((0..1000).into_iter().collect())
|
||||
.candidates((0..1000).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
@@ -642,7 +642,7 @@ mod tests {
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(std::iter::once("colour"))
|
||||
.candidates((217..777).into_iter().collect())
|
||||
.candidates((217..777).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
@@ -658,7 +658,7 @@ mod tests {
|
||||
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
|
||||
.unwrap();
|
||||
|
||||
let facet_values = (0..1000).into_iter().collect::<Vec<_>>();
|
||||
let facet_values = (0..1000).collect::<Vec<_>>();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
@@ -686,7 +686,7 @@ mod tests {
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(std::iter::once("colour"))
|
||||
.candidates((0..1000).into_iter().collect())
|
||||
.candidates((0..1000).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
@@ -694,7 +694,7 @@ mod tests {
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(std::iter::once("colour"))
|
||||
.candidates((217..777).into_iter().collect())
|
||||
.candidates((217..777).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
@@ -710,7 +710,7 @@ mod tests {
|
||||
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
|
||||
.unwrap();
|
||||
|
||||
let facet_values = (0..1000).into_iter().collect::<Vec<_>>();
|
||||
let facet_values = (0..1000).collect::<Vec<_>>();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
@@ -738,7 +738,7 @@ mod tests {
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(std::iter::once("colour"))
|
||||
.candidates((0..1000).into_iter().collect())
|
||||
.candidates((0..1000).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
@@ -746,7 +746,7 @@ mod tests {
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(std::iter::once("colour"))
|
||||
.candidates((217..777).into_iter().collect())
|
||||
.candidates((217..777).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
@@ -762,7 +762,7 @@ mod tests {
|
||||
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
|
||||
.unwrap();
|
||||
|
||||
let facet_values = (0..1000).into_iter().collect::<Vec<_>>();
|
||||
let facet_values = (0..1000).collect::<Vec<_>>();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
@@ -794,7 +794,7 @@ mod tests {
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(std::iter::once("colour"))
|
||||
.candidates((0..1000).into_iter().collect())
|
||||
.candidates((0..1000).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
@@ -802,7 +802,7 @@ mod tests {
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(std::iter::once("colour"))
|
||||
.candidates((217..777).into_iter().collect())
|
||||
.candidates((217..777).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
|
||||
@@ -142,7 +142,7 @@ mod tests {
|
||||
let indexes = [get_simple_index(), get_random_looking_index()];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = (0..=255).into_iter().collect::<RoaringBitmap>();
|
||||
let candidates = (0..=255).collect::<RoaringBitmap>();
|
||||
let mut results = String::new();
|
||||
iterate_over_facet_distribution(
|
||||
&txn,
|
||||
@@ -166,7 +166,7 @@ mod tests {
|
||||
let indexes = [get_simple_index(), get_random_looking_index()];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = (0..=255).into_iter().collect::<RoaringBitmap>();
|
||||
let candidates = (0..=255).collect::<RoaringBitmap>();
|
||||
let mut results = String::new();
|
||||
let mut nbr_facets = 0;
|
||||
iterate_over_facet_distribution(
|
||||
|
||||
@@ -410,7 +410,7 @@ mod tests {
|
||||
|
||||
let mut results = String::new();
|
||||
|
||||
for i in (0..=255).into_iter().rev() {
|
||||
for i in (0..=255).rev() {
|
||||
let i = i as f64;
|
||||
let start = Bound::Included(i);
|
||||
let end = Bound::Included(255.);
|
||||
@@ -431,7 +431,7 @@ mod tests {
|
||||
|
||||
let mut results = String::new();
|
||||
|
||||
for i in (0..=255).into_iter().rev() {
|
||||
for i in (0..=255).rev() {
|
||||
let i = i as f64;
|
||||
let start = Bound::Excluded(i);
|
||||
let end = Bound::Excluded(255.);
|
||||
@@ -466,7 +466,7 @@ mod tests {
|
||||
|
||||
let mut results = String::new();
|
||||
|
||||
for i in (0..=128).into_iter().rev() {
|
||||
for i in (0..=128).rev() {
|
||||
let i = i as f64;
|
||||
let start = Bound::Included(i);
|
||||
let end = Bound::Included(255. - i);
|
||||
@@ -491,7 +491,7 @@ mod tests {
|
||||
|
||||
let mut results = String::new();
|
||||
|
||||
for i in (0..=128).into_iter().rev() {
|
||||
for i in (0..=128).rev() {
|
||||
let i = i as f64;
|
||||
let start = Bound::Excluded(i);
|
||||
let end = Bound::Excluded(255. - i);
|
||||
|
||||
@@ -132,7 +132,7 @@ mod tests {
|
||||
let indexes = [get_simple_index(), get_random_looking_index()];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = (200..=300).into_iter().collect::<RoaringBitmap>();
|
||||
let candidates = (200..=300).collect::<RoaringBitmap>();
|
||||
let mut results = String::new();
|
||||
let iter = ascending_facet_sort(&txn, index.content, 0, candidates).unwrap();
|
||||
for el in iter {
|
||||
@@ -154,7 +154,7 @@ mod tests {
|
||||
];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = (200..=300).into_iter().collect::<RoaringBitmap>();
|
||||
let candidates = (200..=300).collect::<RoaringBitmap>();
|
||||
let mut results = String::new();
|
||||
let iter = ascending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap();
|
||||
for el in iter {
|
||||
|
||||
@@ -142,7 +142,7 @@ mod tests {
|
||||
];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = (200..=300).into_iter().collect::<RoaringBitmap>();
|
||||
let candidates = (200..=300).collect::<RoaringBitmap>();
|
||||
let mut results = String::new();
|
||||
let db = index.content.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
|
||||
let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap();
|
||||
@@ -165,7 +165,7 @@ mod tests {
|
||||
];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = (200..=300).into_iter().collect::<RoaringBitmap>();
|
||||
let candidates = (200..=300).collect::<RoaringBitmap>();
|
||||
let mut results = String::new();
|
||||
let db = index.content.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
|
||||
let iter = descending_facet_sort(&txn, db, 0, candidates.clone()).unwrap();
|
||||
|
||||
@@ -211,6 +211,14 @@ impl<'a> Filter<'a> {
|
||||
Condition::Between { from, to } => {
|
||||
(Included(from.parse_finite_float()?), Included(to.parse_finite_float()?))
|
||||
}
|
||||
Condition::Null => {
|
||||
let is_null = index.null_faceted_documents_ids(rtxn, field_id)?;
|
||||
return Ok(is_null);
|
||||
}
|
||||
Condition::Empty => {
|
||||
let is_empty = index.empty_faceted_documents_ids(rtxn, field_id)?;
|
||||
return Ok(is_empty);
|
||||
}
|
||||
Condition::Exists => {
|
||||
let exist = index.exists_faceted_documents_ids(rtxn, field_id)?;
|
||||
return Ok(exist);
|
||||
|
||||
@@ -2,11 +2,13 @@ pub use facet_sort_ascending::ascending_facet_sort;
|
||||
pub use facet_sort_descending::descending_facet_sort;
|
||||
use heed::types::{ByteSlice, DecodeIgnore};
|
||||
use heed::{BytesDecode, RoTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET};
|
||||
pub use self::filter::{BadGeoError, Filter};
|
||||
use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec};
|
||||
use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec};
|
||||
use crate::heed_codec::ByteSliceRefCodec;
|
||||
use crate::{Index, Result};
|
||||
mod facet_distribution;
|
||||
mod facet_distribution_iter;
|
||||
mod facet_range_search;
|
||||
@@ -14,6 +16,38 @@ mod facet_sort_ascending;
|
||||
mod facet_sort_descending;
|
||||
mod filter;
|
||||
|
||||
fn facet_extreme_value<'t>(
|
||||
mut extreme_it: impl Iterator<Item = heed::Result<(RoaringBitmap, &'t [u8])>> + 't,
|
||||
) -> Result<Option<f64>> {
|
||||
let extreme_value =
|
||||
if let Some(extreme_value) = extreme_it.next() { extreme_value } else { return Ok(None) };
|
||||
let (_, extreme_value) = extreme_value?;
|
||||
|
||||
Ok(OrderedF64Codec::bytes_decode(extreme_value))
|
||||
}
|
||||
|
||||
pub fn facet_min_value<'t>(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
field_id: u16,
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<Option<f64>> {
|
||||
let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
|
||||
let it = ascending_facet_sort(rtxn, db, field_id, candidates)?;
|
||||
facet_extreme_value(it)
|
||||
}
|
||||
|
||||
pub fn facet_max_value<'t>(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn,
|
||||
field_id: u16,
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<Option<f64>> {
|
||||
let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
|
||||
let it = descending_facet_sort(rtxn, db, field_id, candidates)?;
|
||||
facet_extreme_value(it)
|
||||
}
|
||||
|
||||
/// Get the first facet value in the facet database
|
||||
pub(crate) fn get_first_facet_value<'t, BoundCodec>(
|
||||
txn: &'t RoTxn,
|
||||
|
||||
@@ -1,458 +0,0 @@
|
||||
use std::cmp::{min, Reverse};
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::ops::{Index, IndexMut};
|
||||
use std::rc::Rc;
|
||||
|
||||
use charabia::Token;
|
||||
use levenshtein_automata::{Distance, DFA};
|
||||
|
||||
use crate::error::InternalError;
|
||||
use crate::search::build_dfa;
|
||||
use crate::MAX_WORD_LENGTH;
|
||||
|
||||
type IsPrefix = bool;
|
||||
|
||||
/// Structure created from a query tree
|
||||
/// referencing words that match the given query tree.
|
||||
#[derive(Default)]
|
||||
pub struct MatchingWords {
|
||||
inner: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for MatchingWords {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
writeln!(f, "[")?;
|
||||
for (matching_words, primitive_word_id) in self.inner.iter() {
|
||||
writeln!(f, "({matching_words:?}, {primitive_word_id:?})")?;
|
||||
}
|
||||
writeln!(f, "]")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl MatchingWords {
|
||||
pub fn new(
|
||||
mut matching_words: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
|
||||
) -> crate::Result<Self> {
|
||||
// if one of the matching_words vec doesn't contain a word.
|
||||
if matching_words.iter().any(|(mw, _)| mw.is_empty()) {
|
||||
return Err(InternalError::InvalidMatchingWords.into());
|
||||
}
|
||||
|
||||
// Sort word by len in DESC order prioritizing the longuest matches,
|
||||
// in order to highlight the longuest part of the matched word.
|
||||
matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len())));
|
||||
|
||||
Ok(Self { inner: matching_words })
|
||||
}
|
||||
|
||||
/// Returns an iterator over terms that match or partially match the given token.
|
||||
pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> {
|
||||
MatchesIter { inner: Box::new(self.inner.iter()), token }
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterator over terms that match the given token,
|
||||
/// This allow to lazily evaluate matches.
|
||||
pub struct MatchesIter<'a, 'b> {
|
||||
#[allow(clippy::type_complexity)]
|
||||
inner: Box<dyn Iterator<Item = &'a (Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)> + 'a>,
|
||||
token: &'b Token<'b>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for MatchesIter<'a, '_> {
|
||||
type Item = MatchType<'a>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.inner.next() {
|
||||
Some((matching_words, ids)) => match matching_words[0].match_token(self.token) {
|
||||
Some(char_len) => {
|
||||
if matching_words.len() > 1 {
|
||||
Some(MatchType::Partial(PartialMatch {
|
||||
matching_words: &matching_words[1..],
|
||||
ids,
|
||||
char_len,
|
||||
}))
|
||||
} else {
|
||||
Some(MatchType::Full { char_len, ids })
|
||||
}
|
||||
}
|
||||
None => self.next(),
|
||||
},
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Id of a matching term corespounding to a word written by the end user.
|
||||
pub type PrimitiveWordId = u8;
|
||||
|
||||
/// Structure used to match a specific term.
|
||||
pub struct MatchingWord {
|
||||
pub dfa: DFA,
|
||||
pub word: String,
|
||||
pub typo: u8,
|
||||
pub prefix: IsPrefix,
|
||||
}
|
||||
|
||||
impl fmt::Debug for MatchingWord {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("MatchingWord")
|
||||
.field("word", &self.word)
|
||||
.field("typo", &self.typo)
|
||||
.field("prefix", &self.prefix)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for MatchingWord {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.prefix == other.prefix && self.typo == other.typo && self.word == other.word
|
||||
}
|
||||
}
|
||||
|
||||
impl MatchingWord {
|
||||
pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Option<Self> {
|
||||
if word.len() > MAX_WORD_LENGTH {
|
||||
return None;
|
||||
}
|
||||
let dfa = build_dfa(&word, typo, prefix);
|
||||
|
||||
Some(Self { dfa, word, typo, prefix })
|
||||
}
|
||||
|
||||
/// Returns the lenght in chars of the match in case of the token matches the term.
|
||||
pub fn match_token(&self, token: &Token) -> Option<usize> {
|
||||
match self.dfa.eval(token.lemma()) {
|
||||
Distance::Exact(t) if t <= self.typo => {
|
||||
if self.prefix {
|
||||
let len = bytes_to_highlight(token.lemma(), &self.word);
|
||||
Some(token.original_lengths(len).0)
|
||||
} else {
|
||||
Some(token.original_lengths(token.lemma().len()).0)
|
||||
}
|
||||
}
|
||||
_otherwise => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A given token can partially match a query word for several reasons:
|
||||
/// - split words
|
||||
/// - multi-word synonyms
|
||||
/// In these cases we need to match consecutively several tokens to consider that the match is full.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum MatchType<'a> {
|
||||
Full { char_len: usize, ids: &'a [PrimitiveWordId] },
|
||||
Partial(PartialMatch<'a>),
|
||||
}
|
||||
|
||||
/// Structure helper to match several tokens in a row in order to complete a partial match.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct PartialMatch<'a> {
|
||||
matching_words: &'a [Rc<MatchingWord>],
|
||||
ids: &'a [PrimitiveWordId],
|
||||
char_len: usize,
|
||||
}
|
||||
|
||||
impl<'a> PartialMatch<'a> {
|
||||
/// Returns:
|
||||
/// - None if the given token breaks the partial match
|
||||
/// - Partial if the given token matches the partial match but doesn't complete it
|
||||
/// - Full if the given token completes the partial match
|
||||
pub fn match_token(self, token: &Token) -> Option<MatchType<'a>> {
|
||||
self.matching_words[0].match_token(token).map(|char_len| {
|
||||
if self.matching_words.len() > 1 {
|
||||
MatchType::Partial(PartialMatch {
|
||||
matching_words: &self.matching_words[1..],
|
||||
ids: self.ids,
|
||||
char_len,
|
||||
})
|
||||
} else {
|
||||
MatchType::Full { char_len, ids: self.ids }
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn char_len(&self) -> usize {
|
||||
self.char_len
|
||||
}
|
||||
}
|
||||
|
||||
// A simple wrapper around vec so we can get contiguous but index it like it's 2D array.
|
||||
struct N2Array<T> {
|
||||
y_size: usize,
|
||||
buf: Vec<T>,
|
||||
}
|
||||
|
||||
impl<T: Clone> N2Array<T> {
|
||||
fn new(x: usize, y: usize, value: T) -> N2Array<T> {
|
||||
N2Array { y_size: y, buf: vec![value; x * y] }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Index<(usize, usize)> for N2Array<T> {
|
||||
type Output = T;
|
||||
|
||||
#[inline]
|
||||
fn index(&self, (x, y): (usize, usize)) -> &T {
|
||||
&self.buf[(x * self.y_size) + y]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> IndexMut<(usize, usize)> for N2Array<T> {
|
||||
#[inline]
|
||||
fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T {
|
||||
&mut self.buf[(x * self.y_size) + y]
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of **bytes** we want to highlight in the `source` word.
|
||||
/// Basically we want to highlight as much characters as possible in the source until it has too much
|
||||
/// typos (= 2)
|
||||
/// The algorithm is a modified
|
||||
/// [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
|
||||
fn bytes_to_highlight(source: &str, target: &str) -> usize {
|
||||
let n = source.chars().count();
|
||||
let m = target.chars().count();
|
||||
|
||||
if n == 0 {
|
||||
return 0;
|
||||
}
|
||||
// since we allow two typos we can send two characters even if it's completely wrong
|
||||
if m < 3 {
|
||||
return source.chars().take(m).map(|c| c.len_utf8()).sum();
|
||||
}
|
||||
if n == m && source == target {
|
||||
return source.len();
|
||||
}
|
||||
|
||||
let inf = n + m;
|
||||
let mut matrix = N2Array::new(n + 2, m + 2, 0);
|
||||
|
||||
matrix[(0, 0)] = inf;
|
||||
for i in 0..=n {
|
||||
matrix[(i + 1, 0)] = inf;
|
||||
matrix[(i + 1, 1)] = i;
|
||||
}
|
||||
for j in 0..=m {
|
||||
matrix[(0, j + 1)] = inf;
|
||||
matrix[(1, j + 1)] = j;
|
||||
}
|
||||
|
||||
let mut last_row = BTreeMap::new();
|
||||
|
||||
for (row, char_s) in source.chars().enumerate() {
|
||||
let mut last_match_col = 0;
|
||||
let row = row + 1;
|
||||
|
||||
for (col, char_t) in target.chars().enumerate() {
|
||||
let col = col + 1;
|
||||
let last_match_row = *last_row.get(&char_t).unwrap_or(&0);
|
||||
let cost = usize::from(char_s != char_t);
|
||||
|
||||
let dist_add = matrix[(row, col + 1)] + 1;
|
||||
let dist_del = matrix[(row + 1, col)] + 1;
|
||||
let dist_sub = matrix[(row, col)] + cost;
|
||||
let dist_trans = matrix[(last_match_row, last_match_col)]
|
||||
+ (row - last_match_row - 1)
|
||||
+ 1
|
||||
+ (col - last_match_col - 1);
|
||||
let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans));
|
||||
matrix[(row + 1, col + 1)] = dist;
|
||||
|
||||
if cost == 0 {
|
||||
last_match_col = col;
|
||||
}
|
||||
}
|
||||
|
||||
last_row.insert(char_s, row);
|
||||
}
|
||||
|
||||
let mut minimum = (u32::max_value(), 0);
|
||||
for x in 0..=m {
|
||||
let dist = matrix[(n + 1, x + 1)] as u32;
|
||||
if dist < minimum.0 {
|
||||
minimum = (dist, x);
|
||||
}
|
||||
}
|
||||
|
||||
// everything was done characters wise and now we want to returns a number of bytes
|
||||
source.chars().take(minimum.1).map(|c| c.len_utf8()).sum()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::borrow::Cow;
|
||||
use std::str::from_utf8;
|
||||
|
||||
use charabia::TokenKind;
|
||||
|
||||
use super::*;
|
||||
use crate::MatchingWords;
|
||||
|
||||
#[test]
|
||||
fn test_bytes_to_highlight() {
|
||||
struct TestBytesToHighlight {
|
||||
query: &'static str,
|
||||
text: &'static str,
|
||||
length: usize,
|
||||
}
|
||||
let tests = [
|
||||
TestBytesToHighlight { query: "bip", text: "bip", length: "bip".len() },
|
||||
TestBytesToHighlight { query: "bip", text: "boup", length: "bip".len() },
|
||||
TestBytesToHighlight {
|
||||
query: "Levenshtein",
|
||||
text: "Levenshtein",
|
||||
length: "Levenshtein".len(),
|
||||
},
|
||||
// we get to the end of our word with only one typo
|
||||
TestBytesToHighlight {
|
||||
query: "Levenste",
|
||||
text: "Levenshtein",
|
||||
length: "Levenste".len(),
|
||||
},
|
||||
// we get our third and last authorized typo right on the last character
|
||||
TestBytesToHighlight {
|
||||
query: "Levenstein",
|
||||
text: "Levenshte",
|
||||
length: "Levenste".len(),
|
||||
},
|
||||
// we get to the end of our word with only two typos at the beginning
|
||||
TestBytesToHighlight {
|
||||
query: "Bavenshtein",
|
||||
text: "Levenshtein",
|
||||
length: "Bavenshtein".len(),
|
||||
},
|
||||
TestBytesToHighlight {
|
||||
query: "Альфа", text: "Альфой", length: "Альф".len()
|
||||
},
|
||||
TestBytesToHighlight {
|
||||
query: "Go💼", text: "Go💼od luck.", length: "Go💼".len()
|
||||
},
|
||||
TestBytesToHighlight {
|
||||
query: "Go💼od", text: "Go💼od luck.", length: "Go💼od".len()
|
||||
},
|
||||
TestBytesToHighlight {
|
||||
query: "chäräcters",
|
||||
text: "chäräcters",
|
||||
length: "chäräcters".len(),
|
||||
},
|
||||
TestBytesToHighlight { query: "ch", text: "chäräcters", length: "ch".len() },
|
||||
TestBytesToHighlight { query: "chär", text: "chäräcters", length: "chär".len() },
|
||||
];
|
||||
|
||||
for test in &tests {
|
||||
let length = bytes_to_highlight(test.text, test.query);
|
||||
assert_eq!(length, test.length, r#"lenght between: "{}" "{}""#, test.query, test.text);
|
||||
assert!(
|
||||
from_utf8(&test.query.as_bytes()[..length]).is_ok(),
|
||||
r#"converting {}[..{}] to an utf8 str failed"#,
|
||||
test.query,
|
||||
length
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn matching_words() {
|
||||
let all = vec![
|
||||
Rc::new(MatchingWord::new("split".to_string(), 1, true).unwrap()),
|
||||
Rc::new(MatchingWord::new("this".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
|
||||
];
|
||||
let matching_words = vec![
|
||||
(vec![all[0].clone()], vec![0]),
|
||||
(vec![all[1].clone()], vec![1]),
|
||||
(vec![all[2].clone()], vec![2]),
|
||||
];
|
||||
|
||||
let matching_words = MatchingWords::new(matching_words).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("word"),
|
||||
char_end: "word".chars().count(),
|
||||
byte_end: "word".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_len: 3, ids: &[2] })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("nyc"),
|
||||
char_end: "nyc".chars().count(),
|
||||
byte_end: "nyc".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("world"),
|
||||
char_end: "world".chars().count(),
|
||||
byte_end: "world".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_len: 5, ids: &[2] })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("splitted"),
|
||||
char_end: "splitted".chars().count(),
|
||||
byte_end: "splitted".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_len: 5, ids: &[0] })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("thisnew"),
|
||||
char_end: "thisnew".chars().count(),
|
||||
byte_end: "thisnew".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("borld"),
|
||||
char_end: "borld".chars().count(),
|
||||
byte_end: "borld".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_len: 5, ids: &[2] })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("wordsplit"),
|
||||
char_end: "wordsplit".chars().count(),
|
||||
byte_end: "wordsplit".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_len: 4, ids: &[2] })
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,42 +1,24 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::hash_map::{Entry, HashMap};
|
||||
use std::fmt;
|
||||
use std::mem::take;
|
||||
use std::result::Result as StdResult;
|
||||
use std::str::Utf8Error;
|
||||
use std::time::Instant;
|
||||
|
||||
use charabia::TokenizerBuilder;
|
||||
use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct};
|
||||
use fst::automaton::Str;
|
||||
use fst::{Automaton, IntoStreamer, Streamer};
|
||||
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
|
||||
use log::debug;
|
||||
use once_cell::sync::Lazy;
|
||||
use roaring::bitmap::RoaringBitmap;
|
||||
|
||||
pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET};
|
||||
use self::fst_utils::{Complement, Intersection, StartsWith, Union};
|
||||
pub use self::matches::{
|
||||
FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords,
|
||||
pub use self::new::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords};
|
||||
use self::new::PartialSearchResult;
|
||||
use crate::{
|
||||
execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext,
|
||||
};
|
||||
use self::query_tree::QueryTreeBuilder;
|
||||
use crate::error::UserError;
|
||||
use crate::search::criteria::r#final::{Final, FinalResult};
|
||||
use crate::search::criteria::InitialCandidates;
|
||||
use crate::{AscDesc, Criterion, DocumentId, Index, Member, Result};
|
||||
|
||||
// Building these factories is not free.
|
||||
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
|
||||
static LEVDIST1: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(1, true));
|
||||
static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
|
||||
|
||||
mod criteria;
|
||||
mod distinct;
|
||||
pub mod facet;
|
||||
mod fst_utils;
|
||||
mod matches;
|
||||
mod query_tree;
|
||||
pub mod new;
|
||||
|
||||
pub struct Search<'a> {
|
||||
query: Option<String>,
|
||||
@@ -45,11 +27,10 @@ pub struct Search<'a> {
|
||||
offset: usize,
|
||||
limit: usize,
|
||||
sort_criteria: Option<Vec<AscDesc>>,
|
||||
geo_strategy: new::GeoSortStrategy,
|
||||
terms_matching_strategy: TermsMatchingStrategy,
|
||||
authorize_typos: bool,
|
||||
words_limit: usize,
|
||||
exhaustive_number_hits: bool,
|
||||
criterion_implementation_strategy: CriterionImplementationStrategy,
|
||||
rtxn: &'a heed::RoTxn<'a>,
|
||||
index: &'a Index,
|
||||
}
|
||||
@@ -62,11 +43,10 @@ impl<'a> Search<'a> {
|
||||
offset: 0,
|
||||
limit: 20,
|
||||
sort_criteria: None,
|
||||
geo_strategy: new::GeoSortStrategy::default(),
|
||||
terms_matching_strategy: TermsMatchingStrategy::default(),
|
||||
authorize_typos: true,
|
||||
exhaustive_number_hits: false,
|
||||
words_limit: 10,
|
||||
criterion_implementation_strategy: CriterionImplementationStrategy::default(),
|
||||
rtxn,
|
||||
index,
|
||||
}
|
||||
@@ -97,11 +77,6 @@ impl<'a> Search<'a> {
|
||||
self
|
||||
}
|
||||
|
||||
pub fn authorize_typos(&mut self, value: bool) -> &mut Search<'a> {
|
||||
self.authorize_typos = value;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn words_limit(&mut self, value: usize) -> &mut Search<'a> {
|
||||
self.words_limit = value;
|
||||
self
|
||||
@@ -112,6 +87,12 @@ impl<'a> Search<'a> {
|
||||
self
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn geo_sort_strategy(&mut self, strategy: new::GeoSortStrategy) -> &mut Search<'a> {
|
||||
self.geo_strategy = strategy;
|
||||
self
|
||||
}
|
||||
|
||||
/// Force the search to exhastivelly compute the number of candidates,
|
||||
/// this will increase the search time but allows finite pagination.
|
||||
pub fn exhaustive_number_hits(&mut self, exhaustive_number_hits: bool) -> &mut Search<'a> {
|
||||
@@ -119,177 +100,31 @@ impl<'a> Search<'a> {
|
||||
self
|
||||
}
|
||||
|
||||
pub fn criterion_implementation_strategy(
|
||||
&mut self,
|
||||
strategy: CriterionImplementationStrategy,
|
||||
) -> &mut Search<'a> {
|
||||
self.criterion_implementation_strategy = strategy;
|
||||
self
|
||||
}
|
||||
|
||||
fn is_typo_authorized(&self) -> Result<bool> {
|
||||
let index_authorizes_typos = self.index.authorize_typos(self.rtxn)?;
|
||||
// only authorize typos if both the index and the query allow it.
|
||||
Ok(self.authorize_typos && index_authorizes_typos)
|
||||
}
|
||||
|
||||
pub fn execute(&self) -> Result<SearchResult> {
|
||||
// We create the query tree by spliting the query into tokens.
|
||||
let before = Instant::now();
|
||||
let (query_tree, primitive_query, matching_words) = match self.query.as_ref() {
|
||||
Some(query) => {
|
||||
let mut builder = QueryTreeBuilder::new(self.rtxn, self.index)?;
|
||||
builder.terms_matching_strategy(self.terms_matching_strategy);
|
||||
let mut ctx = SearchContext::new(self.index, self.rtxn);
|
||||
let PartialSearchResult { located_query_terms, candidates, documents_ids } =
|
||||
execute_search(
|
||||
&mut ctx,
|
||||
&self.query,
|
||||
self.terms_matching_strategy,
|
||||
self.exhaustive_number_hits,
|
||||
&self.filter,
|
||||
&self.sort_criteria,
|
||||
self.geo_strategy,
|
||||
self.offset,
|
||||
self.limit,
|
||||
Some(self.words_limit),
|
||||
&mut DefaultSearchLogger,
|
||||
&mut DefaultSearchLogger,
|
||||
)?;
|
||||
|
||||
builder.authorize_typos(self.is_typo_authorized()?);
|
||||
|
||||
builder.words_limit(self.words_limit);
|
||||
// We make sure that the analyzer is aware of the stop words
|
||||
// this ensures that the query builder is able to properly remove them.
|
||||
let mut tokbuilder = TokenizerBuilder::new();
|
||||
let stop_words = self.index.stop_words(self.rtxn)?;
|
||||
if let Some(ref stop_words) = stop_words {
|
||||
tokbuilder.stop_words(stop_words);
|
||||
}
|
||||
|
||||
let script_lang_map = self.index.script_language(self.rtxn)?;
|
||||
if !script_lang_map.is_empty() {
|
||||
tokbuilder.allow_list(&script_lang_map);
|
||||
}
|
||||
|
||||
let tokenizer = tokbuilder.build();
|
||||
let tokens = tokenizer.tokenize(query);
|
||||
builder
|
||||
.build(tokens)?
|
||||
.map_or((None, None, None), |(qt, pq, mw)| (Some(qt), Some(pq), Some(mw)))
|
||||
}
|
||||
None => (None, None, None),
|
||||
// consume context and located_query_terms to build MatchingWords.
|
||||
let matching_words = match located_query_terms {
|
||||
Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms),
|
||||
None => MatchingWords::default(),
|
||||
};
|
||||
|
||||
debug!("query tree: {:?} took {:.02?}", query_tree, before.elapsed());
|
||||
|
||||
// We create the original candidates with the facet conditions results.
|
||||
let before = Instant::now();
|
||||
let filtered_candidates = match &self.filter {
|
||||
Some(condition) => Some(condition.evaluate(self.rtxn, self.index)?),
|
||||
None => None,
|
||||
};
|
||||
|
||||
debug!("facet candidates: {:?} took {:.02?}", filtered_candidates, before.elapsed());
|
||||
|
||||
// We check that we are allowed to use the sort criteria, we check
|
||||
// that they are declared in the sortable fields.
|
||||
if let Some(sort_criteria) = &self.sort_criteria {
|
||||
let sortable_fields = self.index.sortable_fields(self.rtxn)?;
|
||||
for asc_desc in sort_criteria {
|
||||
match asc_desc.member() {
|
||||
Member::Field(ref field) if !crate::is_faceted(field, &sortable_fields) => {
|
||||
return Err(UserError::InvalidSortableAttribute {
|
||||
field: field.to_string(),
|
||||
valid_fields: sortable_fields.into_iter().collect(),
|
||||
})?
|
||||
}
|
||||
Member::Geo(_) if !sortable_fields.contains("_geo") => {
|
||||
return Err(UserError::InvalidSortableAttribute {
|
||||
field: "_geo".to_string(),
|
||||
valid_fields: sortable_fields.into_iter().collect(),
|
||||
})?
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We check that the sort ranking rule exists and throw an
|
||||
// error if we try to use it and that it doesn't.
|
||||
let sort_ranking_rule_missing = !self.index.criteria(self.rtxn)?.contains(&Criterion::Sort);
|
||||
let empty_sort_criteria = self.sort_criteria.as_ref().map_or(true, |s| s.is_empty());
|
||||
if sort_ranking_rule_missing && !empty_sort_criteria {
|
||||
return Err(UserError::SortRankingRuleMissing.into());
|
||||
}
|
||||
|
||||
let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?;
|
||||
|
||||
match self.index.distinct_field(self.rtxn)? {
|
||||
None => {
|
||||
let criteria = criteria_builder.build::<NoopDistinct>(
|
||||
query_tree,
|
||||
primitive_query,
|
||||
filtered_candidates,
|
||||
self.sort_criteria.clone(),
|
||||
self.exhaustive_number_hits,
|
||||
None,
|
||||
self.criterion_implementation_strategy,
|
||||
)?;
|
||||
self.perform_sort(NoopDistinct, matching_words.unwrap_or_default(), criteria)
|
||||
}
|
||||
Some(name) => {
|
||||
let field_ids_map = self.index.fields_ids_map(self.rtxn)?;
|
||||
match field_ids_map.id(name) {
|
||||
Some(fid) => {
|
||||
let distinct = FacetDistinct::new(fid, self.index, self.rtxn);
|
||||
|
||||
let criteria = criteria_builder.build(
|
||||
query_tree,
|
||||
primitive_query,
|
||||
filtered_candidates,
|
||||
self.sort_criteria.clone(),
|
||||
self.exhaustive_number_hits,
|
||||
Some(distinct.clone()),
|
||||
self.criterion_implementation_strategy,
|
||||
)?;
|
||||
self.perform_sort(distinct, matching_words.unwrap_or_default(), criteria)
|
||||
}
|
||||
None => Ok(SearchResult::default()),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn perform_sort<D: Distinct>(
|
||||
&self,
|
||||
mut distinct: D,
|
||||
matching_words: MatchingWords,
|
||||
mut criteria: Final,
|
||||
) -> Result<SearchResult> {
|
||||
let mut offset = self.offset;
|
||||
let mut initial_candidates = InitialCandidates::Estimated(RoaringBitmap::new());
|
||||
let mut excluded_candidates = self.index.soft_deleted_documents_ids(self.rtxn)?;
|
||||
let mut documents_ids = Vec::new();
|
||||
|
||||
while let Some(FinalResult { candidates, initial_candidates: ic, .. }) =
|
||||
criteria.next(&excluded_candidates)?
|
||||
{
|
||||
debug!("Number of candidates found {}", candidates.len());
|
||||
|
||||
let excluded = take(&mut excluded_candidates);
|
||||
let mut candidates = distinct.distinct(candidates, excluded);
|
||||
|
||||
initial_candidates |= ic;
|
||||
|
||||
if offset != 0 {
|
||||
let discarded = candidates.by_ref().take(offset).count();
|
||||
offset = offset.saturating_sub(discarded);
|
||||
}
|
||||
|
||||
for candidate in candidates.by_ref().take(self.limit - documents_ids.len()) {
|
||||
documents_ids.push(candidate?);
|
||||
}
|
||||
|
||||
excluded_candidates |= candidates.into_excluded();
|
||||
|
||||
if documents_ids.len() == self.limit {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
initial_candidates.map_inplace(|c| c - excluded_candidates);
|
||||
|
||||
Ok(SearchResult {
|
||||
matching_words,
|
||||
candidates: initial_candidates.into_inner(),
|
||||
documents_ids,
|
||||
})
|
||||
Ok(SearchResult { matching_words, candidates, documents_ids })
|
||||
}
|
||||
}
|
||||
|
||||
@@ -301,11 +136,10 @@ impl fmt::Debug for Search<'_> {
|
||||
offset,
|
||||
limit,
|
||||
sort_criteria,
|
||||
geo_strategy: _,
|
||||
terms_matching_strategy,
|
||||
authorize_typos,
|
||||
words_limit,
|
||||
exhaustive_number_hits,
|
||||
criterion_implementation_strategy,
|
||||
rtxn: _,
|
||||
index: _,
|
||||
} = self;
|
||||
@@ -316,9 +150,7 @@ impl fmt::Debug for Search<'_> {
|
||||
.field("limit", limit)
|
||||
.field("sort_criteria", sort_criteria)
|
||||
.field("terms_matching_strategy", terms_matching_strategy)
|
||||
.field("authorize_typos", authorize_typos)
|
||||
.field("exhaustive_number_hits", exhaustive_number_hits)
|
||||
.field("criterion_implementation_strategy", criterion_implementation_strategy)
|
||||
.field("words_limit", words_limit)
|
||||
.finish()
|
||||
}
|
||||
@@ -332,26 +164,10 @@ pub struct SearchResult {
|
||||
pub documents_ids: Vec<DocumentId>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Copy)]
|
||||
pub enum CriterionImplementationStrategy {
|
||||
OnlyIterative,
|
||||
OnlySetBased,
|
||||
#[default]
|
||||
Dynamic,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum TermsMatchingStrategy {
|
||||
// remove last word first
|
||||
Last,
|
||||
// remove first word first
|
||||
First,
|
||||
// remove more frequent word first
|
||||
Frequency,
|
||||
// remove smallest word first
|
||||
Size,
|
||||
// only one of the word is mandatory
|
||||
Any,
|
||||
// all words are mandatory
|
||||
All,
|
||||
}
|
||||
@@ -362,69 +178,6 @@ impl Default for TermsMatchingStrategy {
|
||||
}
|
||||
}
|
||||
|
||||
pub type WordDerivationsCache = HashMap<(String, bool, u8), Vec<(String, u8)>>;
|
||||
|
||||
pub fn word_derivations<'c>(
|
||||
word: &str,
|
||||
is_prefix: bool,
|
||||
max_typo: u8,
|
||||
fst: &fst::Set<Cow<[u8]>>,
|
||||
cache: &'c mut WordDerivationsCache,
|
||||
) -> StdResult<&'c [(String, u8)], Utf8Error> {
|
||||
match cache.entry((word.to_string(), is_prefix, max_typo)) {
|
||||
Entry::Occupied(entry) => Ok(entry.into_mut()),
|
||||
Entry::Vacant(entry) => {
|
||||
let mut derived_words = Vec::new();
|
||||
if max_typo == 0 {
|
||||
if is_prefix {
|
||||
let prefix = Str::new(word).starts_with();
|
||||
let mut stream = fst.search(prefix).into_stream();
|
||||
|
||||
while let Some(word) = stream.next() {
|
||||
let word = std::str::from_utf8(word)?;
|
||||
derived_words.push((word.to_string(), 0));
|
||||
}
|
||||
} else if fst.contains(word) {
|
||||
derived_words.push((word.to_string(), 0));
|
||||
}
|
||||
} else if max_typo == 1 {
|
||||
let dfa = build_dfa(word, 1, is_prefix);
|
||||
let starts = StartsWith(Str::new(get_first(word)));
|
||||
let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream();
|
||||
|
||||
while let Some((word, state)) = stream.next() {
|
||||
let word = std::str::from_utf8(word)?;
|
||||
let d = dfa.distance(state.1);
|
||||
derived_words.push((word.to_string(), d.to_u8()));
|
||||
}
|
||||
} else {
|
||||
let starts = StartsWith(Str::new(get_first(word)));
|
||||
let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts));
|
||||
let second_dfa = build_dfa(word, 2, is_prefix);
|
||||
let second = Intersection(&second_dfa, &starts);
|
||||
let automaton = Union(first, &second);
|
||||
|
||||
let mut stream = fst.search_with_state(automaton).into_stream();
|
||||
|
||||
while let Some((found_word, state)) = stream.next() {
|
||||
let found_word = std::str::from_utf8(found_word)?;
|
||||
// in the case the typo is on the first letter, we know the number of typo
|
||||
// is two
|
||||
if get_first(found_word) != get_first(word) {
|
||||
derived_words.push((found_word.to_string(), 2));
|
||||
} else {
|
||||
// Else, we know that it is the second dfa that matched and compute the
|
||||
// correct distance
|
||||
let d = second_dfa.distance((state.1).0);
|
||||
derived_words.push((found_word.to_string(), d.to_u8()));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(entry.insert(derived_words))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get_first(s: &str) -> &str {
|
||||
match s.chars().next() {
|
||||
Some(c) => &s[..c.len_utf8()],
|
||||
@@ -472,92 +225,4 @@ mod test {
|
||||
|
||||
assert_eq!(documents_ids, vec![1]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_authorized_typos() {
|
||||
let index = TempIndex::new();
|
||||
let mut txn = index.write_txn().unwrap();
|
||||
|
||||
let mut search = Search::new(&txn, &index);
|
||||
|
||||
// default is authorized
|
||||
assert!(search.is_typo_authorized().unwrap());
|
||||
|
||||
search.authorize_typos(false);
|
||||
assert!(!search.is_typo_authorized().unwrap());
|
||||
|
||||
index.put_authorize_typos(&mut txn, false).unwrap();
|
||||
txn.commit().unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
let mut search = Search::new(&txn, &index);
|
||||
|
||||
assert!(!search.is_typo_authorized().unwrap());
|
||||
|
||||
search.authorize_typos(true);
|
||||
assert!(!search.is_typo_authorized().unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_one_typos_tolerance() {
|
||||
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
|
||||
let mut cache = HashMap::new();
|
||||
let found = word_derivations("zealend", false, 1, &fst, &mut cache).unwrap();
|
||||
|
||||
assert_eq!(found, &[("zealand".to_string(), 1)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_one_typos_first_letter() {
|
||||
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
|
||||
let mut cache = HashMap::new();
|
||||
let found = word_derivations("sealand", false, 1, &fst, &mut cache).unwrap();
|
||||
|
||||
assert_eq!(found, &[]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_two_typos_tolerance() {
|
||||
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
|
||||
let mut cache = HashMap::new();
|
||||
let found = word_derivations("zealemd", false, 2, &fst, &mut cache).unwrap();
|
||||
|
||||
assert_eq!(found, &[("zealand".to_string(), 2)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_two_typos_first_letter() {
|
||||
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
|
||||
let mut cache = HashMap::new();
|
||||
let found = word_derivations("sealand", false, 2, &fst, &mut cache).unwrap();
|
||||
|
||||
assert_eq!(found, &[("zealand".to_string(), 2)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prefix() {
|
||||
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
|
||||
let mut cache = HashMap::new();
|
||||
let found = word_derivations("ze", true, 0, &fst, &mut cache).unwrap();
|
||||
|
||||
assert_eq!(found, &[("zealand".to_string(), 0)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bad_prefix() {
|
||||
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
|
||||
let mut cache = HashMap::new();
|
||||
let found = word_derivations("se", true, 0, &fst, &mut cache).unwrap();
|
||||
|
||||
assert_eq!(found, &[]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prefix_with_typo() {
|
||||
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
|
||||
let mut cache = HashMap::new();
|
||||
let found = word_derivations("zae", true, 1, &fst, &mut cache).unwrap();
|
||||
|
||||
assert_eq!(found, &[("zealand".to_string(), 1)]);
|
||||
}
|
||||
}
|
||||
|
||||
245
milli/src/search/new/bucket_sort.rs
Normal file
245
milli/src/search/new/bucket_sort.rs
Normal file
@@ -0,0 +1,245 @@
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::logger::SearchLogger;
|
||||
use super::ranking_rules::{BoxRankingRule, RankingRuleQueryTrait};
|
||||
use super::SearchContext;
|
||||
use crate::search::new::distinct::{apply_distinct_rule, distinct_single_docid, DistinctOutput};
|
||||
use crate::Result;
|
||||
|
||||
pub struct BucketSortOutput {
|
||||
pub docids: Vec<u32>,
|
||||
pub all_candidates: RoaringBitmap,
|
||||
}
|
||||
|
||||
pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
mut ranking_rules: Vec<BoxRankingRule<'ctx, Q>>,
|
||||
query: &Q,
|
||||
universe: &RoaringBitmap,
|
||||
from: usize,
|
||||
length: usize,
|
||||
logger: &mut dyn SearchLogger<Q>,
|
||||
) -> Result<BucketSortOutput> {
|
||||
logger.initial_query(query);
|
||||
logger.ranking_rules(&ranking_rules);
|
||||
logger.initial_universe(universe);
|
||||
|
||||
let distinct_fid = if let Some(field) = ctx.index.distinct_field(ctx.txn)? {
|
||||
ctx.index.fields_ids_map(ctx.txn)?.id(field)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
if universe.len() < from as u64 {
|
||||
return Ok(BucketSortOutput { docids: vec![], all_candidates: universe.clone() });
|
||||
}
|
||||
if ranking_rules.is_empty() {
|
||||
if let Some(distinct_fid) = distinct_fid {
|
||||
let mut excluded = RoaringBitmap::new();
|
||||
let mut results = vec![];
|
||||
for docid in universe.iter() {
|
||||
if results.len() >= from + length {
|
||||
break;
|
||||
}
|
||||
if excluded.contains(docid) {
|
||||
continue;
|
||||
}
|
||||
distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?;
|
||||
results.push(docid);
|
||||
}
|
||||
let mut all_candidates = universe - excluded;
|
||||
all_candidates.extend(results.iter().copied());
|
||||
return Ok(BucketSortOutput { docids: results, all_candidates });
|
||||
} else {
|
||||
let docids = universe.iter().skip(from).take(length).collect();
|
||||
return Ok(BucketSortOutput { docids, all_candidates: universe.clone() });
|
||||
};
|
||||
}
|
||||
|
||||
let ranking_rules_len = ranking_rules.len();
|
||||
|
||||
logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query, universe);
|
||||
ranking_rules[0].start_iteration(ctx, logger, universe, query)?;
|
||||
|
||||
let mut ranking_rule_universes: Vec<RoaringBitmap> =
|
||||
vec![RoaringBitmap::default(); ranking_rules_len];
|
||||
ranking_rule_universes[0] = universe.clone();
|
||||
|
||||
let mut cur_ranking_rule_index = 0;
|
||||
|
||||
/// Finish iterating over the current ranking rule, yielding
|
||||
/// control to the parent (or finishing the search if not possible).
|
||||
/// Update the universes accordingly and inform the logger.
|
||||
macro_rules! back {
|
||||
() => {
|
||||
assert!(
|
||||
ranking_rule_universes[cur_ranking_rule_index].is_empty(),
|
||||
"The ranking rule {} did not sort its bucket exhaustively",
|
||||
ranking_rules[cur_ranking_rule_index].id()
|
||||
);
|
||||
logger.end_iteration_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||
&ranking_rule_universes[cur_ranking_rule_index],
|
||||
);
|
||||
ranking_rule_universes[cur_ranking_rule_index].clear();
|
||||
ranking_rules[cur_ranking_rule_index].end_iteration(ctx, logger);
|
||||
if cur_ranking_rule_index == 0 {
|
||||
break;
|
||||
} else {
|
||||
cur_ranking_rule_index -= 1;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
let mut all_candidates = universe.clone();
|
||||
let mut valid_docids = vec![];
|
||||
let mut cur_offset = 0usize;
|
||||
|
||||
macro_rules! maybe_add_to_results {
|
||||
($candidates:expr) => {
|
||||
maybe_add_to_results(
|
||||
ctx,
|
||||
from,
|
||||
length,
|
||||
logger,
|
||||
&mut valid_docids,
|
||||
&mut all_candidates,
|
||||
&mut ranking_rule_universes,
|
||||
&mut ranking_rules,
|
||||
cur_ranking_rule_index,
|
||||
&mut cur_offset,
|
||||
distinct_fid,
|
||||
$candidates,
|
||||
)?;
|
||||
};
|
||||
}
|
||||
|
||||
while valid_docids.len() < length {
|
||||
// The universe for this bucket is zero or one element, so we don't need to sort
|
||||
// anything, just extend the results and go back to the parent ranking rule.
|
||||
if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 {
|
||||
let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]);
|
||||
maybe_add_to_results!(bucket);
|
||||
back!();
|
||||
continue;
|
||||
}
|
||||
|
||||
let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &ranking_rule_universes[cur_ranking_rule_index])? else {
|
||||
back!();
|
||||
continue;
|
||||
};
|
||||
|
||||
logger.next_bucket_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||
&ranking_rule_universes[cur_ranking_rule_index],
|
||||
&next_bucket.candidates,
|
||||
);
|
||||
|
||||
debug_assert!(
|
||||
ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates)
|
||||
);
|
||||
ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates;
|
||||
|
||||
if cur_ranking_rule_index == ranking_rules_len - 1
|
||||
|| next_bucket.candidates.len() <= 1
|
||||
|| cur_offset + (next_bucket.candidates.len() as usize) < from
|
||||
{
|
||||
maybe_add_to_results!(next_bucket.candidates);
|
||||
continue;
|
||||
}
|
||||
|
||||
cur_ranking_rule_index += 1;
|
||||
ranking_rule_universes[cur_ranking_rule_index] = next_bucket.candidates.clone();
|
||||
logger.start_iteration_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||
&next_bucket.query,
|
||||
&ranking_rule_universes[cur_ranking_rule_index],
|
||||
);
|
||||
ranking_rules[cur_ranking_rule_index].start_iteration(
|
||||
ctx,
|
||||
logger,
|
||||
&next_bucket.candidates,
|
||||
&next_bucket.query,
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(BucketSortOutput { docids: valid_docids, all_candidates })
|
||||
}
|
||||
|
||||
/// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset`
|
||||
/// into account and inform the logger.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>(
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
from: usize,
|
||||
length: usize,
|
||||
logger: &mut dyn SearchLogger<Q>,
|
||||
|
||||
valid_docids: &mut Vec<u32>,
|
||||
all_candidates: &mut RoaringBitmap,
|
||||
|
||||
ranking_rule_universes: &mut [RoaringBitmap],
|
||||
ranking_rules: &mut [BoxRankingRule<'ctx, Q>],
|
||||
cur_ranking_rule_index: usize,
|
||||
|
||||
cur_offset: &mut usize,
|
||||
distinct_fid: Option<u16>,
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
// First apply the distinct rule on the candidates, reducing the universes if necessary
|
||||
let candidates = if let Some(distinct_fid) = distinct_fid {
|
||||
let DistinctOutput { remaining, excluded } =
|
||||
apply_distinct_rule(ctx, distinct_fid, &candidates)?;
|
||||
for universe in ranking_rule_universes.iter_mut() {
|
||||
*universe -= &excluded;
|
||||
*all_candidates -= &excluded;
|
||||
}
|
||||
remaining
|
||||
} else {
|
||||
candidates.clone()
|
||||
};
|
||||
*all_candidates |= &candidates;
|
||||
|
||||
// if the candidates are empty, there is nothing to do;
|
||||
if candidates.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// if we still haven't reached the first document to return
|
||||
if *cur_offset < from {
|
||||
// and if no document from this bucket can be returned
|
||||
if *cur_offset + (candidates.len() as usize) < from {
|
||||
// then just skip the bucket
|
||||
logger.skip_bucket_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||
&candidates,
|
||||
);
|
||||
} else {
|
||||
// otherwise, skip some of the documents and add some of the rest, in order of ids
|
||||
let candidates_vec = candidates.iter().collect::<Vec<_>>();
|
||||
let (skipped_candidates, candidates) = candidates_vec.split_at(from - *cur_offset);
|
||||
|
||||
logger.skip_bucket_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||
&skipped_candidates.iter().collect(),
|
||||
);
|
||||
let candidates =
|
||||
candidates.iter().take(length - valid_docids.len()).copied().collect::<Vec<_>>();
|
||||
logger.add_to_results(&candidates);
|
||||
valid_docids.extend(&candidates);
|
||||
}
|
||||
} else {
|
||||
// if we have passed the offset already, add some of the documents (up to the limit)
|
||||
let candidates = candidates.iter().take(length - valid_docids.len()).collect::<Vec<u32>>();
|
||||
logger.add_to_results(&candidates);
|
||||
valid_docids.extend(&candidates);
|
||||
}
|
||||
|
||||
*cur_offset += candidates.len() as usize;
|
||||
Ok(())
|
||||
}
|
||||
436
milli/src/search/new/db_cache.rs
Normal file
436
milli/src/search/new/db_cache.rs
Normal file
@@ -0,0 +1,436 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::hash::Hash;
|
||||
|
||||
use fxhash::FxHashMap;
|
||||
use heed::types::ByteSlice;
|
||||
use heed::{BytesDecode, BytesEncode, Database, RoTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::interner::Interned;
|
||||
use super::Word;
|
||||
use crate::heed_codec::StrBEU16Codec;
|
||||
use crate::{
|
||||
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext,
|
||||
};
|
||||
|
||||
/// A cache storing pointers to values in the LMDB databases.
|
||||
///
|
||||
/// Used for performance reasons only. By using this cache, we avoid performing a
|
||||
/// database lookup and instead get a direct reference to the value using a fast
|
||||
/// local HashMap lookup.
|
||||
#[derive(Default)]
|
||||
pub struct DatabaseCache<'ctx> {
|
||||
pub word_pair_proximity_docids:
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'ctx [u8]>>,
|
||||
pub word_prefix_pair_proximity_docids:
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'ctx [u8]>>,
|
||||
pub prefix_word_pair_proximity_docids:
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'ctx [u8]>>,
|
||||
pub word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
||||
pub exact_word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
||||
pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
||||
pub exact_word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
||||
|
||||
pub words_fst: Option<fst::Set<Cow<'ctx, [u8]>>>,
|
||||
pub word_position_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
|
||||
pub word_prefix_position_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
|
||||
pub word_positions: FxHashMap<Interned<String>, Vec<u16>>,
|
||||
pub word_prefix_positions: FxHashMap<Interned<String>, Vec<u16>>,
|
||||
|
||||
pub word_fid_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
|
||||
pub word_prefix_fid_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
|
||||
pub word_fids: FxHashMap<Interned<String>, Vec<u16>>,
|
||||
pub word_prefix_fids: FxHashMap<Interned<String>, Vec<u16>>,
|
||||
}
|
||||
impl<'ctx> DatabaseCache<'ctx> {
|
||||
fn get_value<'v, K1, KC>(
|
||||
txn: &'ctx RoTxn,
|
||||
cache_key: K1,
|
||||
db_key: &'v KC::EItem,
|
||||
cache: &mut FxHashMap<K1, Option<&'ctx [u8]>>,
|
||||
db: Database<KC, ByteSlice>,
|
||||
) -> Result<Option<&'ctx [u8]>>
|
||||
where
|
||||
K1: Copy + Eq + Hash,
|
||||
KC: BytesEncode<'v>,
|
||||
{
|
||||
let bitmap_ptr = match cache.entry(cache_key) {
|
||||
Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(),
|
||||
Entry::Vacant(entry) => {
|
||||
let bitmap_ptr = db.get(txn, db_key)?;
|
||||
entry.insert(bitmap_ptr);
|
||||
bitmap_ptr
|
||||
}
|
||||
};
|
||||
Ok(bitmap_ptr)
|
||||
}
|
||||
}
|
||||
impl<'ctx> SearchContext<'ctx> {
|
||||
pub fn get_words_fst(&mut self) -> Result<fst::Set<Cow<'ctx, [u8]>>> {
|
||||
if let Some(fst) = self.db_cache.words_fst.clone() {
|
||||
Ok(fst)
|
||||
} else {
|
||||
let fst = self.index.words_fst(self.txn)?;
|
||||
self.db_cache.words_fst = Some(fst.clone());
|
||||
Ok(fst)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn word_docids(&mut self, word: Word) -> Result<Option<RoaringBitmap>> {
|
||||
match word {
|
||||
Word::Original(word) => {
|
||||
let exact = self.get_db_exact_word_docids(word)?;
|
||||
let tolerant = self.get_db_word_docids(word)?;
|
||||
Ok(match (exact, tolerant) {
|
||||
(None, None) => None,
|
||||
(None, Some(tolerant)) => Some(tolerant),
|
||||
(Some(exact), None) => Some(exact),
|
||||
(Some(exact), Some(tolerant)) => {
|
||||
let mut both = exact;
|
||||
both |= tolerant;
|
||||
Some(both)
|
||||
}
|
||||
})
|
||||
}
|
||||
Word::Derived(word) => self.get_db_word_docids(word),
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve or insert the given value in the `word_docids` database.
|
||||
fn get_db_word_docids(&mut self, word: Interned<String>) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
word,
|
||||
self.word_interner.get(word).as_str(),
|
||||
&mut self.db_cache.word_docids,
|
||||
self.index.word_docids.remap_data_type::<ByteSlice>(),
|
||||
)?
|
||||
.map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
fn get_db_exact_word_docids(
|
||||
&mut self,
|
||||
word: Interned<String>,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
word,
|
||||
self.word_interner.get(word).as_str(),
|
||||
&mut self.db_cache.exact_word_docids,
|
||||
self.index.exact_word_docids.remap_data_type::<ByteSlice>(),
|
||||
)?
|
||||
.map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn word_prefix_docids(&mut self, prefix: Word) -> Result<Option<RoaringBitmap>> {
|
||||
match prefix {
|
||||
Word::Original(prefix) => {
|
||||
let exact = self.get_db_exact_word_prefix_docids(prefix)?;
|
||||
let tolerant = self.get_db_word_prefix_docids(prefix)?;
|
||||
Ok(match (exact, tolerant) {
|
||||
(None, None) => None,
|
||||
(None, Some(tolerant)) => Some(tolerant),
|
||||
(Some(exact), None) => Some(exact),
|
||||
(Some(exact), Some(tolerant)) => {
|
||||
let mut both = exact;
|
||||
both |= tolerant;
|
||||
Some(both)
|
||||
}
|
||||
})
|
||||
}
|
||||
Word::Derived(prefix) => self.get_db_word_prefix_docids(prefix),
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve or insert the given value in the `word_prefix_docids` database.
|
||||
fn get_db_word_prefix_docids(
|
||||
&mut self,
|
||||
prefix: Interned<String>,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
prefix,
|
||||
self.word_interner.get(prefix).as_str(),
|
||||
&mut self.db_cache.word_prefix_docids,
|
||||
self.index.word_prefix_docids.remap_data_type::<ByteSlice>(),
|
||||
)?
|
||||
.map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
fn get_db_exact_word_prefix_docids(
|
||||
&mut self,
|
||||
prefix: Interned<String>,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
prefix,
|
||||
self.word_interner.get(prefix).as_str(),
|
||||
&mut self.db_cache.exact_word_prefix_docids,
|
||||
self.index.exact_word_prefix_docids.remap_data_type::<ByteSlice>(),
|
||||
)?
|
||||
.map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn get_db_word_pair_proximity_docids(
|
||||
&mut self,
|
||||
word1: Interned<String>,
|
||||
word2: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(proximity, word1, word2),
|
||||
&(
|
||||
proximity,
|
||||
self.word_interner.get(word1).as_str(),
|
||||
self.word_interner.get(word2).as_str(),
|
||||
),
|
||||
&mut self.db_cache.word_pair_proximity_docids,
|
||||
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
||||
)?
|
||||
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn get_db_word_pair_proximity_docids_len(
|
||||
&mut self,
|
||||
word1: Interned<String>,
|
||||
word2: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<u64>> {
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(proximity, word1, word2),
|
||||
&(
|
||||
proximity,
|
||||
self.word_interner.get(word1).as_str(),
|
||||
self.word_interner.get(word2).as_str(),
|
||||
),
|
||||
&mut self.db_cache.word_pair_proximity_docids,
|
||||
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
||||
)?
|
||||
.map(|bytes| {
|
||||
CboRoaringBitmapLenCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())
|
||||
})
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn get_db_word_prefix_pair_proximity_docids(
|
||||
&mut self,
|
||||
word1: Interned<String>,
|
||||
prefix2: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(proximity, word1, prefix2),
|
||||
&(
|
||||
proximity,
|
||||
self.word_interner.get(word1).as_str(),
|
||||
self.word_interner.get(prefix2).as_str(),
|
||||
),
|
||||
&mut self.db_cache.word_prefix_pair_proximity_docids,
|
||||
self.index.word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
||||
)?
|
||||
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
pub fn get_db_prefix_word_pair_proximity_docids(
|
||||
&mut self,
|
||||
left_prefix: Interned<String>,
|
||||
right: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(proximity, left_prefix, right),
|
||||
&(
|
||||
proximity,
|
||||
self.word_interner.get(left_prefix).as_str(),
|
||||
self.word_interner.get(right).as_str(),
|
||||
),
|
||||
&mut self.db_cache.prefix_word_pair_proximity_docids,
|
||||
self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
||||
)?
|
||||
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn get_db_word_fid_docids(
|
||||
&mut self,
|
||||
word: Interned<String>,
|
||||
fid: u16,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(word, fid),
|
||||
&(self.word_interner.get(word).as_str(), fid),
|
||||
&mut self.db_cache.word_fid_docids,
|
||||
self.index.word_fid_docids.remap_data_type::<ByteSlice>(),
|
||||
)?
|
||||
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn get_db_word_prefix_fid_docids(
|
||||
&mut self,
|
||||
word_prefix: Interned<String>,
|
||||
fid: u16,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(word_prefix, fid),
|
||||
&(self.word_interner.get(word_prefix).as_str(), fid),
|
||||
&mut self.db_cache.word_prefix_fid_docids,
|
||||
self.index.word_prefix_fid_docids.remap_data_type::<ByteSlice>(),
|
||||
)?
|
||||
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn get_db_word_fids(&mut self, word: Interned<String>) -> Result<Vec<u16>> {
|
||||
let fids = match self.db_cache.word_fids.entry(word) {
|
||||
Entry::Occupied(fids) => fids.get().clone(),
|
||||
Entry::Vacant(entry) => {
|
||||
let mut key = self.word_interner.get(word).as_bytes().to_owned();
|
||||
key.push(0);
|
||||
let mut fids = vec![];
|
||||
let remap_key_type = self
|
||||
.index
|
||||
.word_fid_docids
|
||||
.remap_types::<ByteSlice, ByteSlice>()
|
||||
.prefix_iter(self.txn, &key)?
|
||||
.remap_key_type::<StrBEU16Codec>();
|
||||
for result in remap_key_type {
|
||||
let ((_, fid), value) = result?;
|
||||
// filling other caches to avoid searching for them again
|
||||
self.db_cache.word_fid_docids.insert((word, fid), Some(value));
|
||||
fids.push(fid);
|
||||
}
|
||||
entry.insert(fids.clone());
|
||||
fids
|
||||
}
|
||||
};
|
||||
Ok(fids)
|
||||
}
|
||||
|
||||
pub fn get_db_word_prefix_fids(&mut self, word_prefix: Interned<String>) -> Result<Vec<u16>> {
|
||||
let fids = match self.db_cache.word_prefix_fids.entry(word_prefix) {
|
||||
Entry::Occupied(fids) => fids.get().clone(),
|
||||
Entry::Vacant(entry) => {
|
||||
let mut key = self.word_interner.get(word_prefix).as_bytes().to_owned();
|
||||
key.push(0);
|
||||
let mut fids = vec![];
|
||||
let remap_key_type = self
|
||||
.index
|
||||
.word_prefix_fid_docids
|
||||
.remap_types::<ByteSlice, ByteSlice>()
|
||||
.prefix_iter(self.txn, &key)?
|
||||
.remap_key_type::<StrBEU16Codec>();
|
||||
for result in remap_key_type {
|
||||
let ((_, fid), value) = result?;
|
||||
// filling other caches to avoid searching for them again
|
||||
self.db_cache.word_prefix_fid_docids.insert((word_prefix, fid), Some(value));
|
||||
fids.push(fid);
|
||||
}
|
||||
entry.insert(fids.clone());
|
||||
fids
|
||||
}
|
||||
};
|
||||
Ok(fids)
|
||||
}
|
||||
|
||||
pub fn get_db_word_position_docids(
|
||||
&mut self,
|
||||
word: Interned<String>,
|
||||
position: u16,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(word, position),
|
||||
&(self.word_interner.get(word).as_str(), position),
|
||||
&mut self.db_cache.word_position_docids,
|
||||
self.index.word_position_docids.remap_data_type::<ByteSlice>(),
|
||||
)?
|
||||
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn get_db_word_prefix_position_docids(
|
||||
&mut self,
|
||||
word_prefix: Interned<String>,
|
||||
position: u16,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(word_prefix, position),
|
||||
&(self.word_interner.get(word_prefix).as_str(), position),
|
||||
&mut self.db_cache.word_prefix_position_docids,
|
||||
self.index.word_prefix_position_docids.remap_data_type::<ByteSlice>(),
|
||||
)?
|
||||
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn get_db_word_positions(&mut self, word: Interned<String>) -> Result<Vec<u16>> {
|
||||
let positions = match self.db_cache.word_positions.entry(word) {
|
||||
Entry::Occupied(positions) => positions.get().clone(),
|
||||
Entry::Vacant(entry) => {
|
||||
let mut key = self.word_interner.get(word).as_bytes().to_owned();
|
||||
key.push(0);
|
||||
let mut positions = vec![];
|
||||
let remap_key_type = self
|
||||
.index
|
||||
.word_position_docids
|
||||
.remap_types::<ByteSlice, ByteSlice>()
|
||||
.prefix_iter(self.txn, &key)?
|
||||
.remap_key_type::<StrBEU16Codec>();
|
||||
for result in remap_key_type {
|
||||
let ((_, position), value) = result?;
|
||||
// filling other caches to avoid searching for them again
|
||||
self.db_cache.word_position_docids.insert((word, position), Some(value));
|
||||
positions.push(position);
|
||||
}
|
||||
entry.insert(positions.clone());
|
||||
positions
|
||||
}
|
||||
};
|
||||
Ok(positions)
|
||||
}
|
||||
|
||||
pub fn get_db_word_prefix_positions(
|
||||
&mut self,
|
||||
word_prefix: Interned<String>,
|
||||
) -> Result<Vec<u16>> {
|
||||
let positions = match self.db_cache.word_prefix_positions.entry(word_prefix) {
|
||||
Entry::Occupied(positions) => positions.get().clone(),
|
||||
Entry::Vacant(entry) => {
|
||||
let mut key = self.word_interner.get(word_prefix).as_bytes().to_owned();
|
||||
key.push(0);
|
||||
let mut positions = vec![];
|
||||
let remap_key_type = self
|
||||
.index
|
||||
.word_prefix_position_docids
|
||||
.remap_types::<ByteSlice, ByteSlice>()
|
||||
.prefix_iter(self.txn, &key)?
|
||||
.remap_key_type::<StrBEU16Codec>();
|
||||
for result in remap_key_type {
|
||||
let ((_, position), value) = result?;
|
||||
// filling other caches to avoid searching for them again
|
||||
self.db_cache
|
||||
.word_prefix_position_docids
|
||||
.insert((word_prefix, position), Some(value));
|
||||
positions.push(position);
|
||||
}
|
||||
entry.insert(positions.clone());
|
||||
positions
|
||||
}
|
||||
};
|
||||
Ok(positions)
|
||||
}
|
||||
}
|
||||
124
milli/src/search/new/distinct.rs
Normal file
124
milli/src/search/new/distinct.rs
Normal file
@@ -0,0 +1,124 @@
|
||||
use heed::types::{ByteSlice, Str, Unit};
|
||||
use heed::{Database, RoPrefix, RoTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
const FID_SIZE: usize = 2;
|
||||
const DOCID_SIZE: usize = 4;
|
||||
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetCodec,
|
||||
};
|
||||
use crate::heed_codec::ByteSliceRefCodec;
|
||||
use crate::{Index, Result, SearchContext};
|
||||
|
||||
pub struct DistinctOutput {
|
||||
pub remaining: RoaringBitmap,
|
||||
pub excluded: RoaringBitmap,
|
||||
}
|
||||
|
||||
/// Return a [`DistinctOutput`] containing:
|
||||
/// - `remaining`: a set of docids built such that exactly one element from `candidates`
|
||||
/// is kept for each distinct value inside the given field. If the field does not exist, it
|
||||
/// is considered unique.
|
||||
/// - `excluded`: the set of document ids that contain a value for the given field that occurs
|
||||
/// in the given candidates.
|
||||
pub fn apply_distinct_rule(
|
||||
ctx: &mut SearchContext,
|
||||
field_id: u16,
|
||||
candidates: &RoaringBitmap,
|
||||
// TODO: add a universe here, such that the `excluded` are a subset of the universe?
|
||||
) -> Result<DistinctOutput> {
|
||||
let mut excluded = RoaringBitmap::new();
|
||||
let mut remaining = RoaringBitmap::new();
|
||||
for docid in candidates {
|
||||
if excluded.contains(docid) {
|
||||
continue;
|
||||
}
|
||||
distinct_single_docid(ctx.index, ctx.txn, field_id, docid, &mut excluded)?;
|
||||
remaining.push(docid);
|
||||
}
|
||||
Ok(DistinctOutput { remaining, excluded })
|
||||
}
|
||||
|
||||
/// Apply the distinct rule defined by [`apply_distinct_rule`] for a single document id.
|
||||
pub fn distinct_single_docid(
|
||||
index: &Index,
|
||||
txn: &RoTxn,
|
||||
field_id: u16,
|
||||
docid: u32,
|
||||
excluded: &mut RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
for item in facet_string_values(docid, field_id, index, txn)? {
|
||||
let ((_, _, facet_value), _) = item?;
|
||||
if let Some(facet_docids) = facet_value_docids(
|
||||
index.facet_id_string_docids.remap_types(),
|
||||
txn,
|
||||
field_id,
|
||||
facet_value,
|
||||
)? {
|
||||
*excluded |= facet_docids;
|
||||
}
|
||||
}
|
||||
for item in facet_number_values(docid, field_id, index, txn)? {
|
||||
let ((_, _, facet_value), _) = item?;
|
||||
if let Some(facet_docids) =
|
||||
facet_value_docids(index.facet_id_f64_docids.remap_types(), txn, field_id, facet_value)?
|
||||
{
|
||||
*excluded |= facet_docids;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return all the docids containing the given value in the given field
|
||||
fn facet_value_docids(
|
||||
database: Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
||||
txn: &RoTxn,
|
||||
field_id: u16,
|
||||
facet_value: &[u8],
|
||||
) -> heed::Result<Option<RoaringBitmap>> {
|
||||
database
|
||||
.get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value })
|
||||
.map(|opt| opt.map(|v| v.bitmap))
|
||||
}
|
||||
|
||||
/// Return an iterator over each number value in the given field of the given document.
|
||||
fn facet_number_values<'a>(
|
||||
docid: u32,
|
||||
field_id: u16,
|
||||
index: &Index,
|
||||
txn: &'a RoTxn,
|
||||
) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<ByteSliceRefCodec>, Unit>> {
|
||||
let key = facet_values_prefix_key(field_id, docid);
|
||||
|
||||
let iter = index
|
||||
.field_id_docid_facet_f64s
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(txn, &key)?
|
||||
.remap_key_type();
|
||||
|
||||
Ok(iter)
|
||||
}
|
||||
|
||||
/// Return an iterator over each string value in the given field of the given document.
|
||||
fn facet_string_values<'a>(
|
||||
docid: u32,
|
||||
field_id: u16,
|
||||
index: &Index,
|
||||
txn: &'a RoTxn,
|
||||
) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<ByteSliceRefCodec>, Str>> {
|
||||
let key = facet_values_prefix_key(field_id, docid);
|
||||
|
||||
let iter = index
|
||||
.field_id_docid_facet_strings
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(txn, &key)?
|
||||
.remap_types();
|
||||
|
||||
Ok(iter)
|
||||
}
|
||||
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] {
|
||||
concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes())
|
||||
}
|
||||
270
milli/src/search/new/exact_attribute.rs
Normal file
270
milli/src/search/new/exact_attribute.rs
Normal file
@@ -0,0 +1,270 @@
|
||||
use roaring::{MultiOps, RoaringBitmap};
|
||||
|
||||
use super::query_graph::QueryGraph;
|
||||
use super::ranking_rules::{RankingRule, RankingRuleOutput};
|
||||
use crate::search::new::query_graph::QueryNodeData;
|
||||
use crate::search::new::query_term::ExactTerm;
|
||||
use crate::{Result, SearchContext, SearchLogger};
|
||||
|
||||
/// A ranking rule that produces 3 disjoint buckets:
|
||||
///
|
||||
/// 1. Documents from the universe whose value is exactly the query.
|
||||
/// 2. Documents from the universe not in (1) whose value starts with the query.
|
||||
/// 3. Documents from the universe not in (1) or (2).
|
||||
pub struct ExactAttribute {
|
||||
state: State,
|
||||
}
|
||||
|
||||
impl ExactAttribute {
|
||||
pub fn new() -> Self {
|
||||
Self { state: Default::default() }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
|
||||
fn id(&self) -> String {
|
||||
"exact_attribute".to_owned()
|
||||
}
|
||||
|
||||
fn start_iteration(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
universe: &roaring::RoaringBitmap,
|
||||
query: &QueryGraph,
|
||||
) -> Result<()> {
|
||||
self.state = State::start_iteration(ctx, universe, query)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn next_bucket(
|
||||
&mut self,
|
||||
_ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
universe: &roaring::RoaringBitmap,
|
||||
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
|
||||
let state = std::mem::take(&mut self.state);
|
||||
let (state, output) = State::next(state, universe);
|
||||
self.state = state;
|
||||
|
||||
Ok(output)
|
||||
}
|
||||
|
||||
fn end_iteration(
|
||||
&mut self,
|
||||
_ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) {
|
||||
self.state = Default::default();
|
||||
}
|
||||
}
|
||||
|
||||
/// Inner state of the ranking rule.
|
||||
#[derive(Default)]
|
||||
enum State {
|
||||
/// State between two iterations
|
||||
#[default]
|
||||
Uninitialized,
|
||||
/// The next call to `next` will output the documents in the universe that have an attribute that is the exact query
|
||||
ExactAttribute(QueryGraph, Vec<FieldCandidates>),
|
||||
/// The next call to `next` will output the documents in the universe that have an attribute that starts with the exact query,
|
||||
/// but isn't the exact query.
|
||||
AttributeStarts(QueryGraph, Vec<FieldCandidates>),
|
||||
/// The next calls to `next` will output the input universe.
|
||||
Empty(QueryGraph),
|
||||
}
|
||||
|
||||
/// The candidates sorted by attributes
|
||||
///
|
||||
/// Each of the bitmap in a single `FieldCandidates` struct applies to the same field.
|
||||
struct FieldCandidates {
|
||||
/// The candidates that start with all the words of the query in the field
|
||||
start_with_exact: RoaringBitmap,
|
||||
/// The candidates that have the same number of words as the query in the field
|
||||
exact_word_count: RoaringBitmap,
|
||||
}
|
||||
|
||||
impl State {
|
||||
fn start_iteration(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
universe: &RoaringBitmap,
|
||||
query_graph: &QueryGraph,
|
||||
) -> Result<Self> {
|
||||
struct ExactTermInfo {
|
||||
exact_term: ExactTerm,
|
||||
start_position: u16,
|
||||
start_term_id: u8,
|
||||
position_count: usize,
|
||||
}
|
||||
|
||||
let mut exact_terms: Vec<ExactTermInfo> =
|
||||
Vec::with_capacity(query_graph.nodes.len() as usize);
|
||||
for (_, node) in query_graph.nodes.iter() {
|
||||
match &node.data {
|
||||
QueryNodeData::Term(term) => {
|
||||
let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) {
|
||||
exact_term
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
exact_terms.push(ExactTermInfo {
|
||||
exact_term,
|
||||
start_position: *term.positions.start(),
|
||||
start_term_id: *term.term_ids.start(),
|
||||
position_count: term.positions.len(),
|
||||
});
|
||||
}
|
||||
QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
|
||||
}
|
||||
}
|
||||
|
||||
exact_terms.sort_by_key(|x| x.start_term_id);
|
||||
exact_terms.dedup_by_key(|x| x.start_term_id);
|
||||
let count_all_positions = exact_terms.iter().fold(0, |acc, x| acc + x.position_count);
|
||||
|
||||
// bail if there is a "hole" (missing word) in remaining query graph
|
||||
if let Some(e) = exact_terms.first() {
|
||||
if e.start_term_id != 0 {
|
||||
return Ok(State::Empty(query_graph.clone()));
|
||||
}
|
||||
} else {
|
||||
return Ok(State::Empty(query_graph.clone()));
|
||||
}
|
||||
let mut previous_id = 0;
|
||||
for e in exact_terms.iter() {
|
||||
if e.start_term_id < previous_id || e.start_term_id - previous_id > 1 {
|
||||
return Ok(State::Empty(query_graph.clone()));
|
||||
} else {
|
||||
previous_id = e.start_term_id;
|
||||
}
|
||||
}
|
||||
|
||||
// sample query: "sunflower are pretty"
|
||||
// sunflower at pos 0 in attr A
|
||||
// are at pos 1 in attr B
|
||||
// pretty at pos 2 in attr C
|
||||
// We want to eliminate such document
|
||||
|
||||
// first check that for each term, there exists some attribute that has this term at the correct position
|
||||
//"word-position-docids";
|
||||
let mut candidates = universe.clone();
|
||||
let words_positions: Vec<(Vec<_>, _)> = exact_terms
|
||||
.iter()
|
||||
.map(|e| (e.exact_term.interned_words(ctx).collect(), e.start_position))
|
||||
.collect();
|
||||
for (words, position) in &words_positions {
|
||||
if candidates.is_empty() {
|
||||
return Ok(State::Empty(query_graph.clone()));
|
||||
}
|
||||
|
||||
'words: for (offset, word) in words.iter().enumerate() {
|
||||
let offset = offset as u16;
|
||||
let word = if let Some(word) = word {
|
||||
word
|
||||
} else {
|
||||
continue 'words;
|
||||
};
|
||||
// Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of
|
||||
// longer phrases we'll be losing on precision here.
|
||||
let bucketed_position = crate::bucketed_position(position + offset);
|
||||
let word_position_docids =
|
||||
ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default()
|
||||
& universe;
|
||||
candidates &= word_position_docids;
|
||||
if candidates.is_empty() {
|
||||
return Ok(State::Empty(query_graph.clone()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let candidates = candidates;
|
||||
|
||||
if candidates.is_empty() {
|
||||
return Ok(State::Empty(query_graph.clone()));
|
||||
}
|
||||
|
||||
let searchable_fields_ids = {
|
||||
if let Some(fids) = ctx.index.searchable_fields_ids(ctx.txn)? {
|
||||
fids
|
||||
} else {
|
||||
ctx.index.fields_ids_map(ctx.txn)?.ids().collect()
|
||||
}
|
||||
};
|
||||
|
||||
let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len());
|
||||
// then check that there exists at least one attribute that has all of the terms
|
||||
for fid in searchable_fields_ids {
|
||||
let mut intersection = MultiOps::intersection(
|
||||
words_positions
|
||||
.iter()
|
||||
.flat_map(|(words, ..)| words.iter())
|
||||
// ignore stop words words in phrases
|
||||
.flatten()
|
||||
.map(|word| -> Result<_> {
|
||||
Ok(ctx.get_db_word_fid_docids(*word, fid)?.unwrap_or_default())
|
||||
}),
|
||||
)?;
|
||||
intersection &= &candidates;
|
||||
if !intersection.is_empty() {
|
||||
// TODO: although not really worth it in terms of performance,
|
||||
// if would be good to put this in cache for the sake of consistency
|
||||
let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize {
|
||||
ctx.index
|
||||
.field_id_word_count_docids
|
||||
.get(ctx.txn, &(fid, count_all_positions as u8))?
|
||||
.unwrap_or_default()
|
||||
& universe
|
||||
} else {
|
||||
RoaringBitmap::default()
|
||||
};
|
||||
candidates_per_attribute.push(FieldCandidates {
|
||||
start_with_exact: intersection,
|
||||
exact_word_count: candidates_with_exact_word_count,
|
||||
});
|
||||
}
|
||||
}
|
||||
// note we could have "false positives" where there both exist different attributes that collectively
|
||||
// have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order.
|
||||
|
||||
Ok(State::ExactAttribute(query_graph.clone(), candidates_per_attribute))
|
||||
}
|
||||
|
||||
fn next(
|
||||
state: State,
|
||||
universe: &RoaringBitmap,
|
||||
) -> (State, Option<RankingRuleOutput<QueryGraph>>) {
|
||||
let (state, output) = match state {
|
||||
State::Uninitialized => (state, None),
|
||||
State::ExactAttribute(query_graph, candidates_per_attribute) => {
|
||||
let mut candidates = MultiOps::union(candidates_per_attribute.iter().map(
|
||||
|FieldCandidates { start_with_exact, exact_word_count }| {
|
||||
start_with_exact & exact_word_count
|
||||
},
|
||||
));
|
||||
candidates &= universe;
|
||||
(
|
||||
State::AttributeStarts(query_graph.clone(), candidates_per_attribute),
|
||||
Some(RankingRuleOutput { query: query_graph, candidates }),
|
||||
)
|
||||
}
|
||||
State::AttributeStarts(query_graph, candidates_per_attribute) => {
|
||||
let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map(
|
||||
|FieldCandidates { mut start_with_exact, exact_word_count }| {
|
||||
start_with_exact -= exact_word_count;
|
||||
start_with_exact
|
||||
},
|
||||
));
|
||||
candidates &= universe;
|
||||
(
|
||||
State::Empty(query_graph.clone()),
|
||||
Some(RankingRuleOutput { query: query_graph, candidates }),
|
||||
)
|
||||
}
|
||||
State::Empty(query_graph) => (
|
||||
State::Empty(query_graph.clone()),
|
||||
Some(RankingRuleOutput { query: query_graph, candidates: universe.clone() }),
|
||||
),
|
||||
};
|
||||
(state, output)
|
||||
}
|
||||
}
|
||||
272
milli/src/search/new/geo_sort.rs
Normal file
272
milli/src/search/new/geo_sort.rs
Normal file
@@ -0,0 +1,272 @@
|
||||
use std::collections::VecDeque;
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use heed::types::{ByteSlice, Unit};
|
||||
use heed::{RoPrefix, RoTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
use rstar::RTree;
|
||||
|
||||
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
|
||||
use crate::heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec};
|
||||
use crate::{
|
||||
distance_between_two_points, lat_lng_to_xyz, GeoPoint, Index, Result, SearchContext,
|
||||
SearchLogger,
|
||||
};
|
||||
|
||||
const FID_SIZE: usize = 2;
|
||||
const DOCID_SIZE: usize = 4;
|
||||
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] {
|
||||
concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes())
|
||||
}
|
||||
|
||||
/// Return an iterator over each number value in the given field of the given document.
|
||||
fn facet_number_values<'a>(
|
||||
docid: u32,
|
||||
field_id: u16,
|
||||
index: &Index,
|
||||
txn: &'a RoTxn,
|
||||
) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<OrderedF64Codec>, Unit>> {
|
||||
let key = facet_values_prefix_key(field_id, docid);
|
||||
|
||||
let iter = index
|
||||
.field_id_docid_facet_f64s
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(txn, &key)?
|
||||
.remap_key_type();
|
||||
|
||||
Ok(iter)
|
||||
}
|
||||
|
||||
/// Define the strategy used by the geo sort.
|
||||
/// The paramater represents the cache size, and, in the case of the Dynamic strategy,
|
||||
/// the point where we move from using the iterative strategy to the rtree.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum Strategy {
|
||||
AlwaysIterative(usize),
|
||||
AlwaysRtree(usize),
|
||||
Dynamic(usize),
|
||||
}
|
||||
|
||||
impl Default for Strategy {
|
||||
fn default() -> Self {
|
||||
Strategy::Dynamic(1000)
|
||||
}
|
||||
}
|
||||
|
||||
impl Strategy {
|
||||
pub fn use_rtree(&self, candidates: usize) -> bool {
|
||||
match self {
|
||||
Strategy::AlwaysIterative(_) => false,
|
||||
Strategy::AlwaysRtree(_) => true,
|
||||
Strategy::Dynamic(i) => candidates >= *i,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn cache_size(&self) -> usize {
|
||||
match self {
|
||||
Strategy::AlwaysIterative(i) | Strategy::AlwaysRtree(i) | Strategy::Dynamic(i) => *i,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct GeoSort<Q: RankingRuleQueryTrait> {
|
||||
query: Option<Q>,
|
||||
|
||||
strategy: Strategy,
|
||||
ascending: bool,
|
||||
point: [f64; 2],
|
||||
field_ids: Option<[u16; 2]>,
|
||||
rtree: Option<RTree<GeoPoint>>,
|
||||
|
||||
cached_sorted_docids: VecDeque<u32>,
|
||||
geo_candidates: RoaringBitmap,
|
||||
}
|
||||
|
||||
impl<Q: RankingRuleQueryTrait> GeoSort<Q> {
|
||||
pub fn new(
|
||||
strategy: Strategy,
|
||||
geo_faceted_docids: RoaringBitmap,
|
||||
point: [f64; 2],
|
||||
ascending: bool,
|
||||
) -> Result<Self> {
|
||||
Ok(Self {
|
||||
query: None,
|
||||
strategy,
|
||||
ascending,
|
||||
point,
|
||||
geo_candidates: geo_faceted_docids,
|
||||
field_ids: None,
|
||||
rtree: None,
|
||||
cached_sorted_docids: VecDeque::new(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Refill the internal buffer of cached docids based on the strategy.
|
||||
/// Drop the rtree if we don't need it anymore.
|
||||
fn fill_buffer(&mut self, ctx: &mut SearchContext) -> Result<()> {
|
||||
debug_assert!(self.field_ids.is_some(), "fill_buffer can't be called without the lat&lng");
|
||||
debug_assert!(self.cached_sorted_docids.is_empty());
|
||||
|
||||
// lazily initialize the rtree if needed by the strategy, and cache it in `self.rtree`
|
||||
let rtree = if self.strategy.use_rtree(self.geo_candidates.len() as usize) {
|
||||
if let Some(rtree) = self.rtree.as_ref() {
|
||||
// get rtree from cache
|
||||
Some(rtree)
|
||||
} else {
|
||||
let rtree = ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree");
|
||||
// insert rtree in cache and returns it.
|
||||
// Can't use `get_or_insert_with` because getting the rtree from the DB is a fallible operation.
|
||||
Some(&*self.rtree.insert(rtree))
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let cache_size = self.strategy.cache_size();
|
||||
if let Some(rtree) = rtree {
|
||||
if self.ascending {
|
||||
let point = lat_lng_to_xyz(&self.point);
|
||||
for point in rtree.nearest_neighbor_iter(&point) {
|
||||
if self.geo_candidates.contains(point.data.0) {
|
||||
self.cached_sorted_docids.push_back(point.data.0);
|
||||
if self.cached_sorted_docids.len() >= cache_size {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// in the case of the desc geo sort we look for the closest point to the opposite of the queried point
|
||||
// and we insert the points in reverse order they get reversed when emptying the cache later on
|
||||
let point = lat_lng_to_xyz(&opposite_of(self.point));
|
||||
for point in rtree.nearest_neighbor_iter(&point) {
|
||||
if self.geo_candidates.contains(point.data.0) {
|
||||
self.cached_sorted_docids.push_front(point.data.0);
|
||||
if self.cached_sorted_docids.len() >= cache_size {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// the iterative version
|
||||
let [lat, lng] = self.field_ids.unwrap();
|
||||
|
||||
let mut documents = self
|
||||
.geo_candidates
|
||||
.iter()
|
||||
.map(|id| -> Result<_> {
|
||||
Ok((
|
||||
id,
|
||||
[
|
||||
facet_number_values(id, lat, ctx.index, ctx.txn)?
|
||||
.next()
|
||||
.expect("A geo faceted document doesn't contain any lat")?
|
||||
.0
|
||||
.2,
|
||||
facet_number_values(id, lng, ctx.index, ctx.txn)?
|
||||
.next()
|
||||
.expect("A geo faceted document doesn't contain any lng")?
|
||||
.0
|
||||
.2,
|
||||
],
|
||||
))
|
||||
})
|
||||
.collect::<Result<Vec<(u32, [f64; 2])>>>()?;
|
||||
// computing the distance between two points is expensive thus we cache the result
|
||||
documents
|
||||
.sort_by_cached_key(|(_, p)| distance_between_two_points(&self.point, p) as usize);
|
||||
self.cached_sorted_docids.extend(documents.into_iter().map(|(doc_id, _)| doc_id));
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort<Q> {
|
||||
fn id(&self) -> String {
|
||||
"geo_sort".to_owned()
|
||||
}
|
||||
|
||||
fn start_iteration(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<Q>,
|
||||
universe: &RoaringBitmap,
|
||||
query: &Q,
|
||||
) -> Result<()> {
|
||||
assert!(self.query.is_none());
|
||||
|
||||
self.query = Some(query.clone());
|
||||
self.geo_candidates &= universe;
|
||||
|
||||
if self.geo_candidates.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let fid_map = ctx.index.fields_ids_map(ctx.txn)?;
|
||||
let lat = fid_map.id("_geo.lat").expect("geo candidates but no fid for lat");
|
||||
let lng = fid_map.id("_geo.lng").expect("geo candidates but no fid for lng");
|
||||
self.field_ids = Some([lat, lng]);
|
||||
self.fill_buffer(ctx)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::only_used_in_recursion)]
|
||||
fn next_bucket(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
logger: &mut dyn SearchLogger<Q>,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<Option<RankingRuleOutput<Q>>> {
|
||||
assert!(universe.len() > 1);
|
||||
let query = self.query.as_ref().unwrap().clone();
|
||||
self.geo_candidates &= universe;
|
||||
|
||||
if self.geo_candidates.is_empty() {
|
||||
return Ok(Some(RankingRuleOutput { query, candidates: universe.clone() }));
|
||||
}
|
||||
|
||||
let ascending = self.ascending;
|
||||
let next = |cache: &mut VecDeque<_>| {
|
||||
if ascending {
|
||||
cache.pop_front()
|
||||
} else {
|
||||
cache.pop_back()
|
||||
}
|
||||
};
|
||||
while let Some(id) = next(&mut self.cached_sorted_docids) {
|
||||
if self.geo_candidates.contains(id) {
|
||||
return Ok(Some(RankingRuleOutput {
|
||||
query,
|
||||
candidates: RoaringBitmap::from_iter([id]),
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
// if we got out of this loop it means we've exhausted our cache.
|
||||
// we need to refill it and run the function again.
|
||||
self.fill_buffer(ctx)?;
|
||||
self.next_bucket(ctx, logger, universe)
|
||||
}
|
||||
|
||||
fn end_iteration(&mut self, _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger<Q>) {
|
||||
// we do not reset the rtree here, it could be used in a next iteration
|
||||
self.query = None;
|
||||
self.cached_sorted_docids.clear();
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute the antipodal coordinate of `coord`
|
||||
fn opposite_of(mut coord: [f64; 2]) -> [f64; 2] {
|
||||
coord[0] *= -1.;
|
||||
// in the case of x,0 we want to return x,180
|
||||
if coord[1] > 0. {
|
||||
coord[1] -= 180.;
|
||||
} else {
|
||||
coord[1] += 180.;
|
||||
}
|
||||
|
||||
coord
|
||||
}
|
||||
391
milli/src/search/new/graph_based_ranking_rule.rs
Normal file
391
milli/src/search/new/graph_based_ranking_rule.rs
Normal file
@@ -0,0 +1,391 @@
|
||||
/*! Implementation of a generic graph-based ranking rule.
|
||||
|
||||
A graph-based ranking rule is a ranking rule that works by representing
|
||||
its possible operations and their relevancy cost as a directed acyclic multi-graph
|
||||
built on top of the query graph. It then computes its buckets by finding the
|
||||
cheapest paths from the start node to the end node and computing the document ids
|
||||
that satisfy those paths.
|
||||
|
||||
For example, the proximity ranking rule builds a graph where the edges between two
|
||||
nodes represent a condition that the term of the source node is in a certain proximity
|
||||
to the term of the destination node. With the query "pretty house by" where the term
|
||||
"pretty" has three possible proximities to the term "house" and "house" has two
|
||||
proximities to "by", the graph will look like this:
|
||||
|
||||
```txt
|
||||
┌───────┐ ┌───────┐─────1────▶┌───────┐──1──▶┌─────┐ ┌───────┐
|
||||
│ START │──0─▶│pretty │─────2────▶│ house │ │ by │─0─▶│ END │
|
||||
└───────┘ └───────┘─────3────▶└───────┘──2-─▶└─────┘ └───────┘
|
||||
```
|
||||
The proximity ranking rule's first bucket will be determined by the union of all
|
||||
the shortest paths from START to END, which in this case is:
|
||||
```txt
|
||||
START --0-> pretty --1--> house --1--> by --0--> end
|
||||
```
|
||||
The path's corresponding document ids are found by taking the intersection of the
|
||||
document ids of each edge. That is, we find the documents where both `pretty` is
|
||||
1-close to `house` AND `house` is 1-close to `by`.
|
||||
|
||||
For the second bucket, we get the union of the second-cheapest paths, which are:
|
||||
```txt
|
||||
START --0-> pretty --1--> house --2--> by --0--> end
|
||||
START --0-> pretty --2--> house --1--> by --0--> end
|
||||
```
|
||||
That is we find the documents where either:
|
||||
- `pretty` is 1-close to `house` AND `house` is 2-close to `by`
|
||||
- OR: `pretty` is 2-close to `house` AND `house` is 1-close to `by`
|
||||
*/
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::ControlFlow;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::interner::{Interned, MappedInterner};
|
||||
use super::logger::SearchLogger;
|
||||
use super::query_graph::QueryNode;
|
||||
use super::ranking_rule_graph::{
|
||||
ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, FidGraph, PositionGraph, ProximityGraph,
|
||||
RankingRuleGraph, RankingRuleGraphTrait, TypoGraph,
|
||||
};
|
||||
use super::small_bitmap::SmallBitmap;
|
||||
use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
|
||||
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||
use crate::search::new::ranking_rule_graph::PathVisitor;
|
||||
use crate::{Result, TermsMatchingStrategy};
|
||||
|
||||
pub type Proximity = GraphBasedRankingRule<ProximityGraph>;
|
||||
impl GraphBasedRankingRule<ProximityGraph> {
|
||||
pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
|
||||
Self::new_with_id("proximity".to_owned(), terms_matching_strategy)
|
||||
}
|
||||
}
|
||||
pub type Fid = GraphBasedRankingRule<FidGraph>;
|
||||
impl GraphBasedRankingRule<FidGraph> {
|
||||
pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
|
||||
Self::new_with_id("fid".to_owned(), terms_matching_strategy)
|
||||
}
|
||||
}
|
||||
pub type Position = GraphBasedRankingRule<PositionGraph>;
|
||||
impl GraphBasedRankingRule<PositionGraph> {
|
||||
pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
|
||||
Self::new_with_id("position".to_owned(), terms_matching_strategy)
|
||||
}
|
||||
}
|
||||
pub type Typo = GraphBasedRankingRule<TypoGraph>;
|
||||
impl GraphBasedRankingRule<TypoGraph> {
|
||||
pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
|
||||
Self::new_with_id("typo".to_owned(), terms_matching_strategy)
|
||||
}
|
||||
}
|
||||
pub type Exactness = GraphBasedRankingRule<ExactnessGraph>;
|
||||
impl GraphBasedRankingRule<ExactnessGraph> {
|
||||
pub fn new() -> Self {
|
||||
Self::new_with_id("exactness".to_owned(), None)
|
||||
}
|
||||
}
|
||||
|
||||
/// A generic graph-based ranking rule
|
||||
pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
|
||||
id: String,
|
||||
terms_matching_strategy: Option<TermsMatchingStrategy>,
|
||||
// When the ranking rule is not iterating over its buckets,
|
||||
// its state is `None`.
|
||||
state: Option<GraphBasedRankingRuleState<G>>,
|
||||
}
|
||||
impl<G: RankingRuleGraphTrait> GraphBasedRankingRule<G> {
|
||||
/// Creates the ranking rule with the given identifier
|
||||
pub fn new_with_id(id: String, terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
|
||||
Self { id, terms_matching_strategy, state: None }
|
||||
}
|
||||
}
|
||||
|
||||
/// The internal state of a graph-based ranking rule during iteration
|
||||
pub struct GraphBasedRankingRuleState<G: RankingRuleGraphTrait> {
|
||||
/// The current graph
|
||||
graph: RankingRuleGraph<G>,
|
||||
/// Cache to retrieve the docids associated with each edge
|
||||
conditions_cache: ConditionDocIdsCache<G>,
|
||||
/// Cache used to optimistically discard paths that resolve to no documents.
|
||||
dead_ends_cache: DeadEndsCache<G::Condition>,
|
||||
/// A structure giving the list of possible costs from each node to the end node
|
||||
all_costs: MappedInterner<QueryNode, Vec<u64>>,
|
||||
/// An index in the first element of `all_distances`, giving the cost of the next bucket
|
||||
cur_cost: u64,
|
||||
}
|
||||
|
||||
impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBasedRankingRule<G> {
|
||||
fn id(&self) -> String {
|
||||
self.id.clone()
|
||||
}
|
||||
fn start_iteration(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
_universe: &RoaringBitmap,
|
||||
query_graph: &QueryGraph,
|
||||
) -> Result<()> {
|
||||
let removal_cost = if let Some(terms_matching_strategy) = self.terms_matching_strategy {
|
||||
match terms_matching_strategy {
|
||||
TermsMatchingStrategy::Last => {
|
||||
let removal_order =
|
||||
query_graph.removal_order_for_terms_matching_strategy_last(ctx);
|
||||
let mut forbidden_nodes =
|
||||
SmallBitmap::for_interned_values_in(&query_graph.nodes);
|
||||
let mut costs = query_graph.nodes.map(|_| None);
|
||||
let mut cost = 100;
|
||||
for ns in removal_order {
|
||||
for n in ns.iter() {
|
||||
*costs.get_mut(n) = Some((cost, forbidden_nodes.clone()));
|
||||
}
|
||||
forbidden_nodes.union(&ns);
|
||||
cost += 100;
|
||||
}
|
||||
costs
|
||||
}
|
||||
TermsMatchingStrategy::All => query_graph.nodes.map(|_| None),
|
||||
}
|
||||
} else {
|
||||
query_graph.nodes.map(|_| None)
|
||||
};
|
||||
|
||||
let graph = RankingRuleGraph::build(ctx, query_graph.clone(), removal_cost)?;
|
||||
let condition_docids_cache = ConditionDocIdsCache::default();
|
||||
let dead_ends_cache = DeadEndsCache::new(&graph.conditions_interner);
|
||||
|
||||
// Then pre-compute the cost of all paths from each node to the end node
|
||||
let all_costs = graph.find_all_costs_to_end();
|
||||
|
||||
let state = GraphBasedRankingRuleState {
|
||||
graph,
|
||||
conditions_cache: condition_docids_cache,
|
||||
dead_ends_cache,
|
||||
all_costs,
|
||||
cur_cost: 0,
|
||||
};
|
||||
|
||||
self.state = Some(state);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn next_bucket(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
|
||||
// If universe.len() <= 1, the bucket sort algorithm
|
||||
// should not have called this function.
|
||||
assert!(universe.len() > 1);
|
||||
// Will crash if `next_bucket` is called before `start_iteration` or after `end_iteration`,
|
||||
// should never happen
|
||||
let mut state = self.state.take().unwrap();
|
||||
|
||||
// Retrieve the cost of the paths to compute
|
||||
let Some(&cost) = state
|
||||
.all_costs
|
||||
.get(state.graph.query_graph.root_node)
|
||||
.iter()
|
||||
.find(|c| **c >= state.cur_cost) else {
|
||||
self.state = None;
|
||||
return Ok(None);
|
||||
};
|
||||
state.cur_cost = cost + 1;
|
||||
|
||||
let mut bucket = RoaringBitmap::new();
|
||||
|
||||
let GraphBasedRankingRuleState {
|
||||
graph,
|
||||
conditions_cache: condition_docids_cache,
|
||||
dead_ends_cache,
|
||||
all_costs,
|
||||
cur_cost: _,
|
||||
} = &mut state;
|
||||
|
||||
let mut universe = universe.clone();
|
||||
|
||||
let mut used_conditions = SmallBitmap::for_interned_values_in(&graph.conditions_interner);
|
||||
let mut good_paths = vec![];
|
||||
let mut considered_paths = vec![];
|
||||
|
||||
// For each path of the given cost, we will compute its associated
|
||||
// document ids.
|
||||
// In case the path does not resolve to any document id, we try to figure out why
|
||||
// and update the `dead_ends_cache` accordingly.
|
||||
// Updating the dead_ends_cache helps speed up the execution of `visit_paths_of_cost` and reduces
|
||||
// the number of future candidate paths given by that same function.
|
||||
|
||||
let mut subpaths_docids: Vec<(Interned<G::Condition>, RoaringBitmap)> = vec![];
|
||||
|
||||
let mut nodes_with_removed_outgoing_conditions = BTreeSet::new();
|
||||
let visitor = PathVisitor::new(cost, graph, all_costs, dead_ends_cache);
|
||||
|
||||
visitor.visit_paths(&mut |path, graph, dead_ends_cache| {
|
||||
considered_paths.push(path.to_vec());
|
||||
// If the universe is empty, stop exploring the graph, since no docids will ever be found anymore.
|
||||
if universe.is_empty() {
|
||||
return Ok(ControlFlow::Break(()));
|
||||
}
|
||||
// `visit_paths` performs a depth-first search, so the previously visited path
|
||||
// is likely to share a prefix with the current one.
|
||||
// We stored the previous path and the docids associated to each of its prefixes in `subpaths_docids`.
|
||||
// We take advantage of this to avoid computing the docids associated with the common prefix between
|
||||
// the old and current path.
|
||||
let idx_of_first_different_condition = {
|
||||
let mut idx = 0;
|
||||
for (&last_c, cur_c) in path.iter().zip(subpaths_docids.iter().map(|x| x.0)) {
|
||||
if last_c == cur_c {
|
||||
idx += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
subpaths_docids.truncate(idx);
|
||||
idx
|
||||
};
|
||||
// Then for the remaining of the path, we continue computing docids.
|
||||
for latest_condition in path[idx_of_first_different_condition..].iter().copied() {
|
||||
let success = visit_path_condition(
|
||||
ctx,
|
||||
graph,
|
||||
&universe,
|
||||
dead_ends_cache,
|
||||
condition_docids_cache,
|
||||
&mut subpaths_docids,
|
||||
&mut nodes_with_removed_outgoing_conditions,
|
||||
latest_condition,
|
||||
)?;
|
||||
if !success {
|
||||
return Ok(ControlFlow::Continue(()));
|
||||
}
|
||||
}
|
||||
assert!(subpaths_docids.iter().map(|x| x.0).eq(path.iter().copied()));
|
||||
|
||||
let path_docids =
|
||||
subpaths_docids.pop().map(|x| x.1).unwrap_or_else(|| universe.clone());
|
||||
assert!(!path_docids.is_empty());
|
||||
|
||||
// Accumulate the path for logging purposes only
|
||||
good_paths.push(path.to_vec());
|
||||
for &condition in path {
|
||||
used_conditions.insert(condition);
|
||||
}
|
||||
bucket |= &path_docids;
|
||||
// Reduce the size of the universe so that we can more optimistically discard candidate paths
|
||||
universe -= &path_docids;
|
||||
for (_, docids) in subpaths_docids.iter_mut() {
|
||||
*docids -= &path_docids;
|
||||
}
|
||||
|
||||
if universe.is_empty() {
|
||||
Ok(ControlFlow::Break(()))
|
||||
} else {
|
||||
Ok(ControlFlow::Continue(()))
|
||||
}
|
||||
})?;
|
||||
logger.log_internal_state(graph);
|
||||
logger.log_internal_state(&good_paths);
|
||||
|
||||
// We modify the next query graph so that it only contains the subgraph
|
||||
// that was used to compute this bucket
|
||||
// But we only do it in case the bucket length is >1, because otherwise
|
||||
// we know the child ranking rule won't be called anyway
|
||||
|
||||
let paths: Vec<Vec<(Option<LocatedQueryTermSubset>, LocatedQueryTermSubset)>> = good_paths
|
||||
.into_iter()
|
||||
.map(|path| {
|
||||
path.into_iter()
|
||||
.map(|condition| {
|
||||
let (a, b) =
|
||||
condition_docids_cache.get_subsets_used_by_condition(condition);
|
||||
(a.clone(), b.clone())
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let next_query_graph = QueryGraph::build_from_paths(paths);
|
||||
|
||||
#[allow(clippy::comparison_chain)]
|
||||
if nodes_with_removed_outgoing_conditions.len() == 1 {
|
||||
graph.update_all_costs_before_node(
|
||||
*nodes_with_removed_outgoing_conditions.first().unwrap(),
|
||||
all_costs,
|
||||
);
|
||||
} else if nodes_with_removed_outgoing_conditions.len() > 1 {
|
||||
*all_costs = graph.find_all_costs_to_end();
|
||||
}
|
||||
|
||||
self.state = Some(state);
|
||||
|
||||
Ok(Some(RankingRuleOutput { query: next_query_graph, candidates: bucket }))
|
||||
}
|
||||
|
||||
fn end_iteration(
|
||||
&mut self,
|
||||
_ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) {
|
||||
self.state = None;
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns false if the intersection between the condition
|
||||
/// docids and the previous path docids is empty.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn visit_path_condition<G: RankingRuleGraphTrait>(
|
||||
ctx: &mut SearchContext,
|
||||
graph: &mut RankingRuleGraph<G>,
|
||||
universe: &RoaringBitmap,
|
||||
dead_ends_cache: &mut DeadEndsCache<G::Condition>,
|
||||
condition_docids_cache: &mut ConditionDocIdsCache<G>,
|
||||
subpath: &mut Vec<(Interned<G::Condition>, RoaringBitmap)>,
|
||||
nodes_with_removed_outgoing_conditions: &mut BTreeSet<Interned<QueryNode>>,
|
||||
latest_condition: Interned<G::Condition>,
|
||||
) -> Result<bool> {
|
||||
let condition_docids = &condition_docids_cache
|
||||
.get_computed_condition(ctx, latest_condition, graph, universe)?
|
||||
.docids;
|
||||
if condition_docids.is_empty() {
|
||||
// 1. Store in the cache that this edge is empty for this universe
|
||||
dead_ends_cache.forbid_condition(latest_condition);
|
||||
// 2. remove all the edges with this condition from the ranking rule graph
|
||||
let source_nodes = graph.remove_edges_with_condition(latest_condition);
|
||||
nodes_with_removed_outgoing_conditions.extend(source_nodes);
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let latest_path_docids = if let Some((_, prev_docids)) = subpath.last() {
|
||||
prev_docids & condition_docids
|
||||
} else {
|
||||
condition_docids.clone()
|
||||
};
|
||||
if !latest_path_docids.is_empty() {
|
||||
subpath.push((latest_condition, latest_path_docids));
|
||||
return Ok(true);
|
||||
}
|
||||
// If the (sub)path is empty, we try to figure out why and update the caches accordingly.
|
||||
|
||||
// First, we know that this path is empty, and thus any path
|
||||
// that is a superset of it will also be empty.
|
||||
dead_ends_cache.forbid_condition_after_prefix(subpath.iter().map(|x| x.0), latest_condition);
|
||||
|
||||
if subpath.len() <= 1 {
|
||||
return Ok(false);
|
||||
}
|
||||
let mut subprefix = vec![];
|
||||
// Deadend if the intersection between this edge and any
|
||||
// previous prefix is disjoint with the universe
|
||||
// We already know that the intersection with the last one
|
||||
// is empty,
|
||||
for (past_condition, sp_docids) in subpath[..subpath.len() - 1].iter() {
|
||||
subprefix.push(*past_condition);
|
||||
if condition_docids.is_disjoint(sp_docids) {
|
||||
dead_ends_cache
|
||||
.forbid_condition_after_prefix(subprefix.iter().copied(), latest_condition);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(false)
|
||||
}
|
||||
259
milli/src/search/new/interner.rs
Normal file
259
milli/src/search/new/interner.rs
Normal file
@@ -0,0 +1,259 @@
|
||||
use std::fmt;
|
||||
use std::hash::Hash;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use fxhash::FxHashMap;
|
||||
|
||||
use super::small_bitmap::SmallBitmap;
|
||||
|
||||
/// An index within an interner ([`FixedSizeInterner`], [`DedupInterner`], or [`MappedInterner`]).
|
||||
pub struct Interned<T> {
|
||||
idx: u16,
|
||||
_phantom: PhantomData<T>,
|
||||
}
|
||||
impl<T> Interned<T> {
|
||||
/// Create an interned value manually from its raw index within the interner.
|
||||
pub fn from_raw(idx: u16) -> Self {
|
||||
Self { idx, _phantom: PhantomData }
|
||||
}
|
||||
/// Get the raw index from the interned value
|
||||
pub fn into_raw(self) -> u16 {
|
||||
self.idx
|
||||
}
|
||||
}
|
||||
|
||||
/// A [`DedupInterner`] is used to store a unique copy of a value of type `T`. This value
|
||||
/// is then identified by a lightweight index of type [`Interned<T>`], which can
|
||||
/// be copied, compared, and hashed efficiently. An immutable reference to the original value
|
||||
/// can be retrieved using `self.get(interned)`. A set of values within the interner can be
|
||||
/// efficiently managed using [`SmallBitmap<T>`](super::small_bitmap::SmallBitmap).
|
||||
///
|
||||
/// A dedup-interner can contain a maximum of `u16::MAX` values.
|
||||
#[derive(Clone)]
|
||||
pub struct DedupInterner<T> {
|
||||
stable_store: Vec<T>,
|
||||
lookup: FxHashMap<T, Interned<T>>, // TODO: Arc
|
||||
}
|
||||
impl<T> Default for DedupInterner<T> {
|
||||
fn default() -> Self {
|
||||
Self { stable_store: Default::default(), lookup: Default::default() }
|
||||
}
|
||||
}
|
||||
impl<T> DedupInterner<T> {
|
||||
/// Convert the dedup-interner into a fixed-size interner, such that new
|
||||
/// elements cannot be added to it anymore.
|
||||
pub fn freeze(self) -> FixedSizeInterner<T> {
|
||||
FixedSizeInterner { stable_store: self.stable_store }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> DedupInterner<T>
|
||||
where
|
||||
T: Clone + Eq + Hash,
|
||||
{
|
||||
/// Insert the given value into the dedup-interner, and return
|
||||
/// its index.
|
||||
pub fn insert(&mut self, s: T) -> Interned<T> {
|
||||
if let Some(interned) = self.lookup.get(&s) {
|
||||
*interned
|
||||
} else {
|
||||
assert!(self.stable_store.len() < u16::MAX as usize);
|
||||
self.stable_store.push(s.clone());
|
||||
let interned = Interned::from_raw(self.stable_store.len() as u16 - 1);
|
||||
self.lookup.insert(s, interned);
|
||||
interned
|
||||
}
|
||||
}
|
||||
/// Get a reference to the interned value.
|
||||
pub fn get(&self, interned: Interned<T>) -> &T {
|
||||
&self.stable_store[interned.idx as usize]
|
||||
}
|
||||
}
|
||||
|
||||
/// A fixed-length store for values of type `T`, where each value is identified
|
||||
/// by an index of type [`Interned<T>`].
|
||||
#[derive(Clone)]
|
||||
pub struct FixedSizeInterner<T> {
|
||||
stable_store: Vec<T>,
|
||||
}
|
||||
impl<T: Clone> FixedSizeInterner<T> {
|
||||
/// Create a fixed-size interner of the given length containing
|
||||
/// clones of the given value.
|
||||
pub fn new(length: u16, value: T) -> Self {
|
||||
Self { stable_store: vec![value; length as usize] }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> FixedSizeInterner<T> {
|
||||
pub fn from_vec(store: Vec<T>) -> Self {
|
||||
Self { stable_store: store }
|
||||
}
|
||||
pub fn all_interned_values(&self) -> SmallBitmap<T> {
|
||||
let mut b = SmallBitmap::for_interned_values_in(self);
|
||||
for i in self.indexes() {
|
||||
b.insert(i);
|
||||
}
|
||||
b
|
||||
}
|
||||
pub fn get(&self, interned: Interned<T>) -> &T {
|
||||
&self.stable_store[interned.idx as usize]
|
||||
}
|
||||
pub fn get_mut(&mut self, interned: Interned<T>) -> &mut T {
|
||||
&mut self.stable_store[interned.idx as usize]
|
||||
}
|
||||
|
||||
pub fn len(&self) -> u16 {
|
||||
self.stable_store.len() as u16
|
||||
}
|
||||
pub fn map_move<U>(self, map_f: impl Fn(T) -> U) -> FixedSizeInterner<U> {
|
||||
FixedSizeInterner { stable_store: self.stable_store.into_iter().map(map_f).collect() }
|
||||
}
|
||||
pub fn map<U>(&self, map_f: impl Fn(&T) -> U) -> MappedInterner<T, U> {
|
||||
MappedInterner {
|
||||
stable_store: self.stable_store.iter().map(map_f).collect(),
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
pub fn map_indexes<U>(&self, map_f: impl Fn(Interned<T>) -> U) -> MappedInterner<T, U> {
|
||||
MappedInterner { stable_store: self.indexes().map(map_f).collect(), _phantom: PhantomData }
|
||||
}
|
||||
pub fn indexes(&self) -> impl Iterator<Item = Interned<T>> {
|
||||
(0..self.stable_store.len()).map(|i| Interned::from_raw(i as u16))
|
||||
}
|
||||
pub fn iter(&self) -> impl Iterator<Item = (Interned<T>, &T)> {
|
||||
self.stable_store.iter().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
|
||||
}
|
||||
pub fn iter_mut(&mut self) -> impl Iterator<Item = (Interned<T>, &mut T)> {
|
||||
self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
|
||||
}
|
||||
}
|
||||
|
||||
/// A fixed-length store for values of type `T`, where each value is identified
|
||||
/// by an index of type [`Interned<T>`].
|
||||
#[derive(Clone)]
|
||||
pub struct Interner<T> {
|
||||
stable_store: Vec<T>,
|
||||
}
|
||||
impl<T> Default for Interner<T> {
|
||||
fn default() -> Self {
|
||||
Self { stable_store: vec![] }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Interner<T> {
|
||||
pub fn from_vec(v: Vec<T>) -> Self {
|
||||
Self { stable_store: v }
|
||||
}
|
||||
pub fn get(&self, interned: Interned<T>) -> &T {
|
||||
&self.stable_store[interned.idx as usize]
|
||||
}
|
||||
pub fn get_mut(&mut self, interned: Interned<T>) -> &mut T {
|
||||
&mut self.stable_store[interned.idx as usize]
|
||||
}
|
||||
pub fn push(&mut self, value: T) -> Interned<T> {
|
||||
assert!(self.stable_store.len() < u16::MAX as usize);
|
||||
self.stable_store.push(value);
|
||||
Interned::from_raw(self.stable_store.len() as u16 - 1)
|
||||
}
|
||||
pub fn len(&self) -> u16 {
|
||||
self.stable_store.len() as u16
|
||||
}
|
||||
pub fn map<U>(&self, map_f: impl Fn(&T) -> U) -> MappedInterner<T, U> {
|
||||
MappedInterner {
|
||||
stable_store: self.stable_store.iter().map(map_f).collect(),
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
pub fn map_indexes<U>(&self, map_f: impl Fn(Interned<T>) -> U) -> MappedInterner<T, U> {
|
||||
MappedInterner { stable_store: self.indexes().map(map_f).collect(), _phantom: PhantomData }
|
||||
}
|
||||
pub fn indexes(&self) -> impl Iterator<Item = Interned<T>> {
|
||||
(0..self.stable_store.len()).map(|i| Interned::from_raw(i as u16))
|
||||
}
|
||||
pub fn iter(&self) -> impl Iterator<Item = (Interned<T>, &T)> {
|
||||
self.stable_store.iter().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
|
||||
}
|
||||
pub fn iter_mut(&mut self) -> impl Iterator<Item = (Interned<T>, &mut T)> {
|
||||
self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
|
||||
}
|
||||
pub fn freeze(self) -> FixedSizeInterner<T> {
|
||||
FixedSizeInterner { stable_store: self.stable_store }
|
||||
}
|
||||
}
|
||||
|
||||
/// A store of values of type `T`, each linked to a value of type `From`
|
||||
/// stored in another interner. To create a mapped interner, use the
|
||||
/// `map` method on [`FixedSizeInterner`] or [`MappedInterner`].
|
||||
///
|
||||
/// Values in this interner are indexed with [`Interned<From>`].
|
||||
#[derive(Clone)]
|
||||
pub struct MappedInterner<From, T> {
|
||||
stable_store: Vec<T>,
|
||||
_phantom: PhantomData<From>,
|
||||
}
|
||||
|
||||
impl<From, T> MappedInterner<From, T> {
|
||||
pub fn get(&self, interned: Interned<From>) -> &T {
|
||||
&self.stable_store[interned.idx as usize]
|
||||
}
|
||||
pub fn get_mut(&mut self, interned: Interned<From>) -> &mut T {
|
||||
&mut self.stable_store[interned.idx as usize]
|
||||
}
|
||||
pub fn map<U>(&self, map_f: impl Fn(&T) -> U) -> MappedInterner<From, U> {
|
||||
MappedInterner {
|
||||
stable_store: self.stable_store.iter().map(map_f).collect(),
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
pub fn iter(&self) -> impl Iterator<Item = (Interned<From>, &T)> {
|
||||
self.stable_store.iter().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
|
||||
}
|
||||
pub fn iter_mut(&mut self) -> impl Iterator<Item = (Interned<From>, &mut T)> {
|
||||
self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
|
||||
}
|
||||
}
|
||||
// Interned<T> boilerplate implementations
|
||||
|
||||
impl<T> Hash for Interned<T> {
|
||||
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
||||
self.idx.hash(state);
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Ord for Interned<T> {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
self.idx.cmp(&other.idx)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> PartialOrd for Interned<T> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
self.idx.partial_cmp(&other.idx)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Eq for Interned<T> {}
|
||||
|
||||
impl<T> PartialEq for Interned<T> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.idx == other.idx
|
||||
}
|
||||
}
|
||||
impl<T> Clone for Interned<T> {
|
||||
fn clone(&self) -> Self {
|
||||
Self { idx: self.idx, _phantom: PhantomData }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Copy for Interned<T> {}
|
||||
|
||||
impl<T> fmt::Display for Interned<T> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fmt::Display::fmt(&self.idx, f)
|
||||
}
|
||||
}
|
||||
impl<T> fmt::Debug for Interned<T> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fmt::Debug::fmt(&self.idx, f)
|
||||
}
|
||||
}
|
||||
18
milli/src/search/new/limits.rs
Normal file
18
milli/src/search/new/limits.rs
Normal file
@@ -0,0 +1,18 @@
|
||||
/// Maximum number of tokens we consider in a single search.
|
||||
// TODO: Loic, find proper value here so we don't overflow the interner.
|
||||
pub const MAX_TOKEN_COUNT: usize = 1_000;
|
||||
|
||||
/// Maximum number of prefixes that can be derived from a single word.
|
||||
pub const MAX_PREFIX_COUNT: usize = 1_000;
|
||||
/// Maximum number of words that can be derived from a single word with a distance of one to that word.
|
||||
pub const MAX_ONE_TYPO_COUNT: usize = 150;
|
||||
/// Maximum number of words that can be derived from a single word with a distance of two to that word.
|
||||
pub const MAX_TWO_TYPOS_COUNT: usize = 50;
|
||||
|
||||
/// Maximum amount of synonym phrases that can be derived from a single word.
|
||||
pub const MAX_SYNONYM_PHRASE_COUNT: usize = 50;
|
||||
|
||||
/// Maximum amount of words inside of all the synonym phrases that can be derived from a single word.
|
||||
///
|
||||
/// This limit is meant to gracefully handle the case where a word would have very long phrases as synonyms.
|
||||
pub const MAX_SYNONYM_WORD_COUNT: usize = 100;
|
||||
81
milli/src/search/new/logger/mod.rs
Normal file
81
milli/src/search/new/logger/mod.rs
Normal file
@@ -0,0 +1,81 @@
|
||||
// #[cfg(test)]
|
||||
pub mod visual;
|
||||
|
||||
use std::any::Any;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::ranking_rules::BoxRankingRule;
|
||||
use super::{RankingRule, RankingRuleQueryTrait};
|
||||
|
||||
/// Trait for structure logging the execution of a search query.
|
||||
pub trait SearchLogger<Q: RankingRuleQueryTrait> {
|
||||
/// Logs the initial query
|
||||
fn initial_query(&mut self, _query: &Q);
|
||||
|
||||
/// Logs the value of the initial set of all candidates
|
||||
fn initial_universe(&mut self, _universe: &RoaringBitmap);
|
||||
|
||||
/// Logs the query that was used to compute the set of all candidates
|
||||
fn query_for_initial_universe(&mut self, _query: &Q);
|
||||
|
||||
/// Logs the ranking rules used to perform the search query
|
||||
fn ranking_rules(&mut self, _rr: &[BoxRankingRule<Q>]);
|
||||
|
||||
/// Logs the start of a ranking rule's iteration.
|
||||
fn start_iteration_ranking_rule(
|
||||
&mut self,
|
||||
_ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<Q>,
|
||||
_query: &Q,
|
||||
_universe: &RoaringBitmap,
|
||||
) {
|
||||
}
|
||||
/// Logs the end of the computation of a ranking rule bucket
|
||||
fn next_bucket_ranking_rule(
|
||||
&mut self,
|
||||
_ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<Q>,
|
||||
_universe: &RoaringBitmap,
|
||||
_candidates: &RoaringBitmap,
|
||||
) {
|
||||
}
|
||||
/// Logs the skipping of a ranking rule bucket
|
||||
fn skip_bucket_ranking_rule(
|
||||
&mut self,
|
||||
_ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<Q>,
|
||||
_candidates: &RoaringBitmap,
|
||||
) {
|
||||
}
|
||||
/// Logs the end of a ranking rule's iteration.
|
||||
fn end_iteration_ranking_rule(
|
||||
&mut self,
|
||||
_ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<Q>,
|
||||
_universe: &RoaringBitmap,
|
||||
) {
|
||||
}
|
||||
/// Logs the addition of document ids to the final results
|
||||
fn add_to_results(&mut self, _docids: &[u32]);
|
||||
|
||||
/// Logs an internal state in the search algorithms
|
||||
fn log_internal_state(&mut self, _rr: &dyn Any);
|
||||
}
|
||||
|
||||
/// A dummy [`SearchLogger`] which does nothing.
|
||||
pub struct DefaultSearchLogger;
|
||||
|
||||
impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
|
||||
fn initial_query(&mut self, _query: &Q) {}
|
||||
|
||||
fn initial_universe(&mut self, _universe: &RoaringBitmap) {}
|
||||
|
||||
fn query_for_initial_universe(&mut self, _query: &Q) {}
|
||||
|
||||
fn ranking_rules(&mut self, _rr: &[BoxRankingRule<Q>]) {}
|
||||
|
||||
fn add_to_results(&mut self, _docids: &[u32]) {}
|
||||
|
||||
fn log_internal_state(&mut self, _rr: &dyn Any) {}
|
||||
}
|
||||
557
milli/src/search/new/logger/visual.rs
Normal file
557
milli/src/search/new/logger/visual.rs
Normal file
@@ -0,0 +1,557 @@
|
||||
use std::any::Any;
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::Instant;
|
||||
|
||||
// use rand::random;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::search::new::interner::Interned;
|
||||
use crate::search::new::query_graph::QueryNodeData;
|
||||
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||
use crate::search::new::ranking_rule_graph::{
|
||||
Edge, FidCondition, FidGraph, PositionCondition, PositionGraph, ProximityCondition,
|
||||
ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph,
|
||||
};
|
||||
use crate::search::new::ranking_rules::BoxRankingRule;
|
||||
use crate::search::new::{QueryGraph, QueryNode, RankingRule, SearchContext, SearchLogger};
|
||||
use crate::Result;
|
||||
|
||||
pub enum SearchEvents {
|
||||
RankingRuleStartIteration { ranking_rule_idx: usize, universe_len: u64 },
|
||||
RankingRuleNextBucket { ranking_rule_idx: usize, universe_len: u64, bucket_len: u64 },
|
||||
RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 },
|
||||
RankingRuleEndIteration { ranking_rule_idx: usize, universe_len: u64 },
|
||||
ExtendResults { new: Vec<u32> },
|
||||
WordsGraph { query_graph: QueryGraph },
|
||||
ProximityGraph { graph: RankingRuleGraph<ProximityGraph> },
|
||||
ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> },
|
||||
TypoGraph { graph: RankingRuleGraph<TypoGraph> },
|
||||
TypoPaths { paths: Vec<Vec<Interned<TypoCondition>>> },
|
||||
FidGraph { graph: RankingRuleGraph<FidGraph> },
|
||||
FidPaths { paths: Vec<Vec<Interned<FidCondition>>> },
|
||||
PositionGraph { graph: RankingRuleGraph<PositionGraph> },
|
||||
PositionPaths { paths: Vec<Vec<Interned<PositionCondition>>> },
|
||||
}
|
||||
|
||||
enum Location {
|
||||
Words,
|
||||
Typo,
|
||||
Proximity,
|
||||
Fid,
|
||||
Position,
|
||||
Other,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct VisualSearchLogger {
|
||||
initial_query: Option<QueryGraph>,
|
||||
initial_query_time: Option<Instant>,
|
||||
query_for_universe: Option<QueryGraph>,
|
||||
initial_universe: Option<RoaringBitmap>,
|
||||
ranking_rules_ids: Option<Vec<String>>,
|
||||
events: Vec<SearchEvents>,
|
||||
location: Vec<Location>,
|
||||
}
|
||||
|
||||
impl SearchLogger<QueryGraph> for VisualSearchLogger {
|
||||
fn initial_query(&mut self, query: &QueryGraph) {
|
||||
self.initial_query = Some(query.clone());
|
||||
self.initial_query_time = Some(Instant::now());
|
||||
}
|
||||
|
||||
fn query_for_initial_universe(&mut self, query: &QueryGraph) {
|
||||
self.query_for_universe = Some(query.clone());
|
||||
}
|
||||
|
||||
fn initial_universe(&mut self, universe: &RoaringBitmap) {
|
||||
self.initial_universe = Some(universe.clone());
|
||||
}
|
||||
fn ranking_rules(&mut self, rr: &[BoxRankingRule<QueryGraph>]) {
|
||||
self.ranking_rules_ids = Some(rr.iter().map(|rr| rr.id()).collect());
|
||||
}
|
||||
|
||||
fn start_iteration_ranking_rule(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
ranking_rule: &dyn RankingRule<QueryGraph>,
|
||||
_query: &QueryGraph,
|
||||
universe: &RoaringBitmap,
|
||||
) {
|
||||
self.events.push(SearchEvents::RankingRuleStartIteration {
|
||||
ranking_rule_idx,
|
||||
universe_len: universe.len(),
|
||||
});
|
||||
self.location.push(match ranking_rule.id().as_str() {
|
||||
"words" => Location::Words,
|
||||
"typo" => Location::Typo,
|
||||
"proximity" => Location::Proximity,
|
||||
"fid" => Location::Fid,
|
||||
"position" => Location::Position,
|
||||
_ => Location::Other,
|
||||
});
|
||||
}
|
||||
|
||||
fn next_bucket_ranking_rule(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<QueryGraph>,
|
||||
universe: &RoaringBitmap,
|
||||
bucket: &RoaringBitmap,
|
||||
) {
|
||||
self.events.push(SearchEvents::RankingRuleNextBucket {
|
||||
ranking_rule_idx,
|
||||
universe_len: universe.len(),
|
||||
bucket_len: bucket.len(),
|
||||
});
|
||||
}
|
||||
fn skip_bucket_ranking_rule(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<QueryGraph>,
|
||||
bucket: &RoaringBitmap,
|
||||
) {
|
||||
self.events.push(SearchEvents::RankingRuleSkipBucket {
|
||||
ranking_rule_idx,
|
||||
bucket_len: bucket.len(),
|
||||
})
|
||||
}
|
||||
|
||||
fn end_iteration_ranking_rule(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<QueryGraph>,
|
||||
universe: &RoaringBitmap,
|
||||
) {
|
||||
self.events.push(SearchEvents::RankingRuleEndIteration {
|
||||
ranking_rule_idx,
|
||||
universe_len: universe.len(),
|
||||
});
|
||||
self.location.pop();
|
||||
}
|
||||
fn add_to_results(&mut self, docids: &[u32]) {
|
||||
self.events.push(SearchEvents::ExtendResults { new: docids.to_vec() });
|
||||
}
|
||||
|
||||
/// Logs the internal state of the ranking rule
|
||||
fn log_internal_state(&mut self, state: &dyn Any) {
|
||||
let Some(location) = self.location.last() else { return };
|
||||
match location {
|
||||
Location::Words => {
|
||||
if let Some(query_graph) = state.downcast_ref::<QueryGraph>() {
|
||||
self.events.push(SearchEvents::WordsGraph { query_graph: query_graph.clone() });
|
||||
}
|
||||
}
|
||||
Location::Typo => {
|
||||
if let Some(graph) = state.downcast_ref::<RankingRuleGraph<TypoGraph>>() {
|
||||
self.events.push(SearchEvents::TypoGraph { graph: graph.clone() });
|
||||
}
|
||||
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<TypoCondition>>>>() {
|
||||
self.events.push(SearchEvents::TypoPaths { paths: paths.clone() });
|
||||
}
|
||||
}
|
||||
Location::Proximity => {
|
||||
if let Some(graph) = state.downcast_ref::<RankingRuleGraph<ProximityGraph>>() {
|
||||
self.events.push(SearchEvents::ProximityGraph { graph: graph.clone() });
|
||||
}
|
||||
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<ProximityCondition>>>>()
|
||||
{
|
||||
self.events.push(SearchEvents::ProximityPaths { paths: paths.clone() });
|
||||
}
|
||||
}
|
||||
Location::Fid => {
|
||||
if let Some(graph) = state.downcast_ref::<RankingRuleGraph<FidGraph>>() {
|
||||
self.events.push(SearchEvents::FidGraph { graph: graph.clone() });
|
||||
}
|
||||
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<FidCondition>>>>() {
|
||||
self.events.push(SearchEvents::FidPaths { paths: paths.clone() });
|
||||
}
|
||||
}
|
||||
Location::Position => {
|
||||
if let Some(graph) = state.downcast_ref::<RankingRuleGraph<PositionGraph>>() {
|
||||
self.events.push(SearchEvents::PositionGraph { graph: graph.clone() });
|
||||
}
|
||||
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<PositionCondition>>>>() {
|
||||
self.events.push(SearchEvents::PositionPaths { paths: paths.clone() });
|
||||
}
|
||||
}
|
||||
Location::Other => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl VisualSearchLogger {
|
||||
pub fn finish<'ctx>(self, ctx: &'ctx mut SearchContext<'ctx>, folder: &Path) -> Result<()> {
|
||||
let mut f = DetailedLoggerFinish::new(ctx, folder)?;
|
||||
f.finish(self)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct DetailedLoggerFinish<'ctx> {
|
||||
ctx: &'ctx mut SearchContext<'ctx>,
|
||||
/// The folder where all the files should be printed
|
||||
folder_path: PathBuf,
|
||||
/// The main file visualising the search request
|
||||
index_file: BufWriter<File>,
|
||||
/// A vector of counters where each counter at index i represents the number of times
|
||||
/// that the ranking rule at idx i-1 was called since its last call to `start_iteration`.
|
||||
/// This is used to uniquely identify a point in the sequence diagram.
|
||||
rr_action_counter: Vec<usize>,
|
||||
/// The file storing information about the internal state of the latest active ranking rule
|
||||
file_for_internal_state: Option<BufWriter<File>>,
|
||||
}
|
||||
|
||||
impl<'ctx> DetailedLoggerFinish<'ctx> {
|
||||
fn cur_file(&mut self) -> &mut BufWriter<File> {
|
||||
if let Some(file) = self.file_for_internal_state.as_mut() {
|
||||
file
|
||||
} else {
|
||||
&mut self.index_file
|
||||
}
|
||||
}
|
||||
fn pop_rr_action(&mut self) {
|
||||
self.file_for_internal_state = None;
|
||||
self.rr_action_counter.pop();
|
||||
}
|
||||
fn push_new_rr_action(&mut self) {
|
||||
self.file_for_internal_state = None;
|
||||
self.rr_action_counter.push(0);
|
||||
}
|
||||
fn increment_cur_rr_action(&mut self) {
|
||||
self.file_for_internal_state = None;
|
||||
if let Some(c) = self.rr_action_counter.last_mut() {
|
||||
*c += 1;
|
||||
}
|
||||
}
|
||||
fn id_of_timestamp(&self) -> String {
|
||||
let mut s = String::new();
|
||||
for t in self.rr_action_counter.iter() {
|
||||
s.push_str(&format!("{t}_"));
|
||||
}
|
||||
s
|
||||
}
|
||||
fn id_of_extend_results(&self) -> String {
|
||||
let mut s = String::new();
|
||||
s.push_str("results.\"");
|
||||
s.push_str(&self.id_of_timestamp());
|
||||
s.push('"');
|
||||
s
|
||||
}
|
||||
fn id_of_last_rr_action(&self) -> String {
|
||||
let mut s = String::new();
|
||||
let rr_id = if self.rr_action_counter.is_empty() {
|
||||
"start.\"".to_owned()
|
||||
} else {
|
||||
format!("{}.\"", self.rr_action_counter.len() - 1)
|
||||
};
|
||||
s.push_str(&rr_id);
|
||||
s.push_str(&self.id_of_timestamp());
|
||||
s.push('"');
|
||||
s
|
||||
}
|
||||
fn make_new_file_for_internal_state_if_needed(&mut self) -> Result<()> {
|
||||
if self.file_for_internal_state.is_some() {
|
||||
return Ok(());
|
||||
}
|
||||
let timestamp = self.id_of_timestamp();
|
||||
let id = self.id_of_last_rr_action();
|
||||
let new_file_path = self.folder_path.join(format!("{timestamp}.d2"));
|
||||
self.file_for_internal_state = Some(BufWriter::new(File::create(new_file_path)?));
|
||||
|
||||
writeln!(
|
||||
&mut self.index_file,
|
||||
"{id} {{
|
||||
link: \"{timestamp}.d2.svg\"
|
||||
}}"
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
fn new(ctx: &'ctx mut SearchContext<'ctx>, folder_path: &Path) -> Result<Self> {
|
||||
let index_path = folder_path.join("index.d2");
|
||||
let index_file = BufWriter::new(File::create(index_path)?);
|
||||
|
||||
Ok(Self {
|
||||
ctx,
|
||||
folder_path: folder_path.to_owned(),
|
||||
index_file,
|
||||
rr_action_counter: vec![],
|
||||
file_for_internal_state: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn finish(&mut self, logger: VisualSearchLogger) -> Result<()> {
|
||||
writeln!(&mut self.index_file, "direction: right")?;
|
||||
if let Some(qg) = logger.initial_query {
|
||||
writeln!(&mut self.index_file, "Initial Query Graph: {{")?;
|
||||
self.write_query_graph(&qg)?;
|
||||
writeln!(&mut self.index_file, "}}")?;
|
||||
}
|
||||
if let Some(qg) = logger.query_for_universe {
|
||||
writeln!(&mut self.index_file, "Query Graph Used To Compute Universe: {{")?;
|
||||
self.write_query_graph(&qg)?;
|
||||
writeln!(&mut self.index_file, "}}")?;
|
||||
}
|
||||
let Some(ranking_rules_ids) = logger.ranking_rules_ids else { return Ok(()) };
|
||||
writeln!(&mut self.index_file, "Control Flow Between Ranking Rules: {{")?;
|
||||
writeln!(&mut self.index_file, "shape: sequence_diagram")?;
|
||||
writeln!(&mut self.index_file, "start")?;
|
||||
for (idx, rr_id) in ranking_rules_ids.iter().enumerate() {
|
||||
writeln!(&mut self.index_file, "{idx}: {rr_id}")?;
|
||||
}
|
||||
writeln!(&mut self.index_file, "results")?;
|
||||
for event in logger.events {
|
||||
self.write_event(event)?;
|
||||
}
|
||||
writeln!(&mut self.index_file, "}}")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_event(&mut self, e: SearchEvents) -> Result<()> {
|
||||
match e {
|
||||
SearchEvents::RankingRuleStartIteration { ranking_rule_idx, universe_len } => {
|
||||
assert!(ranking_rule_idx == self.rr_action_counter.len());
|
||||
self.write_start_iteration(universe_len)?;
|
||||
}
|
||||
SearchEvents::RankingRuleNextBucket { ranking_rule_idx, universe_len, bucket_len } => {
|
||||
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
|
||||
self.write_next_bucket(bucket_len, universe_len)?;
|
||||
}
|
||||
SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, bucket_len } => {
|
||||
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
|
||||
self.write_skip_bucket(bucket_len)?;
|
||||
}
|
||||
SearchEvents::RankingRuleEndIteration { ranking_rule_idx, universe_len: _ } => {
|
||||
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
|
||||
self.write_end_iteration()?;
|
||||
}
|
||||
SearchEvents::ExtendResults { new } => {
|
||||
self.write_extend_results(new)?;
|
||||
}
|
||||
SearchEvents::WordsGraph { query_graph } => self.write_words_graph(query_graph)?,
|
||||
SearchEvents::ProximityGraph { graph } => self.write_rr_graph(&graph)?,
|
||||
SearchEvents::ProximityPaths { paths } => {
|
||||
self.write_rr_graph_paths::<ProximityGraph>(paths)?;
|
||||
}
|
||||
SearchEvents::TypoGraph { graph } => self.write_rr_graph(&graph)?,
|
||||
SearchEvents::TypoPaths { paths } => {
|
||||
self.write_rr_graph_paths::<TypoGraph>(paths)?;
|
||||
}
|
||||
SearchEvents::FidGraph { graph } => self.write_rr_graph(&graph)?,
|
||||
SearchEvents::FidPaths { paths } => {
|
||||
self.write_rr_graph_paths::<FidGraph>(paths)?;
|
||||
}
|
||||
SearchEvents::PositionGraph { graph } => self.write_rr_graph(&graph)?,
|
||||
SearchEvents::PositionPaths { paths } => {
|
||||
self.write_rr_graph_paths::<PositionGraph>(paths)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
fn write_query_graph(&mut self, qg: &QueryGraph) -> Result<()> {
|
||||
writeln!(self.cur_file(), "direction: right")?;
|
||||
for (node_id, node) in qg.nodes.iter() {
|
||||
if matches!(node.data, QueryNodeData::Deleted) {
|
||||
continue;
|
||||
}
|
||||
self.write_query_node(node_id, node)?;
|
||||
|
||||
for edge in node.successors.iter() {
|
||||
writeln!(self.cur_file(), "{node_id} -> {edge};\n").unwrap();
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_start_iteration(&mut self, _universe_len: u64) -> Result<()> {
|
||||
let parent_action_id = self.id_of_last_rr_action();
|
||||
self.push_new_rr_action();
|
||||
let self_action_id = self.id_of_last_rr_action();
|
||||
writeln!(&mut self.index_file, "{parent_action_id} -> {self_action_id} : start iteration")?;
|
||||
writeln!(
|
||||
&mut self.index_file,
|
||||
"{self_action_id} {{
|
||||
style {{
|
||||
fill: \"#D8A7B1\"
|
||||
}}
|
||||
}}"
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn write_next_bucket(&mut self, bucket_len: u64, universe_len: u64) -> Result<()> {
|
||||
let cur_action_id = self.id_of_last_rr_action();
|
||||
self.increment_cur_rr_action();
|
||||
let next_action_id = self.id_of_last_rr_action();
|
||||
writeln!(
|
||||
&mut self.index_file,
|
||||
"{cur_action_id} -> {next_action_id} : next bucket {bucket_len}/{universe_len}"
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn write_skip_bucket(&mut self, bucket_len: u64) -> Result<()> {
|
||||
let cur_action_id = self.id_of_last_rr_action();
|
||||
self.increment_cur_rr_action();
|
||||
let next_action_id = self.id_of_last_rr_action();
|
||||
writeln!(
|
||||
&mut self.index_file,
|
||||
"{cur_action_id} -> {next_action_id} : skip bucket ({bucket_len})"
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn write_end_iteration(&mut self) -> Result<()> {
|
||||
let cur_action_id = self.id_of_last_rr_action();
|
||||
self.pop_rr_action();
|
||||
let parent_action_id = self.id_of_last_rr_action();
|
||||
|
||||
writeln!(&mut self.index_file, "{cur_action_id} -> {parent_action_id} : end iteration",)?;
|
||||
Ok(())
|
||||
}
|
||||
fn write_extend_results(&mut self, new: Vec<u32>) -> Result<()> {
|
||||
if new.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let cur_action_id = self.id_of_last_rr_action();
|
||||
let results_id = self.id_of_extend_results();
|
||||
let docids = new.iter().collect::<Vec<_>>();
|
||||
let len = new.len();
|
||||
|
||||
writeln!(
|
||||
&mut self.index_file,
|
||||
"{cur_action_id} -> {results_id} : \"add {len}\"
|
||||
{results_id} {{
|
||||
tooltip: \"{docids:?}\"
|
||||
style {{
|
||||
fill: \"#B6E2D3\"
|
||||
}}
|
||||
}}
|
||||
"
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_query_node(&mut self, node_idx: Interned<QueryNode>, node: &QueryNode) -> Result<()> {
|
||||
let Self {
|
||||
ctx, index_file, file_for_internal_state: active_ranking_rule_state_file, ..
|
||||
} = self;
|
||||
let file = if let Some(file) = active_ranking_rule_state_file.as_mut() {
|
||||
file
|
||||
} else {
|
||||
index_file
|
||||
};
|
||||
match &node.data {
|
||||
QueryNodeData::Term(LocatedQueryTermSubset {
|
||||
term_subset,
|
||||
positions: _,
|
||||
term_ids: _,
|
||||
}) => {
|
||||
writeln!(
|
||||
file,
|
||||
"{node_idx} : \"{}\" {{
|
||||
shape: class
|
||||
max_nbr_typo: {}",
|
||||
term_subset.description(ctx),
|
||||
term_subset.max_nbr_typos(ctx)
|
||||
)?;
|
||||
|
||||
for w in term_subset.all_single_words_except_prefix_db(ctx)? {
|
||||
let w = ctx.word_interner.get(w.interned());
|
||||
writeln!(file, "{w}: word")?;
|
||||
}
|
||||
for p in term_subset.all_phrases(ctx)? {
|
||||
writeln!(file, "{}: phrase", p.description(ctx))?;
|
||||
}
|
||||
if let Some(w) = term_subset.use_prefix_db(ctx) {
|
||||
let w = ctx.word_interner.get(w.interned());
|
||||
writeln!(file, "{w}: prefix db")?;
|
||||
}
|
||||
|
||||
writeln!(file, "}}")?;
|
||||
}
|
||||
QueryNodeData::Deleted => panic!(),
|
||||
QueryNodeData::Start => {
|
||||
writeln!(file, "{node_idx} : START")?;
|
||||
}
|
||||
QueryNodeData::End => {
|
||||
writeln!(file, "{node_idx} : END")?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
fn write_words_graph(&mut self, qg: QueryGraph) -> Result<()> {
|
||||
self.make_new_file_for_internal_state_if_needed()?;
|
||||
|
||||
self.write_query_graph(&qg)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn write_rr_graph<R: RankingRuleGraphTrait>(
|
||||
&mut self,
|
||||
graph: &RankingRuleGraph<R>,
|
||||
) -> Result<()> {
|
||||
self.make_new_file_for_internal_state_if_needed()?;
|
||||
|
||||
writeln!(self.cur_file(), "direction: right")?;
|
||||
|
||||
writeln!(self.cur_file(), "Graph {{")?;
|
||||
for (node_idx, node) in graph.query_graph.nodes.iter() {
|
||||
if matches!(&node.data, QueryNodeData::Deleted) {
|
||||
continue;
|
||||
}
|
||||
self.write_query_node(node_idx, node)?;
|
||||
}
|
||||
for (_edge_id, edge) in graph.edges_store.iter() {
|
||||
let Some(edge) = edge else { continue };
|
||||
let Edge { source_node, dest_node, condition: details, cost, nodes_to_skip: _ } = edge;
|
||||
|
||||
match &details {
|
||||
None => {
|
||||
writeln!(
|
||||
self.cur_file(),
|
||||
"{source_node} -> {dest_node} : \"always cost {cost}\"",
|
||||
)?;
|
||||
}
|
||||
Some(condition) => {
|
||||
writeln!(
|
||||
self.cur_file(),
|
||||
"{source_node} -> {dest_node} : \"{condition} cost {cost}\"",
|
||||
cost = edge.cost,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
writeln!(self.cur_file(), "}}")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_rr_graph_paths<R: RankingRuleGraphTrait>(
|
||||
&mut self,
|
||||
paths: Vec<Vec<Interned<R::Condition>>>,
|
||||
) -> Result<()> {
|
||||
self.make_new_file_for_internal_state_if_needed()?;
|
||||
let file = if let Some(file) = self.file_for_internal_state.as_mut() {
|
||||
file
|
||||
} else {
|
||||
&mut self.index_file
|
||||
};
|
||||
writeln!(file, "Path {{")?;
|
||||
for (path_idx, condition_indexes) in paths.iter().enumerate() {
|
||||
writeln!(file, "{path_idx} {{")?;
|
||||
for condition in condition_indexes.iter() {
|
||||
writeln!(file, "{condition}")?;
|
||||
}
|
||||
for couple_edges in condition_indexes.windows(2) {
|
||||
let [src_edge_idx, dest_edge_idx] = couple_edges else { panic!() };
|
||||
writeln!(file, "{src_edge_idx} -> {dest_edge_idx}")?;
|
||||
}
|
||||
writeln!(file, "}}")?;
|
||||
}
|
||||
writeln!(file, "}}")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
323
milli/src/search/new/matches/matching_words.rs
Normal file
323
milli/src/search/new/matches/matching_words.rs
Normal file
@@ -0,0 +1,323 @@
|
||||
use std::cmp::Reverse;
|
||||
use std::fmt;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
use charabia::Token;
|
||||
|
||||
use super::super::interner::Interned;
|
||||
use super::super::query_term::LocatedQueryTerm;
|
||||
use super::super::{DedupInterner, Phrase};
|
||||
use crate::SearchContext;
|
||||
|
||||
pub struct LocatedMatchingPhrase {
|
||||
pub value: Interned<Phrase>,
|
||||
pub positions: RangeInclusive<WordId>,
|
||||
}
|
||||
|
||||
pub struct LocatedMatchingWords {
|
||||
pub value: Vec<Interned<String>>,
|
||||
pub positions: RangeInclusive<WordId>,
|
||||
pub is_prefix: bool,
|
||||
pub original_char_count: usize,
|
||||
}
|
||||
|
||||
/// Structure created from a query tree
|
||||
/// referencing words that match the given query tree.
|
||||
#[derive(Default)]
|
||||
pub struct MatchingWords {
|
||||
word_interner: DedupInterner<String>,
|
||||
phrase_interner: DedupInterner<Phrase>,
|
||||
phrases: Vec<LocatedMatchingPhrase>,
|
||||
words: Vec<LocatedMatchingWords>,
|
||||
}
|
||||
|
||||
impl MatchingWords {
|
||||
pub fn new(ctx: SearchContext, located_terms: Vec<LocatedQueryTerm>) -> Self {
|
||||
let mut phrases = Vec::new();
|
||||
let mut words = Vec::new();
|
||||
|
||||
// Extract and centralize the different phrases and words to match stored in a QueryTerm
|
||||
// and wrap them in dedicated structures.
|
||||
for located_term in located_terms {
|
||||
let term = ctx.term_interner.get(located_term.value);
|
||||
let (matching_words, matching_phrases) = term.all_computed_derivations();
|
||||
|
||||
for matching_phrase in matching_phrases {
|
||||
phrases.push(LocatedMatchingPhrase {
|
||||
value: matching_phrase,
|
||||
positions: located_term.positions.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
words.push(LocatedMatchingWords {
|
||||
value: matching_words,
|
||||
positions: located_term.positions.clone(),
|
||||
is_prefix: term.is_cached_prefix(),
|
||||
original_char_count: term.original_word(&ctx).chars().count(),
|
||||
});
|
||||
}
|
||||
|
||||
// Sort word to put prefixes at the bottom prioritizing the exact matches.
|
||||
words.sort_unstable_by_key(|lmw| (lmw.is_prefix, Reverse(lmw.positions.len())));
|
||||
|
||||
Self {
|
||||
phrases,
|
||||
words,
|
||||
word_interner: ctx.word_interner,
|
||||
phrase_interner: ctx.phrase_interner,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator over terms that match or partially match the given token.
|
||||
pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> {
|
||||
MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token }
|
||||
}
|
||||
|
||||
/// Try to match the token with one of the located_words.
|
||||
fn match_unique_words<'a>(&'a self, token: &Token) -> Option<MatchType<'a>> {
|
||||
for located_words in &self.words {
|
||||
for word in &located_words.value {
|
||||
let word = self.word_interner.get(*word);
|
||||
// if the word is a prefix we match using starts_with.
|
||||
if located_words.is_prefix && token.lemma().starts_with(word) {
|
||||
let Some((char_index, c)) = word.char_indices().take(located_words.original_char_count).last() else {
|
||||
continue;
|
||||
};
|
||||
let prefix_length = char_index + c.len_utf8();
|
||||
let char_len = token.original_lengths(prefix_length).0;
|
||||
let ids = &located_words.positions;
|
||||
return Some(MatchType::Full { char_len, ids });
|
||||
// else we exact match the token.
|
||||
} else if token.lemma() == word {
|
||||
let char_len = token.char_end - token.char_start;
|
||||
let ids = &located_words.positions;
|
||||
return Some(MatchType::Full { char_len, ids });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterator over terms that match the given token,
|
||||
/// This allow to lazily evaluate matches.
|
||||
pub struct MatchesIter<'a, 'b> {
|
||||
matching_words: &'a MatchingWords,
|
||||
phrases: Box<dyn Iterator<Item = &'a LocatedMatchingPhrase> + 'a>,
|
||||
token: &'b Token<'b>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for MatchesIter<'a, '_> {
|
||||
type Item = MatchType<'a>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.phrases.next() {
|
||||
// Try to match all the phrases first.
|
||||
Some(located_phrase) => {
|
||||
let phrase = self.matching_words.phrase_interner.get(located_phrase.value);
|
||||
|
||||
// create a PartialMatch struct to make it compute the first match
|
||||
// instead of duplicating the code.
|
||||
let ids = &located_phrase.positions;
|
||||
// collect the references of words from the interner.
|
||||
let words = phrase
|
||||
.words
|
||||
.iter()
|
||||
.map(|word| {
|
||||
word.map(|word| self.matching_words.word_interner.get(word).as_str())
|
||||
})
|
||||
.collect();
|
||||
let partial = PartialMatch { matching_words: words, ids, char_len: 0 };
|
||||
|
||||
partial.match_token(self.token).or_else(|| self.next())
|
||||
}
|
||||
// If no phrases matches, try to match uiques words.
|
||||
None => self.matching_words.match_unique_words(self.token),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Id of a matching term corespounding to a word written by the end user.
|
||||
pub type WordId = u16;
|
||||
|
||||
/// A given token can partially match a query word for several reasons:
|
||||
/// - split words
|
||||
/// - multi-word synonyms
|
||||
/// In these cases we need to match consecutively several tokens to consider that the match is full.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum MatchType<'a> {
|
||||
Full { char_len: usize, ids: &'a RangeInclusive<WordId> },
|
||||
Partial(PartialMatch<'a>),
|
||||
}
|
||||
|
||||
/// Structure helper to match several tokens in a row in order to complete a partial match.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct PartialMatch<'a> {
|
||||
matching_words: Vec<Option<&'a str>>,
|
||||
ids: &'a RangeInclusive<WordId>,
|
||||
char_len: usize,
|
||||
}
|
||||
|
||||
impl<'a> PartialMatch<'a> {
|
||||
/// Returns:
|
||||
/// - None if the given token breaks the partial match
|
||||
/// - Partial if the given token matches the partial match but doesn't complete it
|
||||
/// - Full if the given token completes the partial match
|
||||
pub fn match_token(self, token: &Token) -> Option<MatchType<'a>> {
|
||||
let Self { mut matching_words, ids, .. } = self;
|
||||
|
||||
let is_matching = match matching_words.first()? {
|
||||
Some(word) => &token.lemma() == word,
|
||||
// a None value in the phrase corresponds to a stop word,
|
||||
// the walue is considered a match if the current token is categorized as a stop word.
|
||||
None => token.is_stopword(),
|
||||
};
|
||||
|
||||
let char_len = token.char_end - token.char_start;
|
||||
// if there are remaining words to match in the phrase and the current token is matching,
|
||||
// return a new Partial match allowing the highlighter to continue.
|
||||
if is_matching && matching_words.len() > 1 {
|
||||
matching_words.remove(0);
|
||||
Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len }))
|
||||
// if there is no remaining word to match in the phrase and the current token is matching,
|
||||
// return a Full match.
|
||||
} else if is_matching {
|
||||
Some(MatchType::Full { char_len, ids })
|
||||
// if the current token doesn't match, return None to break the match sequence.
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn char_len(&self) -> usize {
|
||||
self.char_len
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for MatchingWords {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let MatchingWords { word_interner, phrase_interner, phrases, words } = self;
|
||||
|
||||
let phrases: Vec<_> = phrases
|
||||
.iter()
|
||||
.map(|p| {
|
||||
(
|
||||
phrase_interner
|
||||
.get(p.value)
|
||||
.words
|
||||
.iter()
|
||||
.map(|w| w.map_or("STOP_WORD", |w| word_interner.get(w)))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" "),
|
||||
p.positions.clone(),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let words: Vec<_> = words
|
||||
.iter()
|
||||
.flat_map(|w| {
|
||||
w.value
|
||||
.iter()
|
||||
.map(|s| (word_interner.get(*s), w.positions.clone(), w.is_prefix))
|
||||
.collect::<Vec<_>>()
|
||||
})
|
||||
.collect();
|
||||
|
||||
f.debug_struct("MatchingWords").field("phrases", &phrases).field("words", &words).finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use std::borrow::Cow;
|
||||
|
||||
use charabia::{TokenKind, TokenizerBuilder};
|
||||
|
||||
use super::super::super::located_query_terms_from_tokens;
|
||||
use super::*;
|
||||
use crate::index::tests::TempIndex;
|
||||
|
||||
pub(crate) fn temp_index_with_documents() -> TempIndex {
|
||||
let temp_index = TempIndex::new();
|
||||
temp_index
|
||||
.add_documents(documents!([
|
||||
{ "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
|
||||
]))
|
||||
.unwrap();
|
||||
temp_index
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn matching_words() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let mut ctx = SearchContext::new(&temp_index, &rtxn);
|
||||
let tokenizer = TokenizerBuilder::new().build();
|
||||
let tokens = tokenizer.tokenize("split this world");
|
||||
let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
|
||||
let matching_words = MatchingWords::new(ctx, query_terms);
|
||||
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("split"),
|
||||
char_end: "split".chars().count(),
|
||||
byte_end: "split".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_len: 5, ids: &(0..=0) })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("nyc"),
|
||||
char_end: "nyc".chars().count(),
|
||||
byte_end: "nyc".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("world"),
|
||||
char_end: "world".chars().count(),
|
||||
byte_end: "world".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("worlded"),
|
||||
char_end: "worlded".chars().count(),
|
||||
byte_end: "worlded".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("thisnew"),
|
||||
char_end: "thisnew".chars().count(),
|
||||
byte_end: "thisnew".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
None
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,8 +1,8 @@
|
||||
use std::borrow::Cow;
|
||||
|
||||
use charabia::{SeparatorKind, Token, Tokenizer};
|
||||
use matching_words::{MatchType, PartialMatch, PrimitiveWordId};
|
||||
pub use matching_words::{MatchingWord, MatchingWords};
|
||||
pub use matching_words::MatchingWords;
|
||||
use matching_words::{MatchType, PartialMatch, WordId};
|
||||
use serde::Serialize;
|
||||
|
||||
pub mod matching_words;
|
||||
@@ -88,7 +88,7 @@ impl FormatOptions {
|
||||
pub struct Match {
|
||||
match_len: usize,
|
||||
// ids of the query words that matches.
|
||||
ids: Vec<PrimitiveWordId>,
|
||||
ids: Vec<WordId>,
|
||||
// position of the word in the whole text.
|
||||
word_position: usize,
|
||||
// position of the token in the whole text.
|
||||
@@ -137,11 +137,12 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
||||
}
|
||||
// partial match is now full, we keep this matches and we advance positions
|
||||
Some(MatchType::Full { char_len, ids }) => {
|
||||
let ids: Vec<_> = ids.clone().collect();
|
||||
// save previously matched tokens as matches.
|
||||
let iter = potential_matches.into_iter().map(
|
||||
|(token_position, word_position, match_len)| Match {
|
||||
match_len,
|
||||
ids: ids.to_vec(),
|
||||
ids: ids.clone(),
|
||||
word_position,
|
||||
token_position,
|
||||
},
|
||||
@@ -151,7 +152,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
||||
// save the token that closes the partial match as a match.
|
||||
matches.push(Match {
|
||||
match_len: char_len,
|
||||
ids: ids.to_vec(),
|
||||
ids,
|
||||
word_position,
|
||||
token_position,
|
||||
});
|
||||
@@ -191,9 +192,10 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
||||
// we match, we save the current token as a match,
|
||||
// then we continue the rest of the tokens.
|
||||
MatchType::Full { char_len, ids } => {
|
||||
let ids: Vec<_> = ids.clone().collect();
|
||||
matches.push(Match {
|
||||
match_len: char_len,
|
||||
ids: ids.to_vec(),
|
||||
ids,
|
||||
word_position,
|
||||
token_position,
|
||||
});
|
||||
@@ -334,7 +336,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
||||
/// 2) calculate distance between matches
|
||||
/// 3) count ordered matches
|
||||
fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) {
|
||||
let mut ids: Vec<PrimitiveWordId> = Vec::with_capacity(matches.len());
|
||||
let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
|
||||
let mut order_score = 0;
|
||||
let mut distance_score = 0;
|
||||
|
||||
@@ -494,39 +496,29 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::rc::Rc;
|
||||
|
||||
use charabia::TokenizerBuilder;
|
||||
use matching_words::tests::temp_index_with_documents;
|
||||
|
||||
use super::super::located_query_terms_from_tokens;
|
||||
use super::*;
|
||||
use crate::search::matches::matching_words::MatchingWord;
|
||||
use crate::SearchContext;
|
||||
|
||||
fn matching_words() -> MatchingWords {
|
||||
let all = vec![
|
||||
Rc::new(MatchingWord::new("split".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
|
||||
];
|
||||
let matching_words = vec![
|
||||
(vec![all[0].clone()], vec![0]),
|
||||
(vec![all[1].clone()], vec![1]),
|
||||
(vec![all[2].clone()], vec![2]),
|
||||
];
|
||||
|
||||
MatchingWords::new(matching_words).unwrap()
|
||||
}
|
||||
|
||||
impl MatcherBuilder<'_, Vec<u8>> {
|
||||
pub fn from_matching_words(matching_words: MatchingWords) -> Self {
|
||||
Self::new(matching_words, TokenizerBuilder::default().build())
|
||||
impl<'a> MatcherBuilder<'a, &[u8]> {
|
||||
pub fn new_test(mut ctx: SearchContext, query: &'a str) -> Self {
|
||||
let tokenizer = TokenizerBuilder::new().build();
|
||||
let tokens = tokenizer.tokenize(query);
|
||||
let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
|
||||
let matching_words = MatchingWords::new(ctx, query_terms);
|
||||
Self::new(matching_words, TokenizerBuilder::new().build())
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_identity() {
|
||||
let matching_words = matching_words();
|
||||
|
||||
let builder = MatcherBuilder::from_matching_words(matching_words);
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let ctx = SearchContext::new(&temp_index, &rtxn);
|
||||
let builder = MatcherBuilder::new_test(ctx, "split the world");
|
||||
|
||||
let format_options = FormatOptions { highlight: false, crop: None };
|
||||
|
||||
@@ -551,9 +543,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn format_highlight() {
|
||||
let matching_words = matching_words();
|
||||
|
||||
let builder = MatcherBuilder::from_matching_words(matching_words);
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let ctx = SearchContext::new(&temp_index, &rtxn);
|
||||
let builder = MatcherBuilder::new_test(ctx, "split the world");
|
||||
|
||||
let format_options = FormatOptions { highlight: true, crop: None };
|
||||
|
||||
@@ -594,16 +587,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn highlight_unicode() {
|
||||
let all = vec![
|
||||
Rc::new(MatchingWord::new("wessfali".to_string(), 1, true).unwrap()),
|
||||
Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
|
||||
];
|
||||
let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])];
|
||||
|
||||
let matching_words = MatchingWords::new(matching_words).unwrap();
|
||||
|
||||
let builder = MatcherBuilder::from_matching_words(matching_words);
|
||||
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let ctx = SearchContext::new(&temp_index, &rtxn);
|
||||
let builder = MatcherBuilder::new_test(ctx, "world");
|
||||
let format_options = FormatOptions { highlight: true, crop: None };
|
||||
|
||||
// Text containing prefix match.
|
||||
@@ -612,7 +599,7 @@ mod tests {
|
||||
// no crop should return complete text with highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"<em>Ŵôřlḑ</em>ôle"
|
||||
@"<em>Ŵôřlḑôle</em>"
|
||||
);
|
||||
|
||||
// Text containing unicode match.
|
||||
@@ -624,21 +611,26 @@ mod tests {
|
||||
@"<em>Ŵôřlḑ</em>"
|
||||
);
|
||||
|
||||
let ctx = SearchContext::new(&temp_index, &rtxn);
|
||||
let builder = MatcherBuilder::new_test(ctx, "westfali");
|
||||
let format_options = FormatOptions { highlight: true, crop: None };
|
||||
|
||||
// Text containing unicode match.
|
||||
let text = "Westfália";
|
||||
let mut matcher = builder.build(text);
|
||||
// no crop should return complete text with highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"<em>Westfáli</em>a"
|
||||
@"<em>Westfália</em>"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_crop() {
|
||||
let matching_words = matching_words();
|
||||
|
||||
let builder = MatcherBuilder::from_matching_words(matching_words);
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let ctx = SearchContext::new(&temp_index, &rtxn);
|
||||
let builder = MatcherBuilder::new_test(ctx, "split the world");
|
||||
|
||||
let format_options = FormatOptions { highlight: false, crop: Some(10) };
|
||||
|
||||
@@ -733,9 +725,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn format_highlight_crop() {
|
||||
let matching_words = matching_words();
|
||||
|
||||
let builder = MatcherBuilder::from_matching_words(matching_words);
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let ctx = SearchContext::new(&temp_index, &rtxn);
|
||||
let builder = MatcherBuilder::new_test(ctx, "split the world");
|
||||
|
||||
let format_options = FormatOptions { highlight: true, crop: Some(10) };
|
||||
|
||||
@@ -795,9 +788,10 @@ mod tests {
|
||||
#[test]
|
||||
fn smaller_crop_size() {
|
||||
//! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
|
||||
let matching_words = matching_words();
|
||||
|
||||
let builder = MatcherBuilder::from_matching_words(matching_words);
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let ctx = SearchContext::new(&temp_index, &rtxn);
|
||||
let builder = MatcherBuilder::new_test(ctx, "split the world");
|
||||
|
||||
let text = "void void split the world void void.";
|
||||
|
||||
@@ -831,25 +825,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn partial_matches() {
|
||||
let all = vec![
|
||||
Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("door".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()),
|
||||
Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap()),
|
||||
];
|
||||
let matching_words = vec![
|
||||
(vec![all[0].clone()], vec![0]),
|
||||
(vec![all[1].clone(), all[2].clone()], vec![0]),
|
||||
(vec![all[3].clone()], vec![1]),
|
||||
(vec![all[4].clone(), all[5].clone()], vec![1]),
|
||||
(vec![all[4].clone()], vec![2]),
|
||||
];
|
||||
|
||||
let matching_words = MatchingWords::new(matching_words).unwrap();
|
||||
|
||||
let mut builder = MatcherBuilder::from_matching_words(matching_words);
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let ctx = SearchContext::new(&temp_index, &rtxn);
|
||||
let mut builder = MatcherBuilder::new_test(ctx, "the \"t he\" door \"do or\"");
|
||||
builder.highlight_prefix("_".to_string());
|
||||
builder.highlight_suffix("_".to_string());
|
||||
|
||||
@@ -859,7 +838,7 @@ mod tests {
|
||||
let mut matcher = builder.build(text);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_"
|
||||
@"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_"
|
||||
);
|
||||
}
|
||||
}
|
||||
493
milli/src/search/new/mod.rs
Normal file
493
milli/src/search/new/mod.rs
Normal file
@@ -0,0 +1,493 @@
|
||||
mod bucket_sort;
|
||||
mod db_cache;
|
||||
mod distinct;
|
||||
mod geo_sort;
|
||||
mod graph_based_ranking_rule;
|
||||
mod interner;
|
||||
mod limits;
|
||||
mod logger;
|
||||
pub mod matches;
|
||||
mod query_graph;
|
||||
mod query_term;
|
||||
mod ranking_rule_graph;
|
||||
mod ranking_rules;
|
||||
mod resolve_query_graph;
|
||||
mod small_bitmap;
|
||||
|
||||
mod exact_attribute;
|
||||
// TODO: documentation + comments
|
||||
// implementation is currently an adaptation of the previous implementation to fit with the new model
|
||||
mod sort;
|
||||
// TODO: documentation + comments
|
||||
mod words;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use std::collections::HashSet;
|
||||
|
||||
use bucket_sort::{bucket_sort, BucketSortOutput};
|
||||
use charabia::TokenizerBuilder;
|
||||
use db_cache::DatabaseCache;
|
||||
use exact_attribute::ExactAttribute;
|
||||
use graph_based_ranking_rule::{Exactness, Fid, Position, Proximity, Typo};
|
||||
use heed::RoTxn;
|
||||
use interner::{DedupInterner, Interner};
|
||||
pub use logger::visual::VisualSearchLogger;
|
||||
pub use logger::{DefaultSearchLogger, SearchLogger};
|
||||
use query_graph::{QueryGraph, QueryNode};
|
||||
use query_term::{located_query_terms_from_tokens, LocatedQueryTerm, Phrase, QueryTerm};
|
||||
use ranking_rules::{
|
||||
BoxRankingRule, PlaceholderQuery, RankingRule, RankingRuleOutput, RankingRuleQueryTrait,
|
||||
};
|
||||
use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache};
|
||||
use roaring::RoaringBitmap;
|
||||
use sort::Sort;
|
||||
use words::Words;
|
||||
|
||||
use self::geo_sort::GeoSort;
|
||||
pub use self::geo_sort::Strategy as GeoSortStrategy;
|
||||
use self::interner::Interned;
|
||||
use crate::search::new::distinct::apply_distinct_rule;
|
||||
use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError};
|
||||
|
||||
/// A structure used throughout the execution of a search query.
|
||||
pub struct SearchContext<'ctx> {
|
||||
pub index: &'ctx Index,
|
||||
pub txn: &'ctx RoTxn<'ctx>,
|
||||
pub db_cache: DatabaseCache<'ctx>,
|
||||
pub word_interner: DedupInterner<String>,
|
||||
pub phrase_interner: DedupInterner<Phrase>,
|
||||
pub term_interner: Interner<QueryTerm>,
|
||||
pub phrase_docids: PhraseDocIdsCache,
|
||||
}
|
||||
|
||||
impl<'ctx> SearchContext<'ctx> {
|
||||
pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self {
|
||||
Self {
|
||||
index,
|
||||
txn,
|
||||
db_cache: <_>::default(),
|
||||
word_interner: <_>::default(),
|
||||
phrase_interner: <_>::default(),
|
||||
term_interner: <_>::default(),
|
||||
phrase_docids: <_>::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, PartialOrd, Ord, Eq)]
|
||||
pub enum Word {
|
||||
Original(Interned<String>),
|
||||
Derived(Interned<String>),
|
||||
}
|
||||
|
||||
impl Word {
|
||||
pub fn interned(&self) -> Interned<String> {
|
||||
match self {
|
||||
Word::Original(word) => *word,
|
||||
Word::Derived(word) => *word,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply the [`TermsMatchingStrategy`] to the query graph and resolve it.
|
||||
fn resolve_maximally_reduced_query_graph(
|
||||
ctx: &mut SearchContext,
|
||||
universe: &RoaringBitmap,
|
||||
query_graph: &QueryGraph,
|
||||
matching_strategy: TermsMatchingStrategy,
|
||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) -> Result<RoaringBitmap> {
|
||||
let mut graph = query_graph.clone();
|
||||
|
||||
let nodes_to_remove = match matching_strategy {
|
||||
TermsMatchingStrategy::Last => query_graph
|
||||
.removal_order_for_terms_matching_strategy_last(ctx)
|
||||
.iter()
|
||||
.flat_map(|x| x.iter())
|
||||
.collect(),
|
||||
TermsMatchingStrategy::All => vec![],
|
||||
};
|
||||
graph.remove_nodes_keep_edges(&nodes_to_remove);
|
||||
|
||||
logger.query_for_initial_universe(&graph);
|
||||
let docids = compute_query_graph_docids(ctx, &graph, universe)?;
|
||||
|
||||
Ok(docids)
|
||||
}
|
||||
|
||||
fn resolve_universe(
|
||||
ctx: &mut SearchContext,
|
||||
initial_universe: &RoaringBitmap,
|
||||
query_graph: &QueryGraph,
|
||||
matching_strategy: TermsMatchingStrategy,
|
||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) -> Result<RoaringBitmap> {
|
||||
resolve_maximally_reduced_query_graph(
|
||||
ctx,
|
||||
initial_universe,
|
||||
query_graph,
|
||||
matching_strategy,
|
||||
logger,
|
||||
)
|
||||
}
|
||||
|
||||
/// Return the list of initialised ranking rules to be used for a placeholder search.
|
||||
fn get_ranking_rules_for_placeholder_search<'ctx>(
|
||||
ctx: &SearchContext<'ctx>,
|
||||
sort_criteria: &Option<Vec<AscDesc>>,
|
||||
geo_strategy: geo_sort::Strategy,
|
||||
) -> Result<Vec<BoxRankingRule<'ctx, PlaceholderQuery>>> {
|
||||
let mut sort = false;
|
||||
let mut sorted_fields = HashSet::new();
|
||||
let mut geo_sorted = false;
|
||||
let mut ranking_rules: Vec<BoxRankingRule<PlaceholderQuery>> = vec![];
|
||||
let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
|
||||
for rr in settings_ranking_rules {
|
||||
match rr {
|
||||
// These rules need a query to have an effect; ignore them in placeholder search
|
||||
crate::Criterion::Words
|
||||
| crate::Criterion::Typo
|
||||
| crate::Criterion::Attribute
|
||||
| crate::Criterion::Proximity
|
||||
| crate::Criterion::Exactness => continue,
|
||||
crate::Criterion::Sort => {
|
||||
if sort {
|
||||
continue;
|
||||
}
|
||||
resolve_sort_criteria(
|
||||
sort_criteria,
|
||||
ctx,
|
||||
&mut ranking_rules,
|
||||
&mut sorted_fields,
|
||||
&mut geo_sorted,
|
||||
geo_strategy,
|
||||
)?;
|
||||
sort = true;
|
||||
}
|
||||
crate::Criterion::Asc(field_name) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?));
|
||||
}
|
||||
crate::Criterion::Desc(field_name) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(ranking_rules)
|
||||
}
|
||||
|
||||
/// Return the list of initialised ranking rules to be used for a query graph search.
|
||||
fn get_ranking_rules_for_query_graph_search<'ctx>(
|
||||
ctx: &SearchContext<'ctx>,
|
||||
sort_criteria: &Option<Vec<AscDesc>>,
|
||||
geo_strategy: geo_sort::Strategy,
|
||||
terms_matching_strategy: TermsMatchingStrategy,
|
||||
) -> Result<Vec<BoxRankingRule<'ctx, QueryGraph>>> {
|
||||
// query graph search
|
||||
let mut words = false;
|
||||
let mut typo = false;
|
||||
let mut proximity = false;
|
||||
let mut sort = false;
|
||||
let mut attribute = false;
|
||||
let mut exactness = false;
|
||||
let mut sorted_fields = HashSet::new();
|
||||
let mut geo_sorted = false;
|
||||
|
||||
let mut ranking_rules: Vec<BoxRankingRule<QueryGraph>> = vec![];
|
||||
let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
|
||||
for rr in settings_ranking_rules {
|
||||
// Add Words before any of: typo, proximity, attribute
|
||||
match rr {
|
||||
crate::Criterion::Typo
|
||||
| crate::Criterion::Attribute
|
||||
| crate::Criterion::Proximity
|
||||
| crate::Criterion::Exactness => {
|
||||
if !words {
|
||||
ranking_rules.push(Box::new(Words::new(terms_matching_strategy)));
|
||||
words = true;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
match rr {
|
||||
crate::Criterion::Words => {
|
||||
if words {
|
||||
continue;
|
||||
}
|
||||
ranking_rules.push(Box::new(Words::new(terms_matching_strategy)));
|
||||
words = true;
|
||||
}
|
||||
crate::Criterion::Typo => {
|
||||
if typo {
|
||||
continue;
|
||||
}
|
||||
typo = true;
|
||||
ranking_rules.push(Box::new(Typo::new(None)));
|
||||
}
|
||||
crate::Criterion::Proximity => {
|
||||
if proximity {
|
||||
continue;
|
||||
}
|
||||
proximity = true;
|
||||
ranking_rules.push(Box::new(Proximity::new(None)));
|
||||
}
|
||||
crate::Criterion::Attribute => {
|
||||
if attribute {
|
||||
continue;
|
||||
}
|
||||
attribute = true;
|
||||
ranking_rules.push(Box::new(Fid::new(None)));
|
||||
ranking_rules.push(Box::new(Position::new(None)));
|
||||
}
|
||||
crate::Criterion::Sort => {
|
||||
if sort {
|
||||
continue;
|
||||
}
|
||||
resolve_sort_criteria(
|
||||
sort_criteria,
|
||||
ctx,
|
||||
&mut ranking_rules,
|
||||
&mut sorted_fields,
|
||||
&mut geo_sorted,
|
||||
geo_strategy,
|
||||
)?;
|
||||
sort = true;
|
||||
}
|
||||
crate::Criterion::Exactness => {
|
||||
if exactness {
|
||||
continue;
|
||||
}
|
||||
ranking_rules.push(Box::new(ExactAttribute::new()));
|
||||
ranking_rules.push(Box::new(Exactness::new()));
|
||||
exactness = true;
|
||||
}
|
||||
crate::Criterion::Asc(field_name) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?));
|
||||
}
|
||||
crate::Criterion::Desc(field_name) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(ranking_rules)
|
||||
}
|
||||
|
||||
fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>(
|
||||
sort_criteria: &Option<Vec<AscDesc>>,
|
||||
ctx: &SearchContext<'ctx>,
|
||||
ranking_rules: &mut Vec<BoxRankingRule<'ctx, Query>>,
|
||||
sorted_fields: &mut HashSet<String>,
|
||||
geo_sorted: &mut bool,
|
||||
geo_strategy: geo_sort::Strategy,
|
||||
) -> Result<()> {
|
||||
let sort_criteria = sort_criteria.clone().unwrap_or_default();
|
||||
ranking_rules.reserve(sort_criteria.len());
|
||||
for criterion in sort_criteria {
|
||||
match criterion {
|
||||
AscDesc::Asc(Member::Field(field_name)) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?));
|
||||
}
|
||||
AscDesc::Desc(Member::Field(field_name)) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?));
|
||||
}
|
||||
AscDesc::Asc(Member::Geo(point)) => {
|
||||
if *geo_sorted {
|
||||
continue;
|
||||
}
|
||||
let geo_faceted_docids = ctx.index.geo_faceted_documents_ids(ctx.txn)?;
|
||||
ranking_rules.push(Box::new(GeoSort::new(
|
||||
geo_strategy,
|
||||
geo_faceted_docids,
|
||||
point,
|
||||
true,
|
||||
)?));
|
||||
}
|
||||
AscDesc::Desc(Member::Geo(point)) => {
|
||||
if *geo_sorted {
|
||||
continue;
|
||||
}
|
||||
let geo_faceted_docids = ctx.index.geo_faceted_documents_ids(ctx.txn)?;
|
||||
ranking_rules.push(Box::new(GeoSort::new(
|
||||
geo_strategy,
|
||||
geo_faceted_docids,
|
||||
point,
|
||||
false,
|
||||
)?));
|
||||
}
|
||||
};
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn execute_search(
|
||||
ctx: &mut SearchContext,
|
||||
query: &Option<String>,
|
||||
terms_matching_strategy: TermsMatchingStrategy,
|
||||
exhaustive_number_hits: bool,
|
||||
filters: &Option<Filter>,
|
||||
sort_criteria: &Option<Vec<AscDesc>>,
|
||||
geo_strategy: geo_sort::Strategy,
|
||||
from: usize,
|
||||
length: usize,
|
||||
words_limit: Option<usize>,
|
||||
placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery>,
|
||||
query_graph_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) -> Result<PartialSearchResult> {
|
||||
let mut universe = if let Some(filters) = filters {
|
||||
filters.evaluate(ctx.txn, ctx.index)?
|
||||
} else {
|
||||
ctx.index.documents_ids(ctx.txn)?
|
||||
};
|
||||
|
||||
check_sort_criteria(ctx, sort_criteria.as_ref())?;
|
||||
|
||||
let mut located_query_terms = None;
|
||||
|
||||
let query_terms = if let Some(query) = query {
|
||||
// We make sure that the analyzer is aware of the stop words
|
||||
// this ensures that the query builder is able to properly remove them.
|
||||
let mut tokbuilder = TokenizerBuilder::new();
|
||||
let stop_words = ctx.index.stop_words(ctx.txn)?;
|
||||
if let Some(ref stop_words) = stop_words {
|
||||
tokbuilder.stop_words(stop_words);
|
||||
}
|
||||
|
||||
let script_lang_map = ctx.index.script_language(ctx.txn)?;
|
||||
if !script_lang_map.is_empty() {
|
||||
tokbuilder.allow_list(&script_lang_map);
|
||||
}
|
||||
|
||||
let tokenizer = tokbuilder.build();
|
||||
let tokens = tokenizer.tokenize(query);
|
||||
|
||||
let query_terms = located_query_terms_from_tokens(ctx, tokens, words_limit)?;
|
||||
if query_terms.is_empty() {
|
||||
// Do a placeholder search instead
|
||||
None
|
||||
} else {
|
||||
Some(query_terms)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let bucket_sort_output = if let Some(query_terms) = query_terms {
|
||||
let graph = QueryGraph::from_query(ctx, &query_terms)?;
|
||||
located_query_terms = Some(query_terms);
|
||||
|
||||
let ranking_rules = get_ranking_rules_for_query_graph_search(
|
||||
ctx,
|
||||
sort_criteria,
|
||||
geo_strategy,
|
||||
terms_matching_strategy,
|
||||
)?;
|
||||
|
||||
universe =
|
||||
resolve_universe(ctx, &universe, &graph, terms_matching_strategy, query_graph_logger)?;
|
||||
|
||||
bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)?
|
||||
} else {
|
||||
let ranking_rules =
|
||||
get_ranking_rules_for_placeholder_search(ctx, sort_criteria, geo_strategy)?;
|
||||
bucket_sort(
|
||||
ctx,
|
||||
ranking_rules,
|
||||
&PlaceholderQuery,
|
||||
&universe,
|
||||
from,
|
||||
length,
|
||||
placeholder_search_logger,
|
||||
)?
|
||||
};
|
||||
|
||||
let BucketSortOutput { docids, mut all_candidates } = bucket_sort_output;
|
||||
|
||||
// The candidates is the universe unless the exhaustive number of hits
|
||||
// is requested and a distinct attribute is set.
|
||||
if exhaustive_number_hits {
|
||||
if let Some(f) = ctx.index.distinct_field(ctx.txn)? {
|
||||
if let Some(distinct_fid) = ctx.index.fields_ids_map(ctx.txn)?.id(f) {
|
||||
all_candidates = apply_distinct_rule(ctx, distinct_fid, &all_candidates)?.remaining;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(PartialSearchResult {
|
||||
candidates: all_candidates,
|
||||
documents_ids: docids,
|
||||
located_query_terms,
|
||||
})
|
||||
}
|
||||
|
||||
fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec<AscDesc>>) -> Result<()> {
|
||||
let sort_criteria = if let Some(sort_criteria) = sort_criteria {
|
||||
sort_criteria
|
||||
} else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
if sort_criteria.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// We check that the sort ranking rule exists and throw an
|
||||
// error if we try to use it and that it doesn't.
|
||||
let sort_ranking_rule_missing = !ctx.index.criteria(ctx.txn)?.contains(&crate::Criterion::Sort);
|
||||
if sort_ranking_rule_missing {
|
||||
return Err(UserError::SortRankingRuleMissing.into());
|
||||
}
|
||||
|
||||
// We check that we are allowed to use the sort criteria, we check
|
||||
// that they are declared in the sortable fields.
|
||||
let sortable_fields = ctx.index.sortable_fields(ctx.txn)?;
|
||||
for asc_desc in sort_criteria {
|
||||
match asc_desc.member() {
|
||||
Member::Field(ref field) if !crate::is_faceted(field, &sortable_fields) => {
|
||||
return Err(UserError::InvalidSortableAttribute {
|
||||
field: field.to_string(),
|
||||
valid_fields: sortable_fields.into_iter().collect(),
|
||||
})?
|
||||
}
|
||||
Member::Geo(_) if !sortable_fields.contains("_geo") => {
|
||||
return Err(UserError::InvalidSortableAttribute {
|
||||
field: "_geo".to_string(),
|
||||
valid_fields: sortable_fields.into_iter().collect(),
|
||||
})?
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub struct PartialSearchResult {
|
||||
pub located_query_terms: Option<Vec<LocatedQueryTerm>>,
|
||||
pub candidates: RoaringBitmap,
|
||||
pub documents_ids: Vec<DocumentId>,
|
||||
}
|
||||
460
milli/src/search/new/query_graph.rs
Normal file
460
milli/src/search/new/query_graph.rs
Normal file
@@ -0,0 +1,460 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BTreeMap;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
use fxhash::{FxHashMap, FxHasher};
|
||||
|
||||
use super::interner::{FixedSizeInterner, Interned};
|
||||
use super::query_term::{
|
||||
self, number_of_typos_allowed, LocatedQueryTerm, LocatedQueryTermSubset, QueryTermSubset,
|
||||
};
|
||||
use super::small_bitmap::SmallBitmap;
|
||||
use super::SearchContext;
|
||||
use crate::search::new::interner::Interner;
|
||||
use crate::Result;
|
||||
|
||||
/// A node of the [`QueryGraph`].
|
||||
///
|
||||
/// There are four types of nodes:
|
||||
/// 1. `Start` : unique, represents the start of the query
|
||||
/// 2. `End` : unique, represents the end of a query
|
||||
/// 3. `Deleted` : represents a node that was deleted.
|
||||
/// All deleted nodes are unreachable from the start node.
|
||||
/// 4. `Term` is a regular node representing a word or combination of words
|
||||
/// from the user query.
|
||||
#[derive(Clone)]
|
||||
pub struct QueryNode {
|
||||
pub data: QueryNodeData,
|
||||
pub predecessors: SmallBitmap<QueryNode>,
|
||||
pub successors: SmallBitmap<QueryNode>,
|
||||
}
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub enum QueryNodeData {
|
||||
Term(LocatedQueryTermSubset),
|
||||
Deleted,
|
||||
Start,
|
||||
End,
|
||||
}
|
||||
|
||||
/**
|
||||
A graph representing all the ways to interpret the user's search query.
|
||||
|
||||
## Example 1
|
||||
For the search query `sunflower`, we need to register the following things:
|
||||
- we need to look for the exact word `sunflower`
|
||||
- but also any word which is 1 or 2 typos apart from `sunflower`
|
||||
- and every word that contains the prefix `sunflower`
|
||||
- and also the couple of adjacent words `sun flower`
|
||||
- as well as all the user-defined synonyms of `sunflower`
|
||||
|
||||
All these derivations of a word will be stored in [`QueryTerm`].
|
||||
|
||||
## Example 2:
|
||||
For the search query `summer house by`.
|
||||
|
||||
We also look for all word derivations of each term. And we also need to consider
|
||||
the potential n-grams `summerhouse`, `summerhouseby`, and `houseby`.
|
||||
Furthermore, we need to know which words these ngrams replace. This is done by creating the
|
||||
following graph, where each node also contains a list of derivations:
|
||||
```txt
|
||||
┌───────┐
|
||||
┌─│houseby│─────────┐
|
||||
│ └───────┘ │
|
||||
┌───────┐ ┌───────┐ │ ┌───────┐ ┌────┐ │ ┌───────┐
|
||||
│ START │─┬─│summer │─┴─│ house │┌─│ by │─┼─│ END │
|
||||
└───────┘ │ └───────┘ └───────┘│ └────┘ │ └───────┘
|
||||
│ ┌────────────┐ │ │
|
||||
├─│summerhouse │───────┘ │
|
||||
│ └────────────┘ │
|
||||
│ ┌─────────────┐ │
|
||||
└─────────│summerhouseby│───────┘
|
||||
└─────────────┘
|
||||
```
|
||||
Note also that each node has a range of positions associated with it,
|
||||
such that `summer` is known to be a word at the positions `0..=0` and `houseby`
|
||||
is registered with the positions `1..=2`. When two nodes are connected by an edge,
|
||||
it means that they are potentially next to each other in the user's search query
|
||||
(depending on the [`TermsMatchingStrategy`](crate::search::TermsMatchingStrategy)
|
||||
and the transformations that were done on the query graph).
|
||||
*/
|
||||
#[derive(Clone)]
|
||||
pub struct QueryGraph {
|
||||
/// The index of the start node within `self.nodes`
|
||||
pub root_node: Interned<QueryNode>,
|
||||
/// The index of the end node within `self.nodes`
|
||||
pub end_node: Interned<QueryNode>,
|
||||
/// The list of all query nodes
|
||||
pub nodes: FixedSizeInterner<QueryNode>,
|
||||
}
|
||||
|
||||
impl QueryGraph {
|
||||
/// Build the query graph from the parsed user search query.
|
||||
pub fn from_query(
|
||||
ctx: &mut SearchContext,
|
||||
// NOTE: the terms here must be consecutive
|
||||
terms: &[LocatedQueryTerm],
|
||||
) -> Result<QueryGraph> {
|
||||
let nbr_typos = number_of_typos_allowed(ctx)?;
|
||||
|
||||
let mut nodes_data: Vec<QueryNodeData> = vec![QueryNodeData::Start, QueryNodeData::End];
|
||||
let root_node = 0;
|
||||
let end_node = 1;
|
||||
|
||||
// TODO: we could consider generalizing to 4,5,6,7,etc. ngrams
|
||||
let (mut prev2, mut prev1, mut prev0): (Vec<u16>, Vec<u16>, Vec<u16>) =
|
||||
(vec![], vec![], vec![root_node]);
|
||||
|
||||
let original_terms_len = terms.len();
|
||||
for term_idx in 0..original_terms_len {
|
||||
let mut new_nodes = vec![];
|
||||
let new_node_idx = add_node(
|
||||
&mut nodes_data,
|
||||
QueryNodeData::Term(LocatedQueryTermSubset {
|
||||
term_subset: QueryTermSubset::full(Interned::from_raw(term_idx as u16)),
|
||||
positions: terms[term_idx].positions.clone(),
|
||||
term_ids: term_idx as u8..=term_idx as u8,
|
||||
}),
|
||||
);
|
||||
new_nodes.push(new_node_idx);
|
||||
|
||||
if !prev1.is_empty() {
|
||||
if let Some(ngram) =
|
||||
query_term::make_ngram(ctx, &terms[term_idx - 1..=term_idx], &nbr_typos)?
|
||||
{
|
||||
let ngram_idx = add_node(
|
||||
&mut nodes_data,
|
||||
QueryNodeData::Term(LocatedQueryTermSubset {
|
||||
term_subset: QueryTermSubset::full(ngram.value),
|
||||
positions: ngram.positions,
|
||||
term_ids: term_idx as u8 - 1..=term_idx as u8,
|
||||
}),
|
||||
);
|
||||
new_nodes.push(ngram_idx);
|
||||
}
|
||||
}
|
||||
if !prev2.is_empty() {
|
||||
if let Some(ngram) =
|
||||
query_term::make_ngram(ctx, &terms[term_idx - 2..=term_idx], &nbr_typos)?
|
||||
{
|
||||
let ngram_idx = add_node(
|
||||
&mut nodes_data,
|
||||
QueryNodeData::Term(LocatedQueryTermSubset {
|
||||
term_subset: QueryTermSubset::full(ngram.value),
|
||||
positions: ngram.positions,
|
||||
term_ids: term_idx as u8 - 2..=term_idx as u8,
|
||||
}),
|
||||
);
|
||||
new_nodes.push(ngram_idx);
|
||||
}
|
||||
}
|
||||
(prev0, prev1, prev2) = (new_nodes, prev0, prev1);
|
||||
}
|
||||
|
||||
let root_node = Interned::from_raw(root_node);
|
||||
let end_node = Interned::from_raw(end_node);
|
||||
let mut nodes = FixedSizeInterner::new(
|
||||
nodes_data.len() as u16,
|
||||
QueryNode {
|
||||
data: QueryNodeData::Deleted,
|
||||
predecessors: SmallBitmap::new(nodes_data.len() as u16),
|
||||
successors: SmallBitmap::new(nodes_data.len() as u16),
|
||||
},
|
||||
);
|
||||
for (node_idx, node_data) in nodes_data.into_iter().enumerate() {
|
||||
let node = nodes.get_mut(Interned::from_raw(node_idx as u16));
|
||||
node.data = node_data;
|
||||
}
|
||||
let mut graph = QueryGraph { root_node, end_node, nodes };
|
||||
graph.build_initial_edges();
|
||||
|
||||
Ok(graph)
|
||||
}
|
||||
|
||||
/// Remove the given nodes, connecting all their predecessors to all their successors.
|
||||
pub fn remove_nodes_keep_edges(&mut self, nodes: &[Interned<QueryNode>]) {
|
||||
for &node_id in nodes {
|
||||
let node = self.nodes.get(node_id);
|
||||
let old_node_pred = node.predecessors.clone();
|
||||
let old_node_succ = node.successors.clone();
|
||||
for pred in old_node_pred.iter() {
|
||||
let pred_successors = &mut self.nodes.get_mut(pred).successors;
|
||||
pred_successors.remove(node_id);
|
||||
pred_successors.union(&old_node_succ);
|
||||
}
|
||||
for succ in old_node_succ.iter() {
|
||||
let succ_predecessors = &mut self.nodes.get_mut(succ).predecessors;
|
||||
succ_predecessors.remove(node_id);
|
||||
succ_predecessors.union(&old_node_pred);
|
||||
}
|
||||
let node = self.nodes.get_mut(node_id);
|
||||
node.data = QueryNodeData::Deleted;
|
||||
node.predecessors.clear();
|
||||
node.successors.clear();
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove the given nodes and all their edges from the query graph.
|
||||
pub fn remove_nodes(&mut self, nodes: &[Interned<QueryNode>]) {
|
||||
for &node_id in nodes {
|
||||
let node = &self.nodes.get(node_id);
|
||||
let old_node_pred = node.predecessors.clone();
|
||||
let old_node_succ = node.successors.clone();
|
||||
|
||||
for pred in old_node_pred.iter() {
|
||||
self.nodes.get_mut(pred).successors.remove(node_id);
|
||||
}
|
||||
for succ in old_node_succ.iter() {
|
||||
self.nodes.get_mut(succ).predecessors.remove(node_id);
|
||||
}
|
||||
|
||||
let node = self.nodes.get_mut(node_id);
|
||||
node.data = QueryNodeData::Deleted;
|
||||
node.predecessors.clear();
|
||||
node.successors.clear();
|
||||
}
|
||||
}
|
||||
/// Simplify the query graph by removing all nodes that are disconnected from
|
||||
/// the start or end nodes.
|
||||
pub fn simplify(&mut self) {
|
||||
loop {
|
||||
let mut nodes_to_remove = vec![];
|
||||
for (node_idx, node) in self.nodes.iter() {
|
||||
if (!matches!(node.data, QueryNodeData::End | QueryNodeData::Deleted)
|
||||
&& node.successors.is_empty())
|
||||
|| (!matches!(node.data, QueryNodeData::Start | QueryNodeData::Deleted)
|
||||
&& node.predecessors.is_empty())
|
||||
{
|
||||
nodes_to_remove.push(node_idx);
|
||||
}
|
||||
}
|
||||
if nodes_to_remove.is_empty() {
|
||||
break;
|
||||
} else {
|
||||
self.remove_nodes(&nodes_to_remove);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn build_initial_edges(&mut self) {
|
||||
for (_, node) in self.nodes.iter_mut() {
|
||||
node.successors.clear();
|
||||
node.predecessors.clear();
|
||||
}
|
||||
for node_id in self.nodes.indexes() {
|
||||
let node = self.nodes.get(node_id);
|
||||
let end_prev_term_id = match &node.data {
|
||||
QueryNodeData::Term(term) => *term.term_ids.end() as i16,
|
||||
QueryNodeData::Start => -1,
|
||||
QueryNodeData::Deleted => continue,
|
||||
QueryNodeData::End => continue,
|
||||
};
|
||||
let successors = {
|
||||
let mut successors = SmallBitmap::for_interned_values_in(&self.nodes);
|
||||
let mut min = i16::MAX;
|
||||
for (node_id, node) in self.nodes.iter() {
|
||||
let start_next_term_id = match &node.data {
|
||||
QueryNodeData::Term(term) => *term.term_ids.start() as i16,
|
||||
QueryNodeData::End => i16::MAX,
|
||||
QueryNodeData::Start => continue,
|
||||
QueryNodeData::Deleted => continue,
|
||||
};
|
||||
if start_next_term_id <= end_prev_term_id {
|
||||
continue;
|
||||
}
|
||||
match start_next_term_id.cmp(&min) {
|
||||
Ordering::Less => {
|
||||
min = start_next_term_id;
|
||||
successors.clear();
|
||||
successors.insert(node_id);
|
||||
}
|
||||
Ordering::Equal => {
|
||||
successors.insert(node_id);
|
||||
}
|
||||
Ordering::Greater => continue,
|
||||
}
|
||||
}
|
||||
successors
|
||||
};
|
||||
let node = self.nodes.get_mut(node_id);
|
||||
node.successors = successors.clone();
|
||||
for successor in successors.iter() {
|
||||
let successor = self.nodes.get_mut(successor);
|
||||
successor.predecessors.insert(node_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn removal_order_for_terms_matching_strategy_last(
|
||||
&self,
|
||||
ctx: &SearchContext,
|
||||
) -> Vec<SmallBitmap<QueryNode>> {
|
||||
let (first_term_idx, last_term_idx) = {
|
||||
let mut first_term_idx = u8::MAX;
|
||||
let mut last_term_idx = 0u8;
|
||||
for (_, node) in self.nodes.iter() {
|
||||
match &node.data {
|
||||
QueryNodeData::Term(t) => {
|
||||
if *t.term_ids.end() > last_term_idx {
|
||||
last_term_idx = *t.term_ids.end();
|
||||
}
|
||||
if *t.term_ids.start() < first_term_idx {
|
||||
first_term_idx = *t.term_ids.start();
|
||||
}
|
||||
}
|
||||
QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
|
||||
}
|
||||
}
|
||||
(first_term_idx, last_term_idx)
|
||||
};
|
||||
if first_term_idx >= last_term_idx {
|
||||
return vec![];
|
||||
}
|
||||
let cost_of_term_idx = |term_idx: u8| {
|
||||
let rank = 1 + last_term_idx - term_idx;
|
||||
rank as u16
|
||||
};
|
||||
let mut nodes_to_remove = BTreeMap::<u16, SmallBitmap<QueryNode>>::new();
|
||||
let mut at_least_one_mandatory_term = false;
|
||||
for (node_id, node) in self.nodes.iter() {
|
||||
let QueryNodeData::Term(t) = &node.data else { continue };
|
||||
if t.term_subset.original_phrase(ctx).is_some() || t.term_subset.is_mandatory() {
|
||||
at_least_one_mandatory_term = true;
|
||||
continue;
|
||||
}
|
||||
let mut cost = 0;
|
||||
for id in t.term_ids.clone() {
|
||||
cost = std::cmp::max(cost, cost_of_term_idx(id));
|
||||
}
|
||||
nodes_to_remove
|
||||
.entry(cost)
|
||||
.or_insert_with(|| SmallBitmap::for_interned_values_in(&self.nodes))
|
||||
.insert(node_id);
|
||||
}
|
||||
let mut res: Vec<_> = nodes_to_remove.into_values().collect();
|
||||
if !at_least_one_mandatory_term {
|
||||
res.pop();
|
||||
}
|
||||
res
|
||||
}
|
||||
}
|
||||
|
||||
fn add_node(nodes_data: &mut Vec<QueryNodeData>, node_data: QueryNodeData) -> u16 {
|
||||
let new_node_idx = nodes_data.len() as u16;
|
||||
nodes_data.push(node_data);
|
||||
new_node_idx
|
||||
}
|
||||
|
||||
impl QueryGraph {
|
||||
/*
|
||||
Build a query graph from a list of paths
|
||||
|
||||
The paths are composed of source and dest terms.
|
||||
|
||||
For example, consider the following paths:
|
||||
```txt
|
||||
PATH 1 : a -> b1 -> c1 -> d -> e1
|
||||
PATH 2 : a -> b2 -> c2 -> d -> e2
|
||||
```
|
||||
Then the resulting graph will be:
|
||||
```txt
|
||||
┌────┐ ┌────┐ ┌────┐ ┌────┐
|
||||
┌──│ b1 │──│ c1 │───│ d │───│ e1 │
|
||||
┌────┐ │ └────┘ └────┘ └────┘ └────┘
|
||||
│ a │─┤
|
||||
└────┘ │ ┌────┐ ┌────┐ ┌────┐ ┌────┐
|
||||
└──│ b2 │──│ c2 │───│ d │───│ e2 │
|
||||
└────┘ └────┘ └────┘ └────┘
|
||||
```
|
||||
*/
|
||||
pub fn build_from_paths(
|
||||
paths: Vec<Vec<(Option<LocatedQueryTermSubset>, LocatedQueryTermSubset)>>,
|
||||
) -> Self {
|
||||
let mut node_data = Interner::default();
|
||||
let root_node = node_data.push(QueryNodeData::Start);
|
||||
let end_node = node_data.push(QueryNodeData::End);
|
||||
|
||||
let mut paths_with_single_terms = vec![];
|
||||
|
||||
for path in paths {
|
||||
let mut processed_path = vec![];
|
||||
let mut prev_dest_term: Option<LocatedQueryTermSubset> = None;
|
||||
for (start_term, dest_term) in path {
|
||||
if let Some(prev_dest_term) = prev_dest_term.take() {
|
||||
if let Some(mut start_term) = start_term {
|
||||
if start_term.term_ids == prev_dest_term.term_ids {
|
||||
start_term.term_subset.intersect(&prev_dest_term.term_subset);
|
||||
processed_path.push(start_term);
|
||||
} else {
|
||||
processed_path.push(prev_dest_term);
|
||||
processed_path.push(start_term);
|
||||
}
|
||||
} else {
|
||||
processed_path.push(prev_dest_term);
|
||||
}
|
||||
} else if let Some(start_term) = start_term {
|
||||
processed_path.push(start_term);
|
||||
}
|
||||
prev_dest_term = Some(dest_term);
|
||||
}
|
||||
if let Some(prev_dest_term) = prev_dest_term {
|
||||
processed_path.push(prev_dest_term);
|
||||
}
|
||||
paths_with_single_terms.push(processed_path);
|
||||
}
|
||||
|
||||
let mut paths_with_single_terms_and_suffix_hash = vec![];
|
||||
for path in paths_with_single_terms {
|
||||
let mut hasher = FxHasher::default();
|
||||
let mut path_with_hash = vec![];
|
||||
for term in path.into_iter().rev() {
|
||||
term.hash(&mut hasher);
|
||||
path_with_hash.push((term, hasher.finish()));
|
||||
}
|
||||
path_with_hash.reverse();
|
||||
paths_with_single_terms_and_suffix_hash.push(path_with_hash);
|
||||
}
|
||||
|
||||
let mut node_data_id_for_term_and_suffix_hash =
|
||||
FxHashMap::<(LocatedQueryTermSubset, u64), Interned<QueryNodeData>>::default();
|
||||
|
||||
let mut paths_with_ids = vec![];
|
||||
for path in paths_with_single_terms_and_suffix_hash {
|
||||
let mut path_with_ids = vec![];
|
||||
for (term, suffix_hash) in path {
|
||||
let node_data_id = node_data_id_for_term_and_suffix_hash
|
||||
.entry((term.clone(), suffix_hash))
|
||||
.or_insert_with(|| node_data.push(QueryNodeData::Term(term)));
|
||||
path_with_ids.push(Interned::from_raw(node_data_id.into_raw()));
|
||||
}
|
||||
paths_with_ids.push(path_with_ids);
|
||||
}
|
||||
|
||||
let nodes_data = node_data.freeze();
|
||||
let nodes_data_len = nodes_data.len();
|
||||
let mut nodes = nodes_data.map_move(|n| QueryNode {
|
||||
data: n,
|
||||
predecessors: SmallBitmap::new(nodes_data_len),
|
||||
successors: SmallBitmap::new(nodes_data_len),
|
||||
});
|
||||
|
||||
let root_node = Interned::<QueryNode>::from_raw(root_node.into_raw());
|
||||
let end_node = Interned::<QueryNode>::from_raw(end_node.into_raw());
|
||||
|
||||
for path in paths_with_ids {
|
||||
let mut prev_node_id = root_node;
|
||||
for node_id in path {
|
||||
let prev_node = nodes.get_mut(prev_node_id);
|
||||
prev_node.successors.insert(node_id);
|
||||
let node = nodes.get_mut(node_id);
|
||||
node.predecessors.insert(prev_node_id);
|
||||
prev_node_id = node_id;
|
||||
}
|
||||
let prev_node = nodes.get_mut(prev_node_id);
|
||||
prev_node.successors.insert(end_node);
|
||||
let node = nodes.get_mut(end_node);
|
||||
node.predecessors.insert(prev_node_id);
|
||||
}
|
||||
|
||||
QueryGraph { root_node, end_node, nodes }
|
||||
}
|
||||
}
|
||||
404
milli/src/search/new/query_term/compute_derivations.rs
Normal file
404
milli/src/search/new/query_term/compute_derivations.rs
Normal file
@@ -0,0 +1,404 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::ControlFlow;
|
||||
|
||||
use fst::automaton::Str;
|
||||
use fst::{Automaton, IntoStreamer, Streamer};
|
||||
use heed::types::DecodeIgnore;
|
||||
|
||||
use super::*;
|
||||
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
|
||||
use crate::search::new::query_term::TwoTypoTerm;
|
||||
use crate::search::new::{limits, SearchContext};
|
||||
use crate::search::{build_dfa, get_first};
|
||||
use crate::{Result, MAX_WORD_LENGTH};
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum NumberOfTypos {
|
||||
Zero,
|
||||
One,
|
||||
Two,
|
||||
}
|
||||
|
||||
pub enum ZeroOrOneTypo {
|
||||
Zero,
|
||||
One,
|
||||
}
|
||||
|
||||
impl Interned<QueryTerm> {
|
||||
pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> {
|
||||
let s = ctx.term_interner.get_mut(self);
|
||||
if s.max_nbr_typos == 0 {
|
||||
s.one_typo = Lazy::Init(OneTypoTerm::default());
|
||||
s.two_typo = Lazy::Init(TwoTypoTerm::default());
|
||||
} else if s.max_nbr_typos == 1 && s.one_typo.is_uninit() {
|
||||
assert!(s.two_typo.is_uninit());
|
||||
self.initialize_one_typo_subterm(ctx)?;
|
||||
let s = ctx.term_interner.get_mut(self);
|
||||
assert!(s.one_typo.is_init());
|
||||
s.two_typo = Lazy::Init(TwoTypoTerm::default());
|
||||
} else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() {
|
||||
assert!(s.two_typo.is_uninit());
|
||||
self.initialize_one_and_two_typo_subterm(ctx)?;
|
||||
let s = ctx.term_interner.get_mut(self);
|
||||
assert!(s.one_typo.is_init() && s.two_typo.is_init());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn find_zero_typo_prefix_derivations(
|
||||
word_interned: Interned<String>,
|
||||
fst: fst::Set<Cow<[u8]>>,
|
||||
word_interner: &mut DedupInterner<String>,
|
||||
mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
|
||||
) -> Result<()> {
|
||||
let word = word_interner.get(word_interned).to_owned();
|
||||
let word = word.as_str();
|
||||
let prefix = Str::new(word).starts_with();
|
||||
let mut stream = fst.search(prefix).into_stream();
|
||||
|
||||
while let Some(derived_word) = stream.next() {
|
||||
let derived_word = std::str::from_utf8(derived_word)?.to_owned();
|
||||
let derived_word_interned = word_interner.insert(derived_word);
|
||||
if derived_word_interned != word_interned {
|
||||
let cf = visit(derived_word_interned)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn find_zero_one_typo_derivations(
|
||||
ctx: &mut SearchContext,
|
||||
word_interned: Interned<String>,
|
||||
is_prefix: bool,
|
||||
mut visit: impl FnMut(Interned<String>, ZeroOrOneTypo) -> Result<ControlFlow<()>>,
|
||||
) -> Result<()> {
|
||||
let fst = ctx.get_words_fst()?;
|
||||
let word = ctx.word_interner.get(word_interned).to_owned();
|
||||
let word = word.as_str();
|
||||
|
||||
let dfa = build_dfa(word, 1, is_prefix);
|
||||
let starts = StartsWith(Str::new(get_first(word)));
|
||||
let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream();
|
||||
|
||||
while let Some((derived_word, state)) = stream.next() {
|
||||
let derived_word = std::str::from_utf8(derived_word)?;
|
||||
let derived_word = ctx.word_interner.insert(derived_word.to_owned());
|
||||
let d = dfa.distance(state.1);
|
||||
match d.to_u8() {
|
||||
0 => {
|
||||
if derived_word != word_interned {
|
||||
let cf = visit(derived_word, ZeroOrOneTypo::Zero)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
1 => {
|
||||
let cf = visit(derived_word, ZeroOrOneTypo::One)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
unreachable!("One typo dfa produced multiple typos")
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn find_zero_one_two_typo_derivations(
|
||||
word_interned: Interned<String>,
|
||||
is_prefix: bool,
|
||||
fst: fst::Set<Cow<[u8]>>,
|
||||
word_interner: &mut DedupInterner<String>,
|
||||
mut visit: impl FnMut(Interned<String>, NumberOfTypos) -> Result<ControlFlow<()>>,
|
||||
) -> Result<()> {
|
||||
let word = word_interner.get(word_interned).to_owned();
|
||||
let word = word.as_str();
|
||||
|
||||
let starts = StartsWith(Str::new(get_first(word)));
|
||||
let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts));
|
||||
let second_dfa = build_dfa(word, 2, is_prefix);
|
||||
let second = Intersection(&second_dfa, &starts);
|
||||
let automaton = Union(first, &second);
|
||||
|
||||
let mut stream = fst.search_with_state(automaton).into_stream();
|
||||
|
||||
while let Some((derived_word, state)) = stream.next() {
|
||||
let derived_word = std::str::from_utf8(derived_word)?;
|
||||
let derived_word_interned = word_interner.insert(derived_word.to_owned());
|
||||
// in the case the typo is on the first letter, we know the number of typo
|
||||
// is two
|
||||
if get_first(derived_word) != get_first(word) {
|
||||
let cf = visit(derived_word_interned, NumberOfTypos::Two)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Else, we know that it is the second dfa that matched and compute the
|
||||
// correct distance
|
||||
let d = second_dfa.distance((state.1).0);
|
||||
match d.to_u8() {
|
||||
0 => {
|
||||
if derived_word_interned != word_interned {
|
||||
let cf = visit(derived_word_interned, NumberOfTypos::Zero)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
1 => {
|
||||
let cf = visit(derived_word_interned, NumberOfTypos::One)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
2 => {
|
||||
let cf = visit(derived_word_interned, NumberOfTypos::Two)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
_ => unreachable!("2 typos DFA produced a distance greater than 2"),
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn partially_initialized_term_from_word(
|
||||
ctx: &mut SearchContext,
|
||||
word: &str,
|
||||
max_typo: u8,
|
||||
is_prefix: bool,
|
||||
is_ngram: bool,
|
||||
) -> Result<QueryTerm> {
|
||||
let word_interned = ctx.word_interner.insert(word.to_owned());
|
||||
|
||||
if word.len() > MAX_WORD_LENGTH {
|
||||
return Ok({
|
||||
QueryTerm {
|
||||
original: ctx.word_interner.insert(word.to_owned()),
|
||||
ngram_words: None,
|
||||
is_prefix: false,
|
||||
max_nbr_typos: 0,
|
||||
zero_typo: <_>::default(),
|
||||
one_typo: Lazy::Init(<_>::default()),
|
||||
two_typo: Lazy::Init(<_>::default()),
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
let fst = ctx.index.words_fst(ctx.txn)?;
|
||||
|
||||
let use_prefix_db = is_prefix
|
||||
&& (ctx
|
||||
.index
|
||||
.word_prefix_docids
|
||||
.remap_data_type::<DecodeIgnore>()
|
||||
.get(ctx.txn, word)?
|
||||
.is_some()
|
||||
|| (!is_ngram
|
||||
&& ctx
|
||||
.index
|
||||
.exact_word_prefix_docids
|
||||
.remap_data_type::<DecodeIgnore>()
|
||||
.get(ctx.txn, word)?
|
||||
.is_some()));
|
||||
let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None };
|
||||
|
||||
let mut zero_typo = None;
|
||||
let mut prefix_of = BTreeSet::new();
|
||||
|
||||
if fst.contains(word) {
|
||||
zero_typo = Some(word_interned);
|
||||
}
|
||||
|
||||
if is_prefix && use_prefix_db.is_none() {
|
||||
find_zero_typo_prefix_derivations(
|
||||
word_interned,
|
||||
fst,
|
||||
&mut ctx.word_interner,
|
||||
|derived_word| {
|
||||
if prefix_of.len() < limits::MAX_PREFIX_COUNT {
|
||||
prefix_of.insert(derived_word);
|
||||
Ok(ControlFlow::Continue(()))
|
||||
} else {
|
||||
Ok(ControlFlow::Break(()))
|
||||
}
|
||||
},
|
||||
)?;
|
||||
}
|
||||
let synonyms = ctx.index.synonyms(ctx.txn)?;
|
||||
let mut synonym_word_count = 0;
|
||||
let synonyms = synonyms
|
||||
.get(&vec![word.to_owned()])
|
||||
.cloned()
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.take(limits::MAX_SYNONYM_PHRASE_COUNT)
|
||||
.filter_map(|words| {
|
||||
if synonym_word_count + words.len() > limits::MAX_SYNONYM_WORD_COUNT {
|
||||
return None;
|
||||
}
|
||||
synonym_word_count += words.len();
|
||||
let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect();
|
||||
Some(ctx.phrase_interner.insert(Phrase { words }))
|
||||
})
|
||||
.collect();
|
||||
let zero_typo =
|
||||
ZeroTypoTerm { phrase: None, exact: zero_typo, prefix_of, synonyms, use_prefix_db };
|
||||
|
||||
Ok(QueryTerm {
|
||||
original: word_interned,
|
||||
ngram_words: None,
|
||||
max_nbr_typos: max_typo,
|
||||
is_prefix,
|
||||
zero_typo,
|
||||
one_typo: Lazy::Uninit,
|
||||
two_typo: Lazy::Uninit,
|
||||
})
|
||||
}
|
||||
|
||||
fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Interned<Phrase>>> {
|
||||
if let Some((l, r)) = split_best_frequency(ctx, word)? {
|
||||
Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] })))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
impl Interned<QueryTerm> {
|
||||
fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
|
||||
let self_mut = ctx.term_interner.get_mut(self);
|
||||
let QueryTerm { original, is_prefix, one_typo, .. } = self_mut;
|
||||
let original = *original;
|
||||
let is_prefix = *is_prefix;
|
||||
// let original_str = ctx.word_interner.get(*original).to_owned();
|
||||
if one_typo.is_init() {
|
||||
return Ok(());
|
||||
}
|
||||
let mut one_typo_words = BTreeSet::new();
|
||||
|
||||
find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| {
|
||||
match nbr_typos {
|
||||
ZeroOrOneTypo::Zero => {}
|
||||
ZeroOrOneTypo::One => {
|
||||
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
||||
one_typo_words.insert(derived_word);
|
||||
} else {
|
||||
return Ok(ControlFlow::Break(()));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(ControlFlow::Continue(()))
|
||||
})?;
|
||||
let original_str = ctx.word_interner.get(original).to_owned();
|
||||
let split_words = find_split_words(ctx, original_str.as_str())?;
|
||||
|
||||
let self_mut = ctx.term_interner.get_mut(self);
|
||||
|
||||
// Only add the split words to the derivations if:
|
||||
// 1. the term is not an ngram; OR
|
||||
// 2. the term is an ngram, but the split words are different from the ngram's component words
|
||||
let split_words = if let Some((ngram_words, split_words)) =
|
||||
self_mut.ngram_words.as_ref().zip(split_words.as_ref())
|
||||
{
|
||||
let Phrase { words } = ctx.phrase_interner.get(*split_words);
|
||||
if ngram_words.iter().ne(words.iter().flatten()) {
|
||||
Some(*split_words)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
split_words
|
||||
};
|
||||
let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words };
|
||||
|
||||
self_mut.one_typo = Lazy::Init(one_typo);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
|
||||
let self_mut = ctx.term_interner.get_mut(self);
|
||||
let QueryTerm { original, is_prefix, two_typo, .. } = self_mut;
|
||||
let original_str = ctx.word_interner.get(*original).to_owned();
|
||||
if two_typo.is_init() {
|
||||
return Ok(());
|
||||
}
|
||||
let mut one_typo_words = BTreeSet::new();
|
||||
let mut two_typo_words = BTreeSet::new();
|
||||
|
||||
find_zero_one_two_typo_derivations(
|
||||
*original,
|
||||
*is_prefix,
|
||||
ctx.index.words_fst(ctx.txn)?,
|
||||
&mut ctx.word_interner,
|
||||
|derived_word, nbr_typos| {
|
||||
if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT
|
||||
&& two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT
|
||||
{
|
||||
// No chance we will add either one- or two-typo derivations anymore, stop iterating.
|
||||
return Ok(ControlFlow::Break(()));
|
||||
}
|
||||
match nbr_typos {
|
||||
NumberOfTypos::Zero => {}
|
||||
NumberOfTypos::One => {
|
||||
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
||||
one_typo_words.insert(derived_word);
|
||||
}
|
||||
}
|
||||
NumberOfTypos::Two => {
|
||||
if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT {
|
||||
two_typo_words.insert(derived_word);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(ControlFlow::Continue(()))
|
||||
},
|
||||
)?;
|
||||
let split_words = find_split_words(ctx, original_str.as_str())?;
|
||||
let self_mut = ctx.term_interner.get_mut(self);
|
||||
|
||||
let one_typo = OneTypoTerm { one_typo: one_typo_words, split_words };
|
||||
|
||||
let two_typo = TwoTypoTerm { two_typos: two_typo_words };
|
||||
|
||||
self_mut.one_typo = Lazy::Init(one_typo);
|
||||
self_mut.two_typo = Lazy::Init(two_typo);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Split the original word into the two words that appear the
|
||||
/// most next to each other in the index.
|
||||
///
|
||||
/// Return `None` if the original word cannot be split.
|
||||
fn split_best_frequency(
|
||||
ctx: &mut SearchContext,
|
||||
original: &str,
|
||||
) -> Result<Option<(Interned<String>, Interned<String>)>> {
|
||||
let chars = original.char_indices().skip(1);
|
||||
let mut best = None;
|
||||
|
||||
for (i, _) in chars {
|
||||
let (left, right) = original.split_at(i);
|
||||
let left = ctx.word_interner.insert(left.to_owned());
|
||||
let right = ctx.word_interner.insert(right.to_owned());
|
||||
|
||||
if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(left, right, 1)? {
|
||||
if best.map_or(true, |(old, _, _)| frequency > old) {
|
||||
best = Some((frequency, left, right));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(best.map(|(_, left, right)| (left, right)))
|
||||
}
|
||||
498
milli/src/search/new/query_term/mod.rs
Normal file
498
milli/src/search/new/query_term/mod.rs
Normal file
@@ -0,0 +1,498 @@
|
||||
mod compute_derivations;
|
||||
mod ntypo_subset;
|
||||
mod parse_query;
|
||||
mod phrase;
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
use std::iter::FromIterator;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
use compute_derivations::partially_initialized_term_from_word;
|
||||
use either::Either;
|
||||
pub use ntypo_subset::NTypoTermSubset;
|
||||
pub use parse_query::{located_query_terms_from_tokens, make_ngram, number_of_typos_allowed};
|
||||
pub use phrase::Phrase;
|
||||
|
||||
use super::interner::{DedupInterner, Interned};
|
||||
use super::{limits, SearchContext, Word};
|
||||
use crate::Result;
|
||||
|
||||
/// A set of word derivations attached to a location in the search query.
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub struct LocatedQueryTermSubset {
|
||||
pub term_subset: QueryTermSubset,
|
||||
pub positions: RangeInclusive<u16>,
|
||||
pub term_ids: RangeInclusive<u8>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct QueryTermSubset {
|
||||
original: Interned<QueryTerm>,
|
||||
zero_typo_subset: NTypoTermSubset,
|
||||
one_typo_subset: NTypoTermSubset,
|
||||
two_typo_subset: NTypoTermSubset,
|
||||
/// `true` if the term cannot be deleted through the term matching strategy
|
||||
///
|
||||
/// Note that there are other reasons for which a term cannot be deleted, such as
|
||||
/// being a phrase. In that case, this field could be set to `false`, but it
|
||||
/// still wouldn't be deleteable by the term matching strategy.
|
||||
mandatory: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub struct QueryTerm {
|
||||
original: Interned<String>,
|
||||
ngram_words: Option<Vec<Interned<String>>>,
|
||||
max_nbr_typos: u8,
|
||||
is_prefix: bool,
|
||||
zero_typo: ZeroTypoTerm,
|
||||
// May not be computed yet
|
||||
one_typo: Lazy<OneTypoTerm>,
|
||||
// May not be computed yet
|
||||
two_typo: Lazy<TwoTypoTerm>,
|
||||
}
|
||||
|
||||
// SubTerms will be in a dedup interner
|
||||
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||
struct ZeroTypoTerm {
|
||||
/// The original phrase, if any
|
||||
phrase: Option<Interned<Phrase>>,
|
||||
/// A single word equivalent to the original term, with zero typos
|
||||
exact: Option<Interned<String>>,
|
||||
/// All the words that contain the original word as prefix
|
||||
prefix_of: BTreeSet<Interned<String>>,
|
||||
/// All the synonyms of the original word or phrase
|
||||
synonyms: BTreeSet<Interned<Phrase>>,
|
||||
/// A prefix in the prefix databases matching the original word
|
||||
use_prefix_db: Option<Interned<String>>,
|
||||
}
|
||||
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||
struct OneTypoTerm {
|
||||
/// The original word split into multiple consecutive words
|
||||
split_words: Option<Interned<Phrase>>,
|
||||
/// Words that are 1 typo away from the original word
|
||||
one_typo: BTreeSet<Interned<String>>,
|
||||
}
|
||||
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||
struct TwoTypoTerm {
|
||||
/// Words that are 2 typos away from the original word
|
||||
two_typos: BTreeSet<Interned<String>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub enum Lazy<T> {
|
||||
Uninit,
|
||||
Init(T),
|
||||
}
|
||||
impl<T> Lazy<T> {
|
||||
pub fn is_init(&self) -> bool {
|
||||
match self {
|
||||
Lazy::Uninit => false,
|
||||
Lazy::Init(_) => true,
|
||||
}
|
||||
}
|
||||
pub fn is_uninit(&self) -> bool {
|
||||
match self {
|
||||
Lazy::Uninit => true,
|
||||
Lazy::Init(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum ExactTerm {
|
||||
Phrase(Interned<Phrase>),
|
||||
Word(Interned<String>),
|
||||
}
|
||||
|
||||
impl ExactTerm {
|
||||
pub fn interned_words<'ctx>(
|
||||
&self,
|
||||
ctx: &'ctx SearchContext<'ctx>,
|
||||
) -> impl Iterator<Item = Option<Interned<String>>> + 'ctx {
|
||||
match *self {
|
||||
ExactTerm::Phrase(phrase) => {
|
||||
let phrase = ctx.phrase_interner.get(phrase);
|
||||
Either::Left(phrase.words.iter().copied())
|
||||
}
|
||||
ExactTerm::Word(word) => Either::Right(std::iter::once(Some(word))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl QueryTermSubset {
|
||||
pub fn is_mandatory(&self) -> bool {
|
||||
self.mandatory
|
||||
}
|
||||
pub fn make_mandatory(&mut self) {
|
||||
self.mandatory = true;
|
||||
}
|
||||
pub fn exact_term(&self, ctx: &SearchContext) -> Option<ExactTerm> {
|
||||
let full_query_term = ctx.term_interner.get(self.original);
|
||||
if full_query_term.ngram_words.is_some() {
|
||||
return None;
|
||||
}
|
||||
// TODO: included in subset
|
||||
if let Some(phrase) = full_query_term.zero_typo.phrase {
|
||||
self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase))
|
||||
} else if let Some(word) = full_query_term.zero_typo.exact {
|
||||
self.zero_typo_subset.contains_word(word).then_some(ExactTerm::Word(word))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn empty(for_term: Interned<QueryTerm>) -> Self {
|
||||
Self {
|
||||
original: for_term,
|
||||
zero_typo_subset: NTypoTermSubset::Nothing,
|
||||
one_typo_subset: NTypoTermSubset::Nothing,
|
||||
two_typo_subset: NTypoTermSubset::Nothing,
|
||||
mandatory: false,
|
||||
}
|
||||
}
|
||||
pub fn full(for_term: Interned<QueryTerm>) -> Self {
|
||||
Self {
|
||||
original: for_term,
|
||||
zero_typo_subset: NTypoTermSubset::All,
|
||||
one_typo_subset: NTypoTermSubset::All,
|
||||
two_typo_subset: NTypoTermSubset::All,
|
||||
mandatory: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn union(&mut self, other: &Self) {
|
||||
assert!(self.original == other.original);
|
||||
self.zero_typo_subset.union(&other.zero_typo_subset);
|
||||
self.one_typo_subset.union(&other.one_typo_subset);
|
||||
self.two_typo_subset.union(&other.two_typo_subset);
|
||||
}
|
||||
pub fn intersect(&mut self, other: &Self) {
|
||||
assert!(self.original == other.original);
|
||||
self.zero_typo_subset.intersect(&other.zero_typo_subset);
|
||||
self.one_typo_subset.intersect(&other.one_typo_subset);
|
||||
self.two_typo_subset.intersect(&other.two_typo_subset);
|
||||
}
|
||||
|
||||
pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option<Word> {
|
||||
let original = ctx.term_interner.get(self.original);
|
||||
let Some(use_prefix_db) = original.zero_typo.use_prefix_db else {
|
||||
return None
|
||||
};
|
||||
let word = match &self.zero_typo_subset {
|
||||
NTypoTermSubset::All => Some(use_prefix_db),
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||
// TODO: use a subset of prefix words instead
|
||||
if words.contains(&use_prefix_db) {
|
||||
Some(use_prefix_db)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
NTypoTermSubset::Nothing => None,
|
||||
};
|
||||
word.map(|word| {
|
||||
if original.ngram_words.is_some() {
|
||||
Word::Derived(word)
|
||||
} else {
|
||||
Word::Original(word)
|
||||
}
|
||||
})
|
||||
}
|
||||
pub fn all_single_words_except_prefix_db(
|
||||
&self,
|
||||
ctx: &mut SearchContext,
|
||||
) -> Result<BTreeSet<Word>> {
|
||||
let mut result = BTreeSet::default();
|
||||
// TODO: a compute_partially funtion
|
||||
if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() {
|
||||
self.original.compute_fully_if_needed(ctx)?;
|
||||
}
|
||||
|
||||
let original = ctx.term_interner.get_mut(self.original);
|
||||
match &self.zero_typo_subset {
|
||||
NTypoTermSubset::All => {
|
||||
let ZeroTypoTerm {
|
||||
phrase: _,
|
||||
exact: zero_typo,
|
||||
prefix_of,
|
||||
synonyms: _,
|
||||
use_prefix_db: _,
|
||||
} = &original.zero_typo;
|
||||
result.extend(zero_typo.iter().copied().map(|w| {
|
||||
if original.ngram_words.is_some() {
|
||||
Word::Derived(w)
|
||||
} else {
|
||||
Word::Original(w)
|
||||
}
|
||||
}));
|
||||
result.extend(prefix_of.iter().copied().map(|w| {
|
||||
if original.ngram_words.is_some() {
|
||||
Word::Derived(w)
|
||||
} else {
|
||||
Word::Original(w)
|
||||
}
|
||||
}));
|
||||
}
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||
let ZeroTypoTerm {
|
||||
phrase: _,
|
||||
exact: zero_typo,
|
||||
prefix_of,
|
||||
synonyms: _,
|
||||
use_prefix_db: _,
|
||||
} = &original.zero_typo;
|
||||
if let Some(zero_typo) = zero_typo {
|
||||
if words.contains(zero_typo) {
|
||||
if original.ngram_words.is_some() {
|
||||
result.insert(Word::Derived(*zero_typo));
|
||||
} else {
|
||||
result.insert(Word::Original(*zero_typo));
|
||||
}
|
||||
}
|
||||
}
|
||||
result.extend(prefix_of.intersection(words).copied().map(|w| {
|
||||
if original.ngram_words.is_some() {
|
||||
Word::Derived(w)
|
||||
} else {
|
||||
Word::Original(w)
|
||||
}
|
||||
}));
|
||||
}
|
||||
NTypoTermSubset::Nothing => {}
|
||||
}
|
||||
|
||||
match &self.one_typo_subset {
|
||||
NTypoTermSubset::All => {
|
||||
let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else {
|
||||
panic!()
|
||||
};
|
||||
result.extend(one_typo.iter().copied().map(Word::Derived))
|
||||
}
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||
let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else {
|
||||
panic!()
|
||||
};
|
||||
result.extend(one_typo.intersection(words).copied().map(Word::Derived));
|
||||
}
|
||||
NTypoTermSubset::Nothing => {}
|
||||
};
|
||||
|
||||
match &self.two_typo_subset {
|
||||
NTypoTermSubset::All => {
|
||||
let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else {
|
||||
panic!()
|
||||
};
|
||||
result.extend(two_typos.iter().copied().map(Word::Derived));
|
||||
}
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||
let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else {
|
||||
panic!()
|
||||
};
|
||||
result.extend(two_typos.intersection(words).copied().map(Word::Derived));
|
||||
}
|
||||
NTypoTermSubset::Nothing => {}
|
||||
};
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
pub fn all_phrases(&self, ctx: &mut SearchContext) -> Result<BTreeSet<Interned<Phrase>>> {
|
||||
let mut result = BTreeSet::default();
|
||||
|
||||
if !self.one_typo_subset.is_empty() {
|
||||
// TODO: compute less than fully if possible
|
||||
self.original.compute_fully_if_needed(ctx)?;
|
||||
}
|
||||
let original = ctx.term_interner.get_mut(self.original);
|
||||
|
||||
let ZeroTypoTerm { phrase, exact: _, prefix_of: _, synonyms, use_prefix_db: _ } =
|
||||
&original.zero_typo;
|
||||
result.extend(phrase.iter().copied());
|
||||
result.extend(synonyms.iter().copied());
|
||||
|
||||
match &self.one_typo_subset {
|
||||
NTypoTermSubset::All => {
|
||||
let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else {
|
||||
panic!();
|
||||
};
|
||||
result.extend(split_words.iter().copied());
|
||||
}
|
||||
NTypoTermSubset::Subset { phrases, .. } => {
|
||||
let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else {
|
||||
panic!();
|
||||
};
|
||||
if let Some(split_words) = split_words {
|
||||
if phrases.contains(split_words) {
|
||||
result.insert(*split_words);
|
||||
}
|
||||
}
|
||||
}
|
||||
NTypoTermSubset::Nothing => {}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub fn original_phrase(&self, ctx: &SearchContext) -> Option<Interned<Phrase>> {
|
||||
let t = ctx.term_interner.get(self.original);
|
||||
if let Some(p) = t.zero_typo.phrase {
|
||||
if self.zero_typo_subset.contains_phrase(p) {
|
||||
return Some(p);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
pub fn max_nbr_typos(&self, ctx: &SearchContext) -> u8 {
|
||||
let t = ctx.term_interner.get(self.original);
|
||||
match t.max_nbr_typos {
|
||||
0 => 0,
|
||||
1 => {
|
||||
if self.one_typo_subset.is_empty() {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
}
|
||||
}
|
||||
2 => {
|
||||
if self.two_typo_subset.is_empty() {
|
||||
if self.one_typo_subset.is_empty() {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
}
|
||||
} else {
|
||||
2
|
||||
}
|
||||
}
|
||||
_ => panic!(),
|
||||
}
|
||||
}
|
||||
pub fn keep_only_exact_term(&mut self, ctx: &SearchContext) {
|
||||
if let Some(term) = self.exact_term(ctx) {
|
||||
match term {
|
||||
ExactTerm::Phrase(p) => {
|
||||
self.zero_typo_subset = NTypoTermSubset::Subset {
|
||||
words: BTreeSet::new(),
|
||||
phrases: BTreeSet::from_iter([p]),
|
||||
};
|
||||
self.clear_one_typo_subset();
|
||||
self.clear_two_typo_subset();
|
||||
}
|
||||
ExactTerm::Word(w) => {
|
||||
self.zero_typo_subset = NTypoTermSubset::Subset {
|
||||
words: BTreeSet::from_iter([w]),
|
||||
phrases: BTreeSet::new(),
|
||||
};
|
||||
self.clear_one_typo_subset();
|
||||
self.clear_two_typo_subset();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn clear_zero_typo_subset(&mut self) {
|
||||
self.zero_typo_subset = NTypoTermSubset::Nothing;
|
||||
}
|
||||
pub fn clear_one_typo_subset(&mut self) {
|
||||
self.one_typo_subset = NTypoTermSubset::Nothing;
|
||||
}
|
||||
pub fn clear_two_typo_subset(&mut self) {
|
||||
self.two_typo_subset = NTypoTermSubset::Nothing;
|
||||
}
|
||||
pub fn description(&self, ctx: &SearchContext) -> String {
|
||||
let t = ctx.term_interner.get(self.original);
|
||||
ctx.word_interner.get(t.original).to_owned()
|
||||
}
|
||||
}
|
||||
|
||||
impl ZeroTypoTerm {
|
||||
fn is_empty(&self) -> bool {
|
||||
let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } = self;
|
||||
phrase.is_none()
|
||||
&& zero_typo.is_none()
|
||||
&& prefix_of.is_empty()
|
||||
&& synonyms.is_empty()
|
||||
&& use_prefix_db.is_none()
|
||||
}
|
||||
}
|
||||
impl OneTypoTerm {
|
||||
fn is_empty(&self) -> bool {
|
||||
let OneTypoTerm { split_words, one_typo } = self;
|
||||
one_typo.is_empty() && split_words.is_none()
|
||||
}
|
||||
}
|
||||
impl TwoTypoTerm {
|
||||
fn is_empty(&self) -> bool {
|
||||
let TwoTypoTerm { two_typos } = self;
|
||||
two_typos.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl QueryTerm {
|
||||
fn is_empty(&self) -> bool {
|
||||
let Lazy::Init(one_typo) = &self.one_typo else {
|
||||
return false;
|
||||
};
|
||||
let Lazy::Init(two_typo) = &self.two_typo else {
|
||||
return false;
|
||||
};
|
||||
|
||||
self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl Interned<QueryTerm> {
|
||||
/// Return the original word from the given query term
|
||||
fn original_single_word(self, ctx: &SearchContext) -> Option<Interned<String>> {
|
||||
let self_ = ctx.term_interner.get(self);
|
||||
if self_.ngram_words.is_some() {
|
||||
None
|
||||
} else {
|
||||
Some(self_.original)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A query term coupled with its position in the user's search query.
|
||||
#[derive(Clone)]
|
||||
pub struct LocatedQueryTerm {
|
||||
pub value: Interned<QueryTerm>,
|
||||
pub positions: RangeInclusive<u16>,
|
||||
}
|
||||
|
||||
impl LocatedQueryTerm {
|
||||
/// Return `true` iff the term is empty
|
||||
pub fn is_empty(&self, interner: &DedupInterner<QueryTerm>) -> bool {
|
||||
interner.get(self.value).is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl QueryTerm {
|
||||
pub fn is_cached_prefix(&self) -> bool {
|
||||
self.zero_typo.use_prefix_db.is_some()
|
||||
}
|
||||
pub fn original_word(&self, ctx: &SearchContext) -> String {
|
||||
ctx.word_interner.get(self.original).clone()
|
||||
}
|
||||
pub fn all_computed_derivations(&self) -> (Vec<Interned<String>>, Vec<Interned<Phrase>>) {
|
||||
let mut words = BTreeSet::new();
|
||||
let mut phrases = BTreeSet::new();
|
||||
|
||||
let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db: _ } =
|
||||
&self.zero_typo;
|
||||
words.extend(zero_typo.iter().copied());
|
||||
words.extend(prefix_of.iter().copied());
|
||||
phrases.extend(phrase.iter().copied());
|
||||
phrases.extend(synonyms.iter().copied());
|
||||
|
||||
if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = &self.one_typo {
|
||||
words.extend(one_typo.iter().copied());
|
||||
phrases.extend(split_words.iter().copied());
|
||||
};
|
||||
|
||||
if let Lazy::Init(TwoTypoTerm { two_typos }) = &self.two_typo {
|
||||
words.extend(two_typos.iter().copied());
|
||||
};
|
||||
|
||||
(words.into_iter().collect(), phrases.into_iter().collect())
|
||||
}
|
||||
}
|
||||
79
milli/src/search/new/query_term/ntypo_subset.rs
Normal file
79
milli/src/search/new/query_term/ntypo_subset.rs
Normal file
@@ -0,0 +1,79 @@
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use super::Phrase;
|
||||
use crate::search::new::interner::Interned;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub enum NTypoTermSubset {
|
||||
All,
|
||||
Subset {
|
||||
words: BTreeSet<Interned<String>>,
|
||||
phrases: BTreeSet<Interned<Phrase>>,
|
||||
// TODO: prefixes: BTreeSet<Interned<String>>,
|
||||
},
|
||||
Nothing,
|
||||
}
|
||||
|
||||
impl NTypoTermSubset {
|
||||
pub fn contains_word(&self, word: Interned<String>) -> bool {
|
||||
match self {
|
||||
NTypoTermSubset::All => true,
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => words.contains(&word),
|
||||
NTypoTermSubset::Nothing => false,
|
||||
}
|
||||
}
|
||||
pub fn contains_phrase(&self, phrase: Interned<Phrase>) -> bool {
|
||||
match self {
|
||||
NTypoTermSubset::All => true,
|
||||
NTypoTermSubset::Subset { words: _, phrases } => phrases.contains(&phrase),
|
||||
NTypoTermSubset::Nothing => false,
|
||||
}
|
||||
}
|
||||
pub fn is_empty(&self) -> bool {
|
||||
match self {
|
||||
NTypoTermSubset::All => false,
|
||||
NTypoTermSubset::Subset { words, phrases } => words.is_empty() && phrases.is_empty(),
|
||||
NTypoTermSubset::Nothing => true,
|
||||
}
|
||||
}
|
||||
pub fn union(&mut self, other: &Self) {
|
||||
match self {
|
||||
Self::All => {}
|
||||
Self::Subset { words, phrases } => match other {
|
||||
Self::All => {
|
||||
*self = Self::All;
|
||||
}
|
||||
Self::Subset { words: w2, phrases: p2 } => {
|
||||
words.extend(w2);
|
||||
phrases.extend(p2);
|
||||
}
|
||||
Self::Nothing => {}
|
||||
},
|
||||
Self::Nothing => {
|
||||
*self = other.clone();
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn intersect(&mut self, other: &Self) {
|
||||
match self {
|
||||
Self::All => *self = other.clone(),
|
||||
Self::Subset { words, phrases } => match other {
|
||||
Self::All => {}
|
||||
Self::Subset { words: w2, phrases: p2 } => {
|
||||
let mut ws = BTreeSet::new();
|
||||
for w in words.intersection(w2) {
|
||||
ws.insert(*w);
|
||||
}
|
||||
let mut ps = BTreeSet::new();
|
||||
for p in phrases.intersection(p2) {
|
||||
ps.insert(*p);
|
||||
}
|
||||
*words = ws;
|
||||
*phrases = ps;
|
||||
}
|
||||
Self::Nothing => *self = Self::Nothing,
|
||||
},
|
||||
Self::Nothing => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
290
milli/src/search/new/query_term/parse_query.rs
Normal file
290
milli/src/search/new/query_term/parse_query.rs
Normal file
@@ -0,0 +1,290 @@
|
||||
use charabia::normalizer::NormalizedTokenIter;
|
||||
use charabia::{SeparatorKind, TokenKind};
|
||||
|
||||
use super::*;
|
||||
use crate::{Result, SearchContext, MAX_WORD_LENGTH};
|
||||
|
||||
/// Convert the tokenised search query into a list of located query terms.
|
||||
pub fn located_query_terms_from_tokens(
|
||||
ctx: &mut SearchContext,
|
||||
query: NormalizedTokenIter<&[u8]>,
|
||||
words_limit: Option<usize>,
|
||||
) -> Result<Vec<LocatedQueryTerm>> {
|
||||
let nbr_typos = number_of_typos_allowed(ctx)?;
|
||||
|
||||
let mut located_terms = Vec::new();
|
||||
|
||||
let mut phrase: Option<PhraseBuilder> = None;
|
||||
|
||||
let parts_limit = words_limit.unwrap_or(usize::MAX);
|
||||
|
||||
// start with the last position as we will wrap around to position 0 at the beginning of the loop below.
|
||||
let mut position = u16::MAX;
|
||||
|
||||
let mut peekable = query.take(super::limits::MAX_TOKEN_COUNT).peekable();
|
||||
while let Some(token) = peekable.next() {
|
||||
if token.lemma().is_empty() {
|
||||
continue;
|
||||
}
|
||||
// early return if word limit is exceeded
|
||||
if located_terms.len() >= parts_limit {
|
||||
return Ok(located_terms);
|
||||
}
|
||||
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord => {
|
||||
// On first loop, goes from u16::MAX to 0, then normal increment.
|
||||
position = position.wrapping_add(1);
|
||||
|
||||
// 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote,
|
||||
// 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
|
||||
// 3. if the word is the last token of the query we push it as a prefix word.
|
||||
if let Some(phrase) = &mut phrase {
|
||||
phrase.push_word(ctx, &token, position)
|
||||
} else if peekable.peek().is_some() {
|
||||
match token.kind {
|
||||
TokenKind::Word => {
|
||||
let word = token.lemma();
|
||||
let term = partially_initialized_term_from_word(
|
||||
ctx,
|
||||
word,
|
||||
nbr_typos(word),
|
||||
false,
|
||||
false,
|
||||
)?;
|
||||
let located_term = LocatedQueryTerm {
|
||||
value: ctx.term_interner.push(term),
|
||||
positions: position..=position,
|
||||
};
|
||||
located_terms.push(located_term);
|
||||
}
|
||||
TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {}
|
||||
}
|
||||
} else {
|
||||
let word = token.lemma();
|
||||
let term = partially_initialized_term_from_word(
|
||||
ctx,
|
||||
word,
|
||||
nbr_typos(word),
|
||||
true,
|
||||
false,
|
||||
)?;
|
||||
let located_term = LocatedQueryTerm {
|
||||
value: ctx.term_interner.push(term),
|
||||
positions: position..=position,
|
||||
};
|
||||
located_terms.push(located_term);
|
||||
}
|
||||
}
|
||||
TokenKind::Separator(separator_kind) => {
|
||||
match separator_kind {
|
||||
SeparatorKind::Hard => {
|
||||
position += 1;
|
||||
}
|
||||
SeparatorKind::Soft => {
|
||||
position += 0;
|
||||
}
|
||||
}
|
||||
|
||||
phrase = 'phrase: {
|
||||
let phrase = phrase.take();
|
||||
|
||||
// If we have a hard separator inside a phrase, we immediately start a new phrase
|
||||
let phrase = if separator_kind == SeparatorKind::Hard {
|
||||
if let Some(phrase) = phrase {
|
||||
if let Some(located_query_term) = phrase.build(ctx) {
|
||||
located_terms.push(located_query_term)
|
||||
}
|
||||
Some(PhraseBuilder::empty())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
phrase
|
||||
};
|
||||
|
||||
// We close and start a new phrase depending on the number of double quotes
|
||||
let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count();
|
||||
if quote_count == 0 {
|
||||
break 'phrase phrase;
|
||||
}
|
||||
|
||||
// Consume the closing quote and the phrase
|
||||
if let Some(phrase) = phrase {
|
||||
// Per the check above, quote_count > 0
|
||||
quote_count -= 1;
|
||||
if let Some(located_query_term) = phrase.build(ctx) {
|
||||
located_terms.push(located_query_term)
|
||||
}
|
||||
}
|
||||
|
||||
// Start new phrase if the token ends with an opening quote
|
||||
(quote_count % 2 == 1).then_some(PhraseBuilder::empty())
|
||||
};
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
||||
// If a quote is never closed, we consider all of the end of the query as a phrase.
|
||||
if let Some(phrase) = phrase.take() {
|
||||
if let Some(located_query_term) = phrase.build(ctx) {
|
||||
located_terms.push(located_query_term);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(located_terms)
|
||||
}
|
||||
|
||||
pub fn number_of_typos_allowed<'ctx>(
|
||||
ctx: &SearchContext<'ctx>,
|
||||
) -> Result<impl Fn(&str) -> u8 + 'ctx> {
|
||||
let authorize_typos = ctx.index.authorize_typos(ctx.txn)?;
|
||||
let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?;
|
||||
let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?;
|
||||
|
||||
// TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms?
|
||||
let exact_words = ctx.index.exact_words(ctx.txn)?;
|
||||
|
||||
Ok(Box::new(move |word: &str| {
|
||||
if !authorize_typos
|
||||
|| word.len() < min_len_one_typo as usize
|
||||
|| exact_words.as_ref().map_or(false, |fst| fst.contains(word))
|
||||
{
|
||||
0
|
||||
} else if word.len() < min_len_two_typos as usize {
|
||||
1
|
||||
} else {
|
||||
2
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn make_ngram(
|
||||
ctx: &mut SearchContext,
|
||||
terms: &[LocatedQueryTerm],
|
||||
number_of_typos_allowed: &impl Fn(&str) -> u8,
|
||||
) -> Result<Option<LocatedQueryTerm>> {
|
||||
assert!(!terms.is_empty());
|
||||
for t in terms {
|
||||
if ctx.term_interner.get(t.value).zero_typo.phrase.is_some() {
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
for ts in terms.windows(2) {
|
||||
let [t1, t2] = ts else { panic!() };
|
||||
if *t1.positions.end() != t2.positions.start() - 1 {
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
let mut words_interned = vec![];
|
||||
for term in terms {
|
||||
if let Some(original_term_word) = term.value.original_single_word(ctx) {
|
||||
words_interned.push(original_term_word);
|
||||
} else {
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
let words =
|
||||
words_interned.iter().map(|&i| ctx.word_interner.get(i).to_owned()).collect::<Vec<_>>();
|
||||
|
||||
let start = *terms.first().as_ref().unwrap().positions.start();
|
||||
let end = *terms.last().as_ref().unwrap().positions.end();
|
||||
let is_prefix = ctx.term_interner.get(terms.last().as_ref().unwrap().value).is_prefix;
|
||||
let ngram_str = words.join("");
|
||||
if ngram_str.len() > MAX_WORD_LENGTH {
|
||||
return Ok(None);
|
||||
}
|
||||
let ngram_str_interned = ctx.word_interner.insert(ngram_str.clone());
|
||||
|
||||
let max_nbr_typos =
|
||||
number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1);
|
||||
|
||||
let mut term =
|
||||
partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix, true)?;
|
||||
|
||||
// Now add the synonyms
|
||||
let index_synonyms = ctx.index.synonyms(ctx.txn)?;
|
||||
|
||||
term.zero_typo.synonyms.extend(
|
||||
index_synonyms.get(&words).cloned().unwrap_or_default().into_iter().map(|words| {
|
||||
let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect();
|
||||
ctx.phrase_interner.insert(Phrase { words })
|
||||
}),
|
||||
);
|
||||
|
||||
let term = QueryTerm {
|
||||
original: ngram_str_interned,
|
||||
ngram_words: Some(words_interned),
|
||||
is_prefix,
|
||||
max_nbr_typos,
|
||||
zero_typo: term.zero_typo,
|
||||
one_typo: Lazy::Uninit,
|
||||
two_typo: Lazy::Uninit,
|
||||
};
|
||||
|
||||
let term = LocatedQueryTerm { value: ctx.term_interner.push(term), positions: start..=end };
|
||||
|
||||
Ok(Some(term))
|
||||
}
|
||||
|
||||
struct PhraseBuilder {
|
||||
words: Vec<Option<Interned<String>>>,
|
||||
start: u16,
|
||||
end: u16,
|
||||
}
|
||||
|
||||
impl PhraseBuilder {
|
||||
fn empty() -> Self {
|
||||
Self { words: Default::default(), start: u16::MAX, end: u16::MAX }
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.words.is_empty() || self.words.iter().all(Option::is_none)
|
||||
}
|
||||
|
||||
// precondition: token has kind Word or StopWord
|
||||
fn push_word(&mut self, ctx: &mut SearchContext, token: &charabia::Token, position: u16) {
|
||||
if self.is_empty() {
|
||||
self.start = position;
|
||||
}
|
||||
self.end = position;
|
||||
if let TokenKind::StopWord = token.kind {
|
||||
self.words.push(None);
|
||||
} else {
|
||||
// token has kind Word
|
||||
let word = ctx.word_interner.insert(token.lemma().to_string());
|
||||
// TODO: in a phrase, check that every word exists
|
||||
// otherwise return an empty term
|
||||
self.words.push(Some(word));
|
||||
}
|
||||
}
|
||||
|
||||
fn build(self, ctx: &mut SearchContext) -> Option<LocatedQueryTerm> {
|
||||
if self.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some(LocatedQueryTerm {
|
||||
value: ctx.term_interner.push({
|
||||
let phrase = ctx.phrase_interner.insert(Phrase { words: self.words });
|
||||
let phrase_desc = phrase.description(ctx);
|
||||
QueryTerm {
|
||||
original: ctx.word_interner.insert(phrase_desc),
|
||||
ngram_words: None,
|
||||
max_nbr_typos: 0,
|
||||
is_prefix: false,
|
||||
zero_typo: ZeroTypoTerm {
|
||||
phrase: Some(phrase),
|
||||
exact: None,
|
||||
prefix_of: BTreeSet::default(),
|
||||
synonyms: BTreeSet::default(),
|
||||
use_prefix_db: None,
|
||||
},
|
||||
one_typo: Lazy::Uninit,
|
||||
two_typo: Lazy::Uninit,
|
||||
}
|
||||
}),
|
||||
positions: self.start..=self.end,
|
||||
})
|
||||
}
|
||||
}
|
||||
21
milli/src/search/new/query_term/phrase.rs
Normal file
21
milli/src/search/new/query_term/phrase.rs
Normal file
@@ -0,0 +1,21 @@
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::search::new::interner::Interned;
|
||||
use crate::SearchContext;
|
||||
|
||||
/// A phrase in the user's search query, consisting of several words
|
||||
/// that must appear side-by-side in the search results.
|
||||
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct Phrase {
|
||||
pub words: Vec<Option<Interned<String>>>,
|
||||
}
|
||||
impl Interned<Phrase> {
|
||||
pub fn description(self, ctx: &SearchContext) -> String {
|
||||
let p = ctx.phrase_interner.get(self);
|
||||
p.words.iter().flatten().map(|w| ctx.word_interner.get(*w)).join(" ")
|
||||
}
|
||||
pub fn words(self, ctx: &SearchContext) -> Vec<Option<Interned<String>>> {
|
||||
let p = ctx.phrase_interner.get(self);
|
||||
p.words.clone()
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user