Compare commits

..

170 Commits

Author SHA1 Message Date
620fee35f9 Fix benches 2023-11-06 11:56:46 +01:00
cbaa54cafd Fix clippy issues 2023-11-06 11:19:31 +01:00
1bccf2079e Correctly mark non-tests as non-tests 2023-11-06 11:03:56 +01:00
1b2ea6cf19 REVERT ME: ignore prefix pair databases tests 2023-11-06 10:46:22 +01:00
1ad1fcc8c8 Remove all warnings 2023-11-06 10:31:14 +01:00
87610a5f98 Don't try to delete a document that is not in the database 2023-11-02 16:49:03 +01:00
2544bc1416 Merge pull request #4160 from meilisearch/diff-indexing-vector-points
Diff Indexing for the vector points
2023-11-02 16:01:51 +01:00
ff522c919d Fix the vector extractions for the diff indexing 2023-11-02 15:58:08 +01:00
1c39459cf4 Merge pull request #4179 from meilisearch/diff-indexing-fix-nested-primary-key
Diff indexing fix nested primary key
2023-11-02 15:39:50 +01:00
bf0651f23c Implement iter method on ExternalDocumentsIds 2023-11-02 15:38:00 +01:00
5b20e625f3 fix merge 2023-11-02 15:31:37 +01:00
bc51d6157a Fix transform reindexing path 2023-11-02 15:26:20 +01:00
1b4ff991c0 update typed chunks 2023-11-02 15:26:20 +01:00
4b64c33aa2 update vector extractor 2023-11-02 15:26:20 +01:00
12323d610e Change the original document sorter key from the internal docid to a concatenation of the internal and the external docid 2023-11-02 15:26:20 +01:00
44e9033b3a Merge pull request #4181 from meilisearch/diff-indexing-parallel-transform
Use rayon to sort entries in parallel
2023-11-02 15:16:10 +01:00
4d864f0702 Always sort internal Sorter entries in parallel 2023-11-02 14:47:43 +01:00
b10c060bf7 Cleanup TOML 2023-11-01 14:03:04 +01:00
e507ef5932 Slow the logging down 2023-11-01 13:49:32 +01:00
c71b1d33ae Sort entries using rayon in the transform sorters 2023-11-01 11:07:16 +01:00
0fc446c62f Add more timing logs to the Transform 2023-11-01 11:07:16 +01:00
0fb6acefc3 Add snapshots for facets 2023-10-31 17:11:08 +01:00
b1d1355b69 remove tests on soft-deleted 2023-10-31 16:36:27 +01:00
f19332466e Extract field value as values instead of Option<Value> 2023-10-31 16:36:27 +01:00
03ddb4f310 use deladd in facet update tests 2023-10-31 16:36:27 +01:00
c855cc2721 Remove unused test 2023-10-31 16:36:27 +01:00
da0503ef80 Fix document count 2023-10-31 16:36:27 +01:00
94206b0055 Update tests 2023-10-31 13:48:47 +01:00
b40253bf18 update snapshots 2023-10-31 10:30:48 +01:00
d8bf3f3fc2 Remove unused snapshots 2023-10-31 10:12:49 +01:00
9d59e8011a fix some tests 2023-10-31 10:08:36 +01:00
dad78cbf8d Bulk facet remove deletes keys from DB when value empty 2023-10-31 09:53:55 +01:00
4e91707a06 Rename test 2023-10-31 09:41:17 +01:00
de10f20732 Fix field distribution again 2023-10-30 17:47:22 +01:00
be395c7944 Change order of arguments to tokenizer_builder 2023-10-30 16:26:29 +01:00
9fedd8101a Fix tests 2023-10-30 15:11:07 +01:00
54d07a8da3 Update field distribution taking into account both deletions and additions 2023-10-30 14:47:51 +01:00
58690dfb19 Fix tests compilation after changes to ExternalDocumentsIds API 2023-10-30 13:34:07 +01:00
abf424ebfc Remove unused FromIterator 2023-10-30 11:41:56 +01:00
dfab6293c9 Use an LMDB database to store the external documents ids 2023-10-30 11:41:23 +01:00
fdf3f7f627 Fix facet distribution test 2023-10-30 11:41:23 +01:00
6260cff65f Actually delete documents from DB when the merge function says so 2023-10-30 11:41:22 +01:00
8e0d9c9a5e Recover delete_documents tests that were too eagerly deleted 2023-10-30 11:41:22 +01:00
ae4ec8ea55 Add delete_document_using_wtxn to TempIndex 2023-10-30 11:41:22 +01:00
652ac3052d use new iterator in batch 2023-10-30 11:41:22 +01:00
9a2dccc3bc Add iterator to find external ids of a bitmap of internal ids 2023-10-30 11:41:22 +01:00
a35988550c Fix some snapshots 2023-10-30 11:41:22 +01:00
e78281785c Actually execute the transform even if there are only documents to delete 2023-10-30 11:41:22 +01:00
3c15881818 Add simple delete test 2023-10-30 11:41:22 +01:00
73c06d31d9 snapshot always display stuff in consistent order 2023-10-30 11:41:22 +01:00
290e773d23 remove more warnings and fix some tests 2023-10-30 11:41:22 +01:00
fa6c7f65ca Add TmpIndex::delete_documents 2023-10-30 11:41:22 +01:00
113527f466 Remove soft-deleted related methods from Index 2023-10-30 11:41:22 +01:00
c534a1b687 Stop using delete documents pipeline in batch runner 2023-10-30 11:41:22 +01:00
2263dff02b Stop using removed delete pipelines almost everywhere 2023-10-30 11:41:22 +01:00
d651b3ef01 Remove delete documents files 2023-10-30 11:41:20 +01:00
762b0b47e6 Use deladd merging function in chunks mergers 2023-10-30 11:40:20 +01:00
01d5eedf2f Remove some warnings 2023-10-30 11:40:20 +01:00
073f89db79 Fix facet tests 2023-10-30 11:40:20 +01:00
8370fbc92b Fix snaps 2023-10-30 11:40:20 +01:00
85f42fbc03 Handle external to internal id mapping from TypedChunk::Documents 2023-10-30 11:40:20 +01:00
c6b3c18c85 WIP: Comment out document deletion in other pipelines than update
TODO: fix calls to DELETE route
2023-10-30 11:40:20 +01:00
bafeb892a7 Modify Index after changes to ExternalDocumentsIds 2023-10-30 11:40:20 +01:00
8fb221dae3 Refactor ExternalDocumentsIds
- Remove soft deleted
- Add apply method that takes a list of operations to encapsulate modifications to the external -> internal mapping
2023-10-30 11:40:20 +01:00
5be569e3e2 Update obkv 2023-10-30 11:40:20 +01:00
946c762d28 WIP: reset documents in TypedChunk::Documents 2023-10-30 11:40:20 +01:00
cda6ca1ee6 Remove TypedChunk::NewDocumentIds 2023-10-30 11:40:18 +01:00
696fcf4d18 Fix document insertion into LMDB 2023-10-30 11:39:31 +01:00
476e4d3dbe Use value buffer instead of the initial value when writting the final result in the sorter 2023-10-30 11:39:31 +01:00
576fa9c6da Remove useless comment 2023-10-30 11:39:31 +01:00
77dcbff6b2 Remove and Insert the DelAdd geo points 2023-10-30 11:39:31 +01:00
544440c363 Ignore geo fields when the Del and Add content is the same 2023-10-30 11:39:31 +01:00
a3dae4db9b Extract the geo fields DelAdd and generate a new DelAdd obkv with it 2023-10-30 11:39:31 +01:00
ba90a5ec0e update extract fid word count docids 2023-10-30 11:39:31 +01:00
b26dc9aabe Explanatory code comment 2023-10-30 11:39:31 +01:00
66abac9364 Use specialized KvReaderDelAdd type
Co-authored-by: Clément Renault <clement@meilisearch.com>
2023-10-30 11:39:31 +01:00
59f88c14b3 Simplify facet update after removing Index::faceted_documents_ids 2023-10-30 11:39:29 +01:00
14832cb324 Remove Index::faceted_documents_ids 2023-10-30 11:37:32 +01:00
04ec293024 Facet Incremental update 2023-10-30 11:37:30 +01:00
f67ff3a738 Facets Bulk update 2023-10-30 11:36:40 +01:00
560e8f5613 Introduce the CboRoaringBitmapCodec merge_deladd_into and use it 2023-10-30 11:34:55 +01:00
2d3f15f82c Introduce a function to only serialize the Add side of a DelAdd obkv 2023-10-30 11:34:55 +01:00
40186bf403 Rename FieldIdWordCountDocids correctly 2023-10-30 11:34:50 +01:00
87e3d27878 update extract word pair proximity to support deladd obkvs 2023-10-30 11:34:02 +01:00
6bcf8b4f8c update extract word position docids 2023-10-30 11:34:02 +01:00
46aa75abdb update extract word docids 2023-10-30 11:34:02 +01:00
2597bbd107 Make script language docids map taking a tuple of roaring bitmaps expressing the deletions and the additions 2023-10-30 11:34:00 +01:00
e2bc054604 Update extract_facet_string_docids to support deladd obkvs 2023-10-30 11:32:36 +01:00
fcd3a1434d Update extract_facet_number_docids to support deladd obkvs 2023-10-30 11:31:04 +01:00
a82dee21e0 Rename docid_fid into fid_docid 2023-10-30 11:31:02 +01:00
bc45c1206d Implement all the facet extraction paths and simplify them 2023-10-30 11:29:08 +01:00
6ae4100f07 Generate the DelAdd for is_null, is_empty, and exists 2023-10-30 11:29:08 +01:00
0c47defeee Work on fid docid facet values rewrite 2023-10-30 11:29:06 +01:00
313b16bec2 Support diff indexing on extract_docid_word_positions 2023-10-30 11:24:19 +01:00
1dd97578a8 Make the transform struct return diff-based documents obkvs 2023-10-30 11:22:07 +01:00
f5ef69293b deactivate prefix dbs 2023-10-30 11:22:07 +01:00
1c5705c164 clean PR warnings 2023-10-30 11:22:05 +01:00
66c2c82a18 Split wpp in several sorters 2023-10-30 11:15:02 +01:00
28a8d0ccda Fix word pair proximity 2023-10-30 11:15:02 +01:00
96be85396d Use a vecDeque in wpp database 2023-10-30 11:15:02 +01:00
df9e5c8651 Generalize usage of CboRoaringBitmap codec to ease the use 2023-10-30 11:15:02 +01:00
b541d48847 Add buffer to the obkv writter 2023-10-30 11:15:02 +01:00
8ccf32d1a0 Compute word_fid_docids before word_docids and exact_word_docids 2023-10-30 11:15:02 +01:00
db1ca21231 add puffin in sorter into reeder function 2023-10-30 11:15:00 +01:00
11ea5acff9 Fix 2023-10-30 11:13:10 +01:00
8d77736a67 Fix fid_word_docids 2023-10-30 11:13:10 +01:00
748b333161 Add usefull debug assert before key insertion in database 2023-10-30 11:13:10 +01:00
17b647dfe5 Wip 2023-10-30 11:13:08 +01:00
62ea81bef6 Merge #4132
4132: Extract the creation and last updated timestamp from v2 dumps r=irevoire a=vivek-26

# Pull Request

## Related issue
Fixes #2989

## What does this PR do?
This PR - 
- extracts the `created_at` and `updated_at` dates from v2 dumps.
- updates the unit tests.

## PR checklist
Please check if your PR fulfills the following requirements:
- [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)?
- [x] Have you read the contributing guidelines?
- [x] Have you made sure that the title is accurate and descriptive of the changes?

Thank you so much for contributing to Meilisearch!


Co-authored-by: Vivek Kumar <vivek.26@outlook.com>
2023-10-24 08:50:57 +00:00
f28f09ae2f update tests for v2 dumps 2023-10-24 14:10:46 +05:30
eae9eab181 Merge #4126
4126: Make the experimental route /metrics activable via HTTP r=dureuill a=braddotcoffee

# Pull Request

## Related issue
Closes #4086

## What does this PR do?
- [x] Make `/metrics` available via HTTP as described in #4086 
- [x] The users can still launch Meilisearch using the `--experimental-enable-metrics` flag.
- [x] If the flag `--experimental-enable-metrics` is activated, a call to the `GET /experimental-features` route right after the launch will show `"metrics": true` even if the user has not called the `PATCH /experimental-features` route yet.
- [x] Even if the --experimental-enable-metrics flag is present at launch, calling the `PATCH /experimental-features` route with `"metrics": false` disables the experimental feature.
- [x] Update the spec
    - I was unable to find docs in this repository to update about the `/experimental-features` endpoint. I'll happily update if you point me in the right direction!

## PR checklist
Please check if your PR fulfills the following requirements:
- [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)?
- [x] Have you read the contributing guidelines?
- [x] Have you made sure that the title is accurate and descriptive of the changes?

Co-authored-by: bwbonanno <bradfordbonanno@gmail.com>
Co-authored-by: Louis Dureuil <louis@meilisearch.com>
2023-10-23 08:51:37 +00:00
cf8dad1ca0 index_scheduler.features() is no longer fallible 2023-10-23 10:38:56 +02:00
dd619913da Use RwLock to never persist cli state to db 2023-10-19 12:45:57 -07:00
9b55ff16e9 Merge #4134
4134: Bump rustix from 0.36.15 to 0.36.16 r=Kerollmops a=dependabot[bot]

Bumps [rustix](https://github.com/bytecodealliance/rustix) from 0.36.15 to 0.36.16.
<details>
<summary>Commits</summary>
<ul>
<li><a href="6534992521"><code>6534992</code></a> chore: Release rustix version 0.36.16</li>
<li><a href="4928cf7a38"><code>4928cf7</code></a> Disable riscv64 testing.</li>
<li><a href="8cc159c4c3"><code>8cc159c</code></a> Fix the <code>test_ttyname_ok</code> test when /dev/stdin is inaccessable. (<a href="https://redirect.github.com/bytecodealliance/rustix/issues/821">#821</a>)</li>
<li><a href="6dc7ba9478"><code>6dc7ba9</code></a> Downgrade dependencies and disable tests to compile under Rust 1.48.</li>
<li><a href="ded8986e7e"><code>ded8986</code></a> Disable MIPS in CI. (<a href="https://redirect.github.com/bytecodealliance/rustix/issues/793">#793</a>)</li>
<li><a href="739f9c3ba0"><code>739f9c3</code></a> Fixes for <code>Dir</code> on macOS, FreeBSD, and WASI.</li>
<li><a href="87481a97f4"><code>87481a9</code></a> Merge pull request from GHSA-c827-hfw6-qwvm</li>
<li>See full diff in <a href="https://github.com/bytecodealliance/rustix/compare/v0.36.15...v0.36.16">compare view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=rustix&package-manager=cargo&previous-version=0.36.15&new-version=0.36.16)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting ``@dependabot` rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- ``@dependabot` rebase` will rebase this PR
- ``@dependabot` recreate` will recreate this PR, overwriting any edits that have been made to it
- ``@dependabot` merge` will merge this PR after your CI passes on it
- ``@dependabot` squash and merge` will squash and merge this PR after your CI passes on it
- ``@dependabot` cancel merge` will cancel a previously requested merge and block automerging
- ``@dependabot` reopen` will reopen this PR if it is closed
- ``@dependabot` close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- ``@dependabot` show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- ``@dependabot` ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- ``@dependabot` ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- ``@dependabot` ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/meilisearch/meilisearch/network/alerts).

</details>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2023-10-19 08:01:36 +00:00
e761db582f Bump rustix from 0.36.15 to 0.36.16
Bumps [rustix](https://github.com/bytecodealliance/rustix) from 0.36.15 to 0.36.16.
- [Release notes](https://github.com/bytecodealliance/rustix/releases)
- [Commits](https://github.com/bytecodealliance/rustix/compare/v0.36.15...v0.36.16)

---
updated-dependencies:
- dependency-name: rustix
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
2023-10-18 18:42:12 +00:00
d8c649b3cd Return recoverable error if we fail to retrieve metrics state 2023-10-18 08:28:24 -07:00
5e0485d8dd Merge #4131
4131: Reduce proximity range from 7 to 3 r=Kerollmops a=ManyTheFish

## Summary
This PR aims to reduce the impact of the proximity databases on the indexing time and on the database size by reducing the maximum distance between two words to be indexed in the proximity database.

## Stats

### Impact on database size and indexing time
![Impact on datasets](https://github.com/meilisearch/meilisearch/assets/6482087/28ed3d96-bdde-41c1-bdac-e90c1b1dbb23)

### Impact on search relevancy

<details>

| dataset_name | host_name        | Relevancy rate (Precision) | completion_rate  25.00% | completion_rate 50.00% | completion_rate 75.00% | completion_rate 100.00% |
|--------------|------------------|------------------------------------|-----------------|-----------------|-----------------|-----------------|
| FBIS         | 1_4_0            | percentile-10 |           0.00% |           0.00% |           0.00% |           0.00% |
| FBIS         | 1_4_0            | percentile-25 |           0.00% |           0.00% |           0.00% |           0.00% |
| FBIS         | 1_4_0            | percentile-50 |           0.00% |           0.00% |           5.00% |           5.56% |
| FBIS         | 1_4_0            | percentile-75 |           0.00% |          12.50% |          35.00% |          45.00% |
| FBIS         | 1_4_0            | percentile-90 |          20.00% |          40.00% |                 |         100.00% |
| FBIS         | 1_4_0            | average       |           5.78% |          11.16% |          21.90% |          26.29% |
| FBIS         | reduce_proximity | percentile-10 |           0.00% |           0.00% |           0.00% |           0.00% |
| FBIS         | reduce_proximity | percentile-25 |           0.00% |           0.00% |           0.00% |           0.00% |
| FBIS         | reduce_proximity | percentile-50 |           0.00% |           0.00% |           5.00% |           5.56% |
| FBIS         | reduce_proximity | percentile-75 |           0.00% |          15.00% |          35.00% |          40.00% |
| FBIS         | reduce_proximity | percentile-90 |          20.00% |          40.00% |          85.00% |         100.00% |
| FBIS         | reduce_proximity | average       |           5.55% |          11.34% |          21.75% |          26.14% |
| FR94         | 1_4_0            | percentile-10 |           0.00% |           0.00% |           0.00% |           0.00% |
| FR94         | 1_4_0            | percentile-25 |           0.00% |           0.00% |           0.00% |           0.00% |
| FR94         | 1_4_0            | percentile-50 |           0.00% |           0.00% |           0.00% |           0.00% |
| FR94         | 1_4_0            | percentile-75 |           0.00% |           5.00% |          15.00% |          42.11% |
| FR94         | 1_4_0            | percentile-90 |          15.00% |          54.55% |         100.00% |         100.00% |
| FR94         | 1_4_0            | average       |           5.95% |          12.07% |          18.70% |          25.57% |
| FR94         | reduce_proximity | percentile-10 |           0.00% |           0.00% |           0.00% |           0.00% |
| FR94         | reduce_proximity | percentile-25 |           0.00% |           0.00% |           0.00% |           0.00% |
| FR94         | reduce_proximity | percentile-50 |           0.00% |           0.00% |           0.00% |           0.00% |
| FR94         | reduce_proximity | percentile-75 |           0.00% |           5.00% |          15.00% |          42.11% |
| FR94         | reduce_proximity | percentile-90 |          15.00% |          54.55% |         100.00% |         100.00% |
| FR94         | reduce_proximity | average       |           5.79% |          12.00% |          18.70% |          25.53% |
| FT           | 1_4_0            | percentile-10 |           0.00% |           0.00% |           0.00% |           0.00% |
| FT           | 1_4_0            | percentile-25 |           0.00% |           0.00% |           0.00% |           0.00% |
| FT           | 1_4_0            | percentile-50 |           0.00% |           0.00% |           5.00% |          10.00% |
| FT           | 1_4_0            | percentile-75 |           0.00% |          15.00% |          30.00% |          40.00% |
| FT           | 1_4_0            | percentile-90 |          20.00% |          50.00% |          65.00% |         100.00% |
| FT           | 1_4_0            | average       |           5.08% |          12.58% |          20.00% |          25.49% |
| FT           | reduce_proximity | percentile-10 |           0.00% |           0.00% |           0.00% |           0.00% |
| FT           | reduce_proximity | percentile-25 |           0.00% |           0.00% |           0.00% |           0.00% |
| FT           | reduce_proximity | percentile-50 |           0.00% |           0.00% |           5.00% |          10.00% |
| FT           | reduce_proximity | percentile-75 |           0.00% |          15.00% |          30.00% |          40.00% |
| FT           | reduce_proximity | percentile-90 |          10.00% |          45.00% |          60.00% |         100.00% |
| FT           | reduce_proximity | average       |           5.01% |          12.64% |          20.10% |          25.53% |
| LAT          | 1_4_0            | percentile-10 |           0.00% |           0.00% |           0.00% |           0.00% |
| LAT          | 1_4_0            | percentile-25 |           0.00% |           0.00% |           0.00% |           0.00% |
| LAT          | 1_4_0            | percentile-50 |           0.00% |           0.00% |           5.00% |           5.00% |
| LAT          | 1_4_0            | percentile-75 |           5.00% |          15.00% |          30.00% |          30.00% |
| LAT          | 1_4_0            | percentile-90 |          15.00% |          45.00% |          60.00% |          80.00% |
| LAT          | 1_4_0            | average       |           4.80% |          11.80% |          17.88% |          21.62% |
| LAT          | reduce_proximity | percentile-10 |           0.00% |           0.00% |           0.00% |           0.00% |
| LAT          | reduce_proximity | percentile-25 |           0.00% |           0.00% |           0.00% |           0.00% |
| LAT          | reduce_proximity | percentile-50 |           0.00% |           0.00% |           5.00% |           5.00% |
| LAT          | reduce_proximity | percentile-75 |           0.00% |          11.11% |          25.00% |          35.00% |
| LAT          | reduce_proximity | percentile-90 |          15.00% |          45.00% |          55.00% |          80.00% |
| LAT          | reduce_proximity | average       |           4.43% |          11.23% |          17.32% |          21.45% |

</details>

### Impact on Search time

| dataset_name | host_name        |      25.00% |      50.00% |      75.00% |     100.00% | Average     |
|--------------|------------------|------------:|------------:|------------:|------------:|-------------|
| FBIS         | 1_4_0            |        3.45 | 7.446666667 | 9.773489933 | 9.620300752 | 7.572614338 |
| FBIS         | reduce_proximity | 2.983333333 | 5.316666667 | 6.911073826 | 7.637218045 | 5.712072968 |
| FR94         | 1_4_0            | 2.236666667 |        4.45 | 5.523489933 | 4.560150376 | 4.192576744 |
| FR94         | reduce_proximity |        2.09 | 3.991666667 | 4.981543624 | 4.266917293 | 3.832531896 |
| FT           | 1_4_0            | 5.956666667 | 9.656666667 | 13.86912752 | 10.83270677 |  10.0787919 |
| FT           | reduce_proximity |        4.51 | 5.981666667 | 7.701342282 | 6.766917293 |  6.23998156 |
| LAT          | 1_4_0            | 5.856666667 | 9.233333333 | 12.98322148 | 10.78759398 | 9.715203865 |
| LAT          | reduce_proximity |        6.91 | 6.706666667 | 8.463087248 | 8.265037594 | 7.586197877 |

## Technical approach

- Ensure the MAX_DISTANCE constant is used everywhere needed
- Reduce the MAX_DISTANCE from 8 to 4

## Related

TBD

Co-authored-by: ManyTheFish <many@meilisearch.com>
2023-10-18 14:56:08 +00:00
27eec21415 Fix tests 2023-10-18 16:03:22 +02:00
62cc97ba70 update tests to include created_at and updated-at in v2 dumps 2023-10-18 13:31:39 +05:30
fed59cc1d5 extract created_at and updated_at dates from v2 dumps 2023-10-18 13:30:24 +05:30
2b3adef796 Use index_scheduler from configured app_data in middleware 2023-10-17 08:17:13 -07:00
956cfc5487 Add runtime check to metrics middleware 2023-10-16 13:48:57 -07:00
12fc878640 Merge remote-tracking branch 'origin/main' into enable-metrics-http 2023-10-16 13:48:01 -07:00
0a2e8b92a9 Merge #4129
4129: Add webinar banner in README r=curquiza a=curquiza



Co-authored-by: curquiza <clementine@meilisearch.com>
2023-10-16 17:35:48 +00:00
c7a3f80de6 Merge #4073
4073: Simplify Puffin report exports r=ManyTheFish a=Kerollmops

This PR changes how we export Puffin reports by directly writing them to disk when the `exportPuffinReports` [experimental feature is enabled](https://www.meilisearch.com/docs/learn/experimental/overview) on the `/experimental-features` route. It also adds more puffing logging to the deletion phase and grenad helpers. The puffin reports are identified by the date and time at which they are exported.

## Todo List
 - [x] Change the CLI flag to be an API experimental option.
 - [x] Create [a PRD for this experimental feature (private)](https://www.notion.so/meilisearch/Export-Puffin-Reports-091df151e71c4edfb7d72f4bf995b3ea).
 - [x] Create and complete [a product discussion](https://github.com/meilisearch/product/discussions/693) (copy/paste PROFILING markdown?).
 - [x] Update the _PROFILING.md_ markdown file instructions.
 - [x] Change the debug logs of the processing operation (visible in puffin viewer).

Co-authored-by: Clément Renault <clement@meilisearch.com>
Co-authored-by: Kerollmops <clement@meilisearch.com>
2023-10-16 15:48:15 +00:00
029d4de043 Add webinar banner in README 2023-10-16 14:38:10 +02:00
549f1bcccf Merge #4125
4125: Rename benchmark CI file to find it easily in the manifest list r=Kerollmops a=curquiza



Co-authored-by: curquiza <clementine@meilisearch.com>
2023-10-16 11:38:28 +00:00
689ec7c7ad Make the experimental route /metrics activable via HTTP 2023-10-13 22:12:54 +00:00
3655d4bdca Move the puffin file export logic into the run function 2023-10-13 13:11:30 +02:00
055ca3935b Update index-scheduler/src/batch.rs
Co-authored-by: Tamo <tamo@meilisearch.com>
2023-10-13 13:11:30 +02:00
1b8871a585 Make cargo insta happy 2023-10-13 13:11:30 +02:00
bf8fac6676 Fix the tests 2023-10-13 13:11:30 +02:00
f2a9e1ebbb Improve the debugging experience in the puffin reports 2023-10-13 13:11:30 +02:00
c45c6cf54c Update the PROFILING.md file 2023-10-13 13:11:30 +02:00
513e61e9a3 Remove the experimental CLI flag 2023-10-13 13:11:29 +02:00
90a626bf80 Use the runtime feature to enable puffin report exporting 2023-10-13 13:11:29 +02:00
0d4acf2daa Fix the metrics product URL 2023-10-13 13:11:29 +02:00
58db8d85ec Add the exportPuffinReports option to the runtime features route 2023-10-13 13:11:29 +02:00
62dfd09dc6 Add more puffin logs to the deletion functions 2023-10-13 13:11:09 +02:00
656dadabea Expose an experimental flag to write the puffin reports to disk 2023-10-13 13:11:09 +02:00
c5f7893fbb Remove the puffin http dependency 2023-10-13 13:11:08 +02:00
8cf2ccf168 Rename benchmark CI file to find it easily in the manifest list 2023-10-12 18:41:26 +02:00
0913373a5e Merge #4122
4122: Bring back changes from `release-v1.4.1` into `main` r=Kerollmops a=curquiza



Co-authored-by: curquiza <curquiza@users.noreply.github.com>
Co-authored-by: meili-bors[bot] <89034592+meili-bors[bot]@users.noreply.github.com>
Co-authored-by: Tamo <tamo@meilisearch.com>
Co-authored-by: Vivek Kumar <vivek.26@outlook.com>
Co-authored-by: Clément Renault <clement@meilisearch.com>
2023-10-12 15:57:47 +00:00
1a7f1282af Fix test to use new common Value type 2023-10-12 17:37:04 +02:00
bc747aac3a Cut the first 8 characters 2023-10-12 15:04:37 +02:00
be92376ab3 Fix originating commit branch 2023-10-12 13:51:41 +02:00
cf7e355735 Fix originating commit command 2023-10-12 13:12:53 +02:00
5f09d89ad1 Fetch the whole git history when cloning 2023-10-12 12:25:26 +02:00
6ecb26a3f8 Add more info on the commenting CI command 2023-10-12 11:54:56 +02:00
76c6f554d6 Merge #4101
4101: Bump webpki from 0.22.1 to 0.22.2 r=curquiza a=dependabot[bot]

Bumps [webpki](https://github.com/briansmith/webpki) from 0.22.1 to 0.22.2.
<details>
<summary>Commits</summary>
<ul>
<li>See full diff in <a href="https://github.com/briansmith/webpki/commits">compare view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=webpki&package-manager=cargo&previous-version=0.22.1&new-version=0.22.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting ``@dependabot` rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- ``@dependabot` rebase` will rebase this PR
- ``@dependabot` recreate` will recreate this PR, overwriting any edits that have been made to it
- ``@dependabot` merge` will merge this PR after your CI passes on it
- ``@dependabot` squash and merge` will squash and merge this PR after your CI passes on it
- ``@dependabot` cancel merge` will cancel a previously requested merge and block automerging
- ``@dependabot` reopen` will reopen this PR if it is closed
- ``@dependabot` close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- ``@dependabot` show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- ``@dependabot` ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- ``@dependabot` ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- ``@dependabot` ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/meilisearch/meilisearch/network/alerts).

</details>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2023-10-12 08:46:04 +00:00
f343ef5f2f Merge #4108
4108: Fix bug where search with distinct attribute and no ranking, returns offset+limit hits r=curquiza a=vivek-26

# Pull Request

## Related issue
Fixes #4078 

## What does this PR do?
This PR - 
- Fixes bug where search with distinct attribute and no ranking, returns offset+limit hits.
- Adds unit and integration tests.

## PR checklist
Please check if your PR fulfills the following requirements:
- [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)?
- [x] Have you read the contributing guidelines?
- [x] Have you made sure that the title is accurate and descriptive of the changes?

Thank you so much for contributing to Meilisearch!


Co-authored-by: Vivek Kumar <vivek.26@outlook.com>
2023-10-12 07:51:29 +00:00
96982a768a Triggers for every type of issue_comment 2023-10-11 23:18:29 +02:00
fca78fbc46 Merge #4082
4082: Update sprint_issue.md r=curquiza a=curquiza

Following internal recent discussions

Co-authored-by: Clémentine U. - curqui <clementine@meilisearch.com>
2023-10-11 15:12:38 +00:00
67a678cfb6 Merge #4089
4089: Use a bufreader and bufwriter everytime there is a grenad<file> r=curquiza a=irevoire

# Pull Request
Wrap all the files we give to a grenad in a `BufReader` or `BufWriter`.

The dump import I tried in the issue went from 2h to 10 minutes on my machine.

I also ran a bunch of benchmarks on my machine, and we're faster by a few seconds everywhere but nothing huge.

-----

The one thing I’m afraid about is if we used to get the inner file in a grenad and then do a read right after without a seek at the beginning of the file or a reopen.
Since we now use a bufreader our read would return the bytes one buffer later and probably completely corrupt what we were supposed to read.

From what I see, it looks like it works, but I may have missed something, I don't know much about this part of the codebase.

This issue should not arise on the bufwriter, though, because if we're not able to write the content of the buffer I ensured that the `into_inner` of the bufwriter should return an internal error.

## Related issue
Fixes #4087


Co-authored-by: Tamo <tamo@meilisearch.com>
2023-10-11 14:27:00 +00:00
d1331d8abf add integration test for distinct search with no ranking 2023-10-11 19:12:56 +05:30
19ba129165 add unit test for distinct search with no ranking 2023-10-11 19:02:27 +05:30
d4da06ff47 fix bug where distinct search with no ranking returns offset+limit hits 2023-10-11 19:02:16 +05:30
3e0471edae Only trigger CI on created or edited comments 2023-10-11 15:15:15 +02:00
432df03c4c Use the correct base filename in the comment bench CI 2023-10-11 14:57:03 +02:00
11958016dd Force a small if to evoid triggering the CI every time 2023-10-11 14:27:51 +02:00
63c250a04d Do not use the GITHUB_REF variable 2023-10-11 13:05:54 +02:00
06d8cd5b72 Make sure that we checkout on the right branch 2023-10-11 12:02:44 +02:00
c0f2724c2d get rids of the new introduced error code in favor of an io::Error 2023-10-10 15:12:23 +02:00
d772073dfa use a bufreader everytime there is a grenad<file> 2023-10-10 15:00:30 +02:00
8fe8ddea79 Merge #4112
4112: Update version for the next release (v1.4.1) in Cargo.toml r=curquiza a=meili-bot

⚠️ This PR is automatically generated. Check the new version is the expected one and Cargo.lock has been updated before merging.

Co-authored-by: curquiza <curquiza@users.noreply.github.com>
2023-10-10 09:05:10 +00:00
8a95bf28e5 Update version for the next release (v1.4.1) in Cargo.toml 2023-10-10 09:01:45 +00:00
43989fe2e4 Reduce porximity range from 7 to 3 2023-10-03 12:16:48 +02:00
c668a29ed5 Bump webpki from 0.22.1 to 0.22.2
Bumps [webpki](https://github.com/briansmith/webpki) from 0.22.1 to 0.22.2.
- [Commits](https://github.com/briansmith/webpki/commits)

---
updated-dependencies:
- dependency-name: webpki
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
2023-10-02 21:53:45 +00:00
b10eeb0e41 Update .github/ISSUE_TEMPLATE/sprint_issue.md 2023-09-26 16:47:04 +02:00
4a8515e9fc Update sprint_issue.md 2023-09-26 16:46:18 +02:00
206 changed files with 3022 additions and 5577 deletions

View File

@ -7,19 +7,17 @@ assignees: ''
---
Related product team resources: [roadmap card]() (_internal only_) and [PRD]() (_internal only_)
Related product team resources: [PRD]() (_internal only_)
Related product discussion:
Related spec: WIP
## Motivation
<!---Copy/paste the information in the roadmap resources or briefly detail the product motivation. Ask product team if any hesitation.-->
<!---Copy/paste the information in PRD or briefly detail the product motivation. Ask product team if any hesitation.-->
## Usage
<!---Write a quick description of the usage if the usage has already been defined-->
Refer to the final spec to know the details and the final decisions about the usage.
<!---Link to the public part of the PRD, or to the related product discussion for experimental features-->
## TODO

View File

@ -8,11 +8,11 @@ env:
jobs:
run-benchmarks-on-comment:
if: startsWith(github.event.comment.body, '/benchmark')
name: Run and upload benchmarks
runs-on: benchmarks
timeout-minutes: 4320 # 72h
steps:
- uses: actions/checkout@v3
- uses: actions-rs/toolchain@v1
with:
profile: minimal
@ -27,14 +27,25 @@ jobs:
reaction-type: "eyes"
repo-token: ${{ env.GH_TOKEN }}
- uses: xt0rted/pull-request-comment-branch@v2
id: comment-branch
with:
repo_token: ${{ env.GH_TOKEN }}
- uses: actions/checkout@v3
if: success()
with:
fetch-depth: 0 # fetch full history to be able to get main commit sha
ref: ${{ steps.comment-branch.outputs.head_ref }}
# Set variables
- name: Set current branch name
shell: bash
run: echo "name=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT
run: echo "name=$(git rev-parse --abbrev-ref HEAD)" >> $GITHUB_OUTPUT
id: current_branch
- name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
shell: bash
run: echo "name=$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" >> $GITHUB_OUTPUT
run: echo "name=$(git rev-parse --abbrev-ref HEAD | tr '/' '_')" >> $GITHUB_OUTPUT
id: normalized_current_branch
- name: Set shorter commit SHA
shell: bash
@ -76,9 +87,11 @@ jobs:
env:
GITHUB_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }}
run: |
export base=$(git log --pretty=%p -n 1)
set -x
export base_ref=$(git merge-base origin/main ${{ steps.comment-branch.outputs.head_ref }} | head -c8)
export base_filename=$(echo ${{ steps.command.outputs.command-arguments }}_main_${base_ref}.json)
echo 'Here are your benchmarks diff 👊' >> body.txt
echo '```' >> body.txt
./benchmarks/scripts/compare.sh $base ${{ steps.file.outputs.basename }}.json >> body.txt
./benchmarks/scripts/compare.sh $base_filename ${{ steps.file.outputs.basename }}.json >> body.txt
echo '```' >> body.txt
gh pr comment ${GITHUB_REF#refs/heads/} --body-file body.txt
gh pr comment ${{ steps.current_branch.outputs.name }} --body-file body.txt

60
Cargo.lock generated
View File

@ -468,7 +468,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
[[package]]
name = "benchmarks"
version = "1.4.0"
version = "1.4.1"
dependencies = [
"anyhow",
"bytes",
@ -1206,7 +1206,7 @@ dependencies = [
[[package]]
name = "dump"
version = "1.4.0"
version = "1.4.1"
dependencies = [
"anyhow",
"big_s",
@ -1417,7 +1417,7 @@ dependencies = [
[[package]]
name = "file-store"
version = "1.4.0"
version = "1.4.1"
dependencies = [
"faux",
"tempfile",
@ -1439,7 +1439,7 @@ dependencies = [
[[package]]
name = "filter-parser"
version = "1.4.0"
version = "1.4.1"
dependencies = [
"insta",
"nom",
@ -1459,7 +1459,7 @@ dependencies = [
[[package]]
name = "flatten-serde-json"
version = "1.4.0"
version = "1.4.1"
dependencies = [
"criterion",
"serde_json",
@ -1577,7 +1577,7 @@ dependencies = [
[[package]]
name = "fuzzers"
version = "1.4.0"
version = "1.4.1"
dependencies = [
"arbitrary",
"clap",
@ -1663,12 +1663,13 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
[[package]]
name = "grenad"
version = "0.4.4"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5232b2d157b7bf63d7abe1b12177039e58db2f29e377517c0cdee1578cca4c93"
checksum = "6a007932af5475ebb5c63bef8812bb1c36f317983bb4ca663e9d6dd58d6a0f8c"
dependencies = [
"bytemuck",
"byteorder",
"rayon",
"tempfile",
]
@ -1891,7 +1892,7 @@ dependencies = [
[[package]]
name = "index-scheduler"
version = "1.4.0"
version = "1.4.1"
dependencies = [
"anyhow",
"big_s",
@ -2088,7 +2089,7 @@ dependencies = [
[[package]]
name = "json-depth-checker"
version = "1.4.0"
version = "1.4.1"
dependencies = [
"criterion",
"serde_json",
@ -2500,7 +2501,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]]
name = "meili-snap"
version = "1.4.0"
version = "1.4.1"
dependencies = [
"insta",
"md5",
@ -2509,7 +2510,7 @@ dependencies = [
[[package]]
name = "meilisearch"
version = "1.4.0"
version = "1.4.1"
dependencies = [
"actix-cors",
"actix-http",
@ -2564,7 +2565,6 @@ dependencies = [
"platform-dirs",
"prometheus",
"puffin",
"puffin_http",
"rand",
"rayon",
"regex",
@ -2600,7 +2600,7 @@ dependencies = [
[[package]]
name = "meilisearch-auth"
version = "1.4.0"
version = "1.4.1"
dependencies = [
"base64 0.21.2",
"enum-iterator",
@ -2619,7 +2619,7 @@ dependencies = [
[[package]]
name = "meilisearch-types"
version = "1.4.0"
version = "1.4.1"
dependencies = [
"actix-web",
"anyhow",
@ -2673,7 +2673,7 @@ dependencies = [
[[package]]
name = "milli"
version = "1.4.0"
version = "1.4.1"
dependencies = [
"big_s",
"bimap",
@ -2867,9 +2867,9 @@ dependencies = [
[[package]]
name = "obkv"
version = "0.2.0"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f69e48cd7c8e5bb52a1da1287fdbfd877c32673176583ce664cd63b201aba385"
checksum = "6c459142426056c639ff88d053ebaaaeca0ee1411c94362892398ef4ccd81080"
[[package]]
name = "once_cell"
@ -2996,7 +2996,7 @@ checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94"
[[package]]
name = "permissive-json-pointer"
version = "1.4.0"
version = "1.4.1"
dependencies = [
"big_s",
"serde_json",
@ -3194,7 +3194,7 @@ dependencies = [
"byteorder",
"hex",
"lazy_static",
"rustix 0.36.15",
"rustix 0.36.16",
]
[[package]]
@ -3237,18 +3237,6 @@ dependencies = [
"serde",
]
[[package]]
name = "puffin_http"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13bffc600c35913d282ae1e96a6ffcdf36dc7a7cdb9310e0ba15914d258c8193"
dependencies = [
"anyhow",
"crossbeam-channel",
"log",
"puffin",
]
[[package]]
name = "quote"
version = "1.0.32"
@ -3479,9 +3467,9 @@ dependencies = [
[[package]]
name = "rustix"
version = "0.36.15"
version = "0.36.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c37f1bd5ef1b5422177b7646cba67430579cfe2ace80f284fee876bca52ad941"
checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab"
dependencies = [
"bitflags 1.3.2",
"errno",
@ -4444,9 +4432,9 @@ dependencies = [
[[package]]
name = "webpki"
version = "0.22.1"
version = "0.22.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0e74f82d49d545ad128049b7e88f6576df2da6b02e9ce565c6f533be576957e"
checksum = "07ecc0cd7cac091bf682ec5efa18b1cff79d617b84181f38b3951dbe135f607f"
dependencies = [
"ring",
"untrusted",

View File

@ -18,7 +18,7 @@ members = [
]
[workspace.package]
version = "1.4.0"
version = "1.4.1"
authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"]
description = "Meilisearch HTTP server"
homepage = "https://meilisearch.com"

View File

@ -1,14 +1,14 @@
# Profiling Meilisearch
Search engine technologies are complex pieces of software that require thorough profiling tools. We chose to use [Puffin](https://github.com/EmbarkStudios/puffin), which the Rust gaming industry uses extensively. You can export and import the profiling reports using the top bar's _File_ menu options.
Search engine technologies are complex pieces of software that require thorough profiling tools. We chose to use [Puffin](https://github.com/EmbarkStudios/puffin), which the Rust gaming industry uses extensively. You can export and import the profiling reports using the top bar's _File_ menu options [in Puffin Viewer](https://github.com/embarkstudios/puffin#ui).
![An example profiling with Puffin viewer](assets/profiling-example.png)
## Profiling the Indexing Process
When you enable the `profile-with-puffin` feature of Meilisearch, a Puffin HTTP server will run on Meilisearch and listen on the default _0.0.0.0:8585_ address. This server will record a "frame" whenever it executes the `IndexScheduler::tick` method.
When you enable [the `exportPuffinReports` experimental feature](https://www.meilisearch.com/docs/learn/experimental/overview) of Meilisearch, Puffin reports with the `.puffin` extension will be automatically exported to disk. When this option is enabled, the engine will automatically create a "frame" whenever it executes the `IndexScheduler::tick` method.
Once your Meilisearch is running and awaits new indexation operations, you must [install and run the `puffin_viewer` tool](https://github.com/EmbarkStudios/puffin/tree/main/puffin_viewer) to see the profiling results. I advise you to run the viewer with the `RUST_LOG=puffin_http::client=debug` environment variable to see the client trying to connect to your server.
[Puffin Viewer](https://github.com/EmbarkStudios/puffin/tree/main/puffin_viewer) is used to analyze the reports. Those reports show areas where Meilisearch spent time during indexing.
Another piece of advice on the Puffin viewer UI interface is to consider the _Merge children with same ID_ option. It can hide the exact actual timings at which events were sent. Please turn it off when you see strange gaps on the Flamegraph. It can help.

View File

@ -25,6 +25,12 @@
<p align="center">⚡ A lightning-fast search engine that fits effortlessly into your apps, websites, and workflow 🔍</p>
---
### 🔥 On November 2nd, we are hosting our first-ever live demo and product updates for [Meilisearch Cloud](https://www.meilisearch.com/cloud?utm_campaign=oss&utm_source=github&utm_medium=meilisearch). Make sure to [register here](https://us06web.zoom.us/meeting/register/tZMlc-mqrjIsH912-HTRe-AaT-pp41bDe81a#/registration) and bring your questions for live Q&A!
---
Meilisearch helps you shape a delightful search experience in a snap, offering features that work out-of-the-box to speed up your workflow.
<p align="center" name="demo">

View File

@ -6,9 +6,7 @@ use std::path::Path;
use criterion::{criterion_group, criterion_main, Criterion};
use milli::heed::{EnvOpenOptions, RwTxn};
use milli::update::{
DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings,
};
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
use milli::Index;
use rand::seq::SliceRandom;
use rand_chacha::rand_core::SeedableRng;
@ -266,17 +264,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) {
(index, document_ids_to_delete)
},
move |(index, document_ids_to_delete)| {
let mut wtxn = index.write_txn().unwrap();
for ids in document_ids_to_delete {
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
builder.delete_documents(&ids);
builder.execute().unwrap();
}
wtxn.commit().unwrap();
index.prepare_for_closing().wait();
delete_documents_from_ids(index, document_ids_to_delete)
},
)
});
@ -613,17 +601,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) {
(index, document_ids_to_delete)
},
move |(index, document_ids_to_delete)| {
let mut wtxn = index.write_txn().unwrap();
for ids in document_ids_to_delete {
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
builder.delete_documents(&ids);
builder.execute().unwrap();
}
wtxn.commit().unwrap();
index.prepare_for_closing().wait();
delete_documents_from_ids(index, document_ids_to_delete)
},
)
});
@ -875,22 +853,41 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) {
(index, document_ids_to_delete)
},
move |(index, document_ids_to_delete)| {
let mut wtxn = index.write_txn().unwrap();
for ids in document_ids_to_delete {
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
builder.delete_documents(&ids);
builder.execute().unwrap();
}
wtxn.commit().unwrap();
index.prepare_for_closing().wait();
delete_documents_from_ids(index, document_ids_to_delete)
},
)
});
}
fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec<RoaringBitmap>) {
let mut wtxn = index.write_txn().unwrap();
let indexer_config = IndexerConfig::default();
for ids in document_ids_to_delete {
let external_documents_ids = index.external_documents_ids();
// FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings).
// Since what we have is an iterator, it would be better to delete in chunks
let external_to_internal: std::result::Result<Vec<_>, RoaringBitmap> =
external_documents_ids
.find_external_id_of(&wtxn, ids)
.unwrap()
.only_external_ids()
.collect();
let ids = external_to_internal.unwrap();
let config = IndexDocumentsConfig::default();
let mut builder =
IndexDocuments::new(&mut wtxn, &index, &indexer_config, config, |_| (), || false)
.unwrap();
(builder, _) = builder.remove_documents(ids).unwrap();
builder.execute().unwrap();
}
wtxn.commit().unwrap();
index.prepare_for_closing().wait();
}
fn indexing_movies_in_three_batches(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
group.sample_size(BENCHMARK_ITERATION);
@ -1112,17 +1109,7 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) {
(index, document_ids_to_delete)
},
move |(index, document_ids_to_delete)| {
let mut wtxn = index.write_txn().unwrap();
for ids in document_ids_to_delete {
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
builder.delete_documents(&ids);
builder.execute().unwrap();
}
wtxn.commit().unwrap();
index.prepare_for_closing().wait();
delete_documents_from_ids(index, document_ids_to_delete)
},
)
});
@ -1338,17 +1325,7 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) {
(index, document_ids_to_delete)
},
move |(index, document_ids_to_delete)| {
let mut wtxn = index.write_txn().unwrap();
for ids in document_ids_to_delete {
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
builder.delete_documents(&ids);
builder.execute().unwrap();
}
wtxn.commit().unwrap();
index.prepare_for_closing().wait();
delete_documents_from_ids(index, document_ids_to_delete)
},
)
});

View File

@ -526,12 +526,12 @@ pub(crate) mod test {
assert!(indexes.is_empty());
// products
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
insta::assert_json_snapshot!(products.metadata(), @r###"
{
"uid": "products",
"primaryKey": "sku",
"createdAt": "[now]",
"updatedAt": "[now]"
"createdAt": "2022-10-09T20:27:22.688964637Z",
"updatedAt": "2022-10-09T20:27:23.951017769Z"
}
"###);
@ -541,12 +541,12 @@ pub(crate) mod test {
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
// movies
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
insta::assert_json_snapshot!(movies.metadata(), @r###"
{
"uid": "movies",
"primaryKey": "id",
"createdAt": "[now]",
"updatedAt": "[now]"
"createdAt": "2022-10-09T20:27:22.197788495Z",
"updatedAt": "2022-10-09T20:28:01.93111053Z"
}
"###);
@ -571,12 +571,12 @@ pub(crate) mod test {
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce");
// spells
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
insta::assert_json_snapshot!(spells.metadata(), @r###"
{
"uid": "dnd_spells",
"primaryKey": "index",
"createdAt": "[now]",
"updatedAt": "[now]"
"createdAt": "2022-10-09T20:27:24.242683494Z",
"updatedAt": "2022-10-09T20:27:24.312809641Z"
}
"###);
@ -617,12 +617,12 @@ pub(crate) mod test {
assert!(indexes.is_empty());
// products
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
insta::assert_json_snapshot!(products.metadata(), @r###"
{
"uid": "products",
"primaryKey": "sku",
"createdAt": "[now]",
"updatedAt": "[now]"
"createdAt": "2023-01-30T16:25:56.595257Z",
"updatedAt": "2023-01-30T16:25:58.70348Z"
}
"###);
@ -632,12 +632,12 @@ pub(crate) mod test {
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
// movies
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
insta::assert_json_snapshot!(movies.metadata(), @r###"
{
"uid": "movies",
"primaryKey": "id",
"createdAt": "[now]",
"updatedAt": "[now]"
"createdAt": "2023-01-30T16:25:56.192178Z",
"updatedAt": "2023-01-30T16:25:56.455714Z"
}
"###);
@ -647,12 +647,12 @@ pub(crate) mod test {
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720");
// spells
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
insta::assert_json_snapshot!(spells.metadata(), @r###"
{
"uid": "dnd_spells",
"primaryKey": "index",
"createdAt": "[now]",
"updatedAt": "[now]"
"createdAt": "2023-01-30T16:25:58.876405Z",
"updatedAt": "2023-01-30T16:25:59.079906Z"
}
"###);

View File

@ -1,24 +0,0 @@
---
source: dump/src/reader/mod.rs
expression: spells.settings().unwrap()
---
{
"displayedAttributes": [
"*"
],
"searchableAttributes": [
"*"
],
"filterableAttributes": [],
"sortableAttributes": [],
"rankingRules": [
"typo",
"words",
"proximity",
"attribute",
"exactness"
],
"stopWords": [],
"synonyms": {},
"distinctAttribute": null
}

View File

@ -1,38 +0,0 @@
---
source: dump/src/reader/mod.rs
expression: products.settings().unwrap()
---
{
"displayedAttributes": [
"*"
],
"searchableAttributes": [
"*"
],
"filterableAttributes": [],
"sortableAttributes": [],
"rankingRules": [
"typo",
"words",
"proximity",
"attribute",
"exactness"
],
"stopWords": [],
"synonyms": {
"android": [
"phone",
"smartphone"
],
"iphone": [
"phone",
"smartphone"
],
"phone": [
"android",
"iphone",
"smartphone"
]
},
"distinctAttribute": null
}

View File

@ -1,31 +0,0 @@
---
source: dump/src/reader/mod.rs
expression: movies.settings().unwrap()
---
{
"displayedAttributes": [
"*"
],
"searchableAttributes": [
"*"
],
"filterableAttributes": [
"genres",
"id"
],
"sortableAttributes": [
"genres",
"id"
],
"rankingRules": [
"typo",
"words",
"proximity",
"attribute",
"exactness",
"release_date:asc"
],
"stopWords": [],
"synonyms": {},
"distinctAttribute": null
}

View File

@ -46,6 +46,7 @@ pub type Checked = settings::Checked;
pub type Unchecked = settings::Unchecked;
pub type Task = updates::UpdateEntry;
pub type Kind = updates::UpdateMeta;
// everything related to the errors
pub type ResponseError = errors::ResponseError;
@ -107,8 +108,11 @@ impl V2Reader {
pub fn indexes(&self) -> Result<impl Iterator<Item = Result<V2IndexReader>> + '_> {
Ok(self.index_uuid.iter().map(|index| -> Result<_> {
V2IndexReader::new(
index.uid.clone(),
&self.dump.path().join("indexes").join(format!("index-{}", index.uuid)),
index,
BufReader::new(
File::open(self.dump.path().join("updates").join("data.jsonl")).unwrap(),
),
)
}))
}
@ -143,16 +147,41 @@ pub struct V2IndexReader {
}
impl V2IndexReader {
pub fn new(name: String, path: &Path) -> Result<Self> {
pub fn new(path: &Path, index_uuid: &IndexUuid, tasks: BufReader<File>) -> Result<Self> {
let meta = File::open(path.join("meta.json"))?;
let meta: DumpMeta = serde_json::from_reader(meta)?;
let mut created_at = None;
let mut updated_at = None;
for line in tasks.lines() {
let task: Task = serde_json::from_str(&line?)?;
if !(task.uuid == index_uuid.uuid && task.is_finished()) {
continue;
}
let new_created_at = match task.update.meta() {
Kind::DocumentsAddition { .. } | Kind::Settings(_) => task.update.finished_at(),
_ => None,
};
let new_updated_at = task.update.finished_at();
if created_at.is_none() || created_at > new_created_at {
created_at = new_created_at;
}
if updated_at.is_none() || updated_at < new_updated_at {
updated_at = new_updated_at;
}
}
let current_time = OffsetDateTime::now_utc();
let metadata = IndexMetadata {
uid: name,
uid: index_uuid.uid.clone(),
primary_key: meta.primary_key,
// FIXME: Iterate over the whole task queue to find the creation and last update date.
created_at: OffsetDateTime::now_utc(),
updated_at: OffsetDateTime::now_utc(),
created_at: created_at.unwrap_or(current_time),
updated_at: updated_at.unwrap_or(current_time),
};
let ret = V2IndexReader {
@ -248,12 +277,12 @@ pub(crate) mod test {
assert!(indexes.is_empty());
// products
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
insta::assert_json_snapshot!(products.metadata(), @r###"
{
"uid": "products",
"primaryKey": "sku",
"createdAt": "[now]",
"updatedAt": "[now]"
"createdAt": "2022-10-09T20:27:22.688964637Z",
"updatedAt": "2022-10-09T20:27:23.951017769Z"
}
"###);
@ -263,12 +292,12 @@ pub(crate) mod test {
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
// movies
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
insta::assert_json_snapshot!(movies.metadata(), @r###"
{
"uid": "movies",
"primaryKey": "id",
"createdAt": "[now]",
"updatedAt": "[now]"
"createdAt": "2022-10-09T20:27:22.197788495Z",
"updatedAt": "2022-10-09T20:28:01.93111053Z"
}
"###);
@ -293,12 +322,12 @@ pub(crate) mod test {
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce");
// spells
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
insta::assert_json_snapshot!(spells.metadata(), @r###"
{
"uid": "dnd_spells",
"primaryKey": "index",
"createdAt": "[now]",
"updatedAt": "[now]"
"createdAt": "2022-10-09T20:27:24.242683494Z",
"updatedAt": "2022-10-09T20:27:24.312809641Z"
}
"###);
@ -340,12 +369,12 @@ pub(crate) mod test {
assert!(indexes.is_empty());
// products
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
insta::assert_json_snapshot!(products.metadata(), @r###"
{
"uid": "products",
"primaryKey": "sku",
"createdAt": "[now]",
"updatedAt": "[now]"
"createdAt": "2023-01-30T16:25:56.595257Z",
"updatedAt": "2023-01-30T16:25:58.70348Z"
}
"###);
@ -355,12 +384,12 @@ pub(crate) mod test {
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
// movies
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
insta::assert_json_snapshot!(movies.metadata(), @r###"
{
"uid": "movies",
"primaryKey": "id",
"createdAt": "[now]",
"updatedAt": "[now]"
"createdAt": "2023-01-30T16:25:56.192178Z",
"updatedAt": "2023-01-30T16:25:56.455714Z"
}
"###);
@ -370,12 +399,12 @@ pub(crate) mod test {
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720");
// spells
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
insta::assert_json_snapshot!(spells.metadata(), @r###"
{
"uid": "dnd_spells",
"primaryKey": "index",
"createdAt": "[now]",
"updatedAt": "[now]"
"createdAt": "2023-01-30T16:25:58.876405Z",
"updatedAt": "2023-01-30T16:25:59.079906Z"
}
"###);

View File

@ -227,4 +227,14 @@ impl UpdateStatus {
_ => None,
}
}
pub fn finished_at(&self) -> Option<OffsetDateTime> {
match self {
UpdateStatus::Processing(_) => None,
UpdateStatus::Enqueued(_) => None,
UpdateStatus::Processed(u) => Some(u.processed_at),
UpdateStatus::Aborted(_) => None,
UpdateStatus::Failed(u) => Some(u.failed_at),
}
}
}

View File

@ -19,18 +19,18 @@ one indexing operation.
use std::collections::{BTreeSet, HashSet};
use std::ffi::OsStr;
use std::fmt;
use std::fs::{self, File};
use std::io::BufWriter;
use dump::IndexMetadata;
use log::{debug, error, info};
use log::{debug, error, info, trace};
use meilisearch_types::error::Code;
use meilisearch_types::heed::{RoTxn, RwTxn};
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
use meilisearch_types::milli::heed::CompactionOption;
use meilisearch_types::milli::update::{
DeleteDocuments, DocumentDeletionResult, IndexDocumentsConfig, IndexDocumentsMethod,
Settings as MilliSettings,
IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings,
};
use meilisearch_types::milli::{self, Filter, BEU32};
use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked};
@ -43,7 +43,7 @@ use uuid::Uuid;
use crate::autobatcher::{self, BatchKind};
use crate::utils::{self, swap_index_uid_in_task};
use crate::{Error, IndexScheduler, ProcessingTasks, Result, TaskId};
use crate::{Error, IndexScheduler, MustStopProcessing, ProcessingTasks, Result, TaskId};
/// Represents a combination of tasks that can all be processed at the same time.
///
@ -199,6 +199,29 @@ impl Batch {
}
}
impl fmt::Display for Batch {
/// A text used when we debug the profiling reports.
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let index_uid = self.index_uid();
let tasks = self.ids();
match self {
Batch::TaskCancelation { .. } => f.write_str("TaskCancelation")?,
Batch::TaskDeletion(_) => f.write_str("TaskDeletion")?,
Batch::SnapshotCreation(_) => f.write_str("SnapshotCreation")?,
Batch::Dump(_) => f.write_str("Dump")?,
Batch::IndexOperation { op, .. } => write!(f, "{op}")?,
Batch::IndexCreation { .. } => f.write_str("IndexCreation")?,
Batch::IndexUpdate { .. } => f.write_str("IndexUpdate")?,
Batch::IndexDeletion { .. } => f.write_str("IndexDeletion")?,
Batch::IndexSwap { .. } => f.write_str("IndexSwap")?,
};
match index_uid {
Some(name) => f.write_fmt(format_args!(" on {name:?} from tasks: {tasks:?}")),
None => f.write_fmt(format_args!(" from tasks: {tasks:?}")),
}
}
}
impl IndexOperation {
pub fn index_uid(&self) -> &str {
match self {
@ -213,6 +236,30 @@ impl IndexOperation {
}
}
impl fmt::Display for IndexOperation {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
IndexOperation::DocumentOperation { .. } => {
f.write_str("IndexOperation::DocumentOperation")
}
IndexOperation::DocumentDeletion { .. } => {
f.write_str("IndexOperation::DocumentDeletion")
}
IndexOperation::IndexDocumentDeletionByFilter { .. } => {
f.write_str("IndexOperation::IndexDocumentDeletionByFilter")
}
IndexOperation::DocumentClear { .. } => f.write_str("IndexOperation::DocumentClear"),
IndexOperation::Settings { .. } => f.write_str("IndexOperation::Settings"),
IndexOperation::DocumentClearAndSetting { .. } => {
f.write_str("IndexOperation::DocumentClearAndSetting")
}
IndexOperation::SettingsAndDocumentOperation { .. } => {
f.write_str("IndexOperation::SettingsAndDocumentOperation")
}
}
}
}
impl IndexScheduler {
/// Convert an [`BatchKind`](crate::autobatcher::BatchKind) into a [`Batch`].
///
@ -581,7 +628,7 @@ impl IndexScheduler {
self.breakpoint(crate::Breakpoint::InsideProcessBatch);
}
puffin::profile_function!(format!("{:?}", batch));
puffin::profile_function!(batch.to_string());
match batch {
Batch::TaskCancelation { mut task, previous_started_at, previous_processing_tasks } => {
@ -848,7 +895,7 @@ impl IndexScheduler {
})?;
// 4. Dump experimental feature settings
let features = self.features()?.runtime_features();
let features = self.features().runtime_features();
dump.create_experimental_features(features)?;
let dump_uid = started_at.format(format_description!(
@ -1143,7 +1190,7 @@ impl IndexScheduler {
index,
indexer_config,
config,
|indexing_step| debug!("update: {:?}", indexing_step),
|indexing_step| trace!("update: {:?}", indexing_step),
|| must_stop_processing.get(),
)?;
@ -1190,7 +1237,8 @@ impl IndexScheduler {
let (new_builder, user_result) =
builder.remove_documents(document_ids)?;
builder = new_builder;
// Uses Invariant: remove documents actually always returns Ok for the inner result
let count = user_result.unwrap();
let provided_ids =
if let Some(Details::DocumentDeletion { provided_ids, .. }) =
task.details
@ -1201,23 +1249,11 @@ impl IndexScheduler {
unreachable!();
};
match user_result {
Ok(count) => {
task.status = Status::Succeeded;
task.details = Some(Details::DocumentDeletion {
provided_ids,
deleted_documents: Some(count),
});
}
Err(e) => {
task.status = Status::Failed;
task.details = Some(Details::DocumentDeletion {
provided_ids,
deleted_documents: Some(0),
});
task.error = Some(milli::Error::from(e).into());
}
}
task.status = Status::Succeeded;
task.details = Some(Details::DocumentDeletion {
provided_ids,
deleted_documents: Some(count),
});
}
}
}
@ -1232,7 +1268,7 @@ impl IndexScheduler {
milli::update::Settings::new(index_wtxn, index, indexer_config);
builder.reset_primary_key();
builder.execute(
|indexing_step| debug!("update: {:?}", indexing_step),
|indexing_step| trace!("update: {:?}", indexing_step),
|| must_stop_processing.clone().get(),
)?;
}
@ -1240,21 +1276,42 @@ impl IndexScheduler {
Ok(tasks)
}
IndexOperation::DocumentDeletion { index_uid: _, documents, mut tasks } => {
let mut builder = milli::update::DeleteDocuments::new(index_wtxn, index)?;
documents.iter().flatten().for_each(|id| {
builder.delete_external_id(id);
});
let indexer_config = self.index_mapper.indexer_config();
let config = IndexDocumentsConfig {
update_method: IndexDocumentsMethod::ReplaceDocuments,
..Default::default()
};
let must_stop_processing = self.must_stop_processing.clone();
let DocumentDeletionResult { deleted_documents, .. } = builder.execute()?;
let mut builder = milli::update::IndexDocuments::new(
index_wtxn,
index,
indexer_config,
config,
|indexing_step| trace!("update: {:?}", indexing_step),
|| must_stop_processing.get(),
)?;
let document_ids = documents.iter().flatten().cloned().collect();
let (new_builder, user_result) = builder.remove_documents(document_ids)?;
builder = new_builder;
// Uses Invariant: remove documents actually always returns Ok for the inner result
let count = user_result.unwrap();
for (task, documents) in tasks.iter_mut().zip(documents) {
task.status = Status::Succeeded;
task.details = Some(Details::DocumentDeletion {
provided_ids: documents.len(),
deleted_documents: Some(deleted_documents.min(documents.len() as u64)),
deleted_documents: Some(count.min(documents.len() as u64)),
});
}
if !tasks.iter().all(|res| res.error.is_some()) {
let addition = builder.execute()?;
info!("document deletion done: {:?}", addition);
}
Ok(tasks)
}
IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
@ -1266,7 +1323,13 @@ impl IndexScheduler {
} else {
unreachable!()
};
let deleted_documents = delete_document_by_filter(index_wtxn, filter, index);
let deleted_documents = delete_document_by_filter(
index_wtxn,
filter,
self.index_mapper.indexer_config(),
self.must_stop_processing.clone(),
index,
);
let original_filter = if let Some(Details::DocumentDeletionByFilter {
original_filter,
deleted_documents: _,
@ -1500,6 +1563,8 @@ impl IndexScheduler {
fn delete_document_by_filter<'a>(
wtxn: &mut RwTxn<'a, '_>,
filter: &serde_json::Value,
indexer_config: &IndexerConfig,
must_stop_processing: MustStopProcessing,
index: &'a Index,
) -> Result<u64> {
let filter = Filter::from_json(filter)?;
@ -1510,9 +1575,41 @@ fn delete_document_by_filter<'a>(
}
e => e.into(),
})?;
let mut delete_operation = DeleteDocuments::new(wtxn, index)?;
delete_operation.delete_documents(&candidates);
delete_operation.execute().map(|result| result.deleted_documents)?
let external_documents_ids = index.external_documents_ids();
// FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings).
// Since what we have is an iterator, it would be better to delete in chunks
let external_to_internal: std::result::Result<Vec<_>, RoaringBitmap> =
external_documents_ids
.find_external_id_of(wtxn, candidates)?
.only_external_ids()
.collect();
let document_ids = match external_to_internal {
Ok(external_ids) => external_ids,
Err(remaining_ids) => panic!("Couldn't find some external ids {:?}", remaining_ids),
};
let config = IndexDocumentsConfig {
update_method: IndexDocumentsMethod::ReplaceDocuments,
..Default::default()
};
let mut builder = milli::update::IndexDocuments::new(
wtxn,
index,
indexer_config,
config,
|indexing_step| debug!("update: {:?}", indexing_step),
|| must_stop_processing.get(),
)?;
let (new_builder, user_result) = builder.remove_documents(document_ids)?;
builder = new_builder;
// Uses Invariant: remove documents actually always returns Ok for the inner result
let count = user_result.unwrap();
let _ = builder.execute()?;
count
} else {
0
})

View File

@ -1,6 +1,8 @@
use std::sync::{Arc, RwLock};
use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures};
use meilisearch_types::heed::types::{SerdeJson, Str};
use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn};
use meilisearch_types::heed::{Database, Env, RwTxn};
use crate::error::FeatureNotEnabledError;
use crate::Result;
@ -9,20 +11,19 @@ const EXPERIMENTAL_FEATURES: &str = "experimental-features";
#[derive(Clone)]
pub(crate) struct FeatureData {
runtime: Database<Str, SerdeJson<RuntimeTogglableFeatures>>,
instance: InstanceTogglableFeatures,
persisted: Database<Str, SerdeJson<RuntimeTogglableFeatures>>,
runtime: Arc<RwLock<RuntimeTogglableFeatures>>,
}
#[derive(Debug, Clone, Copy)]
pub struct RoFeatures {
runtime: RuntimeTogglableFeatures,
instance: InstanceTogglableFeatures,
}
impl RoFeatures {
fn new(txn: RoTxn<'_>, data: &FeatureData) -> Result<Self> {
let runtime = data.runtime_features(txn)?;
Ok(Self { runtime, instance: data.instance })
fn new(data: &FeatureData) -> Self {
let runtime = data.runtime_features();
Self { runtime }
}
pub fn runtime_features(&self) -> RuntimeTogglableFeatures {
@ -43,13 +44,13 @@ impl RoFeatures {
}
pub fn check_metrics(&self) -> Result<()> {
if self.instance.metrics {
if self.runtime.metrics {
Ok(())
} else {
Err(FeatureNotEnabledError {
disabled_action: "Getting metrics",
feature: "metrics",
issue_link: "https://github.com/meilisearch/meilisearch/discussions/3518",
issue_link: "https://github.com/meilisearch/product/discussions/625",
}
.into())
}
@ -67,15 +68,36 @@ impl RoFeatures {
.into())
}
}
pub fn check_puffin(&self) -> Result<()> {
if self.runtime.export_puffin_reports {
Ok(())
} else {
Err(FeatureNotEnabledError {
disabled_action: "Outputting Puffin reports to disk",
feature: "export puffin reports",
issue_link: "https://github.com/meilisearch/product/discussions/693",
}
.into())
}
}
}
impl FeatureData {
pub fn new(env: &Env, instance_features: InstanceTogglableFeatures) -> Result<Self> {
let mut wtxn = env.write_txn()?;
let runtime_features = env.create_database(&mut wtxn, Some(EXPERIMENTAL_FEATURES))?;
let runtime_features_db = env.create_database(&mut wtxn, Some(EXPERIMENTAL_FEATURES))?;
wtxn.commit()?;
Ok(Self { runtime: runtime_features, instance: instance_features })
let txn = env.read_txn()?;
let persisted_features: RuntimeTogglableFeatures =
runtime_features_db.get(&txn, EXPERIMENTAL_FEATURES)?.unwrap_or_default();
let runtime = Arc::new(RwLock::new(RuntimeTogglableFeatures {
metrics: instance_features.metrics || persisted_features.metrics,
..persisted_features
}));
Ok(Self { persisted: runtime_features_db, runtime })
}
pub fn put_runtime_features(
@ -83,16 +105,25 @@ impl FeatureData {
mut wtxn: RwTxn,
features: RuntimeTogglableFeatures,
) -> Result<()> {
self.runtime.put(&mut wtxn, EXPERIMENTAL_FEATURES, &features)?;
self.persisted.put(&mut wtxn, EXPERIMENTAL_FEATURES, &features)?;
wtxn.commit()?;
// safe to unwrap, the lock will only fail if:
// 1. requested by the same thread concurrently -> it is called and released in methods that don't call each other
// 2. there's a panic while the thread is held -> it is only used for an assignment here.
let mut toggled_features = self.runtime.write().unwrap();
*toggled_features = features;
Ok(())
}
fn runtime_features(&self, txn: RoTxn) -> Result<RuntimeTogglableFeatures> {
Ok(self.runtime.get(&txn, EXPERIMENTAL_FEATURES)?.unwrap_or_default())
fn runtime_features(&self) -> RuntimeTogglableFeatures {
// sound to unwrap, the lock will only fail if:
// 1. requested by the same thread concurrently -> it is called and released in methods that don't call each other
// 2. there's a panic while the thread is held -> it is only used for copying the data here
*self.runtime.read().unwrap()
}
pub fn features(&self, txn: RoTxn) -> Result<RoFeatures> {
RoFeatures::new(txn, self)
pub fn features(&self) -> RoFeatures {
RoFeatures::new(self)
}
}

View File

@ -30,6 +30,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {
index_mapper,
features: _,
max_number_of_tasks: _,
puffin_frame: _,
wake_up: _,
dumps_path: _,
snapshots_path: _,

View File

@ -33,6 +33,7 @@ pub type Result<T> = std::result::Result<T, Error>;
pub type TaskId = u32;
use std::collections::{BTreeMap, HashMap};
use std::fs::File;
use std::ops::{Bound, RangeBounds};
use std::path::{Path, PathBuf};
use std::sync::atomic::AtomicBool;
@ -52,6 +53,7 @@ use meilisearch_types::milli::documents::DocumentsBatchBuilder;
use meilisearch_types::milli::update::IndexerConfig;
use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32};
use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task};
use puffin::FrameView;
use roaring::RoaringBitmap;
use synchronoise::SignalEvent;
use time::format_description::well_known::Rfc3339;
@ -314,6 +316,9 @@ pub struct IndexScheduler {
/// the finished tasks automatically.
pub(crate) max_number_of_tasks: usize,
/// A frame to output the indexation profiling files to disk.
pub(crate) puffin_frame: Arc<puffin::GlobalFrameView>,
/// The path used to create the dumps.
pub(crate) dumps_path: PathBuf,
@ -364,6 +369,7 @@ impl IndexScheduler {
wake_up: self.wake_up.clone(),
autobatching_enabled: self.autobatching_enabled,
max_number_of_tasks: self.max_number_of_tasks,
puffin_frame: self.puffin_frame.clone(),
snapshots_path: self.snapshots_path.clone(),
dumps_path: self.dumps_path.clone(),
auth_path: self.auth_path.clone(),
@ -457,6 +463,7 @@ impl IndexScheduler {
env,
// we want to start the loop right away in case meilisearch was ctrl+Ced while processing things
wake_up: Arc::new(SignalEvent::auto(true)),
puffin_frame: Arc::new(puffin::GlobalFrameView::default()),
autobatching_enabled: options.autobatching_enabled,
max_number_of_tasks: options.max_number_of_tasks,
dumps_path: options.dumps_path,
@ -572,17 +579,46 @@ impl IndexScheduler {
run.wake_up.wait();
loop {
let puffin_enabled = run.features().check_puffin().is_ok();
puffin::set_scopes_on(puffin_enabled);
puffin::GlobalProfiler::lock().new_frame();
match run.tick() {
Ok(TickOutcome::TickAgain(_)) => (),
Ok(TickOutcome::WaitForSignal) => run.wake_up.wait(),
Err(e) => {
log::error!("{}", e);
log::error!("{e}");
// Wait one second when an irrecoverable error occurs.
if !e.is_recoverable() {
std::thread::sleep(Duration::from_secs(1));
}
}
}
// Let's write the previous frame to disk but only if
// the user wanted to profile with puffin.
if puffin_enabled {
let mut frame_view = run.puffin_frame.lock();
if !frame_view.is_empty() {
let now = OffsetDateTime::now_utc();
let mut file = match File::create(format!("{}.puffin", now)) {
Ok(file) => file,
Err(e) => {
log::error!("{e}");
continue;
}
};
if let Err(e) = frame_view.save_to_writer(&mut file) {
log::error!("{e}");
}
if let Err(e) = file.sync_all() {
log::error!("{e}");
}
// We erase this frame view as it is no more useful. We want to
// measure the new frames now that we exported the previous ones.
*frame_view = FrameView::default();
}
}
}
})
.unwrap();
@ -1062,8 +1098,6 @@ impl IndexScheduler {
self.breakpoint(Breakpoint::Start);
}
puffin::GlobalProfiler::lock().new_frame();
self.cleanup_task_queue()?;
let rtxn = self.env.read_txn().map_err(Error::HeedTransaction)?;
@ -1259,9 +1293,8 @@ impl IndexScheduler {
Ok(IndexStats { is_indexing, inner_stats: index_stats })
}
pub fn features(&self) -> Result<RoFeatures> {
let rtxn = self.read_txn()?;
self.features.features(rtxn)
pub fn features(&self) -> RoFeatures {
self.features.features()
}
pub fn put_runtime_features(&self, features: RuntimeTogglableFeatures) -> Result<()> {

View File

@ -324,7 +324,6 @@ impl ErrorCode for milli::Error {
UserError::SerdeJson(_)
| UserError::InvalidLmdbOpenOptions
| UserError::DocumentLimitReached
| UserError::AccessingSoftDeletedDocument { .. }
| UserError::UnknownInternalDocumentId { .. } => Code::Internal,
UserError::InvalidStoreFile => Code::InvalidStoreFile,
UserError::NoSpaceLeftOnDevice => Code::NoSpaceLeftOnDevice,

View File

@ -5,6 +5,8 @@ use serde::{Deserialize, Serialize};
pub struct RuntimeTogglableFeatures {
pub score_details: bool,
pub vector_store: bool,
pub metrics: bool,
pub export_puffin_reports: bool,
}
#[derive(Default, Debug, Clone, Copy)]

View File

@ -69,8 +69,7 @@ permissive-json-pointer = { path = "../permissive-json-pointer" }
pin-project-lite = "0.2.9"
platform-dirs = "0.3.0"
prometheus = { version = "0.13.3", features = ["process"] }
puffin = "0.16.0"
puffin_http = { version = "0.13.0", optional = true }
puffin = { version = "0.16.0", features = ["serialization"] }
rand = "0.8.5"
rayon = "1.7.0"
regex = "1.7.3"
@ -135,7 +134,6 @@ zip = { version = "0.6.4", optional = true }
[features]
default = ["analytics", "meilisearch-types/all-tokenizations", "mini-dashboard"]
analytics = ["segment"]
profile-with-puffin = ["dep:puffin_http"]
mini-dashboard = [
"actix-web-static-files",
"static-files",

View File

@ -114,10 +114,7 @@ pub fn create_app(
.configure(routes::configure)
.configure(|s| dashboard(s, enable_dashboard));
let app = app.wrap(actix_web::middleware::Condition::new(
opt.experimental_enable_metrics,
middleware::RouteMetrics,
));
let app = app.wrap(middleware::RouteMetrics);
app.wrap(
Cors::default()
.send_wildcard()
@ -365,7 +362,7 @@ fn import_dump(
update_method: IndexDocumentsMethod::ReplaceDocuments,
..Default::default()
},
|indexing_step| log::debug!("update: {:?}", indexing_step),
|indexing_step| log::trace!("update: {:?}", indexing_step),
|| false,
)?;

View File

@ -30,10 +30,6 @@ fn setup(opt: &Opt) -> anyhow::Result<()> {
async fn main() -> anyhow::Result<()> {
let (opt, config_read_from) = Opt::try_build()?;
#[cfg(feature = "profile-with-puffin")]
let _server = puffin_http::Server::new(&format!("0.0.0.0:{}", puffin_http::DEFAULT_PORT))?;
puffin::set_scopes_on(cfg!(feature = "profile-with-puffin"));
anyhow::ensure!(
!(cfg!(windows) && opt.experimental_reduce_indexing_memory_usage),
"The `experimental-reduce-indexing-memory-usage` flag is not supported on Windows"

View File

@ -3,8 +3,10 @@
use std::future::{ready, Ready};
use actix_web::dev::{self, Service, ServiceRequest, ServiceResponse, Transform};
use actix_web::web::Data;
use actix_web::Error;
use futures_util::future::LocalBoxFuture;
use index_scheduler::IndexScheduler;
use prometheus::HistogramTimer;
pub struct RouteMetrics;
@ -47,19 +49,27 @@ where
fn call(&self, req: ServiceRequest) -> Self::Future {
let mut histogram_timer: Option<HistogramTimer> = None;
let request_path = req.path();
let is_registered_resource = req.resource_map().has_resource(request_path);
if is_registered_resource {
let request_method = req.method().to_string();
histogram_timer = Some(
crate::metrics::MEILISEARCH_HTTP_RESPONSE_TIME_SECONDS
// calling unwrap here is safe because index scheduler is added to app data while creating actix app.
// also, the tests will fail if this is not present.
let index_scheduler = req.app_data::<Data<IndexScheduler>>().unwrap();
let features = index_scheduler.features();
if features.check_metrics().is_ok() {
let request_path = req.path();
let is_registered_resource = req.resource_map().has_resource(request_path);
if is_registered_resource {
let request_method = req.method().to_string();
histogram_timer = Some(
crate::metrics::MEILISEARCH_HTTP_RESPONSE_TIME_SECONDS
.with_label_values(&[&request_method, request_path])
.start_timer(),
);
crate::metrics::MEILISEARCH_HTTP_REQUESTS_TOTAL
.with_label_values(&[&request_method, request_path])
.start_timer(),
);
crate::metrics::MEILISEARCH_HTTP_REQUESTS_TOTAL
.with_label_values(&[&request_method, request_path])
.inc();
}
.inc();
}
};
let fut = self.service.call(req);

View File

@ -29,12 +29,12 @@ async fn get_features(
>,
req: HttpRequest,
analytics: Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
let features = index_scheduler.features()?;
) -> HttpResponse {
let features = index_scheduler.features();
analytics.publish("Experimental features Seen".to_string(), json!(null), Some(&req));
debug!("returns: {:?}", features.runtime_features());
Ok(HttpResponse::Ok().json(features.runtime_features()))
HttpResponse::Ok().json(features.runtime_features())
}
#[derive(Debug, Deserr)]
@ -44,6 +44,10 @@ pub struct RuntimeTogglableFeatures {
pub score_details: Option<bool>,
#[deserr(default)]
pub vector_store: Option<bool>,
#[deserr(default)]
pub metrics: Option<bool>,
#[deserr(default)]
pub export_puffin_reports: Option<bool>,
}
async fn patch_features(
@ -55,26 +59,36 @@ async fn patch_features(
req: HttpRequest,
analytics: Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
let features = index_scheduler.features()?;
let features = index_scheduler.features();
let old_features = features.runtime_features();
let new_features = meilisearch_types::features::RuntimeTogglableFeatures {
score_details: new_features.0.score_details.unwrap_or(old_features.score_details),
vector_store: new_features.0.vector_store.unwrap_or(old_features.vector_store),
metrics: new_features.0.metrics.unwrap_or(old_features.metrics),
export_puffin_reports: new_features
.0
.export_puffin_reports
.unwrap_or(old_features.export_puffin_reports),
};
// explicitly destructure for analytics rather than using the `Serialize` implementation, because
// the it renames to camelCase, which we don't want for analytics.
// **Do not** ignore fields with `..` or `_` here, because we want to add them in the future.
let meilisearch_types::features::RuntimeTogglableFeatures { score_details, vector_store } =
new_features;
let meilisearch_types::features::RuntimeTogglableFeatures {
score_details,
vector_store,
metrics,
export_puffin_reports,
} = new_features;
analytics.publish(
"Experimental features Updated".to_string(),
json!({
"score_details": score_details,
"vector_store": vector_store,
"metrics": metrics,
"export_puffin_reports": export_puffin_reports,
}),
Some(&req),
);

View File

@ -612,8 +612,8 @@ fn retrieve_document<S: AsRef<str>>(
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
let internal_id = index
.external_documents_ids(&txn)?
.get(doc_id.as_bytes())
.external_documents_ids()
.get(&txn, doc_id)?
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?;
let document = index

View File

@ -68,7 +68,7 @@ pub async fn search(
}
let index = index_scheduler.index(&index_uid)?;
let features = index_scheduler.features()?;
let features = index_scheduler.features();
let search_result = tokio::task::spawn_blocking(move || {
perform_facet_search(&index, search_query, facet_query, facet_name, features)
})

View File

@ -157,7 +157,7 @@ pub async fn search_with_url_query(
let mut aggregate = SearchAggregator::from_query(&query, &req);
let index = index_scheduler.index(&index_uid)?;
let features = index_scheduler.features()?;
let features = index_scheduler.features();
let search_result =
tokio::task::spawn_blocking(move || perform_search(&index, query, features)).await?;
if let Ok(ref search_result) = search_result {
@ -192,7 +192,7 @@ pub async fn search_with_post(
let index = index_scheduler.index(&index_uid)?;
let features = index_scheduler.features()?;
let features = index_scheduler.features();
let search_result =
tokio::task::spawn_blocking(move || perform_search(&index, query, features)).await?;
if let Ok(ref search_result) = search_result {

View File

@ -19,7 +19,7 @@ pub async fn get_metrics(
index_scheduler: GuardedData<ActionPolicy<{ actions::METRICS_GET }>, Data<IndexScheduler>>,
auth_controller: Data<AuthController>,
) -> Result<HttpResponse, ResponseError> {
index_scheduler.features()?.check_metrics()?;
index_scheduler.features().check_metrics()?;
let auth_filters = index_scheduler.filters();
if !auth_filters.all_indexes_authorized() {
let mut error = ResponseError::from(AuthenticationError::InvalidToken);

View File

@ -41,7 +41,7 @@ pub async fn multi_search_with_post(
let queries = params.into_inner().queries;
let mut multi_aggregate = MultiSearchAggregator::from_queries(&queries, &req);
let features = index_scheduler.features()?;
let features = index_scheduler.features();
// Explicitly expect a `(ResponseError, usize)` for the error type rather than `ResponseError` only,
// so that `?` doesn't work if it doesn't use `with_index`, ensuring that it is not forgotten in case of code

View File

@ -2,10 +2,12 @@ use std::collections::{HashMap, HashSet};
use ::time::format_description::well_known::Rfc3339;
use maplit::{hashmap, hashset};
use meilisearch::Opt;
use once_cell::sync::Lazy;
use tempfile::TempDir;
use time::{Duration, OffsetDateTime};
use crate::common::{Server, Value};
use crate::common::{default_settings, Server, Value};
use crate::json;
pub static AUTHORIZATIONS: Lazy<HashMap<(&'static str, &'static str), HashSet<&'static str>>> =
@ -195,7 +197,9 @@ async fn access_authorized_master_key() {
#[actix_rt::test]
async fn access_authorized_restricted_index() {
let mut server = Server::new_auth().await;
let dir = TempDir::new().unwrap();
let enable_metrics = Opt { experimental_enable_metrics: true, ..default_settings(dir.path()) };
let mut server = Server::new_auth_with_options(enable_metrics, dir).await;
for ((method, route), actions) in AUTHORIZATIONS.iter() {
for action in actions {
// create a new API key letting only the needed action.

View File

@ -202,6 +202,10 @@ impl Server {
pub async fn set_features(&self, value: Value) -> (Value, StatusCode) {
self.service.patch("/experimental-features", value).await
}
pub async fn get_metrics(&self) -> (Value, StatusCode) {
self.service.get("/metrics").await
}
}
pub fn default_settings(dir: impl AsRef<Path>) -> Opt {
@ -221,7 +225,7 @@ pub fn default_settings(dir: impl AsRef<Path>) -> Opt {
skip_index_budget: true,
..Parser::parse_from(None as Option<&str>)
},
experimental_enable_metrics: true,
experimental_enable_metrics: false,
..Parser::parse_from(None as Option<&str>)
}
}

View File

@ -397,7 +397,7 @@ async fn delete_document_by_complex_filter() {
"canceledBy": null,
"details": {
"providedIds": 0,
"deletedDocuments": 4,
"deletedDocuments": 2,
"originalFilter": "[[\"color = green\",\"color NOT EXISTS\"]]"
},
"error": null,

View File

@ -1,4 +1,7 @@
use crate::common::Server;
use meilisearch::Opt;
use tempfile::TempDir;
use crate::common::{default_settings, Server};
use crate::json;
/// Feature name to test against.
@ -16,7 +19,9 @@ async fn experimental_features() {
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"scoreDetails": false,
"vectorStore": false
"vectorStore": false,
"metrics": false,
"exportPuffinReports": false
}
"###);
@ -26,7 +31,9 @@ async fn experimental_features() {
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"scoreDetails": false,
"vectorStore": true
"vectorStore": true,
"metrics": false,
"exportPuffinReports": false
}
"###);
@ -36,7 +43,9 @@ async fn experimental_features() {
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"scoreDetails": false,
"vectorStore": true
"vectorStore": true,
"metrics": false,
"exportPuffinReports": false
}
"###);
@ -47,7 +56,9 @@ async fn experimental_features() {
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"scoreDetails": false,
"vectorStore": true
"vectorStore": true,
"metrics": false,
"exportPuffinReports": false
}
"###);
@ -58,11 +69,73 @@ async fn experimental_features() {
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"scoreDetails": false,
"vectorStore": true
"vectorStore": true,
"metrics": false,
"exportPuffinReports": false
}
"###);
}
#[actix_rt::test]
async fn experimental_feature_metrics() {
// instance flag for metrics enables metrics at startup
let dir = TempDir::new().unwrap();
let enable_metrics = Opt { experimental_enable_metrics: true, ..default_settings(dir.path()) };
let server = Server::new_with_options(enable_metrics).await.unwrap();
let (response, code) = server.get_features().await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"scoreDetails": false,
"vectorStore": false,
"metrics": true,
"exportPuffinReports": false
}
"###);
let (response, code) = server.get_metrics().await;
meili_snap::snapshot!(code, @"200 OK");
// metrics are not returned in json format
// so the test server will return null
meili_snap::snapshot!(response, @"null");
// disabling metrics results in invalid request
let (response, code) = server.set_features(json!({"metrics": false})).await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(response["metrics"], @"false");
let (response, code) = server.get_metrics().await;
meili_snap::snapshot!(code, @"400 Bad Request");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"message": "Getting metrics requires enabling the `metrics` experimental feature. See https://github.com/meilisearch/product/discussions/625",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
// enabling metrics via HTTP results in valid request
let (response, code) = server.set_features(json!({"metrics": true})).await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(response["metrics"], @"true");
let (response, code) = server.get_metrics().await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(response, @"null");
// startup without flag respects persisted metrics value
let disable_metrics =
Opt { experimental_enable_metrics: false, ..default_settings(dir.path()) };
let server_no_flag = Server::new_with_options(disable_metrics).await.unwrap();
let (response, code) = server_no_flag.get_metrics().await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(response, @"null");
}
#[actix_rt::test]
async fn errors() {
let server = Server::new().await;
@ -73,7 +146,7 @@ async fn errors() {
meili_snap::snapshot!(code, @"400 Bad Request");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"message": "Unknown field `NotAFeature`: expected one of `scoreDetails`, `vectorStore`",
"message": "Unknown field `NotAFeature`: expected one of `scoreDetails`, `vectorStore`, `metrics`, `exportPuffinReports`",
"code": "bad_request",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#bad_request"

View File

@ -0,0 +1,63 @@
use meili_snap::snapshot;
use once_cell::sync::Lazy;
use crate::common::{Server, Value};
use crate::json;
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
json!([
{"productId": 1, "shopId": 1},
{"productId": 2, "shopId": 1},
{"productId": 3, "shopId": 2},
{"productId": 4, "shopId": 2},
{"productId": 5, "shopId": 3},
{"productId": 6, "shopId": 3},
{"productId": 7, "shopId": 4},
{"productId": 8, "shopId": 4},
{"productId": 9, "shopId": 5},
{"productId": 10, "shopId": 5}
])
});
pub(self) static DOCUMENT_PRIMARY_KEY: &str = "productId";
pub(self) static DOCUMENT_DISTINCT_KEY: &str = "shopId";
/// testing: https://github.com/meilisearch/meilisearch/issues/4078
#[actix_rt::test]
async fn distinct_search_with_offset_no_ranking() {
let server = Server::new().await;
let index = server.index("test");
let documents = DOCUMENTS.clone();
index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await;
index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await;
index.wait_task(1).await;
fn get_hits(Value(response): Value) -> Vec<i64> {
let hits_array = response["hits"].as_array().unwrap();
hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_i64().unwrap()).collect::<Vec<_>>()
}
let (response, code) = index.search_post(json!({"limit": 2, "offset": 0})).await;
let hits = get_hits(response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"2");
snapshot!(format!("{:?}", hits), @"[1, 2]");
let (response, code) = index.search_post(json!({"limit": 2, "offset": 2})).await;
let hits = get_hits(response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"2");
snapshot!(format!("{:?}", hits), @"[3, 4]");
let (response, code) = index.search_post(json!({"limit": 10, "offset": 4})).await;
let hits = get_hits(response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"1");
snapshot!(format!("{:?}", hits), @"[5]");
let (response, code) = index.search_post(json!({"limit": 10, "offset": 5})).await;
let hits = get_hits(response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"0");
}

View File

@ -1,6 +1,7 @@
// This modules contains all the test concerning search. Each particular feature of the search
// should be tested in its own module to isolate tests and keep the tests readable.
mod distinct;
mod errors;
mod facet_search;
mod formatted;
@ -816,7 +817,7 @@ async fn experimental_feature_score_details() {
},
"proximity": {
"order": 2,
"score": 0.875
"score": 0.75
},
"attribute": {
"order": 3,

View File

@ -26,8 +26,8 @@ flatten-serde-json = { path = "../flatten-serde-json" }
fst = "0.4.7"
fxhash = "0.2.1"
geoutils = "0.5.1"
grenad = { version = "0.4.4", default-features = false, features = [
"tempfile",
grenad = { version = "0.4.5", default-features = false, features = [
"rayon", "tempfile"
] }
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.7", default-features = false, features = [
"lmdb", "read-txn-no-tls"

View File

@ -1,4 +1,5 @@
use std::fs::File;
use std::io::BufReader;
use std::{io, str};
use obkv::KvReader;
@ -19,14 +20,14 @@ use crate::FieldId;
pub struct EnrichedDocumentsBatchReader<R> {
documents: DocumentsBatchReader<R>,
primary_key: String,
external_ids: grenad::ReaderCursor<File>,
external_ids: grenad::ReaderCursor<BufReader<File>>,
}
impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
pub fn new(
documents: DocumentsBatchReader<R>,
primary_key: String,
external_ids: grenad::Reader<File>,
external_ids: grenad::Reader<BufReader<File>>,
) -> Result<Self, Error> {
if documents.documents_count() as u64 == external_ids.len() {
Ok(EnrichedDocumentsBatchReader {
@ -75,7 +76,7 @@ pub struct EnrichedDocument<'a> {
pub struct EnrichedDocumentsBatchCursor<R> {
documents: DocumentsBatchCursor<R>,
primary_key: String,
external_ids: grenad::ReaderCursor<File>,
external_ids: grenad::ReaderCursor<BufReader<File>>,
}
impl<R> EnrichedDocumentsBatchCursor<R> {

View File

@ -89,8 +89,6 @@ pub enum FieldIdMapMissingEntry {
#[derive(Error, Debug)]
pub enum UserError {
#[error("A soft deleted internal document id have been used: `{document_id}`.")]
AccessingSoftDeletedDocument { document_id: DocumentId },
#[error("A document cannot contain more than 65,535 fields.")]
AttributeLimitReached,
#[error(transparent)]

View File

@ -1,159 +1,146 @@
use std::borrow::Cow;
use std::collections::HashMap;
use std::convert::TryInto;
use std::{fmt, str};
use fst::map::IndexedValue;
use fst::{IntoStreamer, Streamer};
use heed::types::{OwnedType, Str};
use heed::{Database, RoIter, RoTxn, RwTxn};
use roaring::RoaringBitmap;
const DELETED_ID: u64 = u64::MAX;
use crate::{DocumentId, BEU32};
pub struct ExternalDocumentsIds<'a> {
pub(crate) hard: fst::Map<Cow<'a, [u8]>>,
pub(crate) soft: fst::Map<Cow<'a, [u8]>>,
soft_deleted_docids: RoaringBitmap,
pub enum DocumentOperationKind {
Create,
Delete,
}
impl<'a> ExternalDocumentsIds<'a> {
pub fn new(
hard: fst::Map<Cow<'a, [u8]>>,
soft: fst::Map<Cow<'a, [u8]>>,
soft_deleted_docids: RoaringBitmap,
) -> ExternalDocumentsIds<'a> {
ExternalDocumentsIds { hard, soft, soft_deleted_docids }
}
pub struct DocumentOperation {
pub external_id: String,
pub internal_id: DocumentId,
pub kind: DocumentOperationKind,
}
pub fn into_static(self) -> ExternalDocumentsIds<'static> {
ExternalDocumentsIds {
hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
soft_deleted_docids: self.soft_deleted_docids,
}
pub struct ExternalDocumentsIds(Database<Str, OwnedType<BEU32>>);
impl ExternalDocumentsIds {
pub fn new(db: Database<Str, OwnedType<BEU32>>) -> ExternalDocumentsIds {
ExternalDocumentsIds(db)
}
/// Returns `true` if hard and soft external documents lists are empty.
pub fn is_empty(&self) -> bool {
self.hard.is_empty() && self.soft.is_empty()
pub fn is_empty(&self, rtxn: &RoTxn) -> heed::Result<bool> {
self.0.is_empty(rtxn).map_err(Into::into)
}
pub fn get<A: AsRef<[u8]>>(&self, external_id: A) -> Option<u32> {
let external_id = external_id.as_ref();
match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) {
Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => {
Some(id.try_into().unwrap())
}
_otherwise => None,
}
}
/// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they
/// don't contain any soft deleted document id.
pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> {
let mut new_hard_builder = fst::MapBuilder::memory();
let union_op = self.hard.op().add(&self.soft).r#union();
let mut iter = union_op.into_stream();
while let Some((external_id, docids)) = iter.next() {
// prefer selecting the ids from soft, always
let id = indexed_last_value(docids).unwrap();
if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) {
new_hard_builder.insert(external_id, id)?;
}
}
drop(iter);
// Delete soft map completely
self.soft = fst::Map::default().map_data(Cow::Owned)?;
// We save the new map as the new hard map.
self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?;
Ok(())
}
pub fn insert_ids<A: AsRef<[u8]>>(&mut self, other: &fst::Map<A>) -> fst::Result<()> {
let union_op = self.soft.op().add(other).r#union();
let mut new_soft_builder = fst::MapBuilder::memory();
let mut iter = union_op.into_stream();
while let Some((external_id, marked_docids)) = iter.next() {
let id = indexed_last_value(marked_docids).unwrap();
new_soft_builder.insert(external_id, id)?;
}
drop(iter);
// We save the new map as the new soft map.
self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?;
self.merge_soft_into_hard()
pub fn get<A: AsRef<str>>(&self, rtxn: &RoTxn, external_id: A) -> heed::Result<Option<u32>> {
Ok(self.0.get(rtxn, external_id.as_ref())?.map(|x| x.get()))
}
/// An helper function to debug this type, returns an `HashMap` of both,
/// soft and hard fst maps, combined.
pub fn to_hash_map(&self) -> HashMap<String, u32> {
let mut map = HashMap::new();
let union_op = self.hard.op().add(&self.soft).r#union();
let mut iter = union_op.into_stream();
while let Some((external_id, marked_docids)) = iter.next() {
let id = indexed_last_value(marked_docids).unwrap();
if id != DELETED_ID {
let external_id = str::from_utf8(external_id).unwrap();
map.insert(external_id.to_owned(), id.try_into().unwrap());
}
pub fn to_hash_map(&self, rtxn: &RoTxn) -> heed::Result<HashMap<String, u32>> {
let mut map = HashMap::default();
for result in self.0.iter(rtxn)? {
let (external, internal) = result?;
map.insert(external.to_owned(), internal.get());
}
map
Ok(map)
}
/// Return an fst of the combined hard and soft deleted ID.
pub fn to_fst<'b>(&'b self) -> fst::Result<Cow<'b, fst::Map<Cow<'a, [u8]>>>> {
if self.soft.is_empty() {
return Ok(Cow::Borrowed(&self.hard));
}
let union_op = self.hard.op().add(&self.soft).r#union();
let mut iter = union_op.into_stream();
let mut new_hard_builder = fst::MapBuilder::memory();
while let Some((external_id, marked_docids)) = iter.next() {
let value = indexed_last_value(marked_docids).unwrap();
if value != DELETED_ID {
new_hard_builder.insert(external_id, value)?;
}
}
drop(iter);
Ok(Cow::Owned(new_hard_builder.into_map().map_data(Cow::Owned)?))
/// Looks for the internal ids in the passed bitmap, and returns an iterator over the mapping between
/// these internal ids and their external id.
///
/// The returned iterator has `Result<(String, DocumentId), RoaringBitmap>` as `Item`,
/// where the returned values can be:
/// - `Ok((external_id, internal_id))`: if a mapping was found
/// - `Err(remaining_ids)`: if the external ids for some of the requested internal ids weren't found.
/// In that case the returned bitmap contains the internal ids whose external ids were not found after traversing
/// the entire fst.
pub fn find_external_id_of<'t>(
&self,
rtxn: &'t RoTxn,
internal_ids: RoaringBitmap,
) -> heed::Result<ExternalToInternalOwnedIterator<'t>> {
self.0.iter(rtxn).map(|iter| ExternalToInternalOwnedIterator { iter, internal_ids })
}
fn merge_soft_into_hard(&mut self) -> fst::Result<()> {
if self.soft.len() >= self.hard.len() / 2 {
self.hard = self.to_fst()?.into_owned();
self.soft = fst::Map::default().map_data(Cow::Owned)?;
/// Applies the list of operations passed as argument, modifying the current external to internal id mapping.
///
/// If the list contains multiple operations on the same external id, then the result is unspecified.
///
/// # Panics
///
/// - If attempting to delete a document that doesn't exist
/// - If attempting to create a document that already exists
pub fn apply(&self, wtxn: &mut RwTxn, operations: Vec<DocumentOperation>) -> heed::Result<()> {
for DocumentOperation { external_id, internal_id, kind } in operations {
match kind {
DocumentOperationKind::Create => {
self.0.put(wtxn, &external_id, &BEU32::new(internal_id))?;
}
DocumentOperationKind::Delete => {
if !self.0.delete(wtxn, &external_id)? {
panic!("Attempting to delete a non-existing document")
}
}
}
}
Ok(())
}
}
impl fmt::Debug for ExternalDocumentsIds<'_> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_tuple("ExternalDocumentsIds").field(&self.to_hash_map()).finish()
/// Returns an iterator over all the external ids.
pub fn iter<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<RoIter<'t, Str, OwnedType<BEU32>>> {
self.0.iter(rtxn)
}
}
impl Default for ExternalDocumentsIds<'static> {
fn default() -> Self {
ExternalDocumentsIds {
hard: fst::Map::default().map_data(Cow::Owned).unwrap(),
soft: fst::Map::default().map_data(Cow::Owned).unwrap(),
soft_deleted_docids: RoaringBitmap::new(),
/// An iterator over mappings between requested internal ids and external ids.
///
/// See [`ExternalDocumentsIds::find_external_id_of`] for details.
pub struct ExternalToInternalOwnedIterator<'t> {
iter: RoIter<'t, Str, OwnedType<BEU32>>,
internal_ids: RoaringBitmap,
}
impl<'t> Iterator for ExternalToInternalOwnedIterator<'t> {
/// A result indicating if a mapping was found, or if the stream was exhausted without finding all internal ids.
type Item = Result<(&'t str, DocumentId), RoaringBitmap>;
fn next(&mut self) -> Option<Self::Item> {
// if all requested ids were found, we won't find any other, so short-circuit
if self.internal_ids.is_empty() {
return None;
}
loop {
let (external, internal) = match self.iter.next() {
Some(Ok((external, internal))) => (external, internal),
// TODO manage this better, remove panic
Some(Err(e)) => panic!("{}", e),
_ => {
// we exhausted the stream but we still have some internal ids to find
let remaining_ids = std::mem::take(&mut self.internal_ids);
return Some(Err(remaining_ids));
// note: next calls to `next` will return `None` since we replaced the internal_ids
// with the default empty bitmap
}
};
let internal = internal.get();
let was_contained = self.internal_ids.remove(internal);
if was_contained {
return Some(Ok((external, internal)));
}
}
}
}
/// Returns the value of the `IndexedValue` with the highest _index_.
fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option<u64> {
indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value)
impl<'t> ExternalToInternalOwnedIterator<'t> {
/// Returns the bitmap of internal ids whose external id are yet to be found
pub fn remaining_internal_ids(&self) -> &RoaringBitmap {
&self.internal_ids
}
/// Consumes this iterator and returns an iterator over only the external ids, ignoring the internal ids.
///
/// Use this when you don't need the mapping between the external and the internal ids.
pub fn only_external_ids(self) -> impl Iterator<Item = Result<String, RoaringBitmap>> + 't {
self.map(|res| res.map(|(external, _internal)| external.to_owned()))
}
}

View File

@ -6,6 +6,7 @@ use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
use roaring::RoaringBitmap;
use crate::heed_codec::BytesDecodeOwned;
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
/// This is the limit where using a byteorder became less size efficient
/// than using a direct roaring encoding, it is also the point where we are able
@ -99,6 +100,33 @@ impl CboRoaringBitmapCodec {
Ok(())
}
/// Merges a DelAdd delta into a CboRoaringBitmap.
pub fn merge_deladd_into<'a>(
deladd: KvReaderDelAdd<'_>,
previous: &[u8],
buffer: &'a mut Vec<u8>,
) -> io::Result<Option<&'a [u8]>> {
// Deserialize the bitmap that is already there
let mut previous = Self::deserialize_from(previous)?;
// Remove integers we no more want in the previous bitmap
if let Some(value) = deladd.get(DelAdd::Deletion) {
previous -= Self::deserialize_from(value)?;
}
// Insert the new integers we want in the previous bitmap
if let Some(value) = deladd.get(DelAdd::Addition) {
previous |= Self::deserialize_from(value)?;
}
if previous.is_empty() {
return Ok(None);
}
Self::serialize_into(&previous, buffer);
Ok(Some(&buffer[..]))
}
}
impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,7 @@ use std::cmp;
use crate::{relative_from_absolute_position, Position};
pub const MAX_DISTANCE: u32 = 8;
pub const MAX_DISTANCE: u32 = 4;
pub fn index_proximity(lhs: u32, rhs: u32) -> u32 {
if lhs <= rhs {

View File

@ -13,7 +13,7 @@ use crate::heed_codec::ByteSliceRefCodec;
/// The documents returned by the iterator are grouped by the facet values that
/// determined their rank. For example, given the documents:
///
/// ```ignore
/// ```text
/// 0: { "colour": ["blue", "green"] }
/// 1: { "colour": ["blue", "red"] }
/// 2: { "colour": ["orange", "red"] }
@ -22,7 +22,7 @@ use crate::heed_codec::ByteSliceRefCodec;
/// ```
/// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator
/// over the following elements:
/// ```ignore
/// ```text
/// [0, 4] // corresponds to all the documents within the candidates that have the facet value "blue"
/// [3] // same for "green"
/// [2] // same for "orange"

View File

@ -223,12 +223,9 @@ impl<'a> Filter<'a> {
impl<'a> Filter<'a> {
pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> {
// to avoid doing this for each recursive call we're going to do it ONCE ahead of time
let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?;
let filterable_fields = index.filterable_fields(rtxn)?;
// and finally we delete all the soft_deleted_documents, again, only once at the very end
self.inner_evaluate(rtxn, index, &filterable_fields)
.map(|result| result - soft_deleted_documents)
}
fn evaluate_operator(

View File

@ -46,18 +46,27 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
if let Some(distinct_fid) = distinct_fid {
let mut excluded = RoaringBitmap::new();
let mut results = vec![];
let mut skip = 0;
for docid in universe.iter() {
if results.len() >= from + length {
if results.len() >= length {
break;
}
if excluded.contains(docid) {
continue;
}
distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?;
skip += 1;
if skip <= from {
continue;
}
results.push(docid);
}
let mut all_candidates = universe - excluded;
all_candidates.extend(results.iter().copied());
return Ok(BucketSortOutput {
scores: vec![Default::default(); results.len()],
docids: results,

View File

@ -1,6 +1,7 @@
#![allow(clippy::too_many_arguments)]
use super::ProximityCondition;
use crate::proximity::MAX_DISTANCE;
use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::SearchContext;
@ -35,7 +36,7 @@ pub fn build_edges(
}
let mut conditions = vec![];
for cost in right_ngram_max..(7 + right_ngram_max) {
for cost in right_ngram_max..(((MAX_DISTANCE as usize) - 1) + right_ngram_max) {
conditions.push((
cost as u32,
conditions_interner.insert(ProximityCondition::Uninit {
@ -47,7 +48,7 @@ pub fn build_edges(
}
conditions.push((
(7 + right_ngram_max) as u32,
((MAX_DISTANCE - 1) + (right_ngram_max as u32)),
conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
));

View File

@ -273,7 +273,7 @@ fn test_proximity_simple() {
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("the quick brown fox jumps over the lazy dog");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 10, 4, 7, 6, 5, 2, 3, 0, 1]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 10, 4, 7, 6, 2, 3, 5, 1, 0]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
@ -282,11 +282,11 @@ fn test_proximity_simple() {
"\"the quickbrown fox jumps over the lazy dog\"",
"\"the really quick brown fox jumps over the lazy dog\"",
"\"the really quick brown fox jumps over the very lazy dog\"",
"\"brown quick fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the lazy. dog\"",
"\"dog the quick brown fox jumps over the lazy\"",
"\"the very quick dark brown and smart fox did jump over the terribly lazy and small dog\"",
"\"brown quick fox jumps over the lazy dog\"",
"\"the. quick brown fox jumps over the lazy. dog\"",
"\"the very quick dark brown and smart fox did jump over the terribly lazy and small dog\"",
]
"###);
}
@ -371,7 +371,7 @@ fn test_proximity_prefix_db() {
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
s.query("best s");
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 6, 7, 11, 15]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]");
insta::assert_snapshot!(format!("{document_scores:#?}"));
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
@ -382,9 +382,9 @@ fn test_proximity_prefix_db() {
"\"summer best\"",
"\"this is the best meal of summer\"",
"\"summer x best\"",
"\"this is the best meal of the summer\"",
"\"this is the best meal I have ever had in such a beautiful summer day\"",
"\"this is the best cooked meal of the summer\"",
"\"this is the best meal of the summer\"",
"\"summer x y best\"",
"\"this is the best meal I have ever had in such a beautiful winter day\"",
]
@ -396,7 +396,7 @@ fn test_proximity_prefix_db() {
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
s.query("best su");
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 11, 7, 6, 15]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]");
insta::assert_snapshot!(format!("{document_scores:#?}"));
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
@ -406,10 +406,10 @@ fn test_proximity_prefix_db() {
"\"summer best\"",
"\"this is the best meal of summer\"",
"\"summer x best\"",
"\"this is the best meal I have ever had in such a beautiful summer day\"",
"\"this is the best cooked meal of the summer\"",
"\"this is the best meal of the summer\"",
"\"summer x y best\"",
"\"this is the best cooked meal of the summer\"",
"\"this is the best meal I have ever had in such a beautiful summer day\"",
"\"this is the best meal I have ever had in such a beautiful winter day\"",
]
"###);
@ -423,20 +423,20 @@ fn test_proximity_prefix_db() {
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
s.query("best win");
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[15, 16, 17, 18, 19, 20, 21, 22]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
insta::assert_snapshot!(format!("{document_scores:#?}"));
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"this is the best winter meal\"",
"\"winter best\"",
"\"this is the best meal of winter\"",
"\"winter x best\"",
"\"this is the best meal I have ever had in such a beautiful winter day\"",
"\"this is the best cooked meal of the winter\"",
"\"this is the best meal of the winter\"",
"\"this is the best meal of winter\"",
"\"this is the best winter meal\"",
"\"winter x y best\"",
"\"winter x best\"",
"\"winter best\"",
]
"###);
@ -447,7 +447,7 @@ fn test_proximity_prefix_db() {
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
s.query("best wint");
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 17, 20, 16, 15]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
insta::assert_snapshot!(format!("{document_scores:#?}"));
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
@ -457,10 +457,10 @@ fn test_proximity_prefix_db() {
"\"winter best\"",
"\"this is the best meal of winter\"",
"\"winter x best\"",
"\"this is the best meal I have ever had in such a beautiful winter day\"",
"\"this is the best cooked meal of the winter\"",
"\"this is the best meal of the winter\"",
"\"winter x y best\"",
"\"this is the best cooked meal of the winter\"",
"\"this is the best meal I have ever had in such a beautiful winter day\"",
]
"###);
@ -471,7 +471,7 @@ fn test_proximity_prefix_db() {
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
s.query("best wi");
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 17, 15, 16, 20]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
insta::assert_snapshot!(format!("{document_scores:#?}"));
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
@ -481,9 +481,9 @@ fn test_proximity_prefix_db() {
"\"winter best\"",
"\"this is the best meal of winter\"",
"\"winter x best\"",
"\"this is the best meal of the winter\"",
"\"this is the best meal I have ever had in such a beautiful winter day\"",
"\"this is the best cooked meal of the winter\"",
"\"this is the best meal of the winter\"",
"\"winter x y best\"",
]
"###);

View File

@ -68,8 +68,8 @@ fn test_trap_basic() {
[
Proximity(
Rank {
rank: 8,
max_rank: 8,
rank: 4,
max_rank: 4,
},
),
Typo(
@ -82,8 +82,8 @@ fn test_trap_basic() {
[
Proximity(
Rank {
rank: 8,
max_rank: 8,
rank: 4,
max_rank: 4,
},
),
Typo(

View File

@ -23,8 +23,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 35,
max_rank: 57,
rank: 9,
max_rank: 25,
},
),
],
@ -49,8 +49,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 35,
max_rank: 57,
rank: 9,
max_rank: 25,
},
),
],
@ -75,8 +75,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 35,
max_rank: 57,
rank: 9,
max_rank: 25,
},
),
],

View File

@ -23,8 +23,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 57,
max_rank: 57,
rank: 25,
max_rank: 25,
},
),
],
@ -49,8 +49,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 56,
max_rank: 57,
rank: 24,
max_rank: 25,
},
),
],
@ -75,8 +75,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 35,
max_rank: 57,
rank: 9,
max_rank: 25,
},
),
],
@ -101,8 +101,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 22,
max_rank: 22,
rank: 10,
max_rank: 10,
},
),
],
@ -127,8 +127,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 22,
max_rank: 22,
rank: 10,
max_rank: 10,
},
),
],
@ -153,8 +153,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 22,
max_rank: 22,
rank: 10,
max_rank: 10,
},
),
],
@ -179,8 +179,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 21,
max_rank: 22,
rank: 9,
max_rank: 10,
},
),
],
@ -205,8 +205,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 17,
max_rank: 22,
rank: 5,
max_rank: 10,
},
),
],
@ -231,8 +231,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 17,
max_rank: 22,
rank: 5,
max_rank: 10,
},
),
],

View File

@ -3,59 +3,35 @@ source: milli/src/search/new/tests/proximity.rs
expression: "format!(\"{document_scores:#?}\")"
---
[
[
Proximity(
Rank {
rank: 8,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 7,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 6,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 6,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 5,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 5,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 4,
max_rank: 8,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 3,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 2,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 2,
max_rank: 4,
},
),
],
@ -63,7 +39,31 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 8,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 4,
},
),
],

View File

@ -6,40 +6,32 @@ expression: "format!(\"{document_scores:#?}\")"
[
Proximity(
Rank {
rank: 8,
max_rank: 8,
rank: 4,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 7,
max_rank: 8,
rank: 3,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 6,
max_rank: 8,
rank: 2,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 6,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 5,
max_rank: 8,
rank: 2,
max_rank: 4,
},
),
],
@ -47,7 +39,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 8,
max_rank: 4,
},
),
],
@ -55,7 +47,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 8,
max_rank: 4,
},
),
],
@ -63,7 +55,15 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 8,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 4,
},
),
],

View File

@ -6,40 +6,32 @@ expression: "format!(\"{document_scores:#?}\")"
[
Proximity(
Rank {
rank: 8,
max_rank: 8,
rank: 4,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 7,
max_rank: 8,
rank: 3,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 6,
max_rank: 8,
rank: 2,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 6,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 5,
max_rank: 8,
rank: 2,
max_rank: 4,
},
),
],
@ -47,7 +39,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 8,
max_rank: 4,
},
),
],
@ -55,7 +47,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 8,
max_rank: 4,
},
),
],
@ -63,7 +55,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 8,
max_rank: 4,
},
),
],
@ -71,7 +63,15 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 8,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 4,
},
),
],

View File

@ -3,59 +3,35 @@ source: milli/src/search/new/tests/proximity.rs
expression: "format!(\"{document_scores:#?}\")"
---
[
[
Proximity(
Rank {
rank: 8,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 7,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 6,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 6,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 5,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 5,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 4,
max_rank: 8,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 3,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 2,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 2,
max_rank: 4,
},
),
],
@ -63,7 +39,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 8,
max_rank: 4,
},
),
],
@ -71,7 +47,31 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 8,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 4,
},
),
],

View File

@ -6,8 +6,32 @@ expression: "format!(\"{document_scores:#?}\")"
[
Proximity(
Rank {
rank: 1,
max_rank: 8,
rank: 4,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 3,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 2,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 2,
max_rank: 4,
},
),
],
@ -15,7 +39,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 8,
max_rank: 4,
},
),
],
@ -23,7 +47,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 8,
max_rank: 4,
},
),
],
@ -31,7 +55,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 8,
max_rank: 4,
},
),
],
@ -39,31 +63,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 8,
max_rank: 4,
},
),
],

View File

@ -6,24 +6,24 @@ expression: "format!(\"{document_scores:#?}\")"
[
Proximity(
Rank {
rank: 8,
max_rank: 8,
rank: 4,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 8,
max_rank: 8,
rank: 4,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 8,
max_rank: 8,
rank: 4,
max_rank: 4,
},
),
],
@ -31,7 +31,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 8,
max_rank: 4,
},
),
],
@ -39,7 +39,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 8,
max_rank: 4,
},
),
],

View File

@ -6,16 +6,16 @@ expression: "format!(\"{document_scores:#?}\")"
[
Proximity(
Rank {
rank: 8,
max_rank: 8,
rank: 4,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 8,
max_rank: 8,
rank: 4,
max_rank: 4,
},
),
],
@ -23,7 +23,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 8,
max_rank: 4,
},
),
],

View File

@ -6,16 +6,16 @@ expression: "format!(\"{document_scores:#?}\")"
[
Proximity(
Rank {
rank: 8,
max_rank: 8,
rank: 4,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 8,
max_rank: 8,
rank: 4,
max_rank: 4,
},
),
],
@ -23,7 +23,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 8,
max_rank: 4,
},
),
],

View File

@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 8,
max_rank: 8,
rank: 4,
max_rank: 4,
},
),
],
@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 5,
max_rank: 8,
rank: 1,
max_rank: 4,
},
),
],
@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 8,
max_rank: 8,
rank: 4,
max_rank: 4,
},
),
],
@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 7,
max_rank: 8,
rank: 3,
max_rank: 4,
},
),
],

View File

@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 15,
max_rank: 15,
rank: 7,
max_rank: 7,
},
),
],
@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 8,
max_rank: 15,
rank: 4,
max_rank: 7,
},
),
],

View File

@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 50,
max_rank: 50,
rank: 22,
max_rank: 22,
},
),
],
@ -24,132 +24,6 @@ expression: "format!(\"{document_scores:#?}\")"
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 50,
max_rank: 50,
},
),
],
[
Words(
Words {
matching_words: 9,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 49,
max_rank: 50,
},
),
],
[
Words(
Words {
matching_words: 9,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 49,
max_rank: 50,
},
),
],
[
Words(
Words {
matching_words: 9,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 48,
max_rank: 50,
},
),
],
[
Words(
Words {
matching_words: 9,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 41,
max_rank: 50,
},
),
],
[
Words(
Words {
matching_words: 9,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 40,
max_rank: 50,
},
),
],
[
Words(
Words {
matching_words: 8,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 43,
max_rank: 43,
},
),
],
[
Words(
Words {
matching_words: 7,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 36,
max_rank: 36,
},
),
],
[
Words(
Words {
matching_words: 7,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 31,
max_rank: 36,
},
),
],
[
Words(
Words {
matching_words: 5,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 22,
@ -160,14 +34,126 @@ expression: "format!(\"{document_scores:#?}\")"
[
Words(
Words {
matching_words: 4,
matching_words: 9,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 15,
max_rank: 15,
rank: 21,
max_rank: 22,
},
),
],
[
Words(
Words {
matching_words: 9,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 21,
max_rank: 22,
},
),
],
[
Words(
Words {
matching_words: 9,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 20,
max_rank: 22,
},
),
],
[
Words(
Words {
matching_words: 9,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 17,
max_rank: 22,
},
),
],
[
Words(
Words {
matching_words: 9,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 16,
max_rank: 22,
},
),
],
[
Words(
Words {
matching_words: 8,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 19,
max_rank: 19,
},
),
],
[
Words(
Words {
matching_words: 7,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 16,
max_rank: 16,
},
),
],
[
Words(
Words {
matching_words: 7,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 13,
max_rank: 16,
},
),
],
[
Words(
Words {
matching_words: 5,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 10,
max_rank: 10,
},
),
],
@ -180,8 +166,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 15,
max_rank: 15,
rank: 7,
max_rank: 7,
},
),
],
@ -194,8 +180,22 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 15,
max_rank: 15,
rank: 7,
max_rank: 7,
},
),
],
[
Words(
Words {
matching_words: 4,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 7,
max_rank: 7,
},
),
],
@ -208,8 +208,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 8,
max_rank: 8,
rank: 4,
max_rank: 4,
},
),
],

View File

@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 43,
max_rank: 43,
rank: 19,
max_rank: 19,
},
),
],
@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 43,
max_rank: 43,
rank: 19,
max_rank: 19,
},
),
],
@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 42,
max_rank: 43,
rank: 18,
max_rank: 19,
},
),
],
@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 42,
max_rank: 43,
rank: 18,
max_rank: 19,
},
),
],
@ -68,8 +68,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 41,
max_rank: 43,
rank: 17,
max_rank: 19,
},
),
],
@ -82,8 +82,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 34,
max_rank: 43,
rank: 14,
max_rank: 19,
},
),
],
@ -96,8 +96,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 33,
max_rank: 43,
rank: 13,
max_rank: 19,
},
),
],
@ -110,8 +110,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 36,
max_rank: 36,
rank: 16,
max_rank: 16,
},
),
],
@ -124,8 +124,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 29,
max_rank: 29,
rank: 13,
max_rank: 13,
},
),
],
@ -138,8 +138,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 24,
max_rank: 29,
rank: 10,
max_rank: 13,
},
),
],
@ -152,8 +152,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 15,
max_rank: 15,
rank: 7,
max_rank: 7,
},
),
],

View File

@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 57,
max_rank: 57,
rank: 25,
max_rank: 25,
},
),
],
@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 57,
max_rank: 57,
rank: 25,
max_rank: 25,
},
),
],
@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 56,
max_rank: 57,
rank: 24,
max_rank: 25,
},
),
],
@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 56,
max_rank: 57,
rank: 24,
max_rank: 25,
},
),
],
@ -68,8 +68,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 55,
max_rank: 57,
rank: 23,
max_rank: 25,
},
),
],
@ -82,8 +82,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 54,
max_rank: 57,
rank: 22,
max_rank: 25,
},
),
],
@ -96,8 +96,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 53,
max_rank: 57,
rank: 21,
max_rank: 25,
},
),
],
@ -110,8 +110,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 52,
max_rank: 57,
rank: 20,
max_rank: 25,
},
),
],
@ -124,8 +124,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 51,
max_rank: 57,
rank: 20,
max_rank: 25,
},
),
],
@ -138,8 +138,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 48,
max_rank: 57,
rank: 19,
max_rank: 25,
},
),
],
@ -152,8 +152,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 47,
max_rank: 57,
rank: 19,
max_rank: 25,
},
),
],
@ -167,7 +167,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 57,
max_rank: 25,
},
),
],
@ -180,8 +180,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 50,
max_rank: 50,
rank: 22,
max_rank: 22,
},
),
],
@ -194,8 +194,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 43,
max_rank: 43,
rank: 19,
max_rank: 19,
},
),
],
@ -208,8 +208,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 38,
max_rank: 43,
rank: 16,
max_rank: 19,
},
),
],
@ -222,8 +222,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 29,
max_rank: 29,
rank: 13,
max_rank: 13,
},
),
],
@ -236,8 +236,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 22,
max_rank: 22,
rank: 10,
max_rank: 10,
},
),
],
@ -250,8 +250,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 22,
max_rank: 22,
rank: 10,
max_rank: 10,
},
),
],
@ -264,8 +264,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 22,
max_rank: 22,
rank: 10,
max_rank: 10,
},
),
],
@ -278,8 +278,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 15,
max_rank: 15,
rank: 7,
max_rank: 7,
},
),
],

View File

@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 57,
max_rank: 57,
rank: 25,
max_rank: 25,
},
),
],
@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 56,
max_rank: 57,
rank: 24,
max_rank: 25,
},
),
],
@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 55,
max_rank: 57,
rank: 23,
max_rank: 25,
},
),
],
@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 54,
max_rank: 57,
rank: 22,
max_rank: 25,
},
),
],
@ -68,8 +68,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 54,
max_rank: 57,
rank: 22,
max_rank: 25,
},
),
],
@ -82,8 +82,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 54,
max_rank: 57,
rank: 22,
max_rank: 25,
},
),
],
@ -96,8 +96,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 53,
max_rank: 57,
rank: 21,
max_rank: 25,
},
),
],
@ -110,8 +110,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 53,
max_rank: 57,
rank: 21,
max_rank: 25,
},
),
],
@ -124,8 +124,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 52,
max_rank: 57,
rank: 20,
max_rank: 25,
},
),
],
@ -138,8 +138,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 47,
max_rank: 57,
rank: 18,
max_rank: 25,
},
),
],
@ -152,8 +152,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 45,
max_rank: 57,
rank: 18,
max_rank: 25,
},
),
],
@ -167,7 +167,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 57,
max_rank: 25,
},
),
],
@ -180,8 +180,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 47,
max_rank: 50,
rank: 19,
max_rank: 22,
},
),
],
@ -194,8 +194,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 40,
max_rank: 43,
rank: 16,
max_rank: 19,
},
),
],
@ -208,8 +208,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 35,
max_rank: 43,
rank: 13,
max_rank: 19,
},
),
],
@ -222,8 +222,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 26,
max_rank: 29,
rank: 10,
max_rank: 13,
},
),
],
@ -236,8 +236,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 19,
max_rank: 22,
rank: 7,
max_rank: 10,
},
),
],
@ -250,8 +250,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 19,
max_rank: 22,
rank: 7,
max_rank: 10,
},
),
],
@ -264,8 +264,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 19,
max_rank: 22,
rank: 7,
max_rank: 10,
},
),
],
@ -278,8 +278,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 13,
max_rank: 15,
rank: 5,
max_rank: 7,
},
),
],

View File

@ -6,88 +6,88 @@ expression: "format!(\"{document_scores:#?}\")"
[
Proximity(
Rank {
rank: 57,
max_rank: 57,
rank: 25,
max_rank: 25,
},
),
],
[
Proximity(
Rank {
rank: 57,
max_rank: 57,
rank: 25,
max_rank: 25,
},
),
],
[
Proximity(
Rank {
rank: 56,
max_rank: 57,
rank: 24,
max_rank: 25,
},
),
],
[
Proximity(
Rank {
rank: 56,
max_rank: 57,
rank: 24,
max_rank: 25,
},
),
],
[
Proximity(
Rank {
rank: 55,
max_rank: 57,
rank: 23,
max_rank: 25,
},
),
],
[
Proximity(
Rank {
rank: 54,
max_rank: 57,
rank: 22,
max_rank: 25,
},
),
],
[
Proximity(
Rank {
rank: 53,
max_rank: 57,
rank: 21,
max_rank: 25,
},
),
],
[
Proximity(
Rank {
rank: 52,
max_rank: 57,
rank: 20,
max_rank: 25,
},
),
],
[
Proximity(
Rank {
rank: 51,
max_rank: 57,
rank: 20,
max_rank: 25,
},
),
],
[
Proximity(
Rank {
rank: 48,
max_rank: 57,
rank: 19,
max_rank: 25,
},
),
],
[
Proximity(
Rank {
rank: 47,
max_rank: 57,
rank: 19,
max_rank: 25,
},
),
],
@ -95,7 +95,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 57,
max_rank: 25,
},
),
],

View File

@ -259,8 +259,8 @@ fn test_ignore_stop_words() {
),
Proximity(
Rank {
rank: 7,
max_rank: 8,
rank: 3,
max_rank: 4,
},
),
Fid(
@ -411,8 +411,8 @@ fn test_stop_words_in_phrase() {
),
Proximity(
Rank {
rank: 6,
max_rank: 8,
rank: 2,
max_rank: 4,
},
),
Fid(

View File

@ -277,7 +277,7 @@ fn test_words_proximity_tms_last_simple() {
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
// 7 is better than 6 because of the proximity between "the" and its surrounding terms
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 16, 19, 15, 20, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
insta::assert_snapshot!(format!("{document_scores:#?}"));
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
@ -289,10 +289,10 @@ fn test_words_proximity_tms_last_simple() {
"\"the mighty and quick brown fox jumps over the lazy dog\"",
"\"the brown quick fox jumps over the lazy dog\"",
"\"the brown quick fox jumps over the really lazy dog\"",
"\"the brown quick fox immediately jumps over the really lazy dog\"",
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
"\"this quick brown and scary fox jumps over the lazy dog\"",
"\"the brown quick fox immediately jumps over the really lazy dog\"",
"\"this quick brown and very scary fox jumps over the lazy dog\"",
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
"\"the quick brown fox jumps over the lazy\"",
"\"the quick brown fox jumps over the\"",
@ -312,7 +312,7 @@ fn test_words_proximity_tms_last_simple() {
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
// 10 is better than 9 because of the proximity between "quick" and "brown"
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 18, 19, 9, 20, 21, 14, 17, 13, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 18, 19, 9, 20, 21, 14, 17, 13, 15, 16, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
insta::assert_snapshot!(format!("{document_scores:#?}"));
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
@ -326,8 +326,8 @@ fn test_words_proximity_tms_last_simple() {
"\"the great quick brown fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the really lazy dog\"",
"\"the mighty and quick brown fox jumps over the lazy dog\"",
"\"this quick brown and scary fox jumps over the lazy dog\"",
"\"this quick brown and very scary fox jumps over the lazy dog\"",
"\"this quick brown and scary fox jumps over the lazy dog\"",
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
"\"the quick brown fox jumps over the lazy\"",
"\"the quick brown fox jumps over the\"",
@ -427,7 +427,7 @@ fn test_words_tms_all() {
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 16, 19, 15, 20, 22]");
insta::assert_snapshot!(format!("{document_scores:#?}"));
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
@ -439,10 +439,10 @@ fn test_words_tms_all() {
"\"the mighty and quick brown fox jumps over the lazy dog\"",
"\"the brown quick fox jumps over the lazy dog\"",
"\"the brown quick fox jumps over the really lazy dog\"",
"\"the brown quick fox immediately jumps over the really lazy dog\"",
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
"\"this quick brown and scary fox jumps over the lazy dog\"",
"\"the brown quick fox immediately jumps over the really lazy dog\"",
"\"this quick brown and very scary fox jumps over the lazy dog\"",
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
]
"###);

View File

@ -4,9 +4,8 @@ use std::path::Path;
use roaring::RoaringBitmap;
use crate::facet::FacetType;
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};
use crate::{make_db_snap_from_iter, obkv_to_json, ExternalDocumentsIds, Index};
use crate::{make_db_snap_from_iter, obkv_to_json, Index};
#[track_caller]
pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) {
@ -98,7 +97,6 @@ Create a snapshot test of the given database.
- `facet_id_string_docids`
- `documents_ids`
- `stop_words`
- `soft_deleted_documents_ids`
- `field_distribution`
- `fields_ids_map`
- `geo_faceted_documents_ids`
@ -308,12 +306,6 @@ pub fn snap_stop_words(index: &Index) -> String {
let snap = format!("{stop_words:?}");
snap
}
pub fn snap_soft_deleted_documents_ids(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap();
display_bitmap(&soft_deleted_documents_ids)
}
pub fn snap_field_distributions(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let mut snap = String::new();
@ -340,50 +332,21 @@ pub fn snap_geo_faceted_documents_ids(index: &Index) -> String {
}
pub fn snap_external_documents_ids(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let ExternalDocumentsIds { soft, hard, .. } = index.external_documents_ids(&rtxn).unwrap();
let external_ids = index.external_documents_ids().to_hash_map(&rtxn).unwrap();
// ensure fixed order (not guaranteed by hashmap)
let mut external_ids: Vec<(String, u32)> = external_ids.into_iter().collect();
external_ids.sort_by(|(l, _), (r, _)| l.cmp(r));
let mut snap = String::new();
writeln!(&mut snap, "soft:").unwrap();
let stream_soft = soft.stream();
let soft_external_ids = stream_soft.into_str_vec().unwrap();
for (key, id) in soft_external_ids {
writeln!(&mut snap, "{key:<24} {id}").unwrap();
}
writeln!(&mut snap, "hard:").unwrap();
let stream_hard = hard.stream();
let hard_external_ids = stream_hard.into_str_vec().unwrap();
for (key, id) in hard_external_ids {
writeln!(&mut snap, "docids:").unwrap();
for (key, id) in external_ids {
writeln!(&mut snap, "{key:<24} {id}").unwrap();
}
snap
}
pub fn snap_number_faceted_documents_ids(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
let mut snap = String::new();
for field_id in fields_ids_map.ids() {
let number_faceted_documents_ids =
index.faceted_documents_ids(&rtxn, field_id, FacetType::Number).unwrap();
writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&number_faceted_documents_ids))
.unwrap();
}
snap
}
pub fn snap_string_faceted_documents_ids(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
let mut snap = String::new();
for field_id in fields_ids_map.ids() {
let string_faceted_documents_ids =
index.faceted_documents_ids(&rtxn, field_id, FacetType::String).unwrap();
writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&string_faceted_documents_ids))
.unwrap();
}
snap
}
pub fn snap_words_fst(index: &Index) -> String {
let rtxn = index.read_txn().unwrap();
let words_fst = index.words_fst(&rtxn).unwrap();
@ -516,9 +479,6 @@ macro_rules! full_snap_of_db {
($index:ident, stop_words) => {{
$crate::snapshot_tests::snap_stop_words(&$index)
}};
($index:ident, soft_deleted_documents_ids) => {{
$crate::snapshot_tests::snap_soft_deleted_documents_ids(&$index)
}};
($index:ident, field_distribution) => {{
$crate::snapshot_tests::snap_field_distributions(&$index)
}};
@ -531,12 +491,6 @@ macro_rules! full_snap_of_db {
($index:ident, external_documents_ids) => {{
$crate::snapshot_tests::snap_external_documents_ids(&$index)
}};
($index:ident, number_faceted_documents_ids) => {{
$crate::snapshot_tests::snap_number_faceted_documents_ids(&$index)
}};
($index:ident, string_faceted_documents_ids) => {{
$crate::snapshot_tests::snap_string_faceted_documents_ids(&$index)
}};
($index:ident, words_fst) => {{
$crate::snapshot_tests::snap_words_fst(&$index)
}};

View File

@ -8,16 +8,11 @@ pub struct AvailableDocumentsIds {
}
impl AvailableDocumentsIds {
pub fn from_documents_ids(
docids: &RoaringBitmap,
soft_deleted_docids: &RoaringBitmap,
) -> AvailableDocumentsIds {
let used_docids = docids | soft_deleted_docids;
match used_docids.max() {
pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds {
match docids.max() {
Some(last_id) => {
let mut available = RoaringBitmap::from_iter(0..last_id);
available -= used_docids;
available -= docids;
let iter = match last_id.checked_add(1) {
Some(id) => id..=u32::max_value(),
@ -50,7 +45,7 @@ mod tests {
#[test]
fn empty() {
let base = RoaringBitmap::new();
let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new());
let left = AvailableDocumentsIds::from_documents_ids(&base);
let right = 0..=u32::max_value();
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
}
@ -63,28 +58,8 @@ mod tests {
base.insert(100);
base.insert(405);
let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new());
let left = AvailableDocumentsIds::from_documents_ids(&base);
let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405);
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
}
#[test]
fn soft_deleted() {
let mut base = RoaringBitmap::new();
base.insert(0);
base.insert(10);
base.insert(100);
base.insert(405);
let mut soft_deleted = RoaringBitmap::new();
soft_deleted.insert(1);
soft_deleted.insert(11);
soft_deleted.insert(101);
soft_deleted.insert(406);
let left = AvailableDocumentsIds::from_documents_ids(&base, &soft_deleted);
let right =
(0..=u32::max_value()).filter(|&n| ![0, 1, 10, 11, 100, 101, 405, 406].contains(&n));
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
}
}

View File

@ -1,8 +1,7 @@
use roaring::RoaringBitmap;
use time::OffsetDateTime;
use crate::facet::FacetType;
use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result};
use crate::{FieldDistribution, Index, Result};
pub struct ClearDocuments<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -21,6 +20,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
let Index {
env: _env,
main: _main,
external_documents_ids,
word_docids,
exact_word_docids,
word_prefix_docids,
@ -51,36 +51,18 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
// We retrieve the number of documents ids that we are deleting.
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
// We clean some of the main engine datastructures.
self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?;
self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?;
self.index.put_documents_ids(self.wtxn, &empty_roaring)?;
self.index.put_soft_deleted_documents_ids(self.wtxn, &empty_roaring)?;
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
self.index.delete_geo_rtree(self.wtxn)?;
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
self.index.delete_vector_hnsw(self.wtxn)?;
// We clean all the faceted documents ids.
for field_id in faceted_fields {
self.index.put_faceted_documents_ids(
self.wtxn,
field_id,
FacetType::Number,
&empty_roaring,
)?;
self.index.put_faceted_documents_ids(
self.wtxn,
field_id,
FacetType::String,
&empty_roaring,
)?;
}
// Clear the other databases.
external_documents_ids.clear(self.wtxn)?;
word_docids.clear(self.wtxn)?;
exact_word_docids.clear(self.wtxn)?;
word_prefix_docids.clear(self.wtxn)?;
@ -140,7 +122,7 @@ mod tests {
assert!(index.words_fst(&rtxn).unwrap().is_empty());
assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty());
assert!(index.external_documents_ids(&rtxn).unwrap().is_empty());
assert!(index.external_documents_ids().is_empty(&rtxn).unwrap());
assert!(index.documents_ids(&rtxn).unwrap().is_empty());
assert!(index.field_distribution(&rtxn).unwrap().is_empty());
assert!(index.geo_rtree(&rtxn).unwrap().is_none());

File diff suppressed because it is too large Load Diff

View File

@ -1,9 +1,9 @@
use std::borrow::Cow;
use std::fs::File;
use std::io::BufReader;
use grenad::CompressionType;
use heed::types::ByteSlice;
use heed::{BytesEncode, Error, RoTxn, RwTxn};
use heed::{BytesDecode, BytesEncode, Error, RoTxn, RwTxn};
use roaring::RoaringBitmap;
use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
@ -12,17 +12,15 @@ use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
};
use crate::heed_codec::ByteSliceRefCodec;
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result};
/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
/// by rebuilding the database "from scratch".
///
/// First, the new elements are inserted into the level 0 of the database. Then, the
/// higher levels are cleared and recomputed from the content of level 0.
///
/// Finally, the `faceted_documents_ids` value in the main database of `Index`
/// is updated to contain the new set of faceted documents.
pub struct FacetsUpdateBulk<'i> {
index: &'i Index,
group_size: u8,
@ -30,7 +28,7 @@ pub struct FacetsUpdateBulk<'i> {
facet_type: FacetType,
field_ids: Vec<FieldId>,
// None if level 0 does not need to be updated
new_data: Option<grenad::Reader<File>>,
delta_data: Option<grenad::Reader<BufReader<File>>>,
}
impl<'i> FacetsUpdateBulk<'i> {
@ -38,7 +36,7 @@ impl<'i> FacetsUpdateBulk<'i> {
index: &'i Index,
field_ids: Vec<FieldId>,
facet_type: FacetType,
new_data: grenad::Reader<File>,
delta_data: grenad::Reader<BufReader<File>>,
group_size: u8,
min_level_size: u8,
) -> FacetsUpdateBulk<'i> {
@ -48,7 +46,7 @@ impl<'i> FacetsUpdateBulk<'i> {
group_size,
min_level_size,
facet_type,
new_data: Some(new_data),
delta_data: Some(delta_data),
}
}
@ -63,13 +61,13 @@ impl<'i> FacetsUpdateBulk<'i> {
group_size: FACET_GROUP_SIZE,
min_level_size: FACET_MIN_LEVEL_SIZE,
facet_type,
new_data: None,
delta_data: None,
}
}
#[logging_timer::time("FacetsUpdateBulk::{}")]
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self;
let Self { index, field_ids, group_size, min_level_size, facet_type, delta_data } = self;
let db = match facet_type {
FacetType::String => index
@ -80,12 +78,9 @@ impl<'i> FacetsUpdateBulk<'i> {
}
};
let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size };
let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size };
inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| {
index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?;
Ok(())
})?;
inner.update(wtxn, &field_ids)?;
Ok(())
}
@ -94,26 +89,19 @@ impl<'i> FacetsUpdateBulk<'i> {
/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
pub new_data: Option<grenad::Reader<R>>,
pub delta_data: Option<grenad::Reader<R>>,
pub group_size: u8,
pub min_level_size: u8,
}
impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
pub fn update(
mut self,
wtxn: &mut RwTxn,
field_ids: &[u16],
mut handle_all_docids: impl FnMut(&mut RwTxn, FieldId, RoaringBitmap) -> Result<()>,
) -> Result<()> {
pub fn update(mut self, wtxn: &mut RwTxn, field_ids: &[u16]) -> Result<()> {
self.update_level0(wtxn)?;
for &field_id in field_ids.iter() {
self.clear_levels(wtxn, field_id)?;
}
for &field_id in field_ids.iter() {
let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, wtxn)?;
handle_all_docids(wtxn, field_id, all_docids)?;
let level_readers = self.compute_levels_for_field_id(field_id, wtxn)?;
for level_reader in level_readers {
let mut cursor = level_reader.into_cursor()?;
@ -133,20 +121,26 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
Ok(())
}
// TODO the new_data is an Reader<Obkv<Key, Obkv<DelAdd, RoaringBitmap>>>
fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> {
let new_data = match self.new_data.take() {
let delta_data = match self.delta_data.take() {
Some(x) => x,
None => return Ok(()),
};
if self.db.is_empty(wtxn)? {
let mut buffer = Vec::new();
let mut database = self.db.iter_mut(wtxn)?.remap_types::<ByteSlice, ByteSlice>();
let mut cursor = new_data.into_cursor()?;
let mut cursor = delta_data.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
if !valid_lmdb_key(key) {
continue;
}
let value = KvReaderDelAdd::new(value);
// DB is empty, it is safe to ignore Del operations
let Some(value) = value.get(DelAdd::Addition) else {
continue;
};
buffer.clear();
// the group size for level 0
buffer.push(1);
@ -158,11 +152,14 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
let mut buffer = Vec::new();
let database = self.db.remap_types::<ByteSlice, ByteSlice>();
let mut cursor = new_data.into_cursor()?;
let mut cursor = delta_data.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
if !valid_lmdb_key(key) {
continue;
}
let value = KvReaderDelAdd::new(value);
// the value is a CboRoaringBitmap, but I still need to prepend the
// group size for level 0 (= 1) to it
buffer.clear();
@ -170,17 +167,27 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
// then we extend the buffer with the docids bitmap
match database.get(wtxn, key)? {
Some(prev_value) => {
// prev_value is the group size for level 0, followed by the previous bitmap.
let old_bitmap = &prev_value[1..];
CboRoaringBitmapCodec::merge_into(
&[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)],
&mut buffer,
)?;
CboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?;
}
None => {
// it is safe to ignore the del in that case.
let Some(value) = value.get(DelAdd::Addition) else {
// won't put the key in DB as the value would be empty
continue;
};
buffer.extend_from_slice(value);
}
};
database.put(wtxn, key, &buffer)?;
let new_bitmap = &buffer[1..];
// if the new bitmap is empty, let's remove it
if CboRoaringBitmapLenCodec::bytes_decode(new_bitmap).unwrap_or_default() == 0 {
database.delete(wtxn, key)?;
} else {
database.put(wtxn, key, &buffer)?;
}
}
}
Ok(())
@ -189,16 +196,10 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
&self,
field_id: FieldId,
txn: &RoTxn,
) -> Result<(Vec<grenad::Reader<File>>, RoaringBitmap)> {
let mut all_docids = RoaringBitmap::new();
let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| {
for bitmap in bitmaps {
all_docids |= bitmap;
}
Ok(())
})?;
) -> Result<Vec<grenad::Reader<BufReader<File>>>> {
let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |_, _| Ok(()))?;
Ok((subwriters, all_docids))
Ok(subwriters)
}
#[allow(clippy::type_complexity)]
fn read_level_0<'t>(
@ -261,7 +262,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
field_id: u16,
level: u8,
handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
) -> Result<Vec<grenad::Reader<File>>> {
) -> Result<Vec<grenad::Reader<BufReader<File>>>> {
if level == 0 {
self.read_level_0(rtxn, field_id, handle_group)?;
// Level 0 is already in the database
@ -492,7 +493,6 @@ mod tests {
index.add_documents(documents).unwrap();
db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a");
db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521");
}
#[test]

View File

@ -1,360 +0,0 @@
use std::collections::{HashMap, HashSet};
use heed::RwTxn;
use log::debug;
use roaring::RoaringBitmap;
use time::OffsetDateTime;
use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
use crate::facet::FacetType;
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::ByteSliceRefCodec;
use crate::update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner};
use crate::{FieldId, Index, Result};
/// A builder used to remove elements from the `facet_id_string_docids` or `facet_id_f64_docids` databases.
///
/// Depending on the number of removed elements and the existing size of the database, we use either
/// a bulk delete method or an incremental delete method.
pub struct FacetsDelete<'i, 'b> {
index: &'i Index,
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
facet_type: FacetType,
affected_facet_values: HashMap<FieldId, HashSet<Vec<u8>>>,
docids_to_delete: &'b RoaringBitmap,
group_size: u8,
max_group_size: u8,
min_level_size: u8,
}
impl<'i, 'b> FacetsDelete<'i, 'b> {
pub fn new(
index: &'i Index,
facet_type: FacetType,
affected_facet_values: HashMap<FieldId, HashSet<Vec<u8>>>,
docids_to_delete: &'b RoaringBitmap,
) -> Self {
let database = match facet_type {
FacetType::String => index
.facet_id_string_docids
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
FacetType::Number => {
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>()
}
};
Self {
index,
database,
facet_type,
affected_facet_values,
docids_to_delete,
group_size: FACET_GROUP_SIZE,
max_group_size: FACET_MAX_GROUP_SIZE,
min_level_size: FACET_MIN_LEVEL_SIZE,
}
}
pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> {
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
for (field_id, affected_facet_values) in self.affected_facet_values {
// This is an incorrect condition, since we assume that the length of the database is equal
// to the number of facet values for the given field_id. It means that in some cases, we might
// wrongly choose the incremental indexer over the bulk indexer. But the only case where that could
// really be a performance problem is when we fully delete a large ratio of all facet values for
// each field id. This would almost never happen. Still, to be overly cautious, I have added a
// 2x penalty to the incremental indexer. That is, instead of assuming a 70x worst-case performance
// penalty to the incremental indexer, we assume a 150x worst-case performance penalty instead.
if affected_facet_values.len() >= (self.database.len(wtxn)? / 150) {
// Bulk delete
let mut modified = false;
for facet_value in affected_facet_values {
let key =
FacetGroupKey { field_id, level: 0, left_bound: facet_value.as_slice() };
let mut old = self.database.get(wtxn, &key)?.unwrap();
let previous_len = old.bitmap.len();
old.bitmap -= self.docids_to_delete;
if old.bitmap.is_empty() {
modified = true;
self.database.delete(wtxn, &key)?;
} else if old.bitmap.len() != previous_len {
modified = true;
self.database.put(wtxn, &key, &old)?;
}
}
if modified {
let builder = FacetsUpdateBulk::new_not_updating_level_0(
self.index,
vec![field_id],
self.facet_type,
);
builder.execute(wtxn)?;
}
} else {
// Incremental
let inc = FacetsUpdateIncrementalInner {
db: self.database,
group_size: self.group_size,
min_level_size: self.min_level_size,
max_group_size: self.max_group_size,
};
for facet_value in affected_facet_values {
inc.delete(wtxn, field_id, facet_value.as_slice(), self.docids_to_delete)?;
}
}
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use std::iter::FromIterator;
use big_s::S;
use maplit::hashset;
use rand::seq::SliceRandom;
use rand::SeedableRng;
use roaring::RoaringBitmap;
use crate::db_snap;
use crate::documents::documents_batch_reader_from_objects;
use crate::index::tests::TempIndex;
use crate::update::facet::test_helpers::ordered_string;
use crate::update::{DeleteDocuments, DeletionStrategy};
#[test]
fn delete_mixed_incremental_and_bulk() {
// The point of this test is to create an index populated with documents
// containing different filterable attributes. Then, we delete a bunch of documents
// such that a mix of the incremental and bulk indexer is used (depending on the field id)
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
index
.update_settings(|settings| {
settings.set_filterable_fields(
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
);
})
.unwrap();
let mut documents = vec![];
for i in 0..1000 {
documents.push(
serde_json::json! {
{
"id": i,
"label": i / 10,
"colour": i / 100,
"timestamp": i / 2,
}
}
.as_object()
.unwrap()
.clone(),
);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576");
db_snap!(index, number_faceted_documents_ids, 1, @"9a0ea88e7c9dcf6dc0ef0b601736ffcf");
let mut wtxn = index.env.write_txn().unwrap();
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
builder.strategy(DeletionStrategy::AlwaysHard);
builder.delete_documents(&RoaringBitmap::from_iter(0..100));
// by deleting the first 100 documents, we expect that:
// - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13)
// - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13
// - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13
// - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13
// This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test
builder.execute().unwrap();
wtxn.commit().unwrap();
db_snap!(index, soft_deleted_documents_ids, @"[]");
db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6");
db_snap!(index, number_faceted_documents_ids, 2, @"3570e0ac0fdb21be9ebe433f59264b56");
}
// Same test as above but working with string values for the facets
#[test]
fn delete_mixed_incremental_and_bulk_string() {
// The point of this test is to create an index populated with documents
// containing different filterable attributes. Then, we delete a bunch of documents
// such that a mix of the incremental and bulk indexer is used (depending on the field id)
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
index
.update_settings(|settings| {
settings.set_filterable_fields(
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
);
})
.unwrap();
let mut documents = vec![];
for i in 0..1000 {
documents.push(
serde_json::json! {
{
"id": i,
"label": ordered_string(i / 10),
"colour": ordered_string(i / 100),
"timestamp": ordered_string(i / 2),
}
}
.as_object()
.unwrap()
.clone(),
);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5");
let mut wtxn = index.env.write_txn().unwrap();
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
builder.strategy(DeletionStrategy::AlwaysHard);
builder.delete_documents(&RoaringBitmap::from_iter(0..100));
// by deleting the first 100 documents, we expect that:
// - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13)
// - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13
// - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13
// - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13
// This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test
builder.execute().unwrap();
wtxn.commit().unwrap();
db_snap!(index, soft_deleted_documents_ids, @"[]");
db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc");
db_snap!(index, string_faceted_documents_ids, 2, @"504152afa5c94fd4e515dcdfa4c7161f");
}
#[test]
fn delete_almost_all_incrementally_string() {
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
index
.update_settings(|settings| {
settings.set_filterable_fields(
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
);
})
.unwrap();
let mut documents = vec![];
for i in 0..1000 {
documents.push(
serde_json::json! {
{
"id": i,
"label": ordered_string(i / 10),
"colour": ordered_string(i / 100),
"timestamp": ordered_string(i / 2),
}
}
.as_object()
.unwrap()
.clone(),
);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5");
let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
let mut docids_to_delete = (0..1000).collect::<Vec<_>>();
docids_to_delete.shuffle(&mut rng);
for docid in docids_to_delete.into_iter().take(990) {
let mut wtxn = index.env.write_txn().unwrap();
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
builder.strategy(DeletionStrategy::AlwaysHard);
builder.delete_documents(&RoaringBitmap::from_iter([docid]));
builder.execute().unwrap();
wtxn.commit().unwrap();
}
db_snap!(index, soft_deleted_documents_ids, @"[]");
db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d");
db_snap!(index, string_faceted_documents_ids, 2, @r###"
0 []
1 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ]
2 [292, 324, 358, 381, 493, 839, 852, ]
3 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ]
"###);
}
}
#[allow(unused)]
#[cfg(test)]
mod comparison_bench {
use std::iter::once;
use rand::Rng;
use roaring::RoaringBitmap;
use crate::heed_codec::facet::OrderedF64Codec;
use crate::update::facet::test_helpers::FacetIndex;
// This is a simple test to get an intuition on the relative speed
// of the incremental vs. bulk indexer.
//
// The benchmark shows the worst-case scenario for the incremental indexer, since
// each facet value contains only one document ID.
//
// In that scenario, it appears that the incremental indexer is about 70 times slower than the
// bulk indexer.
// #[test]
fn benchmark_facet_indexing_delete() {
let mut r = rand::thread_rng();
for i in 1..=20 {
let size = 50_000 * i;
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
let mut txn = index.env.write_txn().unwrap();
let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
for i in 0..size {
// field id = 0, left_bound = i, docids = [i]
elements.push(((0, i as f64), once(i).collect()));
}
let timer = std::time::Instant::now();
index.bulk_insert(&mut txn, &[0], elements.iter());
let time_spent = timer.elapsed().as_millis();
println!("bulk {size} : {time_spent}ms");
txn.commit().unwrap();
for nbr_doc in [1, 100, 1000, 10_000] {
let mut txn = index.env.write_txn().unwrap();
let timer = std::time::Instant::now();
//
// delete one document
//
for _ in 0..nbr_doc {
let deleted_u32 = r.gen::<u32>() % size;
let deleted_f64 = deleted_u32 as f64;
index.delete_single_docid(&mut txn, 0, &deleted_f64, deleted_u32)
}
let time_spent = timer.elapsed().as_millis();
println!(" delete {nbr_doc} : {time_spent}ms");
txn.abort().unwrap();
}
}
}
}

View File

@ -1,8 +1,9 @@
use std::collections::HashMap;
use std::fs::File;
use std::io::BufReader;
use heed::types::{ByteSlice, DecodeIgnore};
use heed::{BytesDecode, Error, RoTxn, RwTxn};
use obkv::KvReader;
use roaring::RoaringBitmap;
use crate::facet::FacetType;
@ -11,8 +12,9 @@ use crate::heed_codec::facet::{
};
use crate::heed_codec::ByteSliceRefCodec;
use crate::search::facet::get_highest_level;
use crate::update::del_add::DelAdd;
use crate::update::index_documents::valid_lmdb_key;
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
use crate::{CboRoaringBitmapCodec, Index, Result};
enum InsertionResult {
InPlace,
@ -27,27 +29,21 @@ enum DeletionResult {
/// Algorithm to incrementally insert and delete elememts into the
/// `facet_id_(string/f64)_docids` databases.
///
/// Rhe `faceted_documents_ids` value in the main database of `Index`
/// is also updated to contain the new set of faceted documents.
pub struct FacetsUpdateIncremental<'i> {
index: &'i Index,
pub struct FacetsUpdateIncremental {
inner: FacetsUpdateIncrementalInner,
facet_type: FacetType,
new_data: grenad::Reader<File>,
delta_data: grenad::Reader<BufReader<File>>,
}
impl<'i> FacetsUpdateIncremental<'i> {
impl FacetsUpdateIncremental {
pub fn new(
index: &'i Index,
index: &Index,
facet_type: FacetType,
new_data: grenad::Reader<File>,
delta_data: grenad::Reader<BufReader<File>>,
group_size: u8,
min_level_size: u8,
max_group_size: u8,
) -> Self {
FacetsUpdateIncremental {
index,
inner: FacetsUpdateIncrementalInner {
db: match facet_type {
FacetType::String => index
@ -61,31 +57,41 @@ impl<'i> FacetsUpdateIncremental<'i> {
max_group_size,
min_level_size,
},
facet_type,
new_data,
delta_data,
}
}
pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> {
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
let mut cursor = self.new_data.into_cursor()?;
pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> {
let mut cursor = self.delta_data.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
if !valid_lmdb_key(key) {
continue;
}
let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(key)
.ok_or(heed::Error::Encoding)?;
let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
self.inner.insert(wtxn, key.field_id, key.left_bound, &docids)?;
*new_faceted_docids.entry(key.field_id).or_default() |= docids;
let value = KvReader::new(value);
let docids_to_delete = value
.get(DelAdd::Deletion)
.map(CboRoaringBitmapCodec::bytes_decode)
.map(|o| o.ok_or(heed::Error::Encoding));
let docids_to_add = value
.get(DelAdd::Addition)
.map(CboRoaringBitmapCodec::bytes_decode)
.map(|o| o.ok_or(heed::Error::Encoding));
if let Some(docids_to_delete) = docids_to_delete {
let docids_to_delete = docids_to_delete?;
self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?;
}
if let Some(docids_to_add) = docids_to_add {
let docids_to_add = docids_to_add?;
self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?;
}
}
for (field_id, new_docids) in new_faceted_docids {
let mut docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?;
docids |= new_docids;
self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?;
}
Ok(())
}
}

View File

@ -14,7 +14,7 @@ The databases must be able to return results for queries such as:
The algorithms that implement these queries are found in the `src/search/facet` folder.
To make these queries fast to compute, the database adopts a tree structure:
```ignore
```text
┌───────────────────────────────┬───────────────────────────────┬───────────────┐
┌───────┐ │ "ab" (2) │ "gaf" (2) │ "woz" (1) │
│Level 2│ │ │ │ │
@ -41,7 +41,7 @@ These documents all contain a facet value that is contained within `ab .. gaf`.
In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a
[`FacetGroupValue`], which have the following format:
```ignore
```text
FacetGroupKey:
- field id : u16
- level : u8
@ -78,6 +78,7 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
use std::collections::BTreeSet;
use std::fs::File;
use std::io::BufReader;
use std::iter::FromIterator;
use charabia::normalizer::{Normalize, NormalizerOption};
@ -97,7 +98,6 @@ use crate::update::merge_btreeset_string;
use crate::{BEU16StrCodec, Index, Result, BEU16, MAX_FACET_VALUE_LENGTH};
pub mod bulk;
pub mod delete;
pub mod incremental;
/// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases.
@ -108,14 +108,17 @@ pub struct FacetsUpdate<'i> {
index: &'i Index,
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
facet_type: FacetType,
new_data: grenad::Reader<File>,
delta_data: grenad::Reader<BufReader<File>>,
group_size: u8,
max_group_size: u8,
min_level_size: u8,
}
impl<'i> FacetsUpdate<'i> {
// TODO grenad::Reader<Key, Obkv<DelAdd, RoaringBitmap>>
pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader<File>) -> Self {
pub fn new(
index: &'i Index,
facet_type: FacetType,
delta_data: grenad::Reader<BufReader<File>>,
) -> Self {
let database = match facet_type {
FacetType::String => index
.facet_id_string_docids
@ -131,26 +134,26 @@ impl<'i> FacetsUpdate<'i> {
max_group_size: FACET_MAX_GROUP_SIZE,
min_level_size: FACET_MIN_LEVEL_SIZE,
facet_type,
new_data,
delta_data,
}
}
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
if self.new_data.is_empty() {
if self.delta_data.is_empty() {
return Ok(());
}
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
// See self::comparison_bench::benchmark_facet_indexing
if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
if self.delta_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
let field_ids =
self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
let bulk_update = FacetsUpdateBulk::new(
self.index,
field_ids,
self.facet_type,
self.new_data,
self.delta_data,
self.group_size,
self.min_level_size,
);
@ -159,7 +162,7 @@ impl<'i> FacetsUpdate<'i> {
let incremental_update = FacetsUpdateIncremental::new(
self.index,
self.facet_type,
self.new_data,
self.delta_data,
self.group_size,
self.min_level_size,
self.max_group_size,
@ -275,6 +278,7 @@ pub(crate) mod test_helpers {
use crate::heed_codec::ByteSliceRefCodec;
use crate::search::facet::get_highest_level;
use crate::snapshot_tests::display_bitmap;
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
use crate::update::FacetsUpdateIncrementalInner;
use crate::CboRoaringBitmapCodec;
@ -451,20 +455,22 @@ pub(crate) mod test_helpers {
let key: FacetGroupKey<&[u8]> =
FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes };
let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_encode(&key).unwrap();
let mut inner_writer = KvWriterDelAdd::memory();
let value = CboRoaringBitmapCodec::bytes_encode(docids).unwrap();
writer.insert(&key, &value).unwrap();
inner_writer.insert(DelAdd::Addition, value).unwrap();
writer.insert(&key, inner_writer.into_inner().unwrap()).unwrap();
}
writer.finish().unwrap();
let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap();
let update = FacetsUpdateBulkInner {
db: self.content,
new_data: Some(reader),
delta_data: Some(reader),
group_size: self.group_size.get(),
min_level_size: self.min_level_size.get(),
};
update.update(wtxn, field_ids, |_, _, _| Ok(())).unwrap();
update.update(wtxn, field_ids).unwrap();
}
pub fn verify_structure_validity(&self, txn: &RoTxn, field_id: u16) {
@ -552,101 +558,6 @@ pub(crate) mod test_helpers {
}
}
#[cfg(test)]
mod tests {
use big_s::S;
use maplit::hashset;
use crate::db_snap;
use crate::documents::documents_batch_reader_from_objects;
use crate::index::tests::TempIndex;
use crate::update::DeletionStrategy;
#[test]
fn replace_all_identical_soft_deletion_then_hard_deletion() {
let mut index = TempIndex::new_with_map_size(4096 * 1000 * 100);
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
index
.update_settings(|settings| {
settings.set_primary_key("id".to_owned());
settings.set_filterable_fields(hashset! { S("size") });
})
.unwrap();
let mut documents = vec![];
for i in 0..1000 {
documents.push(
serde_json::json! {
{
"id": i,
"size": i % 250,
}
}
.as_object()
.unwrap()
.clone(),
);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b");
db_snap!(index, number_faceted_documents_ids, "initial", @"bd916ef32b05fd5c3c4c518708f431a9");
db_snap!(index, soft_deleted_documents_ids, "initial", @"[]");
let mut documents = vec![];
for i in 0..999 {
documents.push(
serde_json::json! {
{
"id": i,
"size": i % 250,
"other": 0,
}
}
.as_object()
.unwrap()
.clone(),
);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f");
db_snap!(index, number_faceted_documents_ids, "replaced_1_soft", @"de76488bd05ad94c6452d725acf1bd06");
db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123");
// Then replace the last document while disabling soft_deletion
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
let mut documents = vec![];
for i in 999..1000 {
documents.push(
serde_json::json! {
{
"id": i,
"size": i % 250,
"other": 0,
}
}
.as_object()
.unwrap()
.clone(),
);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6");
db_snap!(index, number_faceted_documents_ids, "replaced_2_hard", @"60b19824f136affe6b240a7200779028");
db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]");
}
}
#[allow(unused)]
#[cfg(test)]
mod comparison_bench {

View File

@ -1,4 +1,4 @@
use std::io::{Read, Seek};
use std::io::{BufWriter, Read, Seek};
use std::result::Result as StdResult;
use std::{fmt, iter};
@ -35,7 +35,7 @@ pub fn enrich_documents_batch<R: Read + Seek>(
let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index();
let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?;
let mut external_ids = tempfile::tempfile().map(BufWriter::new).map(grenad::Writer::new)?;
let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH];
// The primary key *field id* that has already been set for this index or the one

View File

@ -1,6 +1,7 @@
use std::collections::{HashMap, HashSet};
use std::convert::TryInto;
use std::fs::File;
use std::io::BufReader;
use std::{io, mem, str};
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
@ -29,7 +30,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
allowed_separators: Option<&[&str]>,
dictionary: Option<&[&str]>,
max_positions_per_attributes: Option<u32>,
) -> Result<(RoaringBitmap, grenad::Reader<File>, ScriptLanguageDocidsMap)> {
) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
puffin::profile_function!();
let max_positions_per_attributes = max_positions_per_attributes
@ -55,7 +56,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
let mut value_buffer = Vec::new();
// initialize tokenizer.
let mut builder = tokenizer_builder(stop_words, dictionary, allowed_separators, None);
let mut builder = tokenizer_builder(stop_words, allowed_separators, dictionary, None);
let tokenizer = builder.build();
// iterate over documents.
@ -114,6 +115,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
let (add_obkv, add_script_language_word_count) = add?;
// merge deletions and additions.
// transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
value_buffer.clear();
del_add_from_two_obkvs(
KvReader::<FieldId>::new(del_obkv),
@ -121,8 +123,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
&mut value_buffer,
)?;
// write them into the sorter.
let obkv = KvReader::<FieldId>::new(value);
// write each KV<DelAdd, KV<u16, String>> into the sorter, field by field.
let obkv = KvReader::<FieldId>::new(&value_buffer);
for (field_id, value) in obkv.iter() {
key_buffer.truncate(mem::size_of::<u32>());
key_buffer.extend_from_slice(&field_id.to_be_bytes());
@ -150,8 +152,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
}
}
// the returned sorter is serialized as: key: (DocId, FieldId), value: KV<DelAdd, KV<u16, String>>.
sorter_into_reader(docid_word_positions_sorter, indexer)
.map(|reader| (documents_ids, reader, script_language_docids))
.map(|reader| (reader, script_language_docids))
}
/// Check if any searchable fields of a document changed.
@ -195,7 +198,7 @@ fn tokenizer_builder<'a>(
}
if let Some(script_language) = script_language {
tokenizer_builder.allow_list(&script_language);
tokenizer_builder.allow_list(script_language);
}
tokenizer_builder
@ -203,6 +206,7 @@ fn tokenizer_builder<'a>(
/// Extract words maped with their positions of a document,
/// ensuring no Language detection mistakes was made.
#[allow(clippy::too_many_arguments)] // FIXME: consider grouping arguments in a struct
fn lang_safe_tokens_from_document<'a>(
obkv: &KvReader<FieldId>,
searchable_fields: &Option<HashSet<FieldId>>,
@ -217,9 +221,9 @@ fn lang_safe_tokens_from_document<'a>(
let mut script_language_word_count = HashMap::new();
tokens_from_document(
&obkv,
obkv,
searchable_fields,
&tokenizer,
tokenizer,
max_positions_per_attributes,
del_add,
buffers,
@ -244,8 +248,8 @@ fn lang_safe_tokens_from_document<'a>(
// build a new temporary tokenizer including the allow list.
let mut builder = tokenizer_builder(
stop_words,
dictionary,
allowed_separators,
dictionary,
Some(&script_language),
);
let tokenizer = builder.build();
@ -254,7 +258,7 @@ fn lang_safe_tokens_from_document<'a>(
// rerun the extraction.
tokens_from_document(
&obkv,
obkv,
searchable_fields,
&tokenizer,
max_positions_per_attributes,
@ -265,6 +269,7 @@ fn lang_safe_tokens_from_document<'a>(
}
}
// returns a (KV<FieldId, KV<u16, String>>, HashMap<Script, Vec<(Language, usize)>>)
Ok((&buffers.obkv_buffer, script_language_word_count))
}
@ -330,6 +335,7 @@ fn tokens_from_document<'a>(
}
}
// returns a KV<FieldId, KV<u16, String>>
Ok(document_writer.into_inner().map(|v| v.as_slice())?)
}

View File

@ -1,5 +1,5 @@
use std::fs::File;
use std::io;
use std::io::{self, BufReader};
use heed::{BytesDecode, BytesEncode};
@ -20,7 +20,7 @@ use crate::Result;
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
fid_docid_facet_number: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> {
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();

View File

@ -1,4 +1,5 @@
use std::fs::File;
use std::io::BufReader;
use std::{io, str};
use heed::BytesEncode;
@ -18,7 +19,7 @@ use crate::{FieldId, Result};
pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
docid_fid_facet_string: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> {
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();

View File

@ -2,7 +2,7 @@ use std::borrow::Cow;
use std::collections::{BTreeMap, HashSet};
use std::convert::TryInto;
use std::fs::File;
use std::io;
use std::io::{self, BufReader};
use std::mem::size_of;
use std::result::Result as StdResult;
@ -29,11 +29,11 @@ const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
/// The extracted facet values stored in grenad files by type.
pub struct ExtractedFacetValues {
pub fid_docid_facet_numbers_chunk: grenad::Reader<File>,
pub fid_docid_facet_strings_chunk: grenad::Reader<File>,
pub fid_facet_is_null_docids_chunk: grenad::Reader<File>,
pub fid_facet_is_empty_docids_chunk: grenad::Reader<File>,
pub fid_facet_exists_docids_chunk: grenad::Reader<File>,
pub fid_docid_facet_numbers_chunk: grenad::Reader<BufReader<File>>,
pub fid_docid_facet_strings_chunk: grenad::Reader<BufReader<File>>,
pub fid_facet_is_null_docids_chunk: grenad::Reader<BufReader<File>>,
pub fid_facet_is_empty_docids_chunk: grenad::Reader<BufReader<File>>,
pub fid_facet_exists_docids_chunk: grenad::Reader<BufReader<File>>,
}
/// Extracts the facet values of each faceted field of each document.
@ -102,11 +102,11 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
let del_add_obkv = obkv::KvReader::new(field_bytes);
let del_value = match del_add_obkv.get(DelAdd::Deletion) {
Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?,
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
None => None,
};
let add_value = match del_add_obkv.get(DelAdd::Addition) {
Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?,
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
None => None,
};

View File

@ -1,14 +1,15 @@
use std::fs::File;
use std::io;
use std::io::{self, BufReader};
use obkv::KvReaderU16;
use super::helpers::{
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
GrenadParameters,
};
use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::Result;
const MAX_COUNTED_WORDS: usize = 30;
@ -22,14 +23,14 @@ const MAX_COUNTED_WORDS: usize = 30;
pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> {
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();
let mut fid_word_count_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
@ -37,18 +38,52 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
);
let mut key_buffer = Vec::new();
let mut value_buffer = Vec::new();
let mut cursor = docid_word_positions.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let document_id = u32::from_be_bytes(document_id_bytes);
let word_count = KvReaderU16::new(&value).iter().take(MAX_COUNTED_WORDS + 1).count();
if word_count <= MAX_COUNTED_WORDS {
key_buffer.clear();
key_buffer.extend_from_slice(fid_bytes);
key_buffer.push(word_count as u8);
fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
let del_add_reader = KvReaderDelAdd::new(value);
let deletion = del_add_reader
// get deleted words
.get(DelAdd::Deletion)
// count deleted words
.map(|deletion| KvReaderU16::new(deletion).iter().take(MAX_COUNTED_WORDS + 1).count())
// keep the count if under or equal to MAX_COUNTED_WORDS
.filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
let addition = del_add_reader
// get added words
.get(DelAdd::Addition)
// count added words
.map(|addition| KvReaderU16::new(addition).iter().take(MAX_COUNTED_WORDS + 1).count())
// keep the count if under or equal to MAX_COUNTED_WORDS
.filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
if deletion != addition {
// Insert deleted word count in sorter if exist.
if let Some(word_count) = deletion {
value_buffer.clear();
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
key_buffer.clear();
key_buffer.extend_from_slice(fid_bytes);
key_buffer.push(word_count as u8);
fid_word_count_docids_sorter
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
}
// Insert added word count in sorter if exist.
if let Some(word_count) = addition {
value_buffer.clear();
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
key_buffer.clear();
key_buffer.extend_from_slice(fid_bytes);
key_buffer.push(word_count as u8);
fid_word_count_docids_sorter
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
}
}
}

View File

@ -1,11 +1,12 @@
use std::fs::File;
use std::io;
use std::io::{self, BufReader};
use concat_arrays::concat_arrays;
use serde_json::Value;
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
use crate::error::GeoError;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::extract_finite_float_from_value;
use crate::{FieldId, InternalError, Result};
@ -18,7 +19,7 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
indexer: GrenadParameters,
primary_key_id: FieldId,
(lat_fid, lng_fid): (FieldId, FieldId),
) -> Result<grenad::Reader<File>> {
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let mut writer = create_writer(
@ -30,39 +31,71 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
let mut cursor = obkv_documents.into_cursor()?;
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
let obkv = obkv::KvReader::new(value);
// since we only needs the primary key when we throw an error we create this getter to
// lazily get it when needed
// since we only need the primary key when we throw an error
// we create this getter to lazily get it when needed
let document_id = || -> Value {
let document_id = obkv.get(primary_key_id).unwrap();
serde_json::from_slice(document_id).unwrap()
};
// first we get the two fields
let lat = obkv.get(lat_fid);
let lng = obkv.get(lng_fid);
match (obkv.get(lat_fid), obkv.get(lng_fid)) {
(Some(lat), Some(lng)) => {
let deladd_lat_obkv = KvReaderDelAdd::new(lat);
let deladd_lng_obkv = KvReaderDelAdd::new(lng);
if let Some((lat, lng)) = lat.zip(lng) {
// then we extract the values
let lat = extract_finite_float_from_value(
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
)
.map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
// then we extract the values
let del_lat_lng = deladd_lat_obkv
.get(DelAdd::Deletion)
.zip(deladd_lng_obkv.get(DelAdd::Deletion))
.map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
.transpose()?;
let add_lat_lng = deladd_lat_obkv
.get(DelAdd::Addition)
.zip(deladd_lng_obkv.get(DelAdd::Addition))
.map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
.transpose()?;
let lng = extract_finite_float_from_value(
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
)
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
#[allow(clippy::drop_non_drop)]
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
writer.insert(docid_bytes, bytes)?;
} else if lat.is_none() && lng.is_some() {
return Err(GeoError::MissingLatitude { document_id: document_id() })?;
} else if lat.is_some() && lng.is_none() {
return Err(GeoError::MissingLongitude { document_id: document_id() })?;
if del_lat_lng != add_lat_lng {
let mut obkv = KvWriterDelAdd::memory();
if let Some([lat, lng]) = del_lat_lng {
#[allow(clippy::drop_non_drop)]
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
obkv.insert(DelAdd::Deletion, bytes)?;
}
if let Some([lat, lng]) = add_lat_lng {
#[allow(clippy::drop_non_drop)]
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
obkv.insert(DelAdd::Addition, bytes)?;
}
let bytes = obkv.into_inner()?;
writer.insert(docid_bytes, bytes)?;
}
}
(None, Some(_)) => {
return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
}
(Some(_), None) => {
return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
}
(None, None) => (),
}
// else => the _geo object was `null`, there is nothing to do
}
writer_into_reader(writer)
}
/// Extract the finite floats lat and lng from two bytes slices.
fn extract_lat_lng(lat: &[u8], lng: &[u8], document_id: impl Fn() -> Value) -> Result<[f64; 2]> {
let lat = extract_finite_float_from_value(
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
)
.map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
let lng = extract_finite_float_from_value(
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
)
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
Ok([lat, lng])
}

View File

@ -1,13 +1,24 @@
use std::cmp::Ordering;
use std::convert::TryFrom;
use std::fs::File;
use std::io;
use std::io::{self, BufReader, BufWriter};
use std::mem::size_of;
use std::str::from_utf8;
use bytemuck::cast_slice;
use grenad::Writer;
use itertools::EitherOrBoth;
use ordered_float::OrderedFloat;
use serde_json::{from_slice, Value};
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
use crate::error::UserError;
use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors};
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::try_split_at;
use crate::{DocumentId, FieldId, InternalError, Result, VectorOrArrayOfVectors};
/// The length of the elements that are always in the buffer when inserting new values.
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
/// Extracts the embedding vector contained in each document under the `_vectors` field.
///
@ -16,9 +27,8 @@ use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors};
pub fn extract_vector_points<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters,
primary_key_id: FieldId,
vectors_fid: FieldId,
) -> Result<grenad::Reader<File>> {
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let mut writer = create_writer(
@ -27,43 +37,112 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
tempfile::tempfile()?,
);
let mut key_buffer = Vec::new();
let mut cursor = obkv_documents.into_cursor()?;
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
while let Some((key, value)) = cursor.move_on_next()? {
// this must always be serialized as (docid, external_docid);
let (docid_bytes, external_id_bytes) =
try_split_at(key, std::mem::size_of::<DocumentId>()).unwrap();
debug_assert!(from_utf8(external_id_bytes).is_ok());
let obkv = obkv::KvReader::new(value);
key_buffer.clear();
key_buffer.extend_from_slice(docid_bytes);
// since we only needs the primary key when we throw an error we create this getter to
// lazily get it when needed
let document_id = || -> Value {
let document_id = obkv.get(primary_key_id).unwrap();
from_slice(document_id).unwrap()
};
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
// first we retrieve the _vectors field
if let Some(vectors) = obkv.get(vectors_fid) {
// extract the vectors
let vectors = match from_slice(vectors) {
Ok(vectors) => VectorOrArrayOfVectors::into_array_of_vectors(vectors),
Err(_) => {
return Err(UserError::InvalidVectorsType {
document_id: document_id(),
value: from_slice(vectors).map_err(InternalError::SerdeJson)?,
}
.into())
}
};
if let Some(value) = obkv.get(vectors_fid) {
let vectors_obkv = KvReaderDelAdd::new(value);
if let Some(vectors) = vectors {
for (i, vector) in vectors.into_iter().enumerate().take(u16::MAX as usize) {
let index = u16::try_from(i).unwrap();
let mut key = docid_bytes.to_vec();
key.extend_from_slice(&index.to_be_bytes());
let bytes = cast_slice(&vector);
writer.insert(key, bytes)?;
}
}
// then we extract the values
let del_vectors = vectors_obkv
.get(DelAdd::Deletion)
.map(|vectors| extract_vectors(vectors, document_id))
.transpose()?
.flatten();
let add_vectors = vectors_obkv
.get(DelAdd::Addition)
.map(|vectors| extract_vectors(vectors, document_id))
.transpose()?
.flatten();
// and we finally push the unique vectors into the writer
push_vectors_diff(
&mut writer,
&mut key_buffer,
del_vectors.unwrap_or_default(),
add_vectors.unwrap_or_default(),
)?;
}
// else => the `_vectors` object was `null`, there is nothing to do
}
writer_into_reader(writer)
}
/// Computes the diff between both Del and Add numbers and
/// only inserts the parts that differ in the sorter.
fn push_vectors_diff(
writer: &mut Writer<BufWriter<File>>,
key_buffer: &mut Vec<u8>,
mut del_vectors: Vec<Vec<f32>>,
mut add_vectors: Vec<Vec<f32>>,
) -> Result<()> {
// We sort and dedup the vectors
del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
let merged_vectors_iter =
itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add));
// insert vectors into the writer
for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) {
// Generate the key by extending the unique index to it.
key_buffer.truncate(TRUNCATE_SIZE);
let index = u16::try_from(i).unwrap();
key_buffer.extend_from_slice(&index.to_be_bytes());
match eob {
EitherOrBoth::Both(_, _) => (), // no need to touch anything
EitherOrBoth::Left(vector) => {
// We insert only the Del part of the Obkv to inform
// that we only want to remove all those vectors.
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Deletion, cast_slice(&vector))?;
let bytes = obkv.into_inner()?;
writer.insert(&key_buffer, bytes)?;
}
EitherOrBoth::Right(vector) => {
// We insert only the Add part of the Obkv to inform
// that we only want to remove all those vectors.
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Addition, cast_slice(&vector))?;
let bytes = obkv.into_inner()?;
writer.insert(&key_buffer, bytes)?;
}
}
}
Ok(())
}
/// Compares two vectors by using the OrderingFloat helper.
fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering {
a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat))
}
/// Extracts the vectors from a JSON value.
fn extract_vectors(value: &[u8], document_id: impl Fn() -> Value) -> Result<Option<Vec<Vec<f32>>>> {
match from_slice(value) {
Ok(vectors) => Ok(VectorOrArrayOfVectors::into_array_of_vectors(vectors)),
Err(_) => Err(UserError::InvalidVectorsType {
document_id: document_id(),
value: from_slice(value).map_err(InternalError::SerdeJson)?,
}
.into()),
}
}

View File

@ -1,6 +1,6 @@
use std::collections::{BTreeSet, HashSet};
use std::fs::File;
use std::io;
use std::io::{self, BufReader};
use heed::BytesDecode;
use obkv::KvReaderU16;
@ -28,7 +28,11 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
exact_attributes: &HashSet<FieldId>,
) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
) -> Result<(
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
)> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();
@ -53,17 +57,17 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
let document_id = u32::from_be_bytes(document_id_bytes);
let fid = u16::from_be_bytes(fid_bytes);
let del_add_reader = KvReaderDelAdd::new(&value);
let del_add_reader = KvReaderDelAdd::new(value);
// extract all unique words to remove.
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
for (_pos, word) in KvReaderU16::new(&deletion).iter() {
for (_pos, word) in KvReaderU16::new(deletion).iter() {
del_words.insert(word.to_vec());
}
}
// extract all unique additional words.
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
for (_pos, word) in KvReaderU16::new(&addition).iter() {
for (_pos, word) in KvReaderU16::new(addition).iter() {
add_words.insert(word.to_vec());
}
}
@ -118,9 +122,9 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
// every words contained in an attribute set to exact must be pushed in the exact_words list.
if exact_attributes.contains(&fid) {
exact_word_docids_sorter.insert(word.as_bytes(), &value)?;
exact_word_docids_sorter.insert(word.as_bytes(), value)?;
} else {
word_docids_sorter.insert(word.as_bytes(), &value)?;
word_docids_sorter.insert(word.as_bytes(), value)?;
}
}
@ -165,7 +169,7 @@ fn words_into_sorter(
};
key_buffer.clear();
key_buffer.extend_from_slice(&word_bytes);
key_buffer.extend_from_slice(word_bytes);
key_buffer.push(0);
key_buffer.extend_from_slice(&fid.to_be_bytes());
word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;

View File

@ -1,5 +1,6 @@
use std::collections::{BTreeMap, VecDeque};
use std::fs::File;
use std::io::BufReader;
use std::{cmp, io};
use obkv::KvReaderU16;
@ -22,13 +23,12 @@ use crate::{DocumentId, Result};
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> {
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();
let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE)
.into_iter()
.map(|_| {
create_sorter(
grenad::SortAlgorithm::Unstable,
@ -74,7 +74,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
let (del, add): (Result<_>, Result<_>) = rayon::join(
|| {
// deletions
if let Some(deletion) = KvReaderDelAdd::new(&value).get(DelAdd::Deletion) {
if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) {
for (position, word) in KvReaderU16::new(deletion).iter() {
// drain the proximity window until the head word is considered close to the word we are inserting.
while del_word_positions.get(0).map_or(false, |(_w, p)| {
@ -103,7 +103,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
},
|| {
// additions
if let Some(addition) = KvReaderDelAdd::new(&value).get(DelAdd::Addition) {
if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) {
for (position, word) in KvReaderU16::new(addition).iter() {
// drain the proximity window until the head word is considered close to the word we are inserting.
while add_word_positions.get(0).map_or(false, |(_w, p)| {
@ -169,7 +169,7 @@ fn document_word_positions_into_sorter(
document_id: DocumentId,
del_word_pair_proximity: &BTreeMap<(String, String), u8>,
add_word_pair_proximity: &BTreeMap<(String, String), u8>,
word_pair_proximity_docids_sorters: &mut Vec<grenad::Sorter<MergeFn>>,
word_pair_proximity_docids_sorters: &mut [grenad::Sorter<MergeFn>],
) -> Result<()> {
use itertools::merge_join_by;
use itertools::EitherOrBoth::{Both, Left, Right};
@ -200,7 +200,7 @@ fn document_word_positions_into_sorter(
};
key_buffer.clear();
key_buffer.push(*prox as u8);
key_buffer.push(*prox);
key_buffer.extend_from_slice(w1.as_bytes());
key_buffer.push(0);
key_buffer.extend_from_slice(w2.as_bytes());

View File

@ -1,6 +1,6 @@
use std::collections::BTreeSet;
use std::fs::File;
use std::io;
use std::io::{self, BufReader};
use obkv::KvReaderU16;
@ -22,7 +22,7 @@ use crate::{bucketed_position, DocumentId, Result};
pub fn extract_word_position_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> {
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();
@ -60,7 +60,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
current_document_id = Some(document_id);
let del_add_reader = KvReaderDelAdd::new(&value);
let del_add_reader = KvReaderDelAdd::new(value);
// extract all unique words to remove.
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
for (position, word_bytes) in KvReaderU16::new(deletion).iter() {

View File

@ -11,6 +11,7 @@ mod extract_word_position_docids;
use std::collections::HashSet;
use std::fs::File;
use std::io::BufReader;
use crossbeam_channel::Sender;
use log::debug;
@ -27,8 +28,8 @@ use self::extract_word_docids::extract_word_docids;
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{
as_cloneable_grenad, merge_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn,
MergeableReader,
as_cloneable_grenad, merge_deladd_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters,
MergeFn, MergeableReader,
};
use super::{helpers, TypedChunk};
use crate::{FieldId, Result};
@ -37,8 +38,8 @@ use crate::{FieldId, Result};
/// Send data in grenad file over provided Sender.
#[allow(clippy::too_many_arguments)]
pub(crate) fn data_from_obkv_documents(
original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send,
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send,
original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
searchable_fields: Option<HashSet<FieldId>>,
@ -62,7 +63,6 @@ pub(crate) fn data_from_obkv_documents(
indexer,
lmdb_writer_sx.clone(),
vectors_field_id,
primary_key_id,
)
})
.collect::<Result<()>>()?;
@ -107,7 +107,7 @@ pub(crate) fn data_from_obkv_documents(
let lmdb_writer_sx = lmdb_writer_sx.clone();
rayon::spawn(move || {
debug!("merge {} database", "facet-id-exists-docids");
match facet_exists_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
match facet_exists_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
Ok(reader) => {
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader)));
}
@ -123,7 +123,7 @@ pub(crate) fn data_from_obkv_documents(
let lmdb_writer_sx = lmdb_writer_sx.clone();
rayon::spawn(move || {
debug!("merge {} database", "facet-id-is-null-docids");
match facet_is_null_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
match facet_is_null_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
Ok(reader) => {
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader)));
}
@ -139,7 +139,7 @@ pub(crate) fn data_from_obkv_documents(
let lmdb_writer_sx = lmdb_writer_sx.clone();
rayon::spawn(move || {
debug!("merge {} database", "facet-id-is-empty-docids");
match facet_is_empty_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
match facet_is_empty_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
Ok(reader) => {
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader)));
}
@ -150,36 +150,40 @@ pub(crate) fn data_from_obkv_documents(
});
}
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
docid_word_positions_chunks.clone(),
indexer,
lmdb_writer_sx.clone(),
extract_word_pair_proximity_docids,
merge_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps,
TypedChunk::WordPairProximityDocids,
"word-pair-proximity-docids",
);
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
docid_word_positions_chunks.clone(),
indexer,
lmdb_writer_sx.clone(),
extract_fid_word_count_docids,
merge_cbo_roaring_bitmaps,
TypedChunk::FieldIdWordcountDocids,
merge_deladd_cbo_roaring_bitmaps,
TypedChunk::FieldIdWordCountDocids,
"field-id-wordcount-docids",
);
spawn_extraction_task::<
_,
_,
Vec<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)>,
Vec<(
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
)>,
>(
docid_word_positions_chunks.clone(),
indexer,
lmdb_writer_sx.clone(),
move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
merge_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps,
|(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| {
TypedChunk::WordDocids {
word_docids_reader,
@ -190,32 +194,32 @@ pub(crate) fn data_from_obkv_documents(
"word-docids",
);
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
docid_word_positions_chunks.clone(),
indexer,
lmdb_writer_sx.clone(),
extract_word_position_docids,
merge_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps,
TypedChunk::WordPositionDocids,
"word-position-docids",
);
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
fid_docid_facet_strings_chunks,
indexer,
lmdb_writer_sx.clone(),
extract_facet_string_docids,
merge_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps,
TypedChunk::FieldIdFacetStringDocids,
"field-id-facet-string-docids",
);
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
fid_docid_facet_numbers_chunks,
indexer,
lmdb_writer_sx,
extract_facet_number_docids,
merge_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps,
TypedChunk::FieldIdFacetNumberDocids,
"field-id-facet-number-docids",
);
@ -265,11 +269,10 @@ fn spawn_extraction_task<FE, FS, M>(
/// Extract chunked data and send it into lmdb_writer_sx sender:
/// - documents
fn send_original_documents_data(
original_documents_chunk: Result<grenad::Reader<File>>,
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
vectors_field_id: Option<FieldId>,
primary_key_id: FieldId,
) -> Result<()> {
let original_documents_chunk =
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
@ -278,12 +281,7 @@ fn send_original_documents_data(
let documents_chunk_cloned = original_documents_chunk.clone();
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
rayon::spawn(move || {
let result = extract_vector_points(
documents_chunk_cloned,
indexer,
primary_key_id,
vectors_field_id,
);
let result = extract_vector_points(documents_chunk_cloned, indexer, vectors_field_id);
let _ = match result {
Ok(vector_points) => {
lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
@ -307,7 +305,7 @@ fn send_original_documents_data(
#[allow(clippy::too_many_arguments)]
#[allow(clippy::type_complexity)]
fn send_and_extract_flattened_documents_data(
flattened_documents_chunk: Result<grenad::Reader<File>>,
flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
searchable_fields: &Option<HashSet<FieldId>>,
@ -324,7 +322,10 @@ fn send_and_extract_flattened_documents_data(
grenad::Reader<CursorClonableMmap>,
(
grenad::Reader<CursorClonableMmap>,
(grenad::Reader<File>, (grenad::Reader<File>, grenad::Reader<File>)),
(
grenad::Reader<BufReader<File>>,
(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>),
),
),
),
)> {
@ -347,7 +348,7 @@ fn send_and_extract_flattened_documents_data(
let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) =
rayon::join(
|| {
let (documents_ids, docid_word_positions_chunk, script_language_pair) =
let (docid_word_positions_chunk, script_language_pair) =
extract_docid_word_positions(
flattened_documents_chunk.clone(),
indexer,
@ -358,9 +359,6 @@ fn send_and_extract_flattened_documents_data(
max_positions_per_attributes,
)?;
// send documents_ids to DB writer
let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids)));
// send docid_word_positions_chunk to DB writer
let docid_word_positions_chunk =
unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? };

View File

@ -1,6 +1,6 @@
use std::borrow::Cow;
use std::fs::File;
use std::io::{self, Seek};
use std::io::{self, BufReader, BufWriter, Seek};
use std::time::Instant;
use grenad::{CompressionType, Sorter};
@ -17,13 +17,13 @@ pub fn create_writer<R: io::Write>(
typ: grenad::CompressionType,
level: Option<u32>,
file: R,
) -> grenad::Writer<R> {
) -> grenad::Writer<BufWriter<R>> {
let mut builder = grenad::Writer::builder();
builder.compression_type(typ);
if let Some(level) = level {
builder.compression_level(level);
}
builder.build(file)
builder.build(BufWriter::new(file))
}
pub fn create_sorter(
@ -47,13 +47,14 @@ pub fn create_sorter(
builder.allow_realloc(false);
}
builder.sort_algorithm(sort_algorithm);
builder.sort_in_parallel(true);
builder.build()
}
pub fn sorter_into_reader(
sorter: grenad::Sorter<MergeFn>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> {
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let mut writer = create_writer(
indexer.chunk_compression_type,
@ -65,16 +66,18 @@ pub fn sorter_into_reader(
writer_into_reader(writer)
}
pub fn writer_into_reader(writer: grenad::Writer<File>) -> Result<grenad::Reader<File>> {
let mut file = writer.into_inner()?;
pub fn writer_into_reader(
writer: grenad::Writer<BufWriter<File>>,
) -> Result<grenad::Reader<BufReader<File>>> {
let mut file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
file.rewind()?;
grenad::Reader::new(file).map_err(Into::into)
grenad::Reader::new(BufReader::new(file)).map_err(Into::into)
}
pub unsafe fn as_cloneable_grenad(
reader: &grenad::Reader<File>,
reader: &grenad::Reader<BufReader<File>>,
) -> Result<grenad::Reader<CursorClonableMmap>> {
let file = reader.get_ref();
let file = reader.get_ref().get_ref();
let mmap = memmap2::Mmap::map(file)?;
let cursor = io::Cursor::new(ClonableMmap::from(mmap));
let reader = grenad::Reader::new(cursor)?;
@ -90,8 +93,8 @@ where
fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result<Self::Output>;
}
impl MergeableReader for Vec<grenad::Reader<File>> {
type Output = grenad::Reader<File>;
impl MergeableReader for Vec<grenad::Reader<BufReader<File>>> {
type Output = grenad::Reader<BufReader<File>>;
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
let mut merger = MergerBuilder::new(merge_fn);
@ -100,8 +103,8 @@ impl MergeableReader for Vec<grenad::Reader<File>> {
}
}
impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>)> {
type Output = (grenad::Reader<File>, grenad::Reader<File>);
impl MergeableReader for Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
type Output = (grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>);
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
let mut m1 = MergerBuilder::new(merge_fn);
@ -114,8 +117,18 @@ impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>)> {
}
}
impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
type Output = (grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>);
impl MergeableReader
for Vec<(
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
)>
{
type Output = (
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
);
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
let mut m1 = MergerBuilder::new(merge_fn);
@ -142,7 +155,7 @@ impl<R: io::Read + io::Seek> MergerBuilder<R> {
Ok(())
}
fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<File>> {
fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<BufReader<File>>> {
let merger = self.0.build();
let mut writer = create_writer(
params.chunk_compression_type,
@ -193,7 +206,7 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
reader: grenad::Reader<R>,
indexer: GrenadParameters,
documents_chunk_size: usize,
) -> Result<impl Iterator<Item = Result<grenad::Reader<File>>>> {
) -> Result<impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>>> {
let mut continue_reading = true;
let mut cursor = reader.into_cursor()?;
@ -210,11 +223,13 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
);
while let Some((document_id, obkv)) = cursor.move_on_next()? {
obkv_documents.insert(document_id, obkv)?;
current_chunk_size += document_id.len() as u64 + obkv.len() as u64;
if !obkv.is_empty() {
obkv_documents.insert(document_id, obkv)?;
current_chunk_size += document_id.len() as u64 + obkv.len() as u64;
if current_chunk_size >= documents_chunk_size as u64 {
return writer_into_reader(obkv_documents).map(Some);
if current_chunk_size >= documents_chunk_size as u64 {
return writer_into_reader(obkv_documents).map(Some);
}
}
}

View File

@ -157,7 +157,7 @@ fn inner_merge_del_add_obkvs<'a>(
let mut acc = newest[1..].to_vec();
let mut buffer = Vec::new();
// reverse iter from the most recent to the oldest.
for current in obkvs.into_iter().rev() {
for current in obkvs.iter().rev() {
// if in the previous iteration there was a complete deletion,
// stop the merge process.
if acc_operation_type == Operation::Deletion as u8 {

View File

@ -35,8 +35,8 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader};
use crate::error::{Error, InternalError, UserError};
pub use crate::update::index_documents::helpers::CursorClonableMmap;
use crate::update::{
self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids,
WordPrefixIntegerDocids, WordsPrefixesFst,
};
use crate::{CboRoaringBitmapCodec, Index, Result};
@ -89,7 +89,6 @@ pub struct IndexDocumentsConfig {
pub words_positions_level_group_size: Option<NonZeroU32>,
pub words_positions_min_level_size: Option<NonZeroU32>,
pub update_method: IndexDocumentsMethod,
pub deletion_strategy: DeletionStrategy,
pub autogenerate_docids: bool,
}
@ -181,6 +180,7 @@ where
// Early return when there is no document to add
if to_delete.is_empty() {
// Maintains Invariant: remove documents actually always returns Ok for the inner result
return Ok((self, Ok(0)));
}
@ -193,6 +193,7 @@ where
self.deleted_documents += deleted_documents;
// Maintains Invariant: remove documents actually always returns Ok for the inner result
Ok((self, Ok(deleted_documents)))
}
@ -200,7 +201,7 @@ where
pub fn execute(mut self) -> Result<DocumentAdditionResult> {
puffin::profile_function!();
if self.added_documents == 0 {
if self.added_documents == 0 && self.deleted_documents == 0 {
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
}
@ -244,9 +245,6 @@ where
primary_key,
fields_ids_map,
field_distribution,
new_external_documents_ids,
new_documents_ids,
replaced_documents_ids,
documents_count,
original_documents,
flattened_documents,
@ -370,29 +368,12 @@ where
let _ = lmdb_writer_sx.send(Err(e));
}
// needs to be droped to avoid channel waiting lock.
// needs to be dropped to avoid channel waiting lock.
drop(lmdb_writer_sx)
});
// We delete the documents that this document addition replaces. This way we are
// able to simply insert all the documents even if they already exist in the database.
if !replaced_documents_ids.is_empty() {
let mut deletion_builder = update::DeleteDocuments::new(self.wtxn, self.index)?;
deletion_builder.strategy(self.config.deletion_strategy);
debug!("documents to delete {:?}", replaced_documents_ids);
deletion_builder.delete_documents(&replaced_documents_ids);
let deleted_documents_result = deletion_builder.execute_inner()?;
debug!("{} documents actually deleted", deleted_documents_result.deleted_documents);
}
let index_documents_ids = self.index.documents_ids(self.wtxn)?;
let index_is_empty = index_documents_ids.is_empty();
let index_is_empty = self.index.number_of_documents(self.wtxn)? == 0;
let mut final_documents_ids = RoaringBitmap::new();
let mut word_pair_proximity_docids = None;
let mut word_position_docids = None;
let mut word_fid_docids = None;
let mut word_docids = None;
let mut exact_word_docids = None;
let mut databases_seen = 0;
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
@ -405,38 +386,9 @@ where
return Err(Error::InternalError(InternalError::AbortedIndexation));
}
let typed_chunk = match result? {
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
} => {
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
word_docids = Some(cloneable_chunk);
let cloneable_chunk =
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
exact_word_docids = Some(cloneable_chunk);
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
word_fid_docids = Some(cloneable_chunk);
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
}
}
TypedChunk::WordPairProximityDocids(chunk) => {
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
word_pair_proximity_docids = Some(cloneable_chunk);
TypedChunk::WordPairProximityDocids(chunk)
}
TypedChunk::WordPositionDocids(chunk) => {
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
word_position_docids = Some(cloneable_chunk);
TypedChunk::WordPositionDocids(chunk)
}
otherwise => otherwise,
};
let typed_chunk = result?;
// FIXME: return newly added as well as newly deleted documents
let (docids, is_merged_database) =
write_typed_chunk_into_index(typed_chunk, self.index, self.wtxn, index_is_empty)?;
if !docids.is_empty() {
@ -466,15 +418,6 @@ where
// We write the primary key field id into the main database
self.index.put_primary_key(self.wtxn, &primary_key)?;
// We write the external documents ids into the main database.
let mut external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
external_documents_ids.insert_ids(&new_external_documents_ids)?;
let external_documents_ids = external_documents_ids.into_static();
self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?;
let all_documents_ids = index_documents_ids | new_documents_ids;
self.index.put_documents_ids(self.wtxn, &all_documents_ids)?;
// TODO: reactivate prefix DB with diff-indexing
// self.execute_prefix_databases(
// word_docids,
@ -484,7 +427,7 @@ where
// word_fid_docids,
// )?;
Ok(all_documents_ids.len())
self.index.number_of_documents(self.wtxn)
}
#[logging_timer::time("IndexDocuments::{}")]
@ -718,14 +661,15 @@ fn execute_word_prefix_docids(
#[cfg(test)]
mod tests {
use big_s::S;
use fst::IntoStreamer;
use heed::RwTxn;
use maplit::hashset;
use super::*;
use crate::documents::documents_batch_reader_from_objects;
use crate::index::tests::TempIndex;
use crate::search::TermsMatchingStrategy;
use crate::update::DeleteDocuments;
use crate::{db_snap, BEU16};
use crate::{db_snap, Filter, Search, BEU16};
#[test]
fn simple_document_replacement() {
@ -816,11 +760,10 @@ mod tests {
assert_eq!(count, 1);
// Check that we get only one document from the database.
// Since the document has been deleted and re-inserted, its internal docid has been incremented to 1
let docs = index.documents(&rtxn, Some(1)).unwrap();
let docs = index.documents(&rtxn, Some(0)).unwrap();
assert_eq!(docs.len(), 1);
let (id, doc) = docs[0];
assert_eq!(id, 1);
assert_eq!(id, 0);
// Check that this document is equal to the last one sent.
let mut doc_iter = doc.iter();
@ -881,7 +824,7 @@ mod tests {
assert_eq!(count, 3);
// the document 0 has been deleted and reinserted with the id 3
let docs = index.documents(&rtxn, vec![1, 2, 3]).unwrap();
let docs = index.documents(&rtxn, vec![1, 2, 0]).unwrap();
let kevin_position =
docs.iter().position(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap();
assert_eq!(kevin_position, 2);
@ -1027,7 +970,6 @@ mod tests {
assert_eq!(count, 6);
db_snap!(index, word_docids, "updated");
db_snap!(index, soft_deleted_documents_ids, "updated", @"[0, 1, 4, ]");
drop(rtxn);
}
@ -1130,17 +1072,15 @@ mod tests {
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
]))
.unwrap();
let mut wtxn = index.write_txn().unwrap();
assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId"));
// Delete not all of the documents but some of them.
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
builder.delete_external_id("30");
builder.execute().unwrap();
index.delete_document("30");
let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
assert!(external_documents_ids.get("30").is_none());
wtxn.commit().unwrap();
let txn = index.read_txn().unwrap();
assert_eq!(index.primary_key(&txn).unwrap(), Some("objectId"));
let external_documents_ids = index.external_documents_ids();
assert!(external_documents_ids.get(&txn, "30").unwrap().is_none());
index
.add_documents(documents!([
@ -1149,8 +1089,8 @@ mod tests {
.unwrap();
let wtxn = index.write_txn().unwrap();
let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
assert!(external_documents_ids.get("30").is_some());
let external_documents_ids = index.external_documents_ids();
assert!(external_documents_ids.get(&wtxn, "30").unwrap().is_some());
wtxn.commit().unwrap();
index
@ -1444,8 +1384,10 @@ mod tests {
index.add_documents(documents!({ "a" : { "b" : { "c" : 1 }}})).unwrap();
let rtxn = index.read_txn().unwrap();
let external_documents_ids = index.external_documents_ids(&rtxn).unwrap();
assert!(external_documents_ids.get("1").is_some());
let all_documents_count = index.all_documents(&rtxn).unwrap().count();
assert_eq!(all_documents_count, 1);
let external_documents_ids = index.external_documents_ids();
assert!(external_documents_ids.get(&rtxn, "1").unwrap().is_some());
}
#[test]
@ -1499,12 +1441,6 @@ mod tests {
3 2 second second
3 3 third third
"###);
db_snap!(index, string_faceted_documents_ids, @r###"
0 []
1 []
2 []
3 [0, 1, 2, 3, ]
"###);
let rtxn = index.read_txn().unwrap();
@ -1528,12 +1464,6 @@ mod tests {
db_snap!(index, facet_id_string_docids, @"");
db_snap!(index, field_id_docid_facet_strings, @"");
db_snap!(index, string_faceted_documents_ids, @r###"
0 []
1 []
2 []
3 [0, 1, 2, 3, ]
"###);
let rtxn = index.read_txn().unwrap();
@ -1560,12 +1490,6 @@ mod tests {
3 2 second second
3 3 third third
"###);
db_snap!(index, string_faceted_documents_ids, @r###"
0 []
1 []
2 []
3 [0, 1, 2, 3, ]
"###);
let rtxn = index.read_txn().unwrap();
@ -1728,7 +1652,7 @@ mod tests {
let wtxn = index.read_txn().unwrap();
let map = index.external_documents_ids(&wtxn).unwrap().to_hash_map();
let map = index.external_documents_ids().to_hash_map(&wtxn).unwrap();
let ids = map.values().collect::<HashSet<_>>();
assert_eq!(ids.len(), map.len());
@ -2540,17 +2464,8 @@ mod tests {
db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4");
db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83");
let mut wtxn = index.write_txn().unwrap();
// Delete not all of the documents but some of them.
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
builder.strategy(DeletionStrategy::AlwaysHard);
builder.delete_external_id("0");
builder.delete_external_id("3");
let result = builder.execute().unwrap();
println!("{result:?}");
wtxn.commit().unwrap();
index.delete_documents(vec!["0".into(), "3".into()]);
db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
@ -2605,8 +2520,7 @@ mod tests {
),
]
*/
let mut index = TempIndex::new();
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
let index = TempIndex::new();
// START OF BATCH
@ -2646,8 +2560,7 @@ mod tests {
{"id":1,"doggo":"bernese"}
"###);
db_snap!(index, external_documents_ids, @r###"
soft:
hard:
docids:
1 0
"###);
@ -2692,13 +2605,10 @@ mod tests {
"###);
db_snap!(index, external_documents_ids, @r###"
soft:
hard:
docids:
0 1
"###);
db_snap!(index, soft_deleted_documents_ids, @"[]");
// BATCH 3
println!("--- ENTERING BATCH 3");
@ -2740,4 +2650,537 @@ mod tests {
let res = index.search(&rtxn).execute().unwrap();
index.documents(&rtxn, res.documents_ids).unwrap();
}
fn delete_documents<'t>(
wtxn: &mut RwTxn<'t, '_>,
index: &'t TempIndex,
external_ids: &[&str],
) -> Vec<u32> {
let external_document_ids = index.external_documents_ids();
let ids_to_delete: Vec<u32> = external_ids
.iter()
.map(|id| external_document_ids.get(wtxn, id).unwrap().unwrap())
.collect();
// Delete some documents.
index.delete_documents_using_wtxn(
wtxn,
external_ids.iter().map(ToString::to_string).collect(),
);
ids_to_delete
}
#[test]
fn delete_documents_with_numbers_as_primary_key() {
let index = TempIndex::new();
let mut wtxn = index.write_txn().unwrap();
index
.add_documents_using_wtxn(
&mut wtxn,
documents!([
{ "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
{ "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
{ "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
]),
)
.unwrap();
// delete those documents, ids are synchronous therefore 0, 1, and 2.
index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1"), S("2")]);
wtxn.commit().unwrap();
// All these snapshots should be empty since the database was cleared
db_snap!(index, documents_ids);
db_snap!(index, word_docids);
db_snap!(index, word_pair_proximity_docids);
db_snap!(index, facet_id_exists_docids);
let rtxn = index.read_txn().unwrap();
assert!(index.field_distribution(&rtxn).unwrap().is_empty());
}
#[test]
fn delete_documents_with_strange_primary_key() {
let index = TempIndex::new();
index
.update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()]))
.unwrap();
let mut wtxn = index.write_txn().unwrap();
index
.add_documents_using_wtxn(
&mut wtxn,
documents!([
{ "mysuperid": 0, "name": "kevin" },
{ "mysuperid": 1, "name": "kevina" },
{ "mysuperid": 2, "name": "benoit" }
]),
)
.unwrap();
wtxn.commit().unwrap();
let mut wtxn = index.write_txn().unwrap();
// Delete not all of the documents but some of them.
index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1")]);
wtxn.commit().unwrap();
db_snap!(index, documents_ids);
db_snap!(index, word_docids);
db_snap!(index, word_pair_proximity_docids);
}
#[test]
fn filtered_placeholder_search_should_not_return_deleted_documents() {
let index = TempIndex::new();
let mut wtxn = index.write_txn().unwrap();
index
.update_settings_using_wtxn(&mut wtxn, |settings| {
settings.set_primary_key(S("docid"));
settings.set_filterable_fields(hashset! { S("label"), S("label2") });
})
.unwrap();
index
.add_documents_using_wtxn(
&mut wtxn,
documents!([
{ "docid": "1_4", "label": ["sign"] },
{ "docid": "1_5", "label": ["letter"] },
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
{ "docid": "1_36", "label": ["drawing","painting","pattern"] },
{ "docid": "1_37", "label": ["art","drawing","outdoor"] },
{ "docid": "1_38", "label": ["aquarium","art","drawing"] },
{ "docid": "1_39", "label": ["abstract"] },
{ "docid": "1_40", "label": ["cartoon"] },
{ "docid": "1_41", "label": ["art","drawing"] },
{ "docid": "1_42", "label": ["art","pattern"] },
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
{ "docid": "1_44", "label": ["drawing"] },
{ "docid": "1_45", "label": ["art"] },
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
{ "docid": "1_47", "label": ["abstract","pattern"] },
{ "docid": "1_52", "label": ["abstract","cartoon"] },
{ "docid": "1_57", "label": ["abstract","drawing","pattern"] },
{ "docid": "1_58", "label": ["abstract","art","cartoon"] },
{ "docid": "1_68", "label": ["design"] },
{ "docid": "1_69", "label": ["geometry"] },
{ "docid": "1_70", "label2": ["geometry", 1.2] },
{ "docid": "1_71", "label2": ["design", 2.2] },
{ "docid": "1_72", "label2": ["geometry", 1.2] }
]),
)
.unwrap();
delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"]);
// Placeholder search with filter
let filter = Filter::from_str("label = sign").unwrap().unwrap();
let results = index.search(&wtxn).filter(filter).execute().unwrap();
assert!(results.documents_ids.is_empty());
wtxn.commit().unwrap();
db_snap!(index, word_docids);
db_snap!(index, facet_id_f64_docids);
db_snap!(index, word_pair_proximity_docids);
db_snap!(index, facet_id_exists_docids);
db_snap!(index, facet_id_string_docids);
}
#[test]
fn placeholder_search_should_not_return_deleted_documents() {
let index = TempIndex::new();
let mut wtxn = index.write_txn().unwrap();
index
.update_settings_using_wtxn(&mut wtxn, |settings| {
settings.set_primary_key(S("docid"));
})
.unwrap();
index
.add_documents_using_wtxn(
&mut wtxn,
documents!([
{ "docid": "1_4", "label": ["sign"] },
{ "docid": "1_5", "label": ["letter"] },
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
{ "docid": "1_36", "label": ["drawing","painting","pattern"] },
{ "docid": "1_37", "label": ["art","drawing","outdoor"] },
{ "docid": "1_38", "label": ["aquarium","art","drawing"] },
{ "docid": "1_39", "label": ["abstract"] },
{ "docid": "1_40", "label": ["cartoon"] },
{ "docid": "1_41", "label": ["art","drawing"] },
{ "docid": "1_42", "label": ["art","pattern"] },
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
{ "docid": "1_44", "label": ["drawing"] },
{ "docid": "1_45", "label": ["art"] },
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
{ "docid": "1_47", "label": ["abstract","pattern"] },
{ "docid": "1_52", "label": ["abstract","cartoon"] },
{ "docid": "1_57", "label": ["abstract","drawing","pattern"] },
{ "docid": "1_58", "label": ["abstract","art","cartoon"] },
{ "docid": "1_68", "label": ["design"] },
{ "docid": "1_69", "label": ["geometry"] },
{ "docid": "1_70", "label2": ["geometry", 1.2] },
{ "docid": "1_71", "label2": ["design", 2.2] },
{ "docid": "1_72", "label2": ["geometry", 1.2] }
]),
)
.unwrap();
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"]);
// Placeholder search
let results = index.search(&wtxn).execute().unwrap();
assert!(!results.documents_ids.is_empty());
for id in results.documents_ids.iter() {
assert!(
!deleted_internal_ids.contains(id),
"The document {} was supposed to be deleted",
id
);
}
wtxn.commit().unwrap();
}
#[test]
fn search_should_not_return_deleted_documents() {
let index = TempIndex::new();
let mut wtxn = index.write_txn().unwrap();
index
.update_settings_using_wtxn(&mut wtxn, |settings| {
settings.set_primary_key(S("docid"));
})
.unwrap();
index
.add_documents_using_wtxn(
&mut wtxn,
documents!([
{ "docid": "1_4", "label": ["sign"] },
{ "docid": "1_5", "label": ["letter"] },
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
{ "docid": "1_36", "label": ["drawing","painting","pattern"] },
{ "docid": "1_37", "label": ["art","drawing","outdoor"] },
{ "docid": "1_38", "label": ["aquarium","art","drawing"] },
{ "docid": "1_39", "label": ["abstract"] },
{ "docid": "1_40", "label": ["cartoon"] },
{ "docid": "1_41", "label": ["art","drawing"] },
{ "docid": "1_42", "label": ["art","pattern"] },
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
{ "docid": "1_44", "label": ["drawing"] },
{ "docid": "1_45", "label": ["art"] },
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
{ "docid": "1_47", "label": ["abstract","pattern"] },
{ "docid": "1_52", "label": ["abstract","cartoon"] },
{ "docid": "1_57", "label": ["abstract","drawing","pattern"] },
{ "docid": "1_58", "label": ["abstract","art","cartoon"] },
{ "docid": "1_68", "label": ["design"] },
{ "docid": "1_69", "label": ["geometry"] },
{ "docid": "1_70", "label2": ["geometry", 1.2] },
{ "docid": "1_71", "label2": ["design", 2.2] },
{ "docid": "1_72", "label2": ["geometry", 1.2] }
]),
)
.unwrap();
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_7", "1_52"]);
// search for abstract
let results = index.search(&wtxn).query("abstract").execute().unwrap();
assert!(!results.documents_ids.is_empty());
for id in results.documents_ids.iter() {
assert!(
!deleted_internal_ids.contains(id),
"The document {} was supposed to be deleted",
id
);
}
wtxn.commit().unwrap();
}
#[test]
fn geo_filtered_placeholder_search_should_not_return_deleted_documents() {
let index = TempIndex::new();
let mut wtxn = index.write_txn().unwrap();
index
.update_settings_using_wtxn(&mut wtxn, |settings| {
settings.set_primary_key(S("id"));
settings.set_filterable_fields(hashset!(S("_geo")));
settings.set_sortable_fields(hashset!(S("_geo")));
})
.unwrap();
index.add_documents_using_wtxn(&mut wtxn, documents!([
{ "id": "1", "city": "Lille", "_geo": { "lat": 50.6299, "lng": 3.0569 } },
{ "id": "2", "city": "Mons-en-Barœul", "_geo": { "lat": 50.6415, "lng": 3.1106 } },
{ "id": "3", "city": "Hellemmes", "_geo": { "lat": 50.6312, "lng": 3.1106 } },
{ "id": "4", "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } },
{ "id": "5", "city": "Hem", "_geo": { "lat": 50.6552, "lng": 3.1897 } },
{ "id": "6", "city": "Roubaix", "_geo": { "lat": 50.6924, "lng": 3.1763 } },
{ "id": "7", "city": "Tourcoing", "_geo": { "lat": 50.7263, "lng": 3.1541 } },
{ "id": "8", "city": "Mouscron", "_geo": { "lat": 50.7453, "lng": 3.2206 } },
{ "id": "9", "city": "Tournai", "_geo": { "lat": 50.6053, "lng": 3.3758 } },
{ "id": "10", "city": "Ghent", "_geo": { "lat": 51.0537, "lng": 3.6957 } },
{ "id": "11", "city": "Brussels", "_geo": { "lat": 50.8466, "lng": 4.3370 } },
{ "id": "12", "city": "Charleroi", "_geo": { "lat": 50.4095, "lng": 4.4347 } },
{ "id": "13", "city": "Mons", "_geo": { "lat": 50.4502, "lng": 3.9623 } },
{ "id": "14", "city": "Valenciennes", "_geo": { "lat": 50.3518, "lng": 3.5326 } },
{ "id": "15", "city": "Arras", "_geo": { "lat": 50.2844, "lng": 2.7637 } },
{ "id": "16", "city": "Cambrai", "_geo": { "lat": 50.1793, "lng": 3.2189 } },
{ "id": "17", "city": "Bapaume", "_geo": { "lat": 50.1112, "lng": 2.8547 } },
{ "id": "18", "city": "Amiens", "_geo": { "lat": 49.9314, "lng": 2.2710 } },
{ "id": "19", "city": "Compiègne", "_geo": { "lat": 49.4449, "lng": 2.7913 } },
{ "id": "20", "city": "Paris", "_geo": { "lat": 48.9021, "lng": 2.3708 } }
])).unwrap();
let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"];
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &external_ids_to_delete);
// Placeholder search with geo filter
let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap();
let results = index.search(&wtxn).filter(filter).execute().unwrap();
assert!(!results.documents_ids.is_empty());
for id in results.documents_ids.iter() {
assert!(
!deleted_internal_ids.contains(id),
"The document {} was supposed to be deleted",
id
);
}
wtxn.commit().unwrap();
db_snap!(index, facet_id_f64_docids);
db_snap!(index, facet_id_string_docids);
}
#[test]
fn get_documents_should_not_return_deleted_documents() {
let index = TempIndex::new();
let mut wtxn = index.write_txn().unwrap();
index
.update_settings_using_wtxn(&mut wtxn, |settings| {
settings.set_primary_key(S("docid"));
})
.unwrap();
index
.add_documents_using_wtxn(
&mut wtxn,
documents!([
{ "docid": "1_4", "label": ["sign"] },
{ "docid": "1_5", "label": ["letter"] },
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
{ "docid": "1_36", "label": ["drawing","painting","pattern"] },
{ "docid": "1_37", "label": ["art","drawing","outdoor"] },
{ "docid": "1_38", "label": ["aquarium","art","drawing"] },
{ "docid": "1_39", "label": ["abstract"] },
{ "docid": "1_40", "label": ["cartoon"] },
{ "docid": "1_41", "label": ["art","drawing"] },
{ "docid": "1_42", "label": ["art","pattern"] },
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
{ "docid": "1_44", "label": ["drawing"] },
{ "docid": "1_45", "label": ["art"] },
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
{ "docid": "1_47", "label": ["abstract","pattern"] },
{ "docid": "1_52", "label": ["abstract","cartoon"] },
{ "docid": "1_57", "label": ["abstract","drawing","pattern"] },
{ "docid": "1_58", "label": ["abstract","art","cartoon"] },
{ "docid": "1_68", "label": ["design"] },
{ "docid": "1_69", "label": ["geometry"] },
{ "docid": "1_70", "label2": ["geometry", 1.2] },
{ "docid": "1_71", "label2": ["design", 2.2] },
{ "docid": "1_72", "label2": ["geometry", 1.2] }
]),
)
.unwrap();
let deleted_external_ids = ["1_7", "1_52"];
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &deleted_external_ids);
// list all documents
let results = index.all_documents(&wtxn).unwrap();
for result in results {
let (id, _) = result.unwrap();
assert!(
!deleted_internal_ids.contains(&id),
"The document {} was supposed to be deleted",
id
);
}
// list internal document ids
let results = index.documents_ids(&wtxn).unwrap();
for id in results {
assert!(
!deleted_internal_ids.contains(&id),
"The document {} was supposed to be deleted",
id
);
}
wtxn.commit().unwrap();
let rtxn = index.read_txn().unwrap();
// get internal docids from deleted external document ids
let results = index.external_documents_ids();
for id in deleted_external_ids {
assert!(
results.get(&rtxn, id).unwrap().is_none(),
"The document {} was supposed to be deleted",
id
);
}
drop(rtxn);
}
#[test]
fn stats_should_not_return_deleted_documents() {
let index = TempIndex::new();
let mut wtxn = index.write_txn().unwrap();
index
.update_settings_using_wtxn(&mut wtxn, |settings| {
settings.set_primary_key(S("docid"));
})
.unwrap();
index.add_documents_using_wtxn(&mut wtxn, documents!([
{ "docid": "1_4", "label": ["sign"]},
{ "docid": "1_5", "label": ["letter"]},
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"},
{ "docid": "1_36", "label": ["drawing","painting","pattern"]},
{ "docid": "1_37", "label": ["art","drawing","outdoor"]},
{ "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"},
{ "docid": "1_39", "label": ["abstract"]},
{ "docid": "1_40", "label": ["cartoon"]},
{ "docid": "1_41", "label": ["art","drawing"]},
{ "docid": "1_42", "label": ["art","pattern"]},
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32},
{ "docid": "1_44", "label": ["drawing"], "number": 44i32},
{ "docid": "1_45", "label": ["art"]},
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"]},
{ "docid": "1_47", "label": ["abstract","pattern"]},
{ "docid": "1_52", "label": ["abstract","cartoon"]},
{ "docid": "1_57", "label": ["abstract","drawing","pattern"]},
{ "docid": "1_58", "label": ["abstract","art","cartoon"]},
{ "docid": "1_68", "label": ["design"]},
{ "docid": "1_69", "label": ["geometry"]}
])).unwrap();
delete_documents(&mut wtxn, &index, &["1_7", "1_52"]);
// count internal documents
let results = index.number_of_documents(&wtxn).unwrap();
assert_eq!(18, results);
// count field distribution
let results = index.field_distribution(&wtxn).unwrap();
assert_eq!(Some(&18), results.get("label"));
assert_eq!(Some(&1), results.get("title"));
assert_eq!(Some(&2), results.get("number"));
wtxn.commit().unwrap();
}
#[test]
fn stored_detected_script_and_language_should_not_return_deleted_documents() {
use charabia::{Language, Script};
let index = TempIndex::new();
let mut wtxn = index.write_txn().unwrap();
index
.add_documents_using_wtxn(
&mut wtxn,
documents!([
{ "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
{ "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
{ "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
{ "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" },
{ "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" },
{ "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" },
]))
.unwrap();
let key_cmn = (Script::Cj, Language::Cmn);
let cj_cmn_docs =
index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default();
let mut expected_cj_cmn_docids = RoaringBitmap::new();
expected_cj_cmn_docids.push(1);
expected_cj_cmn_docids.push(5);
assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
delete_documents(&mut wtxn, &index, &["1"]);
wtxn.commit().unwrap();
let rtxn = index.read_txn().unwrap();
let cj_cmn_docs =
index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default();
let mut expected_cj_cmn_docids = RoaringBitmap::new();
expected_cj_cmn_docids.push(5);
assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
}
#[test]
fn delete_words_exact_attributes() {
let index = TempIndex::new();
index
.update_settings(|settings| {
settings.set_primary_key(S("id"));
settings.set_searchable_fields(vec![S("text"), S("exact")]);
settings.set_exact_attributes(vec![S("exact")].into_iter().collect());
})
.unwrap();
index
.add_documents(documents!([
{ "id": 0, "text": "hello" },
{ "id": 1, "exact": "hello"}
]))
.unwrap();
db_snap!(index, word_docids, 1, @r###"
hello [0, ]
"###);
db_snap!(index, exact_word_docids, 1, @r###"
hello [1, ]
"###);
db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
let mut wtxn = index.write_txn().unwrap();
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1"]);
wtxn.commit().unwrap();
db_snap!(index, word_docids, 2, @r###"
hello [0, ]
"###);
db_snap!(index, exact_word_docids, 2, @"");
db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]");
let txn = index.read_txn().unwrap();
let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap();
insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###);
let mut s = Search::new(&txn, &index);
s.query("hello");
let crate::SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
}
}

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/index_documents/mod.rs
---
[]

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/index_documents/mod.rs
---

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/index_documents/mod.rs
---

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/index_documents/mod.rs
---

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/index_documents/mod.rs
---
[2, ]

View File

@ -0,0 +1,5 @@
---
source: milli/src/update/index_documents/mod.rs
---
benoit [2, ]

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/index_documents/mod.rs
---

View File

@ -1,5 +1,5 @@
---
source: milli/src/update/delete_documents.rs
source: milli/src/update/index_documents/mod.rs
---
1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ]
2 [21, ]

View File

@ -0,0 +1,5 @@
---
source: milli/src/update/index_documents/mod.rs
---
2 0 2.2 1 [21, ]

View File

@ -1,5 +1,5 @@
---
source: milli/src/update/delete_documents.rs
source: milli/src/update/index_documents/mod.rs
---
1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ]
1 0 aquarium 1 [5, ]

Some files were not shown because too many files have changed in this diff Show More