Compare commits

...

76 Commits

Author SHA1 Message Date
Clément Renault
02a40645e2 Add TODOs to the prefix functions 2023-10-19 17:58:52 +02:00
Many the fish
066221fd2b Merge pull request #4141 from meilisearch/diff-indexing-searchable
Diff indexing searchable
2023-10-19 15:00:16 +02:00
ManyTheFish
b8fed737ef update extract word pair proximity to support deladd obkvs 2023-10-19 14:18:14 +02:00
ManyTheFish
c63ff5298b update extract word position docids 2023-10-19 13:27:07 +02:00
ManyTheFish
d50408d670 update extract word docids 2023-10-19 11:59:12 +02:00
ManyTheFish
e0dc413521 Make script language docids map taking a tuple of roaring bitmaps expressing the deletions and the additions 2023-10-19 11:59:12 +02:00
Clément Renault
0f6a0b1ab8 Merge pull request #4136 from meilisearch/diff-indexing-facet-values
Diff Indexing on the facet values extractors
2023-10-19 11:22:34 +02:00
Clément Renault
061f490204 Update extract_facet_string_docids to support deladd obkvs 2023-10-19 10:08:15 +02:00
Clément Renault
5c43ff72c1 Update extract_facet_number_docids to support deladd obkvs 2023-10-18 17:40:13 +02:00
Clément Renault
c445e9daec Rename docid_fid into fid_docid 2023-10-18 13:53:58 +02:00
Clément Renault
178a9802fa Implement all the facet extraction paths and simplify them 2023-10-18 11:02:19 +02:00
Clément Renault
7d546b9c22 Generate the DelAdd for is_null, is_empty, and exists 2023-10-18 11:02:19 +02:00
Clément Renault
c829feb40b Work on fid docid facet values rewrite 2023-10-18 11:02:19 +02:00
ManyTheFish
b88fd7994c Support diff indexing on extract_docid_word_positions 2023-10-16 14:58:11 +02:00
ManyTheFish
096d7705c7 Make the transform struct return diff-based documents obkvs 2023-10-12 11:46:56 +02:00
ManyTheFish
e8f8730467 deactivate prefix dbs 2023-10-10 16:17:03 +02:00
ManyTheFish
26ef0b3a07 clean PR warnings 2023-10-10 14:04:56 +02:00
ManyTheFish
20394fda04 Split wpp in several sorters 2023-10-10 14:04:56 +02:00
ManyTheFish
27161bcd05 Fix word pair proximity 2023-10-10 14:04:56 +02:00
ManyTheFish
04fd44b5e2 Use a vecDeque in wpp database 2023-10-10 14:04:56 +02:00
ManyTheFish
9078e60024 Generalize usage of CboRoaringBitmap codec to ease the use 2023-10-10 14:04:56 +02:00
ManyTheFish
8fb96b8274 Add buffer to the obkv writter 2023-10-10 14:04:56 +02:00
ManyTheFish
50ba751244 Compute word_fid_docids before word_docids and exact_word_docids 2023-10-10 14:04:56 +02:00
ManyTheFish
f36c36e368 add puffin in sorter into reeder function 2023-10-10 14:04:56 +02:00
ManyTheFish
c2dcd66d32 Fix 2023-10-10 14:04:56 +02:00
ManyTheFish
d4594306d3 Fix fid_word_docids 2023-10-10 14:04:56 +02:00
ManyTheFish
93d0680903 Add usefull debug assert before key insertion in database 2023-10-10 14:04:56 +02:00
ManyTheFish
01101d55ac Wip 2023-10-10 14:04:56 +02:00
Clément Renault
c0fd3dffb8 Setup a Github Token env var 2023-10-09 18:04:49 +02:00
Clément Renault
c42fd5375f Fix the git commands again 2023-10-09 17:36:19 +02:00
Clément Renault
b418c3a756 Use the PAT token instead 2023-10-09 16:52:04 +02:00
Clément Renault
1cde455758 Fix workflow CI 2023-10-09 16:30:46 +02:00
Clément Renault
ca19bae72f Prefer using a action to manage commands 2023-10-09 14:56:41 +02:00
meili-bors[bot]
705878ff59 Merge #4102
4102: Introduce the first bot that shows benchmarks results r=curquiza a=Kerollmops

TBD

Co-authored-by: Kerollmops <clement@meilisearch.com>
Co-authored-by: Clément Renault <clement@meilisearch.com>
2023-10-05 10:11:06 +00:00
Clément Renault
92c280d1c8 Update .github/workflows/trigger-benchmarks-on-message.yml
Co-authored-by: Clémentine U. - curqui <clementine@meilisearch.com>
2023-10-05 12:09:52 +02:00
Kerollmops
181e7a1e53 Introduce the first bot that triggers benchmarks 2023-10-05 12:05:38 +02:00
meili-bors[bot]
2e5abb4d2c Merge #4098
4098: Bump docker/metadata-action from 4 to 5 r=curquiza a=dependabot[bot]

Bumps [docker/metadata-action](https://github.com/docker/metadata-action) from 4 to 5.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/docker/metadata-action/releases">docker/metadata-action's releases</a>.</em></p>
<blockquote>
<h2>v5.0.0</h2>
<ul>
<li>Node 20 as default runtime (requires <a href="https://github.com/actions/runner/releases/tag/v2.308.0">Actions Runner v2.308.0</a> or later) by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` in <a href="https://redirect.github.com/docker/metadata-action/pull/328">docker/metadata-action#328</a></li>
<li>Bump <code>`@​actions/core</code>` from 1.10.0 to 1.10.1 in <a href="https://redirect.github.com/docker/metadata-action/pull/333">docker/metadata-action#333</a></li>
<li>Bump csv-parse from 5.4.0 to 5.5.0 in <a href="https://redirect.github.com/docker/metadata-action/pull/320">docker/metadata-action#320</a></li>
<li>Bump semver from 7.5.1 to 7.5.2 in <a href="https://redirect.github.com/docker/metadata-action/pull/304">docker/metadata-action#304</a></li>
<li>Bump handlebars from 4.7.7 to 4.7.8 in <a href="https://redirect.github.com/docker/metadata-action/pull/315">docker/metadata-action#315</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/metadata-action/compare/v4.6.0...v5.0.0">https://github.com/docker/metadata-action/compare/v4.6.0...v5.0.0</a></p>
<h2>v4.6.0</h2>
<ul>
<li>Dedup and sort labels by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` in <a href="https://redirect.github.com/docker/metadata-action/pull/301">docker/metadata-action#301</a></li>
<li>Bump <code>`@​docker/actions-toolkit</code>` from 0.3.0 to 0.5.0 in <a href="https://redirect.github.com/docker/metadata-action/pull/302">docker/metadata-action#302</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/metadata-action/compare/v4.5.0...v4.6.0">https://github.com/docker/metadata-action/compare/v4.5.0...v4.6.0</a></p>
<h2>v4.5.0</h2>
<ul>
<li>Bump <code>`@​docker/actions-toolkit</code>` from 0.1.0 to 0.3.0 in <a href="https://redirect.github.com/docker/metadata-action/pull/296">docker/metadata-action#296</a></li>
<li>Bump csv-parse from 5.3.8 to 5.4.0 in <a href="https://redirect.github.com/docker/metadata-action/pull/294">docker/metadata-action#294</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/metadata-action/compare/v4.4.0...v4.5.0">https://github.com/docker/metadata-action/compare/v4.4.0...v4.5.0</a></p>
<h2>v4.4.0</h2>
<ul>
<li>Add <code>context</code> input to define the metadata provider by <a href="https://github.com/neilime"><code>`@​neilime</code></a>` in <a href="https://redirect.github.com/docker/metadata-action/pull/248">docker/metadata-action#248</a></li>
<li>Switch to actions-toolkit implementation by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` in <a href="https://redirect.github.com/docker/metadata-action/pull/266">docker/metadata-action#266</a> <a href="https://redirect.github.com/docker/metadata-action/pull/273">docker/metadata-action#273</a> <a href="https://redirect.github.com/docker/metadata-action/pull/284">docker/metadata-action#284</a></li>
<li>Bump csv-parse from 5.3.3 to 5.3.8 in <a href="https://redirect.github.com/docker/metadata-action/pull/271">docker/metadata-action#271</a> <a href="https://redirect.github.com/docker/metadata-action/pull/286">docker/metadata-action#286</a></li>
<li>Bump moment-timezone from 0.5.40 to 0.5.43 in <a href="https://redirect.github.com/docker/metadata-action/pull/268">docker/metadata-action#268</a> <a href="https://redirect.github.com/docker/metadata-action/pull/278">docker/metadata-action#278</a> <a href="https://redirect.github.com/docker/metadata-action/pull/281">docker/metadata-action#281</a></li>
<li>Bump semver from 7.4.0 to 7.5.0 in <a href="https://redirect.github.com/docker/metadata-action/pull/285">docker/metadata-action#285</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/metadata-action/compare/v4.3.0...v4.4.0">https://github.com/docker/metadata-action/compare/v4.3.0...v4.4.0</a></p>
<h2>v4.3.0</h2>
<ul>
<li>Provide outputs as env vars by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` (<a href="https://redirect.github.com/docker/metadata-action/issues/257">#257</a>)</li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/metadata-action/compare/v4.2.0...v4.3.0">https://github.com/docker/metadata-action/compare/v4.2.0...v4.3.0</a></p>
<h2>v4.2.0</h2>
<ul>
<li>Add <code>tz</code> attribute to handlebar date function by <a href="https://github.com/chroju"><code>`@​chroju</code></a>` (<a href="https://redirect.github.com/docker/metadata-action/issues/251">#251</a>)</li>
<li>Bump minimatch from 3.0.4 to 3.1.2 (<a href="https://redirect.github.com/docker/metadata-action/issues/242">#242</a>)</li>
<li>Bump csv-parse from 5.3.1 to 5.3.3 (<a href="https://redirect.github.com/docker/metadata-action/issues/245">#245</a>)</li>
<li>Bump json5 from 2.2.0 to 2.2.3 (<a href="https://redirect.github.com/docker/metadata-action/issues/252">#252</a>)</li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/metadata-action/compare/v4.1.1...v4.2.0">https://github.com/docker/metadata-action/compare/v4.1.1...v4.2.0</a></p>
<h2>v4.1.1</h2>
<ul>
<li>Revert changes to set associated head sha on pull request event by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` (<a href="https://redirect.github.com/docker/metadata-action/issues/239">#239</a>)
<ul>
<li>User can still set associated head sha on PR by setting the env var <code>DOCKER_METADATA_PR_HEAD_SHA=true</code></li>
</ul>
</li>
<li>Bump csv-parse from 5.3.0 to 5.3.1 (<a href="https://redirect.github.com/docker/metadata-action/issues/237">#237</a>)</li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/metadata-action/compare/v4.1.0...v4.1.1">https://github.com/docker/metadata-action/compare/v4.1.0...v4.1.1</a></p>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Upgrade guide</summary>
<p><em>Sourced from <a href="https://github.com/docker/metadata-action/blob/master/UPGRADE.md">docker/metadata-action's upgrade guide</a>.</em></p>
<blockquote>
<h1>Upgrade notes</h1>
<h2>v2 to v3</h2>
<ul>
<li>Repository has been moved to docker org. Replace <code>crazy-max/ghaction-docker-meta@v2</code>
with <code>docker/metadata-action@v5</code></li>
<li>The default bake target has been changed: <code>ghaction-docker-meta</code> &gt; <code>docker-metadata-action</code></li>
</ul>
<h2>v1 to v2</h2>
<ul>
<li><a href="https://github.com/docker/metadata-action/blob/master/#inputs">inputs</a>
<ul>
<li><a href="https://github.com/docker/metadata-action/blob/master/#tag-sha"><code>tag-sha</code></a></li>
<li><a href="https://github.com/docker/metadata-action/blob/master/#tag-edge--tag-edge-branch"><code>tag-edge</code> / <code>tag-edge-branch</code></a></li>
<li><a href="https://github.com/docker/metadata-action/blob/master/#tag-semver"><code>tag-semver</code></a></li>
<li><a href="https://github.com/docker/metadata-action/blob/master/#tag-match--tag-match-group"><code>tag-match</code> / <code>tag-match-group</code></a></li>
<li><a href="https://github.com/docker/metadata-action/blob/master/#tag-latest"><code>tag-latest</code></a></li>
<li><a href="https://github.com/docker/metadata-action/blob/master/#tag-schedule"><code>tag-schedule</code></a></li>
<li><a href="https://github.com/docker/metadata-action/blob/master/#tag-custom--tag-custom-only"><code>tag-custom</code> / <code>tag-custom-only</code></a></li>
<li><a href="https://github.com/docker/metadata-action/blob/master/#label-custom"><code>label-custom</code></a></li>
</ul>
</li>
<li><a href="https://github.com/docker/metadata-action/blob/master/#basic-workflow">Basic workflow</a></li>
<li><a href="https://github.com/docker/metadata-action/blob/master/#semver-workflow">Semver workflow</a></li>
</ul>
<h3>inputs</h3>
<table>
<thead>
<tr>
<th>New</th>
<th>Unchanged</th>
<th>Removed</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>tags</code></td>
<td><code>images</code></td>
<td><code>tag-sha</code></td>
</tr>
<tr>
<td><code>flavor</code></td>
<td><code>sep-tags</code></td>
<td><code>tag-edge</code></td>
</tr>
<tr>
<td><code>labels</code></td>
<td><code>sep-labels</code></td>
<td><code>tag-edge-branch</code></td>
</tr>
<tr>
<td></td>
<td></td>
<td><code>tag-semver</code></td>
</tr>
<tr>
<td></td>
<td></td>
<td><code>tag-match</code></td>
</tr>
<tr>
<td></td>
<td></td>
<td><code>tag-match-group</code></td>
</tr>
<tr>
<td></td>
<td></td>
<td><code>tag-latest</code></td>
</tr>
<tr>
<td></td>
<td></td>
<td><code>tag-schedule</code></td>
</tr>
<tr>
<td></td>
<td></td>
<td><code>tag-custom</code></td>
</tr>
<tr>
<td></td>
<td></td>
<td><code>tag-custom-only</code></td>
</tr>
<tr>
<td></td>
<td></td>
<td><code>label-custom</code></td>
</tr>
</tbody>
</table>
<h4><code>tag-sha</code></h4>
<pre lang="yaml"><code>tags: |
  type=sha
</code></pre>
<h4><code>tag-edge</code> / <code>tag-edge-branch</code></h4>
<pre lang="yaml"><code>tags: |
  # default branch
&lt;/tr&gt;&lt;/table&gt; 
</code></pre>
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="96383f4557"><code>96383f4</code></a> Merge pull request <a href="https://redirect.github.com/docker/metadata-action/issues/320">#320</a> from docker/dependabot/npm_and_yarn/csv-parse-5.5.0</li>
<li><a href="f138b9677b"><code>f138b96</code></a> chore: update generated content</li>
<li><a href="9cf7015b15"><code>9cf7015</code></a> Bump csv-parse from 5.4.0 to 5.5.0</li>
<li><a href="5a8a5ff8df"><code>5a8a5ff</code></a> Merge pull request <a href="https://redirect.github.com/docker/metadata-action/issues/315">#315</a> from docker/dependabot/npm_and_yarn/handlebars-4.7.8</li>
<li><a href="2279d9af58"><code>2279d9a</code></a> chore: update generated content</li>
<li><a href="c659933213"><code>c659933</code></a> Bump handlebars from 4.7.7 to 4.7.8</li>
<li><a href="48d23ccc05"><code>48d23cc</code></a> Merge pull request <a href="https://redirect.github.com/docker/metadata-action/issues/333">#333</a> from docker/dependabot/npm_and_yarn/actions/core-1.10.1</li>
<li><a href="b83ffb48d6"><code>b83ffb4</code></a> chore: update generated content</li>
<li><a href="3207f2405f"><code>3207f24</code></a> Bump <code>`@​actions/core</code>` from 1.10.0 to 1.10.1</li>
<li><a href="63f4a263e5"><code>63f4a26</code></a> Merge pull request <a href="https://redirect.github.com/docker/metadata-action/issues/328">#328</a> from crazy-max/update-node20</li>
<li>Additional commits viewable in <a href="https://github.com/docker/metadata-action/compare/v4...v5">compare view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=docker/metadata-action&package-manager=github_actions&previous-version=4&new-version=5)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

You can trigger a rebase of this PR by commenting ``@dependabot` rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- ``@dependabot` rebase` will rebase this PR
- ``@dependabot` recreate` will recreate this PR, overwriting any edits that have been made to it
- ``@dependabot` merge` will merge this PR after your CI passes on it
- ``@dependabot` squash and merge` will squash and merge this PR after your CI passes on it
- ``@dependabot` cancel merge` will cancel a previously requested merge and block automerging
- ``@dependabot` reopen` will reopen this PR if it is closed
- ``@dependabot` close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- ``@dependabot` show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- ``@dependabot` ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- ``@dependabot` ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- ``@dependabot` ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)


</details>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2023-10-04 08:40:29 +00:00
dependabot[bot]
44aaf5d9e3 Bump docker/metadata-action from 4 to 5
Bumps [docker/metadata-action](https://github.com/docker/metadata-action) from 4 to 5.
- [Release notes](https://github.com/docker/metadata-action/releases)
- [Upgrade guide](https://github.com/docker/metadata-action/blob/master/UPGRADE.md)
- [Commits](https://github.com/docker/metadata-action/compare/v4...v5)

---
updated-dependencies:
- dependency-name: docker/metadata-action
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
2023-10-03 23:09:04 +00:00
meili-bors[bot]
ff0ababf65 Merge #4097
4097: Bump docker/setup-buildx-action from 2 to 3 r=curquiza a=dependabot[bot]

Bumps [docker/setup-buildx-action](https://github.com/docker/setup-buildx-action) from 2 to 3.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/docker/setup-buildx-action/releases">docker/setup-buildx-action's releases</a>.</em></p>
<blockquote>
<h2>v3.0.0</h2>
<ul>
<li>Node 20 as default runtime (requires <a href="https://github.com/actions/runner/releases/tag/v2.308.0">Actions Runner v2.308.0</a> or later) by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` in <a href="https://redirect.github.com/docker/setup-buildx-action/pull/264">docker/setup-buildx-action#264</a></li>
<li>Bump <code>`@​actions/core</code>` from 1.10.0 to 1.10.1 in <a href="https://redirect.github.com/docker/setup-buildx-action/pull/267">docker/setup-buildx-action#267</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/setup-buildx-action/compare/v2.10.0...v3.0.0">https://github.com/docker/setup-buildx-action/compare/v2.10.0...v3.0.0</a></p>
<h2>v2.10.0</h2>
<h2>What's Changed</h2>
<ul>
<li>Bump <code>`@​docker/actions-toolkit</code>` from 0.7.1 to 0.10.0 by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` in <a href="https://redirect.github.com/docker/setup-buildx-action/pull/258">docker/setup-buildx-action#258</a></li>
<li>Bump word-wrap from 1.2.3 to 1.2.5 in <a href="https://redirect.github.com/docker/setup-buildx-action/pull/253">docker/setup-buildx-action#253</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/setup-buildx-action/compare/v2.9.1...v2.10.0">https://github.com/docker/setup-buildx-action/compare/v2.9.1...v2.10.0</a></p>
<h2>v2.9.1</h2>
<ul>
<li>Bump <code>`@​docker/actions-toolkit</code>` from 0.7.0 to 0.7.1 in <a href="https://redirect.github.com/docker/setup-buildx-action/pull/248">docker/setup-buildx-action#248</a>
<ul>
<li>Fixes an issue where building Buildx does not match the local platform (<a href="https://redirect.github.com/docker/actions-toolkit/pull/135">docker/actions-toolkit#135</a>)</li>
</ul>
</li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/setup-buildx-action/compare/v2.9.0...v2.9.1">https://github.com/docker/setup-buildx-action/compare/v2.9.0...v2.9.1</a></p>
<h2>v2.9.0</h2>
<ul>
<li>Bump <code>`@​docker/actions-toolkit</code>` from 0.6.0 to 0.7.0 in <a href="https://redirect.github.com/docker/setup-buildx-action/pull/246">docker/setup-buildx-action#246</a>
<ul>
<li>Adds support to cache Buildx binary to hosted tool cache and GHA cache backend</li>
</ul>
</li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/setup-buildx-action/compare/v2.8.0...v2.9.0">https://github.com/docker/setup-buildx-action/compare/v2.8.0...v2.9.0</a></p>
<h2>v2.8.0</h2>
<ul>
<li>Only set specific flags for drivers supporting them by <a href="https://github.com/nicks"><code>`@​nicks</code></a>` in <a href="https://redirect.github.com/docker/setup-buildx-action/pull/241">docker/setup-buildx-action#241</a></li>
<li>Bump <code>`@​docker/actions-toolkit</code>` from 0.5.0 to 0.6.0 in <a href="https://redirect.github.com/docker/setup-buildx-action/pull/242">docker/setup-buildx-action#242</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/setup-buildx-action/compare/v2.7.0...v2.8.0">https://github.com/docker/setup-buildx-action/compare/v2.7.0...v2.8.0</a></p>
<h2>v2.7.0</h2>
<ul>
<li>Bump <code>`@​docker/actions-toolkit</code>` from 0.3.0 to 0.5.0 in <a href="https://redirect.github.com/docker/setup-buildx-action/pull/237">docker/setup-buildx-action#237</a> <a href="https://redirect.github.com/docker/setup-buildx-action/pull/238">docker/setup-buildx-action#238</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/setup-buildx-action/compare/v2.6.0...v2.7.0">https://github.com/docker/setup-buildx-action/compare/v2.6.0...v2.7.0</a></p>
<h2>v2.6.0</h2>
<ul>
<li>Set node name for k8s driver when appending nodes by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` in <a href="https://redirect.github.com/docker/setup-buildx-action/pull/219">docker/setup-buildx-action#219</a></li>
<li>Bump <code>`@​docker/actions-toolkit</code>` from 0.1.0-beta.18 to 0.3.0 in <a href="https://redirect.github.com/docker/setup-buildx-action/pull/220">docker/setup-buildx-action#220</a> <a href="https://redirect.github.com/docker/setup-buildx-action/pull/229">docker/setup-buildx-action#229</a> <a href="https://redirect.github.com/docker/setup-buildx-action/pull/231">docker/setup-buildx-action#231</a> <a href="https://redirect.github.com/docker/setup-buildx-action/pull/236">docker/setup-buildx-action#236</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/setup-buildx-action/compare/v2.5.0...v2.6.0">https://github.com/docker/setup-buildx-action/compare/v2.5.0...v2.6.0</a></p>
<h2>v2.5.0</h2>
<ul>
<li><code>cleanup</code> input to remove builder and temp files by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` in <a href="https://redirect.github.com/docker/setup-buildx-action/pull/213">docker/setup-buildx-action#213</a></li>
<li>do not remove builder using the <code>docker</code> driver by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` in <a href="https://redirect.github.com/docker/setup-buildx-action/pull/218">docker/setup-buildx-action#218</a></li>
<li>fix current context as builder name for <code>docker</code> driver by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` in <a href="https://redirect.github.com/docker/setup-buildx-action/pull/209">docker/setup-buildx-action#209</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/setup-buildx-action/compare/v2.4.1...v2.5.0">https://github.com/docker/setup-buildx-action/compare/v2.4.1...v2.5.0</a></p>
<h2>v2.4.1</h2>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="f95db51fdd"><code>f95db51</code></a> Merge pull request <a href="https://redirect.github.com/docker/setup-buildx-action/issues/267">#267</a> from docker/dependabot/npm_and_yarn/actions/core-1.10.1</li>
<li><a href="998a87c2c1"><code>998a87c</code></a> chore: update generated content</li>
<li><a href="28bae59336"><code>28bae59</code></a> build(deps): bump <code>`@​actions/core</code>` from 1.10.0 to 1.10.1</li>
<li><a href="c215341715"><code>c215341</code></a> Merge pull request <a href="https://redirect.github.com/docker/setup-buildx-action/issues/264">#264</a> from crazy-max/update-node20</li>
<li><a href="02e9319239"><code>02e9319</code></a> chore: node 20 as default runtime</li>
<li><a href="5c9160effc"><code>5c9160e</code></a> chore: update generated content</li>
<li><a href="1283140f57"><code>1283140</code></a> chore: fix author in package.json</li>
<li><a href="c6afe06e4a"><code>c6afe06</code></a> vendor: bump <code>`@​docker/actions-toolkit</code>` from 0.10.0 to 0.12.0</li>
<li><a href="f35e0d5a04"><code>f35e0d5</code></a> chore: update dev dependencies</li>
<li><a href="baeb468fb2"><code>baeb468</code></a> dev: remove unneeded binaries</li>
<li>Additional commits viewable in <a href="https://github.com/docker/setup-buildx-action/compare/v2...v3">compare view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=docker/setup-buildx-action&package-manager=github_actions&previous-version=2&new-version=3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

You can trigger a rebase of this PR by commenting ``@dependabot` rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- ``@dependabot` rebase` will rebase this PR
- ``@dependabot` recreate` will recreate this PR, overwriting any edits that have been made to it
- ``@dependabot` merge` will merge this PR after your CI passes on it
- ``@dependabot` squash and merge` will squash and merge this PR after your CI passes on it
- ``@dependabot` cancel merge` will cancel a previously requested merge and block automerging
- ``@dependabot` reopen` will reopen this PR if it is closed
- ``@dependabot` close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- ``@dependabot` show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- ``@dependabot` ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- ``@dependabot` ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- ``@dependabot` ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)


</details>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2023-10-03 17:14:34 +00:00
dependabot[bot]
c5336af1c5 Bump docker/setup-buildx-action from 2 to 3
Bumps [docker/setup-buildx-action](https://github.com/docker/setup-buildx-action) from 2 to 3.
- [Release notes](https://github.com/docker/setup-buildx-action/releases)
- [Commits](https://github.com/docker/setup-buildx-action/compare/v2...v3)

---
updated-dependencies:
- dependency-name: docker/setup-buildx-action
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
2023-10-03 15:41:06 +00:00
meili-bors[bot]
1567758a56 Merge #4099
4099: Bump docker/build-push-action from 4 to 5 r=curquiza a=dependabot[bot]

Bumps [docker/build-push-action](https://github.com/docker/build-push-action) from 4 to 5.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/docker/build-push-action/releases">docker/build-push-action's releases</a>.</em></p>
<blockquote>
<h2>v5.0.0</h2>
<ul>
<li>Node 20 as default runtime (requires <a href="https://github.com/actions/runner/releases/tag/v2.308.0">Actions Runner v2.308.0</a> or later) by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` in <a href="https://redirect.github.com/docker/build-push-action/pull/954">docker/build-push-action#954</a></li>
<li>Bump <code>`@​actions/core</code>` from 1.10.0 to 1.10.1 in <a href="https://redirect.github.com/docker/build-push-action/pull/959">docker/build-push-action#959</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/build-push-action/compare/v4.2.1...v5.0.0">https://github.com/docker/build-push-action/compare/v4.2.1...v5.0.0</a></p>
<h2>v4.2.1</h2>
<blockquote>
<p><strong>Note</strong></p>
<p>Buildx v0.10 enables support for a minimal <a href="https://slsa.dev/provenance/">SLSA Provenance</a> attestation, which requires support for <a href="https://github.com/opencontainers/image-spec">OCI-compliant</a> multi-platform images. This may introduce issues with registry and runtime support (e.g. <a href="https://redirect.github.com/docker/buildx/issues/1533">Google Cloud Run and AWS Lambda</a>). You can optionally disable the default provenance attestation functionality using <code>provenance: false</code>.</p>
</blockquote>
<ul>
<li>warn if docker config can't be parsed by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` in <a href="https://redirect.github.com/docker/build-push-action/pull/957">docker/build-push-action#957</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/build-push-action/compare/v4.2.0...v4.2.1">https://github.com/docker/build-push-action/compare/v4.2.0...v4.2.1</a></p>
<h2>v4.2.0</h2>
<blockquote>
<p><strong>Note</strong></p>
<p>Buildx v0.10 enables support for a minimal <a href="https://slsa.dev/provenance/">SLSA Provenance</a> attestation, which requires support for <a href="https://github.com/opencontainers/image-spec">OCI-compliant</a> multi-platform images. This may introduce issues with registry and runtime support (e.g. <a href="https://redirect.github.com/docker/buildx/issues/1533">Google Cloud Run and AWS Lambda</a>). You can optionally disable the default provenance attestation functionality using <code>provenance: false</code>.</p>
</blockquote>
<ul>
<li>display proxy configuration  by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` in <a href="https://redirect.github.com/docker/build-push-action/pull/872">docker/build-push-action#872</a></li>
<li>chore(deps): Bump <code>`@​docker/actions-toolkit</code>` from 0.6.0 to 0.8.0 in <a href="https://redirect.github.com/docker/build-push-action/pull/930">docker/build-push-action#930</a></li>
<li>chore(deps): Bump word-wrap from 1.2.3 to 1.2.5 in <a href="https://redirect.github.com/docker/build-push-action/pull/925">docker/build-push-action#925</a></li>
<li>chore(deps): Bump semver from 6.3.0 to 6.3.1 in <a href="https://redirect.github.com/docker/build-push-action/pull/902">docker/build-push-action#902</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/build-push-action/compare/v4.1.1...v4.2.0">https://github.com/docker/build-push-action/compare/v4.1.1...v4.2.0</a></p>
<h2>v4.1.1</h2>
<blockquote>
<p><strong>Note</strong></p>
<p>Buildx v0.10 enables support for a minimal <a href="https://slsa.dev/provenance/">SLSA Provenance</a> attestation, which requires support for <a href="https://github.com/opencontainers/image-spec">OCI-compliant</a> multi-platform images. This may introduce issues with registry and runtime support (e.g. <a href="https://redirect.github.com/docker/buildx/issues/1533">Google Cloud Run and AWS Lambda</a>). You can optionally disable the default provenance attestation functionality using <code>provenance: false</code>.</p>
</blockquote>
<ul>
<li>Bump <code>`@​docker/actions-toolkit</code>` from 0.3.0 to 0.5.0 by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` in <a href="https://redirect.github.com/docker/build-push-action/pull/880">docker/build-push-action#880</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/build-push-action/compare/v4.1.0...v4.1.1">https://github.com/docker/build-push-action/compare/v4.1.0...v4.1.1</a></p>
<h2>v4.1.0</h2>
<blockquote>
<p><strong>Note</strong></p>
<p>Buildx v0.10 enables support for a minimal <a href="https://slsa.dev/provenance/">SLSA Provenance</a> attestation, which requires support for <a href="https://github.com/opencontainers/image-spec">OCI-compliant</a> multi-platform images. This may introduce issues with registry and runtime support (e.g. <a href="https://redirect.github.com/docker/buildx/issues/1533">Google Cloud Run and AWS Lambda</a>). You can optionally disable the default provenance attestation functionality using <code>provenance: false</code>.</p>
</blockquote>
<ul>
<li>Switch to actions-toolkit implementation by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` in <a href="https://redirect.github.com/docker/build-push-action/pull/811">docker/build-push-action#811</a>  <a href="https://redirect.github.com/docker/build-push-action/pull/838">docker/build-push-action#838</a> <a href="https://redirect.github.com/docker/build-push-action/pull/855">docker/build-push-action#855</a> <a href="https://redirect.github.com/docker/build-push-action/pull/860">docker/build-push-action#860</a> <a href="https://redirect.github.com/docker/build-push-action/pull/875">docker/build-push-action#875</a></li>
<li>e2e: quay.io by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` in <a href="https://redirect.github.com/docker/build-push-action/pull/799">docker/build-push-action#799</a> <a href="https://redirect.github.com/docker/build-push-action/pull/805">docker/build-push-action#805</a></li>
<li>e2e: local harbor and nexus by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` in <a href="https://redirect.github.com/docker/build-push-action/pull/800">docker/build-push-action#800</a></li>
<li>e2e: add artifactory container registry to test against by <a href="https://github.com/jedevc"><code>`@​jedevc</code></a>` in <a href="https://redirect.github.com/docker/build-push-action/pull/804">docker/build-push-action#804</a></li>
<li>e2e: add distribution tests by <a href="https://github.com/jedevc"><code>`@​jedevc</code></a>` in <a href="https://redirect.github.com/docker/build-push-action/pull/814">docker/build-push-action#814</a> <a href="https://redirect.github.com/docker/build-push-action/pull/815">docker/build-push-action#815</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/build-push-action/compare/v4.0.0...v4.1.0">https://github.com/docker/build-push-action/compare/v4.0.0...v4.1.0</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="0565240e2d"><code>0565240</code></a> Merge pull request <a href="https://redirect.github.com/docker/build-push-action/issues/959">#959</a> from docker/dependabot/npm_and_yarn/actions/core-1.10.1</li>
<li><a href="3ab07f8801"><code>3ab07f8</code></a> chore: update generated content</li>
<li><a href="b9e7e4daec"><code>b9e7e4d</code></a> chore(deps): Bump <code>`@​actions/core</code>` from 1.10.0 to 1.10.1</li>
<li><a href="04d1a3b049"><code>04d1a3b</code></a> Merge pull request <a href="https://redirect.github.com/docker/build-push-action/issues/954">#954</a> from crazy-max/update-node20</li>
<li><a href="1a4d1a13fb"><code>1a4d1a1</code></a> chore: node 20 as default runtime</li>
<li><a href="675965c0e1"><code>675965c</code></a> chore: update generated content</li>
<li><a href="58ee34cb6b"><code>58ee34c</code></a> chore: fix author in package.json</li>
<li><a href="c97c4060bd"><code>c97c406</code></a> fix ProxyConfig type when checking length</li>
<li><a href="47d5369e0b"><code>47d5369</code></a> vendor: bump <code>`@​docker/actions-toolkit</code>` from 0.8.0 to 0.12.0</li>
<li><a href="8895c7468f"><code>8895c74</code></a> chore: update dev dependencies</li>
<li>Additional commits viewable in <a href="https://github.com/docker/build-push-action/compare/v4...v5">compare view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=docker/build-push-action&package-manager=github_actions&previous-version=4&new-version=5)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

You can trigger a rebase of this PR by commenting ``@dependabot` rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- ``@dependabot` rebase` will rebase this PR
- ``@dependabot` recreate` will recreate this PR, overwriting any edits that have been made to it
- ``@dependabot` merge` will merge this PR after your CI passes on it
- ``@dependabot` squash and merge` will squash and merge this PR after your CI passes on it
- ``@dependabot` cancel merge` will cancel a previously requested merge and block automerging
- ``@dependabot` reopen` will reopen this PR if it is closed
- ``@dependabot` close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- ``@dependabot` show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- ``@dependabot` ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- ``@dependabot` ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- ``@dependabot` ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)


</details>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2023-10-03 11:55:31 +00:00
dependabot[bot]
37953afe1a Bump docker/build-push-action from 4 to 5
Bumps [docker/build-push-action](https://github.com/docker/build-push-action) from 4 to 5.
- [Release notes](https://github.com/docker/build-push-action/releases)
- [Commits](https://github.com/docker/build-push-action/compare/v4...v5)

---
updated-dependencies:
- dependency-name: docker/build-push-action
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
2023-10-03 11:53:53 +00:00
meili-bors[bot]
de3f992ae4 Merge #4095
4095: Bump docker/setup-qemu-action from 2 to 3 r=curquiza a=dependabot[bot]

Bumps [docker/setup-qemu-action](https://github.com/docker/setup-qemu-action) from 2 to 3.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/docker/setup-qemu-action/releases">docker/setup-qemu-action's releases</a>.</em></p>
<blockquote>
<h2>v3.0.0</h2>
<ul>
<li>Node 20 as default runtime (requires <a href="https://github.com/actions/runner/releases/tag/v2.308.0">Actions Runner v2.308.0</a> or later) by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` in <a href="https://redirect.github.com/docker/setup-qemu-action/pull/102">docker/setup-qemu-action#102</a></li>
<li>Bump <code>`@​actions/core</code>` from 1.10.0 to 1.10.1 in <a href="https://redirect.github.com/docker/setup-qemu-action/pull/103">docker/setup-qemu-action#103</a></li>
<li>Bump semver from 6.3.0 to 6.3.1 in <a href="https://redirect.github.com/docker/setup-qemu-action/pull/89">docker/setup-qemu-action#89</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/setup-qemu-action/compare/v2.2.0...v3.0.0">https://github.com/docker/setup-qemu-action/compare/v2.2.0...v3.0.0</a></p>
<h2>v2.2.0</h2>
<ul>
<li>Trim off spaces in <code>platforms</code> input by <a href="https://github.com/Chocobo1"><code>`@​Chocobo1</code></a>` in <a href="https://redirect.github.com/docker/setup-qemu-action/pull/64">docker/setup-qemu-action#64</a></li>
<li>Switch to actions-toolkit implementation by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` in <a href="https://redirect.github.com/docker/setup-qemu-action/pull/70">docker/setup-qemu-action#70</a> <a href="https://redirect.github.com/docker/setup-qemu-action/pull/80">docker/setup-qemu-action#80</a> <a href="https://redirect.github.com/docker/setup-qemu-action/pull/83">docker/setup-qemu-action#83</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/setup-qemu-action/compare/v2.1.0...v2.2.0">https://github.com/docker/setup-qemu-action/compare/v2.1.0...v2.2.0</a></p>
<h2>v2.1.0</h2>
<ul>
<li>Use context for inputs by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` (<a href="https://redirect.github.com/docker/setup-qemu-action/issues/62">#62</a>)</li>
<li>Use built-in <code>getExecOutput</code> by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` (<a href="https://redirect.github.com/docker/setup-qemu-action/issues/61">#61</a>)</li>
<li>Remove workaround for <code>setOutput</code> by <a href="https://github.com/crazy-max"><code>`@​crazy-max</code></a>` (<a href="https://redirect.github.com/docker/setup-qemu-action/issues/63">#63</a>)</li>
<li>Bump <code>`@​actions/core</code>` from 1.6.0 to 1.10.0 (<a href="https://redirect.github.com/docker/setup-qemu-action/issues/54">#54</a> <a href="https://redirect.github.com/docker/setup-qemu-action/issues/58">#58</a> <a href="https://redirect.github.com/docker/setup-qemu-action/issues/59">#59</a>)</li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/docker/setup-qemu-action/compare/v2.0.0...v2.1.0">https://github.com/docker/setup-qemu-action/compare/v2.0.0...v2.1.0</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="68827325e0"><code>6882732</code></a> Merge pull request <a href="https://redirect.github.com/docker/setup-qemu-action/issues/103">#103</a> from docker/dependabot/npm_and_yarn/actions/core-1.10.1</li>
<li><a href="183f4af504"><code>183f4af</code></a> chore: update generated content</li>
<li><a href="f17493529e"><code>f174935</code></a> build(deps): bump <code>`@​actions/core</code>` from 1.10.0 to 1.10.1</li>
<li><a href="2e423eb500"><code>2e423eb</code></a> Merge pull request <a href="https://redirect.github.com/docker/setup-qemu-action/issues/89">#89</a> from docker/dependabot/npm_and_yarn/semver-6.3.1</li>
<li><a href="ecc406afa7"><code>ecc406a</code></a> Bump semver from 6.3.0 to 6.3.1</li>
<li><a href="12dec5e201"><code>12dec5e</code></a> Merge pull request <a href="https://redirect.github.com/docker/setup-qemu-action/issues/102">#102</a> from crazy-max/update-node20</li>
<li><a href="c29b312130"><code>c29b312</code></a> chore: node 20 as default runtime</li>
<li><a href="34ae628c8f"><code>34ae628</code></a> chore: update generated content</li>
<li><a href="1f3d2e1ac0"><code>1f3d2e1</code></a> chore: fix author in package.json</li>
<li><a href="277dbe8c9c"><code>277dbe8</code></a> vendor: bump <code>`@​docker/actions-toolkit</code>` from 0.3.0 to 0.12.0</li>
<li>Additional commits viewable in <a href="https://github.com/docker/setup-qemu-action/compare/v2...v3">compare view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=docker/setup-qemu-action&package-manager=github_actions&previous-version=2&new-version=3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

You can trigger a rebase of this PR by commenting ``@dependabot` rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- ``@dependabot` rebase` will rebase this PR
- ``@dependabot` recreate` will recreate this PR, overwriting any edits that have been made to it
- ``@dependabot` merge` will merge this PR after your CI passes on it
- ``@dependabot` squash and merge` will squash and merge this PR after your CI passes on it
- ``@dependabot` cancel merge` will cancel a previously requested merge and block automerging
- ``@dependabot` reopen` will reopen this PR if it is closed
- ``@dependabot` close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- ``@dependabot` show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- ``@dependabot` ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- ``@dependabot` ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- ``@dependabot` ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)


</details>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2023-10-03 08:20:50 +00:00
dependabot[bot]
98f0618065 Bump docker/setup-qemu-action from 2 to 3
Bumps [docker/setup-qemu-action](https://github.com/docker/setup-qemu-action) from 2 to 3.
- [Release notes](https://github.com/docker/setup-qemu-action/releases)
- [Commits](https://github.com/docker/setup-qemu-action/compare/v2...v3)

---
updated-dependencies:
- dependency-name: docker/setup-qemu-action
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
2023-10-01 17:18:20 +00:00
meili-bors[bot]
86b314626d Merge #4080
4080: Bring back changes from v1.4.0 into main r=Kerollmops a=curquiza



Co-authored-by: ManyTheFish <many@meilisearch.com>
Co-authored-by: Clément Renault <clement@meilisearch.com>
Co-authored-by: Kerollmops <clement@meilisearch.com>
Co-authored-by: meili-bors[bot] <89034592+meili-bors[bot]@users.noreply.github.com>
Co-authored-by: curquiza <curquiza@users.noreply.github.com>
Co-authored-by: Tamo <tamo@meilisearch.com>
Co-authored-by: curquiza <clementine@meilisearch.com>
Co-authored-by: Vivek Kumar <vivek.26@outlook.com>
Co-authored-by: dogukanakkaya <doguakkaya27@hotmail.com>
2023-09-26 08:13:49 +00:00
meili-bors[bot]
bb79bdb3f8 Merge #4074
4074: Enable analytics in debug builds r=Kerollmops a=irevoire

# Pull Request

## Related issue
Fixes https://github.com/meilisearch/meilisearch/issues/4072

## What does this PR do?
- Stop disabling the analytics if meilisearch has been compiled in debug mode

Co-authored-by: Tamo <tamo@meilisearch.com>
2023-09-21 15:54:41 +00:00
Tamo
d429e7da99 make clippy happy 2023-09-21 17:41:12 +02:00
Tamo
584b772248 enable metrics in debug builds 2023-09-21 17:01:05 +02:00
meili-bors[bot]
1806c04a9a Merge #4065
4065: Dependency issue every 6 months r=curquiza a=curquiza

To avoid spending too much time on it (1 every two sprints)

If you disagree `@Kerollmops,` for security or any reason, please close the PR

Co-authored-by: Clémentine U. - curqui <clementine@meilisearch.com>
2023-09-19 15:58:06 +00:00
Clémentine U. - curqui
3485e8f1c4 Update .github/workflows/dependency-issue.yml 2023-09-18 09:59:22 +02:00
Clémentine U. - curqui
fe697a6685 Dependency issue every 6 months 2023-09-18 09:57:58 +02:00
meili-bors[bot]
eb4135f8ae Merge #4044
4044: Add more integrations to SDK CI r=curquiza a=curquiza

For the integration scope management, but also to anticipate bugs and breaking changes for engine team, we need to add more SDKs tests into the CI

Co-authored-by: curquiza <clementine@meilisearch.com>
2023-09-13 14:05:41 +00:00
curquiza
ec4844c3a6 Add dart, swift, dotnet, and java test
Display docker image

Add strapi and firebase

Add rails and symfony tests

Remove strapi and firestore tests

Fix dotnet SDK CI

Use specific dart SDK version

Disable coverage for ruby SDK

Prevent pushing coverage information to codecov

Remove codecoverage token

Trigger Build

Trigger Build

Trigger Build

Trigger Build

Trigger Build
2023-09-12 17:31:17 +02:00
meili-bors[bot]
77c3787b78 Merge #4056
4056: Rewrite segment_analytics module with the destructuring syntax r=Kerollmops a=vivek-26

# Pull Request

## Related issue
Fixes #3928

## What does this PR do?
- This PR uses Rust Destructuring syntax in the `segment_analytics` module, such that adding or deleting fields causes an error at compile time.

## PR checklist
Please check if your PR fulfills the following requirements:
- [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)?
- [x] Have you read the contributing guidelines?
- [x] Have you made sure that the title is accurate and descriptive of the changes?

Thank you so much for contributing to Meilisearch!


Co-authored-by: Vivek Kumar <vivek.26@outlook.com>
2023-09-12 08:07:50 +00:00
Vivek Kumar
4f902490b9 struct destructuring for DocumentsFetchAggregator 2023-09-12 10:39:28 +05:30
Vivek Kumar
1faee92748 struct destructuring for HealthAggregator 2023-09-12 10:39:28 +05:30
Vivek Kumar
5831466525 struct destructuring for DocumentsDeletionAggregator and TasksAggregator 2023-09-12 10:39:28 +05:30
Vivek Kumar
3cdb3e4eaf struct destructuring for DocumentsAggregator 2023-09-12 10:39:27 +05:30
Vivek Kumar
26f34ec7a2 struct destructuring for FacetSearchAggregator 2023-09-12 10:39:27 +05:30
Vivek Kumar
07d36180ad struct destructuring for MultiSearchAggregator 2023-09-12 10:39:27 +05:30
Vivek Kumar
4c641b79a2 use rust struct destructuring for SearchAggregator 2023-09-12 10:39:27 +05:30
meili-bors[bot]
ef31ab52a4 Merge #4051
4051: Implement the snapshots on demand r=Kerollmops a=irevoire

# Pull Request
Private link: [PRD available here](https://www.notion.so/meilisearch/On-demand-snapshots-5676e542b905459d96eec228da133b00#847ff0cafeb64fe09e8ee7150852b474)
Specification here: https://github.com/meilisearch/specifications/pull/258

## Prototype
A prototype is available under the name: `prototype-snapshot-on-demand-0`.

## Related issue
Fixes #4052
## What does this PR do?
- Introduce a new route, `POST /snapshots` to create snapshots on demand
- Introduce a new api-key action `snapshot.create`
- Introduce a new analytic `Snapshot Created` sent every time a snapshot is created.

## Notes for the team

I made a prototype so users can test the feature before the v1.5 comes out. But we can merge the PR as-is.

Co-authored-by: Tamo <tamo@meilisearch.com>
2023-09-11 15:16:08 +00:00
Tamo
791c5cd874 makes clippy happy 2023-09-11 17:02:01 +02:00
Tamo
5bea1092fb fix the flaky test 2023-09-11 16:56:26 +02:00
Tamo
056b2c387d refactor the tests suite slightly 2023-09-11 16:56:26 +02:00
meili-bors[bot]
b4c44603db Merge #4009
4009: Bump rustls-webpki from 0.100.1 to 0.100.2 r=Kerollmops a=dependabot[bot]

Bumps [rustls-webpki](https://github.com/rustls/webpki) from 0.100.1 to 0.100.2.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/rustls/webpki/releases">rustls-webpki's releases</a>.</em></p>
<blockquote>
<h2>v/0.100.2</h2>
<h2>Release notes</h2>
<ul>
<li>certificate path building and verification is now capped at 100 signature validation operations to avoid the risk of CPU usage denial-of-service attack when validating crafted certificate chains producing quadratic runtime. This risk affected both clients, as well as servers that verified client certificates.</li>
</ul>
<h2>What's Changed</h2>
<ul>
<li>v0.100.2 prep by <a href="https://github.com/cpu"><code>`@​cpu</code></a>` in <a href="https://redirect.github.com/rustls/webpki/pull/154">rustls/webpki#154</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/rustls/webpki/compare/v/0.100.1...v/0.100.2">https://github.com/rustls/webpki/compare/v/0.100.1...v/0.100.2</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="c8b821450b"><code>c8b8214</code></a> Bump MSRV to 1.60</li>
<li><a href="855752292e"><code>8557522</code></a> Avoid testing MSRV of dev-dependencies</li>
<li><a href="73a7f0c7d7"><code>73a7f0c</code></a> Cargo: version 0.100.1 -&gt; 0.100.2</li>
<li><a href="4ea052366f"><code>4ea0523</code></a> verify_cert: enforce maximum number of signatures.</li>
<li>See full diff in <a href="https://github.com/rustls/webpki/compare/v/0.100.1...v/0.100.2">compare view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=rustls-webpki&package-manager=cargo&previous-version=0.100.1&new-version=0.100.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting ``@dependabot` rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- ``@dependabot` rebase` will rebase this PR
- ``@dependabot` recreate` will recreate this PR, overwriting any edits that have been made to it
- ``@dependabot` merge` will merge this PR after your CI passes on it
- ``@dependabot` squash and merge` will squash and merge this PR after your CI passes on it
- ``@dependabot` cancel merge` will cancel a previously requested merge and block automerging
- ``@dependabot` reopen` will reopen this PR if it is closed
- ``@dependabot` close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- ``@dependabot` show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- ``@dependabot` ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- ``@dependabot` ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- ``@dependabot` ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/meilisearch/meilisearch/network/alerts).

</details>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2023-09-11 13:11:07 +00:00
Tamo
2c1d60f79b get rid of a warning 2023-09-11 14:40:22 +02:00
Tamo
08af69a33b improve a test to understand what's going on with the ci 2023-09-11 14:23:57 +02:00
Tamo
ddd34a488a update the api-key tests 2023-09-11 13:52:07 +02:00
meili-bors[bot]
526c2b3602 Merge #4050
4050: Bump webpki from 0.22.0 to 0.22.1 r=Kerollmops a=dependabot[bot]

Bumps [webpki](https://github.com/briansmith/webpki) from 0.22.0 to 0.22.1.
<details>
<summary>Commits</summary>
<ul>
<li>See full diff in <a href="https://github.com/briansmith/webpki/commits">compare view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=webpki&package-manager=cargo&previous-version=0.22.0&new-version=0.22.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting ``@dependabot` rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- ``@dependabot` rebase` will rebase this PR
- ``@dependabot` recreate` will recreate this PR, overwriting any edits that have been made to it
- ``@dependabot` merge` will merge this PR after your CI passes on it
- ``@dependabot` squash and merge` will squash and merge this PR after your CI passes on it
- ``@dependabot` cancel merge` will cancel a previously requested merge and block automerging
- ``@dependabot` reopen` will reopen this PR if it is closed
- ``@dependabot` close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- ``@dependabot` show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- ``@dependabot` ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- ``@dependabot` ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- ``@dependabot` ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/meilisearch/meilisearch/network/alerts).

</details>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2023-09-11 11:46:22 +00:00
Tamo
e8c9367686 implement the snapshots on demand 2023-09-11 12:35:57 +02:00
dependabot[bot]
9636c5f558 Bump webpki from 0.22.0 to 0.22.1
Bumps [webpki](https://github.com/briansmith/webpki) from 0.22.0 to 0.22.1.
- [Commits](https://github.com/briansmith/webpki/commits)

---
updated-dependencies:
- dependency-name: webpki
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
2023-09-11 10:32:34 +00:00
Harsh Upadhyay
b310830b5d Improve test-suite.yml for CI failing when disabling tokenization (#4005)
* [Update] test-suite.yml

Added New run command for cargo tree without default features using if-then block

* [Updated] test-disabled-tokenization in test-suite.yml

* [Updated] test-suite.yml

* Update .github/workflows/test-suite.yml

---------

Co-authored-by: Clémentine U. - curqui <clementine@meilisearch.com>
2023-09-11 12:30:53 +02:00
meili-bors[bot]
e9b62aacb3 Merge #4025
4025: Bump Swatinem/rust-cache from 2.5.1 to 2.6.2 r=curquiza a=dependabot[bot]

Bumps [Swatinem/rust-cache](https://github.com/swatinem/rust-cache) from 2.5.1 to 2.6.2.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/swatinem/rust-cache/releases">Swatinem/rust-cache's releases</a>.</em></p>
<blockquote>
<h2>v2.6.2</h2>
<h2>What's Changed</h2>
<ul>
<li>dep: Use <code>smol-toml</code> instead of <code>toml</code> by <a href="https://github.com/NobodyXu"><code>`@​NobodyXu</code></a>` in <a href="https://redirect.github.com/Swatinem/rust-cache/pull/164">Swatinem/rust-cache#164</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/Swatinem/rust-cache/compare/v2...v2.6.2">https://github.com/Swatinem/rust-cache/compare/v2...v2.6.2</a></p>
<h2>v2.6.1</h2>
<ul>
<li>Fix hash contributions of <code>Cargo.lock</code>/<code>Cargo.toml</code> files.</li>
</ul>
<h2>v2.6.0</h2>
<h2>What's Changed</h2>
<ul>
<li>Add &quot;buildjet&quot; as a second <code>cache-provider</code> backend <a href="https://github.com/joroshiba"><code>`@​joroshiba</code></a>` in <a href="https://redirect.github.com/Swatinem/rust-cache/pull/154">Swatinem/rust-cache#154</a></li>
<li>Clean up sparse registry index.</li>
<li>Do not clean up src of <code>-sys</code> crates.</li>
<li>Remove <code>.cargo/credentials.toml</code> before saving.</li>
</ul>
<h2>New Contributors</h2>
<ul>
<li><a href="https://github.com/joroshiba"><code>`@​joroshiba</code></a>` made their first contribution in <a href="https://redirect.github.com/Swatinem/rust-cache/pull/154">Swatinem/rust-cache#154</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/Swatinem/rust-cache/compare/v2.5.1...v2.6.0">https://github.com/Swatinem/rust-cache/compare/v2.5.1...v2.6.0</a></p>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a href="https://github.com/Swatinem/rust-cache/blob/master/CHANGELOG.md">Swatinem/rust-cache's changelog</a>.</em></p>
<blockquote>
<h2>2.6.2</h2>
<ul>
<li>Fix <code>toml</code> parsing.</li>
</ul>
<h2>2.6.1</h2>
<ul>
<li>Fix hash contributions of <code>Cargo.lock</code>/<code>Cargo.toml</code> files.</li>
</ul>
<h2>2.6.0</h2>
<ul>
<li>Add &quot;buildjet&quot; as a second <code>cache-provider</code> backend.</li>
<li>Clean up sparse registry index.</li>
<li>Do not clean up src of <code>-sys</code> crates.</li>
<li>Remove <code>.cargo/credentials.toml</code> before saving.</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="e207df5d26"><code>e207df5</code></a> 2.6.2</li>
<li><a href="decb69d790"><code>decb69d</code></a> Update dependencies and add changelog</li>
<li><a href="ab6b2769d1"><code>ab6b276</code></a> dep: Use <code>smol-toml</code> instead of <code>toml</code> (<a href="https://redirect.github.com/swatinem/rust-cache/issues/164">#164</a>)</li>
<li><a href="578b235f6e"><code>578b235</code></a> 2.6.1</li>
<li><a href="5113490c3f"><code>5113490</code></a> prepare 2.6.1</li>
<li><a href="c0e052c18c"><code>c0e052c</code></a> Fix hashing of parsed <code>Cargo.toml</code> (<a href="https://redirect.github.com/swatinem/rust-cache/issues/160">#160</a>)</li>
<li><a href="4e0f4b19dd"><code>4e0f4b1</code></a> Fix typo in hashing parsed <code>Cargo.lock</code> (<a href="https://redirect.github.com/swatinem/rust-cache/issues/159">#159</a>)</li>
<li><a href="b919e1427f"><code>b919e14</code></a> feat: Add logging to <code>Cargo.lock</code>/<code>Cargo.toml</code> hashing (<a href="https://redirect.github.com/swatinem/rust-cache/issues/156">#156</a>)</li>
<li><a href="b8a6852b4f"><code>b8a6852</code></a> 2.6.0</li>
<li><a href="80c47cc945"><code>80c47cc</code></a> Clean up <code>credentials.toml</code></li>
<li>Additional commits viewable in <a href="https://github.com/swatinem/rust-cache/compare/v2.5.1...v2.6.2">compare view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Swatinem/rust-cache&package-manager=github_actions&previous-version=2.5.1&new-version=2.6.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

You can trigger a rebase of this PR by commenting ``@dependabot` rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- ``@dependabot` rebase` will rebase this PR
- ``@dependabot` recreate` will recreate this PR, overwriting any edits that have been made to it
- ``@dependabot` merge` will merge this PR after your CI passes on it
- ``@dependabot` squash and merge` will squash and merge this PR after your CI passes on it
- ``@dependabot` cancel merge` will cancel a previously requested merge and block automerging
- ``@dependabot` reopen` will reopen this PR if it is closed
- ``@dependabot` close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- ``@dependabot` show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- ``@dependabot` ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- ``@dependabot` ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- ``@dependabot` ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)


</details>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2023-09-04 12:30:53 +00:00
dependabot[bot]
456960d2c7 Bump Swatinem/rust-cache from 2.5.1 to 2.6.2
Bumps [Swatinem/rust-cache](https://github.com/swatinem/rust-cache) from 2.5.1 to 2.6.2.
- [Release notes](https://github.com/swatinem/rust-cache/releases)
- [Changelog](https://github.com/Swatinem/rust-cache/blob/master/CHANGELOG.md)
- [Commits](https://github.com/swatinem/rust-cache/compare/v2.5.1...v2.6.2)

---
updated-dependencies:
- dependency-name: Swatinem/rust-cache
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2023-09-01 17:17:39 +00:00
dependabot[bot]
e59d7f238c Bump rustls-webpki from 0.100.1 to 0.100.2
Bumps [rustls-webpki](https://github.com/rustls/webpki) from 0.100.1 to 0.100.2.
- [Release notes](https://github.com/rustls/webpki/releases)
- [Commits](https://github.com/rustls/webpki/compare/v/0.100.1...v/0.100.2)

---
updated-dependencies:
- dependency-name: rustls-webpki
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
2023-08-22 18:10:53 +00:00
81 changed files with 2731 additions and 1143 deletions

View File

@@ -74,4 +74,4 @@ jobs:
echo "${{ steps.file.outputs.basename }}.json has just been pushed."
echo 'How to compare this benchmark with another one?'
echo ' - Check the available files with: ./benchmarks/scripts/list.sh'
echo " - Run the following command: ./benchmaks/scipts/compare.sh <file-to-compare-with> ${{ steps.file.outputs.basename }}.json"
echo " - Run the following command: ./benchmaks/scripts/compare.sh <file-to-compare-with> ${{ steps.file.outputs.basename }}.json"

View File

@@ -2,8 +2,8 @@ name: Create issue to upgrade dependencies
on:
schedule:
# Run the first of the month, every 3 month
- cron: '0 0 1 */3 *'
# Run the first of the month, every 6 month
- cron: '0 0 1 */6 *'
workflow_dispatch:
jobs:

View File

@@ -57,10 +57,10 @@ jobs:
echo "date=$commit_date" >> $GITHUB_OUTPUT
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
uses: docker/setup-buildx-action@v3
- name: Login to Docker Hub
uses: docker/login-action@v2
@@ -70,7 +70,7 @@ jobs:
- name: Docker meta
id: meta
uses: docker/metadata-action@v4
uses: docker/metadata-action@v5
with:
images: getmeili/meilisearch
# Prevent `latest` to be updated for each new tag pushed.
@@ -83,7 +83,7 @@ jobs:
type=raw,value=latest,enable=${{ steps.check-tag-format.outputs.stable == 'true' && steps.check-tag-format.outputs.latest == 'true' }}
- name: Build and push
uses: docker/build-push-action@v4
uses: docker/build-push-action@v5
with:
push: true
platforms: linux/amd64,linux/arm64

View File

@@ -14,6 +14,7 @@ on:
env:
MEILI_MASTER_KEY: 'masterKey'
MEILI_NO_ANALYTICS: 'true'
DISABLE_COVERAGE: 'true'
jobs:
define-docker-image:
@@ -30,6 +31,117 @@ jobs:
if [[ $event == 'workflow_dispatch' ]]; then
echo "docker-image=${{ github.event.inputs.docker_image }}" >> $GITHUB_OUTPUT
fi
- name: Docker image is ${{ steps.define-image.outputs.docker-image }}
run: echo "Docker image is ${{ steps.define-image.outputs.docker-image }}"
##########
## SDKs ##
##########
meilisearch-dotnet-tests:
needs: define-docker-image
name: .NET SDK tests
runs-on: ubuntu-latest
env:
MEILISEARCH_VERSION: ${{ needs.define-docker-image.outputs.docker-image }}
steps:
- uses: actions/checkout@v3
with:
repository: meilisearch/meilisearch-dotnet
- name: Setup .NET Core
uses: actions/setup-dotnet@v3
with:
dotnet-version: "6.0.x"
- name: Install dependencies
run: dotnet restore
- name: Build
run: dotnet build --configuration Release --no-restore
- name: Meilisearch (latest version) setup with Docker
run: docker compose up -d
- name: Run tests
run: dotnet test --no-restore --verbosity normal
meilisearch-dart-tests:
needs: define-docker-image
name: Dart SDK tests
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
ports:
- '7700:7700'
steps:
- uses: actions/checkout@v3
with:
repository: meilisearch/meilisearch-dart
- uses: dart-lang/setup-dart@v1
with:
sdk: 3.1.1
- name: Install dependencies
run: dart pub get
- name: Run integration tests
run: dart test --concurrency=4
meilisearch-go-tests:
needs: define-docker-image
name: Go SDK tests
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
ports:
- '7700:7700'
steps:
- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: stable
- uses: actions/checkout@v3
with:
repository: meilisearch/meilisearch-go
- name: Get dependencies
run: |
go get -v -t -d ./...
if [ -f Gopkg.toml ]; then
curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh
dep ensure
fi
- name: Run integration tests
run: go test -v ./...
meilisearch-java-tests:
needs: define-docker-image
name: Java SDK tests
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
ports:
- '7700:7700'
steps:
- uses: actions/checkout@v3
with:
repository: meilisearch/meilisearch-java
- name: Set up Java
uses: actions/setup-java@v3
with:
java-version: 8
distribution: 'zulu'
cache: gradle
- name: Grant execute permission for gradlew
run: chmod +x gradlew
- name: Build and run unit and integration tests
run: ./gradlew build integrationTest
meilisearch-js-tests:
needs: define-docker-image
@@ -66,33 +178,6 @@ jobs:
- name: Run Browser env
run: yarn test:env:browser
instant-meilisearch-tests:
needs: define-docker-image
name: instant-meilisearch tests
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
ports:
- '7700:7700'
steps:
- uses: actions/checkout@v3
with:
repository: meilisearch/instant-meilisearch
- name: Setup node
uses: actions/setup-node@v3
with:
cache: yarn
- name: Install dependencies
run: yarn install
- name: Run tests
run: yarn test
- name: Build all the playgrounds and the packages
run: yarn build
meilisearch-php-tests:
needs: define-docker-image
name: PHP SDK tests
@@ -111,8 +196,6 @@ jobs:
repository: meilisearch/meilisearch-php
- name: Install PHP
uses: shivammathur/setup-php@v2
with:
coverage: none
- name: Validate composer.json and composer.lock
run: composer validate
- name: Install dependencies
@@ -149,36 +232,6 @@ jobs:
- name: Test with pytest
run: pipenv run pytest
meilisearch-go-tests:
needs: define-docker-image
name: Go SDK tests
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
ports:
- '7700:7700'
steps:
- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: stable
- uses: actions/checkout@v3
with:
repository: meilisearch/meilisearch-go
- name: Get dependencies
run: |
go get -v -t -d ./...
if [ -f Gopkg.toml ]; then
curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh
dep ensure
fi
- name: Run integration tests
run: go test -v ./...
meilisearch-ruby-tests:
needs: define-docker-image
name: Ruby SDK tests
@@ -224,3 +277,110 @@ jobs:
run: cargo build --verbose
- name: Run tests
run: cargo test --verbose
meilisearch-swift-tests:
needs: define-docker-image
name: Swift SDK tests
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
ports:
- '7700:7700'
steps:
- uses: actions/checkout@v3
with:
repository: meilisearch/meilisearch-swift
- name: Run tests
run: swift test
########################
## FRONT-END PLUGINS ##
########################
meilisearch-js-plugins-tests:
needs: define-docker-image
name: meilisearch-js-plugins tests
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
ports:
- '7700:7700'
steps:
- uses: actions/checkout@v3
with:
repository: meilisearch/meilisearch-js-plugins
- name: Setup node
uses: actions/setup-node@v3
with:
cache: yarn
- name: Install dependencies
run: yarn install
- name: Run tests
run: yarn test
- name: Build all the playgrounds and the packages
run: yarn build
########################
## BACK-END PLUGINS ###
########################
meilisearch-rails-tests:
needs: define-docker-image
name: meilisearch-rails tests
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
ports:
- '7700:7700'
steps:
- uses: actions/checkout@v3
with:
repository: meilisearch/meilisearch-rails
- name: Set up Ruby 3
uses: ruby/setup-ruby@v1
with:
ruby-version: 3
bundler-cache: true
- name: Run tests
run: bundle exec rspec
meilisearch-symfony-tests:
needs: define-docker-image
name: meilisearch-symfony tests
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
ports:
- '7700:7700'
steps:
- uses: actions/checkout@v3
with:
repository: meilisearch/meilisearch-symfony
- name: Install PHP
uses: shivammathur/setup-php@v2
with:
tools: composer:v2, flex
- name: Validate composer.json and composer.lock
run: composer validate
- name: Install dependencies
run: composer install --prefer-dist --no-progress --quiet
- name: Remove doctrine/annotations
run: composer remove --dev doctrine/annotations
- name: Run test suite
run: composer test:unit

View File

@@ -43,7 +43,7 @@ jobs:
toolchain: nightly
override: true
- name: Cache dependencies
uses: Swatinem/rust-cache@v2.5.1
uses: Swatinem/rust-cache@v2.6.2
- name: Run cargo check without any default features
uses: actions-rs/cargo@v1
with:
@@ -65,7 +65,7 @@ jobs:
steps:
- uses: actions/checkout@v3
- name: Cache dependencies
uses: Swatinem/rust-cache@v2.5.1
uses: Swatinem/rust-cache@v2.6.2
- name: Run cargo check without any default features
uses: actions-rs/cargo@v1
with:
@@ -123,7 +123,10 @@ jobs:
override: true
- name: Run cargo tree without default features and check lindera is not present
run: |
cargo tree -f '{p} {f}' -e normal --no-default-features | grep lindera -vqz
if cargo tree -f '{p} {f}' -e normal --no-default-features | grep -vqz lindera; then
echo "lindera has been found in the sources and it shouldn't"
exit 1
fi
- name: Run cargo tree with default features and check lindera is pressent
run: |
cargo tree -f '{p} {f}' -e normal | grep lindera -qz
@@ -146,7 +149,7 @@ jobs:
toolchain: stable
override: true
- name: Cache dependencies
uses: Swatinem/rust-cache@v2.5.1
uses: Swatinem/rust-cache@v2.6.2
- name: Run tests in debug
uses: actions-rs/cargo@v1
with:
@@ -165,7 +168,7 @@ jobs:
override: true
components: clippy
- name: Cache dependencies
uses: Swatinem/rust-cache@v2.5.1
uses: Swatinem/rust-cache@v2.6.2
- name: Run cargo clippy
uses: actions-rs/cargo@v1
with:
@@ -184,7 +187,7 @@ jobs:
override: true
components: rustfmt
- name: Cache dependencies
uses: Swatinem/rust-cache@v2.5.1
uses: Swatinem/rust-cache@v2.6.2
- name: Run cargo fmt
# Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file.
# Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate

View File

@@ -0,0 +1,84 @@
name: Benchmarks (PR)
on: issue_comment
permissions:
issues: write
env:
GH_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }}
jobs:
run-benchmarks-on-comment:
name: Run and upload benchmarks
runs-on: benchmarks
timeout-minutes: 4320 # 72h
steps:
- uses: actions/checkout@v3
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
- name: Check for Command
id: command
uses: xt0rted/slash-command-action@v2
with:
command: benchmark
reaction-type: "eyes"
repo-token: ${{ env.GH_TOKEN }}
# Set variables
- name: Set current branch name
shell: bash
run: echo "name=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT
id: current_branch
- name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
shell: bash
run: echo "name=$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" >> $GITHUB_OUTPUT
id: normalized_current_branch
- name: Set shorter commit SHA
shell: bash
run: echo "short=$(echo $GITHUB_SHA | cut -c1-8)" >> $GITHUB_OUTPUT
id: commit_sha
- name: Set file basename with format "dataset_branch_commitSHA"
shell: bash
run: echo "basename=$(echo ${{ steps.command.outputs.command-arguments }}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" >> $GITHUB_OUTPUT
id: file
# Run benchmarks
- name: Run benchmarks - Dataset ${{ steps.command.outputs.command-arguments }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
run: |
cd benchmarks
cargo bench --bench ${{ steps.command.outputs.command-arguments }} -- --save-baseline ${{ steps.file.outputs.basename }}
# Generate critcmp files
- name: Install critcmp
uses: taiki-e/install-action@v2
with:
tool: critcmp
- name: Export cripcmp file
run: |
critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json
# Upload benchmarks
- name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3
uses: BetaHuhn/do-spaces-action@v2
with:
access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }}
secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }}
space_name: ${{ secrets.DO_SPACES_SPACE_NAME }}
space_region: ${{ secrets.DO_SPACES_SPACE_REGION }}
source: ${{ steps.file.outputs.basename }}.json
out_dir: critcmp_results
# Compute the diff of the benchmarks and send a message on the GitHub PR
- name: Compute and send a message in the PR
env:
GITHUB_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }}
run: |
export base=$(git log --pretty=%p -n 1)
echo 'Here are your benchmarks diff 👊' >> body.txt
echo '```' >> body.txt
./benchmarks/scripts/compare.sh $base ${{ steps.file.outputs.basename }}.json >> body.txt
echo '```' >> body.txt
gh pr comment ${GITHUB_REF#refs/heads/} --body-file body.txt

13
Cargo.lock generated
View File

@@ -2704,6 +2704,7 @@ dependencies = [
"logging_timer",
"maplit",
"md5",
"meili-snap",
"memmap2",
"mimalloc",
"obkv",
@@ -3538,9 +3539,9 @@ dependencies = [
[[package]]
name = "rustls-webpki"
version = "0.100.1"
version = "0.100.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6207cd5ed3d8dca7816f8f3725513a34609c0c765bf652b8c3cb4cfd87db46b"
checksum = "e98ff011474fa39949b7e5c0428f9b4937eda7da7848bbb947786b7be0b27dab"
dependencies = [
"ring",
"untrusted",
@@ -4248,7 +4249,7 @@ dependencies = [
"log",
"once_cell",
"rustls 0.21.6",
"rustls-webpki 0.100.1",
"rustls-webpki 0.100.2",
"url",
"webpki-roots 0.23.1",
]
@@ -4443,9 +4444,9 @@ dependencies = [
[[package]]
name = "webpki"
version = "0.22.0"
version = "0.22.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f095d78192e208183081cc07bc5515ef55216397af48b873e5edcd72637fa1bd"
checksum = "f0e74f82d49d545ad128049b7e88f6576df2da6b02e9ce565c6f533be576957e"
dependencies = [
"ring",
"untrusted",
@@ -4466,7 +4467,7 @@ version = "0.23.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b03058f88386e5ff5310d9111d53f48b17d732b401aeb83a8d5190f2ac459338"
dependencies = [
"rustls-webpki 0.100.1",
"rustls-webpki 0.100.2",
]
[[package]]

View File

@@ -129,6 +129,9 @@ impl HeedAuthStore {
Action::DumpsAll => {
actions.insert(Action::DumpsCreate);
}
Action::SnapshotsAll => {
actions.insert(Action::SnapshotsCreate);
}
Action::TasksAll => {
actions.extend([Action::TasksGet, Action::TasksDelete, Action::TasksCancel]);
}

View File

@@ -257,6 +257,12 @@ pub enum Action {
#[serde(rename = "dumps.create")]
#[deserr(rename = "dumps.create")]
DumpsCreate,
#[serde(rename = "snapshots.*")]
#[deserr(rename = "snapshots.*")]
SnapshotsAll,
#[serde(rename = "snapshots.create")]
#[deserr(rename = "snapshots.create")]
SnapshotsCreate,
#[serde(rename = "version")]
#[deserr(rename = "version")]
Version,
@@ -309,6 +315,7 @@ impl Action {
METRICS_GET => Some(Self::MetricsGet),
DUMPS_ALL => Some(Self::DumpsAll),
DUMPS_CREATE => Some(Self::DumpsCreate),
SNAPSHOTS_CREATE => Some(Self::SnapshotsCreate),
VERSION => Some(Self::Version),
KEYS_CREATE => Some(Self::KeysAdd),
KEYS_GET => Some(Self::KeysGet),
@@ -353,6 +360,7 @@ pub mod actions {
pub const METRICS_GET: u8 = MetricsGet.repr();
pub const DUMPS_ALL: u8 = DumpsAll.repr();
pub const DUMPS_CREATE: u8 = DumpsCreate.repr();
pub const SNAPSHOTS_CREATE: u8 = SnapshotsCreate.repr();
pub const VERSION: u8 = Version.repr();
pub const KEYS_CREATE: u8 = KeysAdd.repr();
pub const KEYS_GET: u8 = KeysGet.repr();

View File

@@ -1,6 +1,5 @@
mod mock_analytics;
// if we are in release mode and the feature analytics was enabled
#[cfg(all(not(debug_assertions), feature = "analytics"))]
#[cfg(feature = "analytics")]
mod segment_analytics;
use std::fs;
@@ -17,26 +16,25 @@ use serde_json::Value;
use crate::routes::indexes::documents::UpdateDocumentsQuery;
use crate::routes::tasks::TasksFilterQuery;
// if we are in debug mode OR the analytics feature is disabled
// if the analytics feature is disabled
// the `SegmentAnalytics` point to the mock instead of the real analytics
#[cfg(any(debug_assertions, not(feature = "analytics")))]
#[cfg(not(feature = "analytics"))]
pub type SegmentAnalytics = mock_analytics::MockAnalytics;
#[cfg(any(debug_assertions, not(feature = "analytics")))]
#[cfg(not(feature = "analytics"))]
pub type SearchAggregator = mock_analytics::SearchAggregator;
#[cfg(any(debug_assertions, not(feature = "analytics")))]
#[cfg(not(feature = "analytics"))]
pub type MultiSearchAggregator = mock_analytics::MultiSearchAggregator;
#[cfg(any(debug_assertions, not(feature = "analytics")))]
#[cfg(not(feature = "analytics"))]
pub type FacetSearchAggregator = mock_analytics::FacetSearchAggregator;
// if we are in release mode and the feature analytics was enabled
// we use the real analytics
#[cfg(all(not(debug_assertions), feature = "analytics"))]
// if the feature analytics is enabled we use the real analytics
#[cfg(feature = "analytics")]
pub type SegmentAnalytics = segment_analytics::SegmentAnalytics;
#[cfg(all(not(debug_assertions), feature = "analytics"))]
#[cfg(feature = "analytics")]
pub type SearchAggregator = segment_analytics::SearchAggregator;
#[cfg(all(not(debug_assertions), feature = "analytics"))]
#[cfg(feature = "analytics")]
pub type MultiSearchAggregator = segment_analytics::MultiSearchAggregator;
#[cfg(all(not(debug_assertions), feature = "analytics"))]
#[cfg(feature = "analytics")]
pub type FacetSearchAggregator = segment_analytics::FacetSearchAggregator;
/// The Meilisearch config dir:

File diff suppressed because it is too large Load Diff

View File

@@ -28,7 +28,7 @@ const MEILI_DB_PATH: &str = "MEILI_DB_PATH";
const MEILI_HTTP_ADDR: &str = "MEILI_HTTP_ADDR";
const MEILI_MASTER_KEY: &str = "MEILI_MASTER_KEY";
const MEILI_ENV: &str = "MEILI_ENV";
#[cfg(all(not(debug_assertions), feature = "analytics"))]
#[cfg(feature = "analytics")]
const MEILI_NO_ANALYTICS: &str = "MEILI_NO_ANALYTICS";
const MEILI_HTTP_PAYLOAD_SIZE_LIMIT: &str = "MEILI_HTTP_PAYLOAD_SIZE_LIMIT";
const MEILI_SSL_CERT_PATH: &str = "MEILI_SSL_CERT_PATH";
@@ -159,7 +159,7 @@ pub struct Opt {
/// Meilisearch automatically collects data from all instances that do not opt out using this flag.
/// All gathered data is used solely for the purpose of improving Meilisearch, and can be deleted
/// at any time.
#[cfg(all(not(debug_assertions), feature = "analytics"))]
#[cfg(feature = "analytics")]
#[serde(default)] // we can't send true
#[clap(long, env = MEILI_NO_ANALYTICS)]
pub no_analytics: bool,
@@ -390,7 +390,7 @@ impl Opt {
ignore_missing_dump: _,
ignore_dump_if_db_exists: _,
config_file_path: _,
#[cfg(all(not(debug_assertions), feature = "analytics"))]
#[cfg(feature = "analytics")]
no_analytics,
experimental_enable_metrics: enable_metrics_route,
experimental_reduce_indexing_memory_usage: reduce_indexing_memory_usage,
@@ -401,7 +401,7 @@ impl Opt {
export_to_env_if_not_present(MEILI_MASTER_KEY, master_key);
}
export_to_env_if_not_present(MEILI_ENV, env);
#[cfg(all(not(debug_assertions), feature = "analytics"))]
#[cfg(feature = "analytics")]
{
export_to_env_if_not_present(MEILI_NO_ANALYTICS, no_analytics.to_string());
}

View File

@@ -24,6 +24,7 @@ pub mod features;
pub mod indexes;
mod metrics;
mod multi_search;
mod snapshot;
mod swap_indexes;
pub mod tasks;
@@ -32,6 +33,7 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
.service(web::resource("/health").route(web::get().to(get_health)))
.service(web::scope("/keys").configure(api_key::configure))
.service(web::scope("/dumps").configure(dump::configure))
.service(web::scope("/snapshots").configure(snapshot::configure))
.service(web::resource("/stats").route(web::get().to(get_stats)))
.service(web::resource("/version").route(web::get().to(get_version)))
.service(web::scope("/indexes").configure(indexes::configure))

View File

@@ -0,0 +1,32 @@
use actix_web::web::Data;
use actix_web::{web, HttpRequest, HttpResponse};
use index_scheduler::IndexScheduler;
use log::debug;
use meilisearch_types::error::ResponseError;
use meilisearch_types::tasks::KindWithContent;
use serde_json::json;
use crate::analytics::Analytics;
use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::GuardedData;
use crate::extractors::sequential_extractor::SeqHandler;
use crate::routes::SummarizedTaskView;
pub fn configure(cfg: &mut web::ServiceConfig) {
cfg.service(web::resource("").route(web::post().to(SeqHandler(create_snapshot))));
}
pub async fn create_snapshot(
index_scheduler: GuardedData<ActionPolicy<{ actions::SNAPSHOTS_CREATE }>, Data<IndexScheduler>>,
req: HttpRequest,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
analytics.publish("Snapshot Created".to_string(), json!({}), Some(&req));
let task = KindWithContent::SnapshotCreation;
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
debug!("returns: {:?}", task);
Ok(HttpResponse::Accepted().json(task))
}

View File

@@ -1,8 +1,7 @@
use std::{thread, time};
use serde_json::{json, Value};
use crate::common::Server;
use crate::common::{Server, Value};
use crate::json;
#[actix_rt::test]
async fn add_valid_api_key() {
@@ -162,7 +161,7 @@ async fn add_valid_api_key_null_description() {
server.use_api_key("MASTER_KEY");
let content = json!({
"description": Value::Null,
"description": json!(null),
"indexes": ["products"],
"actions": ["documents.add"],
"expiresAt": "2050-11-13T00:00:00"
@@ -365,7 +364,7 @@ async fn error_add_api_key_invalid_index_uids() {
server.use_api_key("MASTER_KEY");
let content = json!({
"description": Value::Null,
"description": json!(null),
"indexes": ["invalid index # / \\name with spaces"],
"actions": [
"documents.add"
@@ -422,7 +421,7 @@ async fn error_add_api_key_invalid_parameters_actions() {
meili_snap::snapshot!(code, @"400 Bad Request");
meili_snap::snapshot!(meili_snap::json_string!(response, { ".createdAt" => "[ignored]", ".updatedAt" => "[ignored]" }), @r###"
{
"message": "Unknown value `doc.add` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`",
"message": "Unknown value `doc.add` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`",
"code": "invalid_api_key_actions",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_api_key_actions"
@@ -507,7 +506,7 @@ async fn error_add_api_key_invalid_parameters_uid() {
async fn error_add_api_key_parameters_uid_already_exist() {
let mut server = Server::new_auth().await;
server.use_api_key("MASTER_KEY");
let content = json!({
let content: Value = json!({
"uid": "4bc0887a-0e41-4f3b-935d-0c451dcee9c8",
"indexes": ["products"],
"actions": ["search"],
@@ -1146,7 +1145,7 @@ async fn patch_api_key_description() {
meili_snap::snapshot!(code, @"200 OK");
// Remove the description
let content = json!({ "description": serde_json::Value::Null });
let content = json!({ "description": null });
let (response, code) = server.patch_api_key(&uid, content).await;
meili_snap::snapshot!(meili_snap::json_string!(response, { ".createdAt" => "[ignored]", ".updatedAt" => "[ignored]", ".uid" => "[ignored]", ".key" => "[ignored]" }), @r###"

View File

@@ -3,10 +3,10 @@ use std::collections::{HashMap, HashSet};
use ::time::format_description::well_known::Rfc3339;
use maplit::{hashmap, hashset};
use once_cell::sync::Lazy;
use serde_json::{json, Value};
use time::{Duration, OffsetDateTime};
use crate::common::Server;
use crate::common::{Server, Value};
use crate::json;
pub static AUTHORIZATIONS: Lazy<HashMap<(&'static str, &'static str), HashSet<&'static str>>> =
Lazy::new(|| {
@@ -54,6 +54,7 @@ pub static AUTHORIZATIONS: Lazy<HashMap<(&'static str, &'static str), HashSet<&'
("GET", "/indexes/products/stats") => hashset!{"stats.get", "stats.*", "*"},
("GET", "/stats") => hashset!{"stats.get", "stats.*", "*"},
("POST", "/dumps") => hashset!{"dumps.create", "dumps.*", "*"},
("POST", "/snapshots") => hashset!{"snapshots.create", "snapshots.*", "*"},
("GET", "/version") => hashset!{"version", "*"},
("GET", "/metrics") => hashset!{"metrics.get", "metrics.*", "*"},
("PATCH", "/keys/mykey/") => hashset!{"keys.update", "*"},

View File

@@ -1,8 +1,8 @@
use meili_snap::*;
use serde_json::json;
use uuid::Uuid;
use crate::common::Server;
use crate::json;
#[actix_rt::test]
async fn create_api_key_bad_description() {
@@ -90,7 +90,7 @@ async fn create_api_key_bad_actions() {
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Unknown value `doggo` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`",
"message": "Unknown value `doggo` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`",
"code": "invalid_api_key_actions",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_api_key_actions"

View File

@@ -7,9 +7,9 @@ mod tenant_token;
mod tenant_token_multi_search;
use actix_web::http::StatusCode;
use serde_json::{json, Value};
use crate::common::Server;
use crate::common::{Server, Value};
use crate::json;
impl Server {
pub fn use_api_key(&mut self, api_key: impl AsRef<str>) {

View File

@@ -3,11 +3,11 @@ use std::collections::HashMap;
use ::time::format_description::well_known::Rfc3339;
use maplit::hashmap;
use once_cell::sync::Lazy;
use serde_json::{json, Value};
use time::{Duration, OffsetDateTime};
use super::authorization::{ALL_ACTIONS, AUTHORIZATIONS};
use crate::common::Server;
use crate::common::{Server, Value};
use crate::json;
fn generate_tenant_token(
parent_uid: impl AsRef<str>,
@@ -233,31 +233,31 @@ async fn search_authorized_simple_token() {
},
hashmap! {
"searchRules" => json!({"*": {}}),
"exp" => Value::Null
"exp" => json!(null)
},
hashmap! {
"searchRules" => json!({"*": Value::Null}),
"exp" => Value::Null
"searchRules" => json!({"*": null}),
"exp" => json!(null)
},
hashmap! {
"searchRules" => json!(["*"]),
"exp" => Value::Null
"exp" => json!(null)
},
hashmap! {
"searchRules" => json!({"sales": {}}),
"exp" => Value::Null
"exp" => json!(null)
},
hashmap! {
"searchRules" => json!({"sales": Value::Null}),
"exp" => Value::Null
"searchRules" => json!({"sales": null}),
"exp" => json!(null)
},
hashmap! {
"searchRules" => json!(["sales"]),
"exp" => Value::Null
"exp" => json!(null)
},
hashmap! {
"searchRules" => json!(["sa*"]),
"exp" => Value::Null
"exp" => json!(null)
},
];
@@ -386,7 +386,7 @@ async fn error_search_token_forbidden_parent_key() {
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
},
hashmap! {
"searchRules" => json!({"*": Value::Null}),
"searchRules" => json!({"*": null}),
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
},
hashmap! {
@@ -398,7 +398,7 @@ async fn error_search_token_forbidden_parent_key() {
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
},
hashmap! {
"searchRules" => json!({"sales": Value::Null}),
"searchRules" => json!({"sales": null}),
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
},
hashmap! {
@@ -428,15 +428,15 @@ async fn error_search_forbidden_token() {
},
hashmap! {
"searchRules" => json!({"products": {}}),
"exp" => Value::Null
"exp" => json!(null)
},
hashmap! {
"searchRules" => json!({"products": Value::Null}),
"exp" => Value::Null
"searchRules" => json!({"products": null}),
"exp" => json!(null)
},
hashmap! {
"searchRules" => json!(["products"]),
"exp" => Value::Null
"exp" => json!(null)
},
// expired token
hashmap! {
@@ -444,7 +444,7 @@ async fn error_search_forbidden_token() {
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
},
hashmap! {
"searchRules" => json!({"*": Value::Null}),
"searchRules" => json!({"*": null}),
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
},
hashmap! {
@@ -456,7 +456,7 @@ async fn error_search_forbidden_token() {
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
},
hashmap! {
"searchRules" => json!({"sales": Value::Null}),
"searchRules" => json!({"sales": null}),
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
},
hashmap! {

View File

@@ -3,11 +3,11 @@ use std::collections::HashMap;
use ::time::format_description::well_known::Rfc3339;
use maplit::hashmap;
use once_cell::sync::Lazy;
use serde_json::{json, Value};
use time::{Duration, OffsetDateTime};
use super::authorization::ALL_ACTIONS;
use crate::common::Server;
use crate::common::{Server, Value};
use crate::json;
fn generate_tenant_token(
parent_uid: impl AsRef<str>,
@@ -512,31 +512,31 @@ async fn single_search_authorized_simple_token() {
},
hashmap! {
"searchRules" => json!({"*": {}}),
"exp" => Value::Null
"exp" => json!(null),
},
hashmap! {
"searchRules" => json!({"*": Value::Null}),
"exp" => Value::Null
"searchRules" => json!({"*": null}),
"exp" => json!(null),
},
hashmap! {
"searchRules" => json!(["*"]),
"exp" => Value::Null
"exp" => json!(null),
},
hashmap! {
"searchRules" => json!({"sales": {}}),
"exp" => Value::Null
"exp" => json!(null),
},
hashmap! {
"searchRules" => json!({"sales": Value::Null}),
"exp" => Value::Null
"searchRules" => json!({"sales": null}),
"exp" => json!(null),
},
hashmap! {
"searchRules" => json!(["sales"]),
"exp" => Value::Null
"exp" => json!(null),
},
hashmap! {
"searchRules" => json!(["sa*"]),
"exp" => Value::Null
"exp" => json!(null),
},
];
@@ -564,31 +564,31 @@ async fn multi_search_authorized_simple_token() {
},
hashmap! {
"searchRules" => json!({"*": {}}),
"exp" => Value::Null
"exp" => json!(null),
},
hashmap! {
"searchRules" => json!({"*": Value::Null}),
"exp" => Value::Null
"searchRules" => json!({"*": null}),
"exp" => json!(null),
},
hashmap! {
"searchRules" => json!(["*"]),
"exp" => Value::Null
"exp" => json!(null),
},
hashmap! {
"searchRules" => json!({"sales": {}, "products": {}}),
"exp" => Value::Null
"exp" => json!(null),
},
hashmap! {
"searchRules" => json!({"sales": Value::Null, "products": Value::Null}),
"exp" => Value::Null
"searchRules" => json!({"sales": null, "products": null}),
"exp" => json!(null),
},
hashmap! {
"searchRules" => json!(["sales", "products"]),
"exp" => Value::Null
"exp" => json!(null),
},
hashmap! {
"searchRules" => json!(["sa*", "pro*"]),
"exp" => Value::Null
"exp" => json!(null),
},
];
@@ -823,7 +823,7 @@ async fn error_single_search_token_forbidden_parent_key() {
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
},
hashmap! {
"searchRules" => json!({"*": Value::Null}),
"searchRules" => json!({"*": null}),
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
},
hashmap! {
@@ -835,7 +835,7 @@ async fn error_single_search_token_forbidden_parent_key() {
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
},
hashmap! {
"searchRules" => json!({"sales": Value::Null}),
"searchRules" => json!({"sales": null}),
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
},
hashmap! {
@@ -864,7 +864,7 @@ async fn error_multi_search_token_forbidden_parent_key() {
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
},
hashmap! {
"searchRules" => json!({"*": Value::Null}),
"searchRules" => json!({"*": null}),
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
},
hashmap! {
@@ -876,7 +876,7 @@ async fn error_multi_search_token_forbidden_parent_key() {
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
},
hashmap! {
"searchRules" => json!({"sales": Value::Null, "products": Value::Null}),
"searchRules" => json!({"sales": null, "products": null}),
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
},
hashmap! {
@@ -919,15 +919,15 @@ async fn error_single_search_forbidden_token() {
},
hashmap! {
"searchRules" => json!({"products": {}}),
"exp" => Value::Null
"exp" => json!(null),
},
hashmap! {
"searchRules" => json!({"products": Value::Null}),
"exp" => Value::Null
"searchRules" => json!({"products": null}),
"exp" => json!(null),
},
hashmap! {
"searchRules" => json!(["products"]),
"exp" => Value::Null
"exp" => json!(null),
},
// expired token
hashmap! {
@@ -935,7 +935,7 @@ async fn error_single_search_forbidden_token() {
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
},
hashmap! {
"searchRules" => json!({"*": Value::Null}),
"searchRules" => json!({"*": null}),
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
},
hashmap! {
@@ -947,7 +947,7 @@ async fn error_single_search_forbidden_token() {
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
},
hashmap! {
"searchRules" => json!({"sales": Value::Null}),
"searchRules" => json!({"sales": null}),
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
},
hashmap! {
@@ -978,15 +978,15 @@ async fn error_multi_search_forbidden_token() {
},
hashmap! {
"searchRules" => json!({"products": {}}),
"exp" => Value::Null
"exp" => json!(null),
},
hashmap! {
"searchRules" => json!({"products": Value::Null}),
"exp" => Value::Null
"searchRules" => json!({"products": null}),
"exp" => json!(null),
},
hashmap! {
"searchRules" => json!(["products"]),
"exp" => Value::Null
"exp" => json!(null),
},
hashmap! {
"searchRules" => json!({"sales": {}}),
@@ -998,15 +998,15 @@ async fn error_multi_search_forbidden_token() {
},
hashmap! {
"searchRules" => json!({"sales": {}}),
"exp" => Value::Null
"exp" => json!(null),
},
hashmap! {
"searchRules" => json!({"sales": Value::Null}),
"exp" => Value::Null
"searchRules" => json!({"sales": null}),
"exp" => json!(null),
},
hashmap! {
"searchRules" => json!(["sales"]),
"exp" => Value::Null
"exp" => json!(null),
},
// expired token
hashmap! {
@@ -1014,7 +1014,7 @@ async fn error_multi_search_forbidden_token() {
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
},
hashmap! {
"searchRules" => json!({"*": Value::Null}),
"searchRules" => json!({"*": null}),
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
},
hashmap! {
@@ -1026,7 +1026,7 @@ async fn error_multi_search_forbidden_token() {
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
},
hashmap! {
"searchRules" => json!({"sales": Value::Null, "products": {}}),
"searchRules" => json!({"sales": null, "products": {}}),
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
},
hashmap! {

View File

@@ -3,12 +3,13 @@ use std::panic::{catch_unwind, resume_unwind, UnwindSafe};
use std::time::Duration;
use actix_web::http::StatusCode;
use serde_json::{json, Value};
use tokio::time::sleep;
use urlencoding::encode as urlencode;
use super::encoder::Encoder;
use super::service::Service;
use super::Value;
use crate::json;
pub struct Index<'a> {
pub uid: String,
@@ -242,7 +243,9 @@ impl Index<'_> {
pub async fn delete_batch(&self, ids: Vec<u64>) -> (Value, StatusCode) {
let url = format!("/indexes/{}/documents/delete-batch", urlencode(self.uid.as_ref()));
self.service.post_encoded(url, serde_json::to_value(&ids).unwrap(), self.encoder).await
self.service
.post_encoded(url, serde_json::to_value(&ids).unwrap().into(), self.encoder)
.await
}
pub async fn delete_batch_raw(&self, body: Value) -> (Value, StatusCode) {

View File

@@ -3,9 +3,83 @@ pub mod index;
pub mod server;
pub mod service;
use std::fmt::{self, Display};
pub use index::{GetAllDocumentsOptions, GetDocumentOptions};
use meili_snap::json_string;
use serde::{Deserialize, Serialize};
pub use server::{default_settings, Server};
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
pub struct Value(pub serde_json::Value);
impl Value {
pub fn uid(&self) -> u64 {
if let Some(uid) = self["uid"].as_u64() {
uid
} else if let Some(uid) = self["taskUid"].as_u64() {
uid
} else {
panic!("Didn't find any task id in: {self}");
}
}
}
impl From<serde_json::Value> for Value {
fn from(value: serde_json::Value) -> Self {
Value(value)
}
}
impl std::ops::Deref for Value {
type Target = serde_json::Value;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl PartialEq<serde_json::Value> for Value {
fn eq(&self, other: &serde_json::Value) -> bool {
&self.0 == other
}
}
impl PartialEq<Value> for serde_json::Value {
fn eq(&self, other: &Value) -> bool {
self == &other.0
}
}
impl PartialEq<&str> for Value {
fn eq(&self, other: &&str) -> bool {
self.0.eq(other)
}
}
impl Display for Value {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"{}",
json_string!(self, { ".enqueuedAt" => "[date]", ".processedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" })
)
}
}
impl From<Vec<Value>> for Value {
fn from(value: Vec<Value>) -> Self {
Self(value.into_iter().map(|value| value.0).collect::<serde_json::Value>())
}
}
#[macro_export]
macro_rules! json {
($($json:tt)+) => {
$crate::common::Value(serde_json::json!($($json)+))
};
}
/// Performs a search test on both post and get routes
#[macro_export]
macro_rules! test_post_get_search {

View File

@@ -11,13 +11,14 @@ use clap::Parser;
use meilisearch::option::{IndexerOpts, MaxMemory, Opt};
use meilisearch::{analytics, create_app, setup_meilisearch};
use once_cell::sync::Lazy;
use serde_json::{json, Value};
use tempfile::TempDir;
use tokio::time::sleep;
use super::index::Index;
use super::service::Service;
use crate::common::encoder::Encoder;
use crate::common::Value;
use crate::json;
pub struct Server {
pub service: Service,
@@ -156,6 +157,10 @@ impl Server {
self.service.post("/dumps", json!(null)).await
}
pub async fn create_snapshot(&self) -> (Value, StatusCode) {
self.service.post("/snapshots", json!(null)).await
}
pub async fn index_swap(&self, value: Value) -> (Value, StatusCode) {
self.service.post("/swap-indexes", value).await
}
@@ -204,7 +209,7 @@ pub fn default_settings(dir: impl AsRef<Path>) -> Opt {
db_path: dir.as_ref().join("db"),
dump_dir: dir.as_ref().join("dumps"),
env: "development".to_owned(),
#[cfg(all(not(debug_assertions), feature = "analytics"))]
#[cfg(feature = "analytics")]
no_analytics: true,
max_index_size: Byte::from_unit(100.0, ByteUnit::MiB).unwrap(),
max_task_db_size: Byte::from_unit(1.0, ByteUnit::GiB).unwrap(),

View File

@@ -7,9 +7,9 @@ use actix_web::test::TestRequest;
use index_scheduler::IndexScheduler;
use meilisearch::{analytics, create_app, Opt};
use meilisearch_auth::AuthController;
use serde_json::Value;
use crate::common::encoder::Encoder;
use crate::common::Value;
pub struct Service {
pub index_scheduler: Arc<IndexScheduler>,

View File

@@ -3,9 +3,8 @@
mod common;
use actix_web::test;
use serde_json::{json, Value};
use crate::common::Server;
use crate::common::{Server, Value};
enum HttpVerb {
Put,

View File

@@ -1,11 +1,11 @@
use actix_web::test;
use meili_snap::{json_string, snapshot};
use serde_json::{json, Value};
use time::format_description::well_known::Rfc3339;
use time::OffsetDateTime;
use crate::common::encoder::Encoder;
use crate::common::{GetAllDocumentsOptions, Server};
use crate::common::{GetAllDocumentsOptions, Server, Value};
use crate::json;
/// This is the basic usage of our API and every other tests uses the content-type application/json
#[actix_rt::test]

View File

@@ -1,7 +1,7 @@
use meili_snap::{json_string, snapshot};
use serde_json::json;
use crate::common::{GetAllDocumentsOptions, Server};
use crate::json;
#[actix_rt::test]
async fn delete_one_document_unexisting_index() {

View File

@@ -1,8 +1,8 @@
use meili_snap::*;
use serde_json::json;
use urlencoding::encode;
use crate::common::Server;
use crate::json;
#[actix_rt::test]
async fn get_all_documents_bad_offset() {

View File

@@ -1,11 +1,11 @@
use actix_web::test;
use http::header::ACCEPT_ENCODING;
use meili_snap::*;
use serde_json::{json, Value};
use urlencoding::encode as urlencode;
use crate::common::encoder::Encoder;
use crate::common::{GetAllDocumentsOptions, GetDocumentOptions, Server};
use crate::common::{GetAllDocumentsOptions, GetDocumentOptions, Server, Value};
use crate::json;
// TODO: partial test since we are testing error, amd error is not yet fully implemented in
// transplant
@@ -40,7 +40,7 @@ async fn get_document() {
let server = Server::new().await;
let index = server.index("test");
index.create(None).await;
let documents = serde_json::json!([
let documents = json!([
{
"id": 0,
"nested": { "content": "foobar" },
@@ -53,7 +53,7 @@ async fn get_document() {
assert_eq!(code, 200);
assert_eq!(
response,
serde_json::json!({
json!({
"id": 0,
"nested": { "content": "foobar" },
})
@@ -64,7 +64,7 @@ async fn get_document() {
assert_eq!(code, 200);
assert_eq!(
response,
serde_json::json!({
json!({
"id": 0,
})
);
@@ -75,7 +75,7 @@ async fn get_document() {
assert_eq!(code, 200);
assert_eq!(
response,
serde_json::json!({
json!({
"nested": { "content": "foobar" },
})
);
@@ -122,7 +122,7 @@ async fn get_all_documents_no_options() {
assert_eq!(code, 200);
let arr = response["results"].as_array().unwrap();
assert_eq!(arr.len(), 20);
let first = serde_json::json!({
let first = json!({
"id":0,
"isActive":false,
"balance":"$2,668.55",

View File

@@ -1,7 +1,8 @@
use serde_json::json;
use meili_snap::snapshot;
use crate::common::encoder::Encoder;
use crate::common::{GetAllDocumentsOptions, Server};
use crate::json;
#[actix_rt::test]
async fn error_document_update_create_index_bad_uid() {
@@ -84,7 +85,13 @@ async fn update_document() {
let (response, code) = index.get_document(1, None).await;
assert_eq!(code, 200);
assert_eq!(response.to_string(), r##"{"doc_id":1,"content":"foo","other":"bar"}"##);
snapshot!(response, @r###"
{
"doc_id": 1,
"content": "foo",
"other": "bar"
}
"###);
}
#[actix_rt::test]
@@ -122,7 +129,13 @@ async fn update_document_gzip_encoded() {
let (response, code) = index.get_document(1, None).await;
assert_eq!(code, 200);
assert_eq!(response.to_string(), r##"{"doc_id":1,"content":"foo","other":"bar"}"##);
snapshot!(response, @r###"
{
"doc_id": 1,
"content": "foo",
"other": "bar"
}
"###);
}
#[actix_rt::test]

View File

@@ -2,10 +2,10 @@ mod data;
use meili_snap::{json_string, snapshot};
use meilisearch::Opt;
use serde_json::json;
use self::data::GetDump;
use crate::common::{default_settings, GetAllDocumentsOptions, Server};
use crate::json;
// all the following test are ignored on windows. See #2364
#[actix_rt::test]

View File

@@ -1,6 +1,5 @@
use serde_json::json;
use crate::common::Server;
use crate::json;
/// Feature name to test against.
/// This will have to be changed by a different one when that feature is stabilized.

View File

@@ -2,10 +2,10 @@ use actix_web::http::header::ContentType;
use actix_web::test;
use http::header::ACCEPT_ENCODING;
use meili_snap::{json_string, snapshot};
use serde_json::{json, Value};
use crate::common::encoder::Encoder;
use crate::common::Server;
use crate::common::{Server, Value};
use crate::json;
#[actix_rt::test]
async fn create_index_no_primary_key() {
@@ -21,7 +21,7 @@ async fn create_index_no_primary_key() {
assert_eq!(response["status"], "succeeded");
assert_eq!(response["type"], "indexCreation");
assert_eq!(response["details"]["primaryKey"], Value::Null);
assert_eq!(response["details"]["primaryKey"], json!(null));
}
#[actix_rt::test]
@@ -38,7 +38,7 @@ async fn create_index_with_gzip_encoded_request() {
assert_eq!(response["status"], "succeeded");
assert_eq!(response["type"], "indexCreation");
assert_eq!(response["details"]["primaryKey"], Value::Null);
assert_eq!(response["details"]["primaryKey"], json!(null));
}
#[actix_rt::test]
@@ -86,7 +86,7 @@ async fn create_index_with_zlib_encoded_request() {
assert_eq!(response["status"], "succeeded");
assert_eq!(response["type"], "indexCreation");
assert_eq!(response["details"]["primaryKey"], Value::Null);
assert_eq!(response["details"]["primaryKey"], json!(null));
}
#[actix_rt::test]
@@ -103,7 +103,7 @@ async fn create_index_with_brotli_encoded_request() {
assert_eq!(response["status"], "succeeded");
assert_eq!(response["type"], "indexCreation");
assert_eq!(response["details"]["primaryKey"], Value::Null);
assert_eq!(response["details"]["primaryKey"], json!(null));
}
#[actix_rt::test]
@@ -136,7 +136,7 @@ async fn create_index_with_invalid_primary_key() {
let (response, code) = index.get().await;
assert_eq!(code, 200);
assert_eq!(response["primaryKey"], Value::Null);
assert_eq!(response["primaryKey"], json!(null));
}
#[actix_rt::test]

View File

@@ -1,6 +1,5 @@
use serde_json::json;
use crate::common::Server;
use crate::json;
#[actix_rt::test]
async fn create_and_delete_index() {

View File

@@ -1,7 +1,7 @@
use meili_snap::*;
use serde_json::json;
use crate::common::Server;
use crate::json;
#[actix_rt::test]
async fn get_indexes_bad_offset() {

View File

@@ -1,6 +1,5 @@
use serde_json::json;
use crate::common::Server;
use crate::json;
#[actix_rt::test]
async fn stats() {

View File

@@ -1,9 +1,9 @@
use serde_json::json;
use time::format_description::well_known::Rfc3339;
use time::OffsetDateTime;
use crate::common::encoder::Encoder;
use crate::common::Server;
use crate::json;
#[actix_rt::test]
async fn update_primary_key() {

View File

@@ -1,8 +1,8 @@
use meili_snap::*;
use serde_json::json;
use super::DOCUMENTS;
use crate::common::Server;
use crate::json;
#[actix_rt::test]
async fn search_unexisting_index() {

View File

@@ -1,8 +1,8 @@
use meili_snap::snapshot;
use once_cell::sync::Lazy;
use serde_json::{json, Value};
use crate::common::Server;
use crate::common::{Server, Value};
use crate::json;
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
json!([

View File

@@ -1,8 +1,8 @@
use insta::{allow_duplicates, assert_json_snapshot};
use serde_json::json;
use super::*;
use crate::common::Server;
use crate::json;
#[actix_rt::test]
async fn formatted_contain_wildcard() {

View File

@@ -1,8 +1,8 @@
use meili_snap::{json_string, snapshot};
use once_cell::sync::Lazy;
use serde_json::{json, Value};
use crate::common::Server;
use crate::common::{Server, Value};
use crate::json;
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
json!([

View File

@@ -10,9 +10,9 @@ mod pagination;
mod restrict_searchable;
use once_cell::sync::Lazy;
use serde_json::{json, Value};
use crate::common::Server;
use crate::common::{Server, Value};
use crate::json;
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
json!([

View File

@@ -1,8 +1,8 @@
use meili_snap::{json_string, snapshot};
use serde_json::json;
use super::{DOCUMENTS, NESTED_DOCUMENTS};
use crate::common::Server;
use crate::json;
#[actix_rt::test]
async fn search_empty_list() {

View File

@@ -1,6 +1,5 @@
use serde_json::json;
use crate::common::Server;
use crate::json;
use crate::search::DOCUMENTS;
#[actix_rt::test]

View File

@@ -1,9 +1,9 @@
use meili_snap::{json_string, snapshot};
use once_cell::sync::Lazy;
use serde_json::{json, Value};
use crate::common::index::Index;
use crate::common::Server;
use crate::common::{Server, Value};
use crate::json;
async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Index<'a> {
let index = server.index("test");

View File

@@ -1,6 +1,5 @@
use serde_json::json;
use crate::common::Server;
use crate::json;
#[actix_rt::test]
async fn set_and_reset_distinct_attribute() {

View File

@@ -1,7 +1,7 @@
use meili_snap::*;
use serde_json::json;
use crate::common::Server;
use crate::json;
#[actix_rt::test]
async fn settings_bad_displayed_attributes() {

View File

@@ -1,16 +1,16 @@
use std::collections::HashMap;
use once_cell::sync::Lazy;
use serde_json::{json, Value};
use crate::common::Server;
use crate::common::{Server, Value};
use crate::json;
static DEFAULT_SETTINGS_VALUES: Lazy<HashMap<&'static str, Value>> = Lazy::new(|| {
let mut map = HashMap::new();
map.insert("displayed_attributes", json!(["*"]));
map.insert("searchable_attributes", json!(["*"]));
map.insert("filterable_attributes", json!([]));
map.insert("distinct_attribute", json!(Value::Null));
map.insert("distinct_attribute", json!(null));
map.insert(
"ranking_rules",
json!(["words", "typo", "proximity", "attribute", "sort", "exactness"]),
@@ -229,7 +229,7 @@ macro_rules! test_setting_routes {
.chars()
.map(|c| if c == '_' { '-' } else { c })
.collect::<String>());
let (response, code) = server.service.$write_method(url, serde_json::Value::Null).await;
let (response, code) = server.service.$write_method(url, serde_json::Value::Null.into()).await;
assert_eq!(code, 202, "{}", response);
server.index("").wait_task(0).await;
let (response, code) = server.index("test").get().await;

View File

@@ -1,7 +1,7 @@
use meili_snap::{json_string, snapshot};
use serde_json::json;
use crate::common::Server;
use crate::json;
#[actix_rt::test]
async fn set_and_reset() {

View File

@@ -1,11 +1,13 @@
use std::time::Duration;
use actix_rt::time::sleep;
use meili_snap::{json_string, snapshot};
use meilisearch::option::ScheduleSnapshot;
use meilisearch::Opt;
use crate::common::server::default_settings;
use crate::common::{GetAllDocumentsOptions, Server};
use crate::json;
macro_rules! verify_snapshot {
(
@@ -44,7 +46,7 @@ async fn perform_snapshot() {
let index = server.index("test");
index
.update_settings(serde_json::json! ({
.update_settings(json! ({
"searchableAttributes": [],
}))
.await;
@@ -90,3 +92,95 @@ async fn perform_snapshot() {
server.index("test1").settings(),
);
}
#[actix_rt::test]
async fn perform_on_demand_snapshot() {
let temp = tempfile::tempdir().unwrap();
let snapshot_dir = tempfile::tempdir().unwrap();
let options =
Opt { snapshot_dir: snapshot_dir.path().to_owned(), ..default_settings(temp.path()) };
let server = Server::new_with_options(options).await.unwrap();
let index = server.index("catto");
index
.update_settings(json! ({
"searchableAttributes": [],
}))
.await;
index.load_test_set().await;
server.index("doggo").create(Some("bone")).await;
index.wait_task(2).await;
server.index("doggo").create(Some("bone")).await;
index.wait_task(2).await;
let (task, code) = server.create_snapshot().await;
snapshot!(code, @"202 Accepted");
snapshot!(json_string!(task, { ".enqueuedAt" => "[date]" }), @r###"
{
"taskUid": 4,
"indexUid": null,
"status": "enqueued",
"type": "snapshotCreation",
"enqueuedAt": "[date]"
}
"###);
let task = index.wait_task(task.uid()).await;
snapshot!(json_string!(task, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" }), @r###"
{
"uid": 4,
"indexUid": null,
"status": "succeeded",
"type": "snapshotCreation",
"canceledBy": null,
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let temp = tempfile::tempdir().unwrap();
let snapshots: Vec<String> = std::fs::read_dir(&snapshot_dir)
.unwrap()
.map(|entry| entry.unwrap().path().file_name().unwrap().to_str().unwrap().to_string())
.collect();
meili_snap::snapshot!(format!("{snapshots:?}"), @r###"["db.snapshot"]"###);
let snapshot_path = snapshot_dir.path().to_owned().join("db.snapshot");
#[cfg_attr(windows, allow(unused))]
let snapshot_meta = std::fs::metadata(&snapshot_path).unwrap();
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
let mode = snapshot_meta.permissions().mode();
// rwxrwxrwx
meili_snap::snapshot!(format!("{:b}", mode), @"1000000100100100");
}
let options = Opt { import_snapshot: Some(snapshot_path), ..default_settings(temp.path()) };
let snapshot_server = Server::new_with_options(options).await.unwrap();
verify_snapshot!(server, snapshot_server, |server| =>
server.list_indexes(None, None),
// for some reason the db sizes differ. this may be due to the compaction options we have
// set when performing the snapshot
//server.stats(),
// The original instance contains the snapshotCreation task, while the snapshotted-instance does not. For this reason we need to compare the task queue **after** the task 4
server.tasks_filter("?from=2"),
server.index("catto").get_all_documents(GetAllDocumentsOptions::default()),
server.index("catto").settings(),
server.index("doggo").get_all_documents(GetAllDocumentsOptions::default()),
server.index("doggo").settings(),
);
}

View File

@@ -1,8 +1,8 @@
use serde_json::json;
use time::format_description::well_known::Rfc3339;
use time::OffsetDateTime;
use crate::common::Server;
use crate::json;
#[actix_rt::test]
async fn get_settings_unexisting_index() {

View File

@@ -1,7 +1,7 @@
use meili_snap::*;
use serde_json::json;
use crate::common::Server;
use crate::json;
#[actix_rt::test]
async fn swap_indexes_bad_format() {

View File

@@ -1,9 +1,9 @@
mod errors;
use meili_snap::{json_string, snapshot};
use serde_json::json;
use crate::common::{GetAllDocumentsOptions, Server};
use crate::json;
#[actix_rt::test]
async fn swap_indexes() {

View File

@@ -1,11 +1,11 @@
mod errors;
use meili_snap::insta::assert_json_snapshot;
use serde_json::json;
use time::format_description::well_known::Rfc3339;
use time::OffsetDateTime;
use crate::common::Server;
use crate::json;
#[actix_rt::test]
async fn error_get_unexisting_task_status() {
@@ -33,7 +33,7 @@ async fn get_task_status() {
index.create(None).await;
index
.add_documents(
serde_json::json!([{
json!([{
"id": 1,
"content": "foobar",
}]),

View File

@@ -79,6 +79,7 @@ big_s = "1.0.2"
insta = "1.29.0"
maplit = "1.0.2"
md5 = "0.7.0"
meili-snap = { path = "../meili-snap" }
rand = { version = "0.8.5", features = ["small_rng"] }
[features]

View File

@@ -60,12 +60,16 @@ impl CboRoaringBitmapCodec {
/// if the merged values length is under the threshold, values are directly
/// serialized in the buffer else a RoaringBitmap is created from the
/// values and is serialized in the buffer.
pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec<u8>) -> io::Result<()> {
pub fn merge_into<I, A>(slices: I, buffer: &mut Vec<u8>) -> io::Result<()>
where
I: IntoIterator<Item = A>,
A: AsRef<[u8]>,
{
let mut roaring = RoaringBitmap::new();
let mut vec = Vec::new();
for bytes in slices {
if bytes.len() <= THRESHOLD * size_of::<u32>() {
if bytes.as_ref().len() <= THRESHOLD * size_of::<u32>() {
let mut reader = bytes.as_ref();
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
vec.push(integer);
@@ -85,7 +89,7 @@ impl CboRoaringBitmapCodec {
}
} else {
// We can unwrap safely because the vector is sorted upper.
let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap();
let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap();
roaring.serialize_into(buffer)?;
}
} else {

View File

@@ -119,16 +119,16 @@ pub struct Index {
pub(crate) main: PolyDatabase,
/// A word and all the documents ids containing the word.
pub word_docids: Database<Str, RoaringBitmapCodec>,
pub word_docids: Database<Str, CboRoaringBitmapCodec>,
/// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
pub exact_word_docids: Database<Str, RoaringBitmapCodec>,
pub exact_word_docids: Database<Str, CboRoaringBitmapCodec>,
/// A prefix of word and all the documents ids containing this prefix.
pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
pub word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>,
pub exact_word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
/// Maps the proximity between a pair of words with all the docids where this relation appears.
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,

View File

@@ -11,9 +11,7 @@ use super::interner::Interned;
use super::Word;
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
use crate::{
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext,
};
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext};
/// A cache storing pointers to values in the LMDB databases.
///
@@ -168,7 +166,7 @@ impl<'ctx> SearchContext<'ctx> {
merge_cbo_roaring_bitmaps,
)
}
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
word,
self.word_interner.get(word).as_str(),
@@ -182,7 +180,7 @@ impl<'ctx> SearchContext<'ctx> {
&mut self,
word: Interned<String>,
) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
word,
self.word_interner.get(word).as_str(),
@@ -230,7 +228,7 @@ impl<'ctx> SearchContext<'ctx> {
merge_cbo_roaring_bitmaps,
)
}
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
prefix,
self.word_interner.get(prefix).as_str(),
@@ -244,7 +242,7 @@ impl<'ctx> SearchContext<'ctx> {
&mut self,
prefix: Interned<String>,
) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
prefix,
self.word_interner.get(prefix).as_str(),

View File

@@ -13,6 +13,7 @@ This module tests the `sort` ranking rule:
use big_s::S;
use maplit::hashset;
use meili_snap::insta;
use crate::index::tests::TempIndex;
use crate::search::new::tests::collect_field_values;

104
milli/src/update/del_add.rs Normal file
View File

@@ -0,0 +1,104 @@
use obkv::Key;
pub type KvWriterDelAdd<W> = obkv::KvWriter<W, DelAdd>;
pub type KvReaderDelAdd<'a> = obkv::KvReader<'a, DelAdd>;
/// DelAdd defines the new value to add in the database and old value to delete from the database.
///
/// Its used in an OBKV to be serialized in grenad files.
#[repr(u8)]
#[derive(Clone, Copy, PartialOrd, PartialEq, Debug)]
pub enum DelAdd {
Deletion = 0,
Addition = 1,
}
impl Key for DelAdd {
const BYTES_SIZE: usize = std::mem::size_of::<DelAdd>();
type BYTES = [u8; Self::BYTES_SIZE];
fn to_be_bytes(&self) -> Self::BYTES {
u8::to_be_bytes(*self as u8)
}
fn from_be_bytes(array: Self::BYTES) -> Self {
match u8::from_be_bytes(array) {
0 => Self::Deletion,
1 => Self::Addition,
otherwise => unreachable!("DelAdd has only 2 variants, unknown variant: {}", otherwise),
}
}
}
/// Creates a Kv<K, Kv<DelAdd, value>> from Kv<K, value>
///
/// if deletion is `true`, the value will be inserted behind a DelAdd::Deletion key.
/// if addition is `true`, the value will be inserted behind a DelAdd::Addition key.
/// if both deletion and addition are `true, the value will be inserted in both keys.
pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
reader: obkv::KvReader<K>,
deletion: bool,
addition: bool,
buffer: &mut Vec<u8>,
) -> Result<(), std::io::Error> {
let mut writer = obkv::KvWriter::new(buffer);
let mut value_buffer = Vec::new();
for (key, value) in reader.iter() {
value_buffer.clear();
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
if deletion {
value_writer.insert(DelAdd::Deletion, value)?;
}
if addition {
value_writer.insert(DelAdd::Addition, value)?;
}
value_writer.finish()?;
writer.insert(key, &value_buffer)?;
}
writer.finish()
}
/// Creates a Kv<K, Kv<DelAdd, value>> from two Kv<K, value>
///
/// putting each deletion obkv's keys under an DelAdd::Deletion
/// and putting each addition obkv's keys under an DelAdd::Addition
pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
deletion: obkv::KvReader<K>,
addition: obkv::KvReader<K>,
buffer: &mut Vec<u8>,
) -> Result<(), std::io::Error> {
use itertools::merge_join_by;
use itertools::EitherOrBoth::{Both, Left, Right};
let mut writer = obkv::KvWriter::new(buffer);
let mut value_buffer = Vec::new();
for eob in merge_join_by(deletion.iter(), addition.iter(), |(b, _), (u, _)| b.cmp(u)) {
value_buffer.clear();
match eob {
Left((k, v)) => {
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
value_writer.insert(DelAdd::Deletion, v).unwrap();
writer.insert(k, value_writer.into_inner()?).unwrap();
}
Right((k, v)) => {
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
value_writer.insert(DelAdd::Addition, v).unwrap();
writer.insert(k, value_writer.into_inner()?).unwrap();
}
Both((k, deletion), (_, addition)) => {
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
value_writer.insert(DelAdd::Addition, addition).unwrap();
writer.insert(k, value_writer.into_inner()?).unwrap();
}
}
}
writer.finish()
}
pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd) -> bool {
del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition)
}

View File

@@ -16,9 +16,7 @@ use crate::facet::FacetType;
use crate::heed_codec::facet::FieldDocIdFacetCodec;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::index::Hnsw;
use crate::{
ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, BEU32,
};
use crate::{ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, BEU32};
pub struct DeleteDocuments<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
@@ -495,7 +493,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
fn remove_from_word_prefix_docids(
txn: &mut heed::RwTxn,
db: &Database<Str, RoaringBitmapCodec>,
db: &Database<Str, CboRoaringBitmapCodec>,
to_remove: &RoaringBitmap,
) -> Result<fst::Set<Vec<u8>>> {
let mut prefixes_to_delete = fst::SetBuilder::memory();
@@ -523,7 +521,7 @@ fn remove_from_word_prefix_docids(
fn remove_from_word_docids(
txn: &mut heed::RwTxn,
db: &heed::Database<Str, RoaringBitmapCodec>,
db: &heed::Database<Str, CboRoaringBitmapCodec>,
to_remove: &RoaringBitmap,
words_to_keep: &mut BTreeSet<String>,
words_to_remove: &mut BTreeSet<String>,

View File

@@ -132,6 +132,8 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
self.db.delete_range(wtxn, &range).map(drop)?;
Ok(())
}
// TODO the new_data is an Reader<Obkv<Key, Obkv<DelAdd, RoaringBitmap>>>
fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> {
let new_data = match self.new_data.take() {
Some(x) => x,

View File

@@ -114,6 +114,7 @@ pub struct FacetsUpdate<'i> {
min_level_size: u8,
}
impl<'i> FacetsUpdate<'i> {
// TODO grenad::Reader<Key, Obkv<DelAdd, RoaringBitmap>>
pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader<File>) -> Self {
let database = match facet_type {
FacetType::String => index

View File

@@ -4,18 +4,16 @@ use std::fs::File;
use std::{io, mem, str};
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
use obkv::KvReader;
use obkv::{KvReader, KvWriterU16};
use roaring::RoaringBitmap;
use serde_json::Value;
use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
use crate::error::{InternalError, SerializationError};
use crate::update::index_documents::MergeFn;
use crate::{
absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
};
use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>;
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>;
/// Extracts the word and positions where this word appear and
/// prefixes it by the document id.
@@ -38,18 +36,153 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
let max_memory = indexer.max_memory_by_thread();
// initialize destination values.
let mut documents_ids = RoaringBitmap::new();
let mut script_language_docids = HashMap::new();
let mut docid_word_positions_sorter = create_sorter(
grenad::SortAlgorithm::Stable,
concat_u32s_array,
keep_latest_obkv,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory,
);
let mut buffers = Buffers::default();
// initialize buffers.
let mut del_buffers = Buffers::default();
let mut add_buffers = Buffers::default();
let mut key_buffer = Vec::new();
let mut value_buffer = Vec::new();
// initialize tokenizer.
let mut builder = tokenizer_builder(stop_words, dictionary, allowed_separators, None);
let tokenizer = builder.build();
// iterate over documents.
let mut cursor = obkv_documents.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let document_id = key
.try_into()
.map(u32::from_be_bytes)
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
let obkv = KvReader::<FieldId>::new(value);
// if the searchable fields didn't change, skip the searchable indexing for this document.
if !searchable_fields_changed(&KvReader::<FieldId>::new(value), searchable_fields) {
continue;
}
documents_ids.push(document_id);
// Update key buffer prefix.
key_buffer.clear();
key_buffer.extend_from_slice(&document_id.to_be_bytes());
// Tokenize deletions and additions in 2 diffferent threads.
let (del, add): (Result<_>, Result<_>) = rayon::join(
|| {
// deletions
lang_safe_tokens_from_document(
&obkv,
searchable_fields,
&tokenizer,
stop_words,
allowed_separators,
dictionary,
max_positions_per_attributes,
DelAdd::Deletion,
&mut del_buffers,
)
},
|| {
// additions
lang_safe_tokens_from_document(
&obkv,
searchable_fields,
&tokenizer,
stop_words,
allowed_separators,
dictionary,
max_positions_per_attributes,
DelAdd::Addition,
&mut add_buffers,
)
},
);
let (del_obkv, del_script_language_word_count) = del?;
let (add_obkv, add_script_language_word_count) = add?;
// merge deletions and additions.
value_buffer.clear();
del_add_from_two_obkvs(
KvReader::<FieldId>::new(del_obkv),
KvReader::<FieldId>::new(add_obkv),
&mut value_buffer,
)?;
// write them into the sorter.
let obkv = KvReader::<FieldId>::new(value);
for (field_id, value) in obkv.iter() {
key_buffer.truncate(mem::size_of::<u32>());
key_buffer.extend_from_slice(&field_id.to_be_bytes());
docid_word_positions_sorter.insert(&key_buffer, value)?;
}
// update script_language_docids deletions.
for (script, languages_frequency) in del_script_language_word_count {
for (language, _) in languages_frequency {
let entry = script_language_docids
.entry((script, language))
.or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
entry.0.push(document_id);
}
}
// update script_language_docids additions.
for (script, languages_frequency) in add_script_language_word_count {
for (language, _) in languages_frequency {
let entry = script_language_docids
.entry((script, language))
.or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
entry.1.push(document_id);
}
}
}
sorter_into_reader(docid_word_positions_sorter, indexer)
.map(|reader| (documents_ids, reader, script_language_docids))
}
/// Check if any searchable fields of a document changed.
fn searchable_fields_changed(
obkv: &KvReader<FieldId>,
searchable_fields: &Option<HashSet<FieldId>>,
) -> bool {
for (field_id, field_bytes) in obkv.iter() {
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
let del_add = KvReaderDelAdd::new(field_bytes);
match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
// if both fields are None, check the next field.
(None, None) => (),
// if both contains a value and values are the same, check the next field.
(Some(del), Some(add)) if del == add => (),
// otherwise the fields are different, return true.
_otherwise => return true,
}
}
}
false
}
/// Factorize tokenizer building.
fn tokenizer_builder<'a>(
stop_words: Option<&'a fst::Set<&[u8]>>,
allowed_separators: Option<&'a [&str]>,
dictionary: Option<&'a [&str]>,
script_language: Option<&'a HashMap<Script, Vec<Language>>>,
) -> TokenizerBuilder<'a, &'a [u8]> {
let mut tokenizer_builder = TokenizerBuilder::new();
if let Some(stop_words) = stop_words {
tokenizer_builder.stop_words(stop_words);
@@ -60,130 +193,144 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
if let Some(separators) = allowed_separators {
tokenizer_builder.separators(separators);
}
let tokenizer = tokenizer_builder.build();
let mut cursor = obkv_documents.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let document_id = key
.try_into()
.map(u32::from_be_bytes)
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
let obkv = KvReader::<FieldId>::new(value);
if let Some(script_language) = script_language {
tokenizer_builder.allow_list(&script_language);
}
documents_ids.push(document_id);
buffers.key_buffer.clear();
buffers.key_buffer.extend_from_slice(&document_id.to_be_bytes());
tokenizer_builder
}
let mut script_language_word_count = HashMap::new();
/// Extract words maped with their positions of a document,
/// ensuring no Language detection mistakes was made.
fn lang_safe_tokens_from_document<'a>(
obkv: &KvReader<FieldId>,
searchable_fields: &Option<HashSet<FieldId>>,
tokenizer: &Tokenizer,
stop_words: Option<&fst::Set<&[u8]>>,
allowed_separators: Option<&[&str]>,
dictionary: Option<&[&str]>,
max_positions_per_attributes: u32,
del_add: DelAdd,
buffers: &'a mut Buffers,
) -> Result<(&'a [u8], HashMap<Script, Vec<(Language, usize)>>)> {
let mut script_language_word_count = HashMap::new();
extract_tokens_from_document(
&obkv,
searchable_fields,
&tokenizer,
max_positions_per_attributes,
&mut buffers,
&mut script_language_word_count,
&mut docid_word_positions_sorter,
)?;
tokens_from_document(
&obkv,
searchable_fields,
&tokenizer,
max_positions_per_attributes,
del_add,
buffers,
&mut script_language_word_count,
)?;
// if we detect a potetial mistake in the language detection,
// we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages.
// context: https://github.com/meilisearch/meilisearch/issues/3565
if script_language_word_count
.values()
.map(Vec::as_slice)
.any(potential_language_detection_error)
{
// build an allow list with the most frequent detected languages in the document.
let script_language: HashMap<_, _> =
script_language_word_count.iter().filter_map(most_frequent_languages).collect();
// if we detect a potetial mistake in the language detection,
// we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages.
// context: https://github.com/meilisearch/meilisearch/issues/3565
if script_language_word_count
.values()
.map(Vec::as_slice)
.any(potential_language_detection_error)
{
// build an allow list with the most frequent detected languages in the document.
let script_language: HashMap<_, _> =
script_language_word_count.iter().filter_map(most_frequent_languages).collect();
// if the allow list is empty, meaning that no Language is considered frequent,
// then we don't rerun the extraction.
if !script_language.is_empty() {
// build a new temporary tokenizer including the allow list.
let mut tokenizer_builder = TokenizerBuilder::new();
if let Some(stop_words) = stop_words {
tokenizer_builder.stop_words(stop_words);
}
tokenizer_builder.allow_list(&script_language);
let tokenizer = tokenizer_builder.build();
// if the allow list is empty, meaning that no Language is considered frequent,
// then we don't rerun the extraction.
if !script_language.is_empty() {
// build a new temporary tokenizer including the allow list.
let mut builder = tokenizer_builder(
stop_words,
dictionary,
allowed_separators,
Some(&script_language),
);
let tokenizer = builder.build();
script_language_word_count.clear();
script_language_word_count.clear();
// rerun the extraction.
extract_tokens_from_document(
&obkv,
searchable_fields,
&tokenizer,
max_positions_per_attributes,
&mut buffers,
&mut script_language_word_count,
&mut docid_word_positions_sorter,
)?;
}
}
for (script, languages_frequency) in script_language_word_count {
for (language, _) in languages_frequency {
let entry = script_language_docids
.entry((script, language))
.or_insert_with(RoaringBitmap::new);
entry.push(document_id);
}
// rerun the extraction.
tokens_from_document(
&obkv,
searchable_fields,
&tokenizer,
max_positions_per_attributes,
del_add,
buffers,
&mut script_language_word_count,
)?;
}
}
sorter_into_reader(docid_word_positions_sorter, indexer)
.map(|reader| (documents_ids, reader, script_language_docids))
Ok((&buffers.obkv_buffer, script_language_word_count))
}
fn extract_tokens_from_document(
/// Extract words maped with their positions of a document.
fn tokens_from_document<'a>(
obkv: &KvReader<FieldId>,
searchable_fields: &Option<HashSet<FieldId>>,
tokenizer: &Tokenizer,
max_positions_per_attributes: u32,
buffers: &mut Buffers,
del_add: DelAdd,
buffers: &'a mut Buffers,
script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
docid_word_positions_sorter: &mut grenad::Sorter<MergeFn>,
) -> Result<()> {
) -> Result<&'a [u8]> {
buffers.obkv_buffer.clear();
let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
for (field_id, field_bytes) in obkv.iter() {
// if field is searchable.
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
buffers.field_buffer.clear();
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
let tokens = process_tokens(tokenizer.tokenize(field))
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
// extract deletion or addition only.
if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) {
// parse json.
let value =
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
for (index, token) in tokens {
// if a language has been detected for the token, we update the counter.
if let Some(language) = token.language {
let script = token.script;
let entry =
script_language_word_count.entry(script).or_insert_with(Vec::new);
match entry.iter_mut().find(|(l, _)| *l == language) {
Some((_, n)) => *n += 1,
None => entry.push((language, 1)),
// prepare writting destination.
buffers.obkv_positions_buffer.clear();
let mut writer = KvWriterU16::new(&mut buffers.obkv_positions_buffer);
// convert json into an unique string.
buffers.field_buffer.clear();
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
// create an iterator of token with their positions.
let tokens = process_tokens(tokenizer.tokenize(field))
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
for (index, token) in tokens {
// if a language has been detected for the token, we update the counter.
if let Some(language) = token.language {
let script = token.script;
let entry =
script_language_word_count.entry(script).or_insert_with(Vec::new);
match entry.iter_mut().find(|(l, _)| *l == language) {
Some((_, n)) => *n += 1,
None => entry.push((language, 1)),
}
}
// keep a word only if it is not empty and fit in a LMDB key.
let token = token.lemma().trim();
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
let position: u16 = index
.try_into()
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
writer.insert(position, token.as_bytes())?;
}
}
let token = token.lemma().trim();
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
buffers.key_buffer.truncate(mem::size_of::<u32>());
buffers.key_buffer.extend_from_slice(token.as_bytes());
let position: u16 = index
.try_into()
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
let position = absolute_from_relative_position(field_id, position);
docid_word_positions_sorter
.insert(&buffers.key_buffer, position.to_ne_bytes())?;
}
// write positions into document.
let positions = writer.into_inner()?;
document_writer.insert(field_id, positions)?;
}
}
}
}
Ok(())
Ok(document_writer.into_inner().map(|v| v.as_slice())?)
}
/// Transform a JSON value into a string that can be indexed.
@@ -286,10 +433,10 @@ fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize)
#[derive(Default)]
struct Buffers {
// the key buffer is the concatenation of the internal document id with the field id.
// The buffer has to be completelly cleared between documents,
// and the field id part must be cleared between each field.
key_buffer: Vec<u8>,
// the field buffer for each fields desserialization, and must be cleared between each field.
field_buffer: String,
// buffer used to store the value data containing an obkv.
obkv_buffer: Vec<u8>,
// buffer used to store the value data containing an obkv of tokens with their positions.
obkv_positions_buffer: Vec<u8>,
}

View File

@@ -4,11 +4,12 @@ use std::io;
use heed::{BytesDecode, BytesEncode};
use super::helpers::{
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
};
use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
};
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
use crate::Result;
/// Extracts the facet number and the documents ids where this facet number appear.
@@ -17,7 +18,7 @@ use crate::Result;
/// documents ids from the given chunk of docid facet number positions.
#[logging_timer::time]
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
docid_fid_facet_number: grenad::Reader<R>,
fid_docid_facet_number: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> {
puffin::profile_function!();
@@ -26,21 +27,30 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
let mut facet_number_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory,
);
let mut cursor = docid_fid_facet_number.into_cursor()?;
while let Some((key_bytes, _)) = cursor.move_on_next()? {
let mut buffer = Vec::new();
let mut cursor = fid_docid_facet_number.into_cursor()?;
while let Some((key_bytes, deladd_obkv_bytes)) = cursor.move_on_next()? {
let (field_id, document_id, number) =
FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap();
let key = FacetGroupKey { field_id, level: 0, left_bound: number };
let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?;
buffer.clear();
let mut obkv = KvWriterDelAdd::new(&mut buffer);
for (deladd_key, _) in KvReaderDelAdd::new(deladd_obkv_bytes).iter() {
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
}
obkv.finish()?;
facet_number_docids_sorter.insert(key_bytes, &buffer)?;
}
sorter_into_reader(facet_number_docids_sorter, indexer)

View File

@@ -1,13 +1,14 @@
use std::fs::File;
use std::io;
use std::{io, str};
use heed::BytesEncode;
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
use crate::heed_codec::StrRefCodec;
use crate::update::index_documents::merge_cbo_roaring_bitmaps;
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::merge_deladd_cbo_roaring_bitmaps;
use crate::{FieldId, Result};
/// Extracts the facet string and the documents ids where this facet string appear.
///
@@ -24,15 +25,16 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
let mut facet_string_docids_sorter = create_sorter(
grenad::SortAlgorithm::Stable,
merge_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory,
);
let mut buffer = Vec::new();
let mut cursor = docid_fid_facet_string.into_cursor()?;
while let Some((key, _original_value_bytes)) = cursor.move_on_next()? {
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
let field_id = FieldId::from_be_bytes(field_id_bytes);
@@ -40,21 +42,17 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
try_split_array_at::<_, 4>(bytes).unwrap();
let document_id = u32::from_be_bytes(document_id_bytes);
let mut normalised_value = std::str::from_utf8(normalized_value_bytes)?;
let normalised_truncated_value: String;
if normalised_value.len() > MAX_FACET_VALUE_LENGTH {
normalised_truncated_value = normalised_value
.char_indices()
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
.map(|(_, c)| c)
.collect();
normalised_value = normalised_truncated_value.as_str();
}
let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value };
let normalized_value = str::from_utf8(normalized_value_bytes)?;
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
// document id is encoded in native-endian because of the CBO roaring bitmap codec
facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?;
buffer.clear();
let mut obkv = KvWriterDelAdd::new(&mut buffer);
for (deladd_key, _) in KvReaderDelAdd::new(deladd_original_value_bytes).iter() {
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
}
obkv.finish()?;
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
}
sorter_into_reader(facet_string_docids_sorter, indexer)

View File

@@ -1,24 +1,36 @@
use std::borrow::Cow;
use std::collections::{BTreeMap, HashSet};
use std::convert::TryInto;
use std::fs::File;
use std::io;
use std::mem::size_of;
use std::result::Result as StdResult;
use grenad::Sorter;
use heed::zerocopy::AsBytes;
use heed::BytesEncode;
use itertools::EitherOrBoth;
use ordered_float::OrderedFloat;
use roaring::RoaringBitmap;
use serde_json::{from_slice, Value};
use FilterableValues::{Empty, Null, Values};
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
use crate::error::InternalError;
use crate::facet::value_encoding::f64_into_bytes;
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
use crate::update::index_documents::{create_writer, writer_into_reader};
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH};
use crate::{
CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH,
};
/// The length of the elements that are always in the buffer when inserting new values.
const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
/// The extracted facet values stored in grenad files by type.
pub struct ExtractedFacetValues {
pub docid_fid_facet_numbers_chunk: grenad::Reader<File>,
pub docid_fid_facet_strings_chunk: grenad::Reader<File>,
pub fid_docid_facet_numbers_chunk: grenad::Reader<File>,
pub fid_docid_facet_strings_chunk: grenad::Reader<File>,
pub fid_facet_is_null_docids_chunk: grenad::Reader<File>,
pub fid_facet_is_empty_docids_chunk: grenad::Reader<File>,
pub fid_facet_exists_docids_chunk: grenad::Reader<File>,
@@ -58,71 +70,150 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
max_memory.map(|m| m / 2),
);
let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
let mut facet_is_null_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
let mut facet_is_empty_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
// The tuples represents the Del and Add side for a bitmap
let mut facet_exists_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
let mut facet_is_null_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
let mut facet_is_empty_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
// We create two buffer for mutable ref issues with closures.
let mut numbers_key_buffer = Vec::new();
let mut strings_key_buffer = Vec::new();
let mut key_buffer = Vec::new();
let mut cursor = obkv_documents.into_cursor()?;
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
let obkv = obkv::KvReader::new(value);
for (field_id, field_bytes) in obkv.iter() {
if faceted_fields.contains(&field_id) {
key_buffer.clear();
numbers_key_buffer.clear();
strings_key_buffer.clear();
// Set key to the field_id
// Note: this encoding is consistent with FieldIdCodec
key_buffer.extend_from_slice(&field_id.to_be_bytes());
numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes());
strings_key_buffer.extend_from_slice(&field_id.to_be_bytes());
// Here, we know already that the document must be added to the “field id exists” database
let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
let document = BEU32::from(document).get();
facet_exists_docids.entry(field_id).or_default().insert(document);
// For the other extraction tasks, prefix the key with the field_id and the document_id
key_buffer.extend_from_slice(docid_bytes);
numbers_key_buffer.extend_from_slice(docid_bytes);
strings_key_buffer.extend_from_slice(docid_bytes);
let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
let del_add_obkv = obkv::KvReader::new(field_bytes);
let del_value = match del_add_obkv.get(DelAdd::Deletion) {
Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?,
None => None,
};
let add_value = match del_add_obkv.get(DelAdd::Addition) {
Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?,
None => None,
};
match extract_facet_values(
&value,
geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng),
) {
FilterableValues::Null => {
facet_is_null_docids.entry(field_id).or_default().insert(document);
}
FilterableValues::Empty => {
facet_is_empty_docids.entry(field_id).or_default().insert(document);
}
FilterableValues::Values { numbers, strings } => {
// insert facet numbers in sorter
for number in numbers {
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
if let Some(value_bytes) = f64_into_bytes(number) {
key_buffer.extend_from_slice(&value_bytes);
key_buffer.extend_from_slice(&number.to_be_bytes());
// We insert the document id on the Del and the Add side if the field exists.
let (ref mut del_exists, ref mut add_exists) =
facet_exists_docids.entry(field_id).or_default();
let (ref mut del_is_null, ref mut add_is_null) =
facet_is_null_docids.entry(field_id).or_default();
let (ref mut del_is_empty, ref mut add_is_empty) =
facet_is_empty_docids.entry(field_id).or_default();
fid_docid_facet_numbers_sorter
.insert(&key_buffer, ().as_bytes())?;
}
if del_value.is_some() {
del_exists.insert(document);
}
if add_value.is_some() {
add_exists.insert(document);
}
let geo_support =
geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
let del_filterable_values =
del_value.map(|value| extract_facet_values(&value, geo_support));
let add_filterable_values =
add_value.map(|value| extract_facet_values(&value, geo_support));
// Those closures are just here to simplify things a bit.
let mut insert_numbers_diff = |del_numbers, add_numbers| {
insert_numbers_diff(
&mut fid_docid_facet_numbers_sorter,
&mut numbers_key_buffer,
del_numbers,
add_numbers,
)
};
let mut insert_strings_diff = |del_strings, add_strings| {
insert_strings_diff(
&mut fid_docid_facet_strings_sorter,
&mut strings_key_buffer,
del_strings,
add_strings,
)
};
match (del_filterable_values, add_filterable_values) {
(None, None) => (),
(Some(del_filterable_values), None) => match del_filterable_values {
Null => {
del_is_null.insert(document);
}
// insert normalized and original facet string in sorter
for (normalized, original) in
strings.into_iter().filter(|(n, _)| !n.is_empty())
{
let normalized_truncated_value: String = normalized
.char_indices()
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
.map(|(_, c)| c)
.collect();
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
key_buffer.extend_from_slice(normalized_truncated_value.as_bytes());
fid_docid_facet_strings_sorter
.insert(&key_buffer, original.as_bytes())?;
Empty => {
del_is_empty.insert(document);
}
Values { numbers, strings } => {
insert_numbers_diff(numbers, vec![])?;
insert_strings_diff(strings, vec![])?;
}
},
(None, Some(add_filterable_values)) => match add_filterable_values {
Null => {
add_is_null.insert(document);
}
Empty => {
add_is_empty.insert(document);
}
Values { numbers, strings } => {
insert_numbers_diff(vec![], numbers)?;
insert_strings_diff(vec![], strings)?;
}
},
(Some(del_filterable_values), Some(add_filterable_values)) => {
match (del_filterable_values, add_filterable_values) {
(Null, Null) | (Empty, Empty) => (),
(Null, Empty) => {
del_is_null.insert(document);
add_is_empty.insert(document);
}
(Empty, Null) => {
del_is_empty.insert(document);
add_is_null.insert(document);
}
(Null, Values { numbers, strings }) => {
insert_numbers_diff(vec![], numbers)?;
insert_strings_diff(vec![], strings)?;
del_is_null.insert(document);
}
(Empty, Values { numbers, strings }) => {
insert_numbers_diff(vec![], numbers)?;
insert_strings_diff(vec![], strings)?;
del_is_empty.insert(document);
}
(Values { numbers, strings }, Null) => {
add_is_null.insert(document);
insert_numbers_diff(numbers, vec![])?;
insert_strings_diff(strings, vec![])?;
}
(Values { numbers, strings }, Empty) => {
add_is_empty.insert(document);
insert_numbers_diff(numbers, vec![])?;
insert_strings_diff(strings, vec![])?;
}
(
Values { numbers: del_numbers, strings: del_strings },
Values { numbers: add_numbers, strings: add_strings },
) => {
insert_numbers_diff(del_numbers, add_numbers)?;
insert_strings_diff(del_strings, add_strings)?;
}
}
}
}
@@ -130,14 +221,15 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
}
}
let mut buffer = Vec::new();
let mut facet_exists_docids_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
for (fid, bitmap) in facet_exists_docids.into_iter() {
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
for (fid, (del_bitmap, add_bitmap)) in facet_exists_docids.into_iter() {
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
facet_exists_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
}
let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
@@ -146,9 +238,9 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
for (fid, bitmap) in facet_is_null_docids.into_iter() {
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
for (fid, (del_bitmap, add_bitmap)) in facet_is_null_docids.into_iter() {
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
}
let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
@@ -157,21 +249,156 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
for (fid, bitmap) in facet_is_empty_docids.into_iter() {
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
for (fid, (del_bitmap, add_bitmap)) in facet_is_empty_docids.into_iter() {
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
}
let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
Ok(ExtractedFacetValues {
docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
fid_docid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
fid_docid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
fid_facet_exists_docids_chunk: facet_exists_docids_reader,
})
}
/// Generates a vector of bytes containing a DelAdd obkv with two bitmaps.
fn deladd_obkv_cbo_roaring_bitmaps(
buffer: &mut Vec<u8>,
del_bitmap: &RoaringBitmap,
add_bitmap: &RoaringBitmap,
) -> io::Result<()> {
buffer.clear();
let mut obkv = KvWriterDelAdd::new(buffer);
let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap();
let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap();
obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
obkv.finish()
}
/// Truncates a string to the biggest valid LMDB key size.
fn truncate_string(s: String) -> String {
s.char_indices()
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
.map(|(_, c)| c)
.collect()
}
/// Computes the diff between both Del and Add numbers and
/// only inserts the parts that differ in the sorter.
fn insert_numbers_diff<MF>(
fid_docid_facet_numbers_sorter: &mut Sorter<MF>,
key_buffer: &mut Vec<u8>,
mut del_numbers: Vec<f64>,
mut add_numbers: Vec<f64>,
) -> Result<()>
where
MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
{
// We sort and dedup the float numbers
del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
del_numbers.dedup_by_key(|f| OrderedFloat(*f));
add_numbers.dedup_by_key(|f| OrderedFloat(*f));
let merged_numbers_iter = itertools::merge_join_by(
del_numbers.into_iter().map(OrderedFloat),
add_numbers.into_iter().map(OrderedFloat),
|del, add| del.cmp(add),
);
// insert facet numbers in sorter
for eob in merged_numbers_iter {
key_buffer.truncate(TRUNCATE_SIZE);
match eob {
EitherOrBoth::Both(_, _) => (), // no need to touch anything
EitherOrBoth::Left(OrderedFloat(number)) => {
if let Some(value_bytes) = f64_into_bytes(number) {
key_buffer.extend_from_slice(&value_bytes);
key_buffer.extend_from_slice(&number.to_be_bytes());
// We insert only the Del part of the Obkv to inform
// that we only want to remove all those numbers.
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Deletion, ().as_bytes())?;
let bytes = obkv.into_inner()?;
fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
}
}
EitherOrBoth::Right(OrderedFloat(number)) => {
if let Some(value_bytes) = f64_into_bytes(number) {
key_buffer.extend_from_slice(&value_bytes);
key_buffer.extend_from_slice(&number.to_be_bytes());
// We insert only the Del part of the Obkv to inform
// that we only want to remove all those numbers.
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Addition, ().as_bytes())?;
let bytes = obkv.into_inner()?;
fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
}
}
}
}
Ok(())
}
/// Computes the diff between both Del and Add strings and
/// only inserts the parts that differ in the sorter.
fn insert_strings_diff<MF>(
fid_docid_facet_strings_sorter: &mut Sorter<MF>,
key_buffer: &mut Vec<u8>,
mut del_strings: Vec<(String, String)>,
mut add_strings: Vec<(String, String)>,
) -> Result<()>
where
MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
{
// We sort and dedup the normalized and original strings
del_strings.sort_unstable();
add_strings.sort_unstable();
del_strings.dedup();
add_strings.dedup();
let merged_strings_iter = itertools::merge_join_by(
del_strings.into_iter().filter(|(n, _)| !n.is_empty()),
add_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|del, add| del.cmp(add),
);
// insert normalized and original facet string in sorter
for eob in merged_strings_iter {
key_buffer.truncate(TRUNCATE_SIZE);
match eob {
EitherOrBoth::Both(_, _) => (), // no need to touch anything
EitherOrBoth::Left((normalized, original)) => {
let truncated = truncate_string(normalized);
key_buffer.extend_from_slice(truncated.as_bytes());
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Deletion, original)?;
let bytes = obkv.into_inner()?;
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
}
EitherOrBoth::Right((normalized, original)) => {
let truncated = truncate_string(normalized);
key_buffer.extend_from_slice(truncated.as_bytes());
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Addition, original)?;
let bytes = obkv.into_inner()?;
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
}
}
}
Ok(())
}
/// Represent what a document field contains.
enum FilterableValues {
/// Corresponds to the JSON `null` value.
@@ -182,6 +409,7 @@ enum FilterableValues {
Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
}
/// Extracts the facet values of a JSON field.
fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
fn inner_extract_facet_values(
value: &Value,

View File

@@ -1,16 +1,17 @@
use std::collections::HashMap;
use std::fs::File;
use std::io;
use grenad::Sorter;
use obkv::KvReaderU16;
use super::helpers::{
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
try_split_array_at, GrenadParameters, MergeFn,
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
GrenadParameters,
};
use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::{relative_from_absolute_position, DocumentId, FieldId, Result};
use crate::Result;
const MAX_COUNTED_WORDS: usize = 30;
/// Extracts the field id word count and the documents ids where
/// this field id with this amount of words appear.
@@ -35,63 +36,21 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
max_memory,
);
// This map is assumed to not consume a lot of memory.
let mut document_fid_wordcount = HashMap::new();
let mut current_document_id = None;
let mut key_buffer = Vec::new();
let mut cursor = docid_word_positions.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let (document_id_bytes, _word_bytes) = try_split_array_at(key)
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let document_id = u32::from_be_bytes(document_id_bytes);
let curr_document_id = *current_document_id.get_or_insert(document_id);
if curr_document_id != document_id {
drain_document_fid_wordcount_into_sorter(
&mut fid_word_count_docids_sorter,
&mut document_fid_wordcount,
curr_document_id,
)?;
current_document_id = Some(document_id);
}
for position in read_u32_ne_bytes(value) {
let (field_id, _) = relative_from_absolute_position(position);
let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0);
*value += 1;
}
}
if let Some(document_id) = current_document_id {
// We must make sure that don't lose the current document field id
// word count map if we break because we reached the end of the chunk.
drain_document_fid_wordcount_into_sorter(
&mut fid_word_count_docids_sorter,
&mut document_fid_wordcount,
document_id,
)?;
}
sorter_into_reader(fid_word_count_docids_sorter, indexer)
}
fn drain_document_fid_wordcount_into_sorter(
fid_word_count_docids_sorter: &mut Sorter<MergeFn>,
document_fid_wordcount: &mut HashMap<FieldId, u32>,
document_id: DocumentId,
) -> Result<()> {
let mut key_buffer = Vec::new();
for (fid, count) in document_fid_wordcount.drain() {
if count <= 30 {
let word_count = KvReaderU16::new(&value).iter().take(MAX_COUNTED_WORDS + 1).count();
if word_count <= MAX_COUNTED_WORDS {
key_buffer.clear();
key_buffer.extend_from_slice(&fid.to_be_bytes());
key_buffer.push(count as u8);
key_buffer.extend_from_slice(fid_bytes);
key_buffer.push(word_count as u8);
fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
}
}
Ok(())
sorter_into_reader(fid_word_count_docids_sorter, indexer)
}

View File

@@ -1,18 +1,20 @@
use std::collections::HashSet;
use std::collections::{BTreeSet, HashSet};
use std::fs::File;
use std::io;
use std::iter::FromIterator;
use roaring::RoaringBitmap;
use heed::BytesDecode;
use obkv::KvReaderU16;
use super::helpers::{
create_sorter, merge_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader,
try_split_array_at, GrenadParameters,
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader,
try_split_array_at, writer_into_reader, GrenadParameters,
};
use crate::error::SerializationError;
use crate::heed_codec::StrBEU16Codec;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::update::index_documents::helpers::read_u32_ne_bytes;
use crate::{relative_from_absolute_position, FieldId, Result};
use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::MergeFn;
use crate::{DocumentId, FieldId, Result};
/// Extracts the word and the documents ids where this word appear.
///
@@ -26,65 +28,148 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
exact_attributes: &HashSet<FieldId>,
) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();
let mut word_docids_sorter = create_sorter(
let mut word_fid_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory.map(|x| x / 2),
max_memory.map(|x| x / 3),
);
let mut key_buffer = Vec::new();
let mut del_words = BTreeSet::new();
let mut add_words = BTreeSet::new();
let mut cursor = docid_word_positions.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let (fid_bytes, _) = try_split_array_at(fid_bytes)
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let document_id = u32::from_be_bytes(document_id_bytes);
let fid = u16::from_be_bytes(fid_bytes);
let del_add_reader = KvReaderDelAdd::new(&value);
// extract all unique words to remove.
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
for (_pos, word) in KvReaderU16::new(&deletion).iter() {
del_words.insert(word.to_vec());
}
}
// extract all unique additional words.
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
for (_pos, word) in KvReaderU16::new(&addition).iter() {
add_words.insert(word.to_vec());
}
}
words_into_sorter(
document_id,
fid,
&mut key_buffer,
&del_words,
&add_words,
&mut word_fid_docids_sorter,
)?;
del_words.clear();
add_words.clear();
}
let mut word_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_deladd_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory.map(|x| x / 3),
);
let mut exact_word_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory.map(|x| x / 2),
max_memory.map(|x| x / 3),
);
let mut value_buffer = Vec::new();
let mut cursor = docid_word_positions.into_cursor()?;
while let Some((key, positions)) = cursor.move_on_next()? {
let (document_id_bytes, word_bytes) = try_split_array_at(key)
let mut word_fid_docids_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?;
// TODO: replace sorters by writers by accumulating values into a buffer before inserting them.
while let Some((key, value)) = iter.next()? {
// only keep the value if their is a change to apply in the DB.
if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) {
word_fid_docids_writer.insert(key, value)?;
}
let (word, fid) = StrBEU16Codec::bytes_decode(key)
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let document_id = u32::from_be_bytes(document_id_bytes);
let bitmap = RoaringBitmap::from_iter(Some(document_id));
serialize_roaring_bitmap(&bitmap, &mut value_buffer)?;
// If there are no exact attributes, we do not need to iterate over positions.
if exact_attributes.is_empty() {
word_docids_sorter.insert(word_bytes, &value_buffer)?;
// every words contained in an attribute set to exact must be pushed in the exact_words list.
if exact_attributes.contains(&fid) {
exact_word_docids_sorter.insert(word.as_bytes(), &value)?;
} else {
let mut added_to_exact = false;
let mut added_to_word_docids = false;
for position in read_u32_ne_bytes(positions) {
// as soon as we know that this word had been to both readers, we don't need to
// iterate over the positions.
if added_to_exact && added_to_word_docids {
break;
}
let (fid, _) = relative_from_absolute_position(position);
if exact_attributes.contains(&fid) && !added_to_exact {
exact_word_docids_sorter.insert(word_bytes, &value_buffer)?;
added_to_exact = true;
} else if !added_to_word_docids {
word_docids_sorter.insert(word_bytes, &value_buffer)?;
added_to_word_docids = true;
}
}
word_docids_sorter.insert(word.as_bytes(), &value)?;
}
}
Ok((
sorter_into_reader(word_docids_sorter, indexer)?,
sorter_into_reader(exact_word_docids_sorter, indexer)?,
writer_into_reader(word_fid_docids_writer)?,
))
}
fn words_into_sorter(
document_id: DocumentId,
fid: FieldId,
key_buffer: &mut Vec<u8>,
del_words: &BTreeSet<Vec<u8>>,
add_words: &BTreeSet<Vec<u8>>,
word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
) -> Result<()> {
puffin::profile_function!();
use itertools::merge_join_by;
use itertools::EitherOrBoth::{Both, Left, Right};
let mut buffer = Vec::new();
for eob in merge_join_by(del_words.iter(), add_words.iter(), |d, a| d.cmp(a)) {
buffer.clear();
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
let word_bytes = match eob {
Left(word_bytes) => {
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
word_bytes
}
Right(word_bytes) => {
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
word_bytes
}
Both(word_bytes, _) => {
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
word_bytes
}
};
key_buffer.clear();
key_buffer.extend_from_slice(&word_bytes);
key_buffer.push(0);
key_buffer.extend_from_slice(&fid.to_be_bytes());
word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
}
Ok(())
}

View File

@@ -1,51 +0,0 @@
use std::fs::File;
use std::io;
use super::helpers::{
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
try_split_array_at, GrenadParameters,
};
use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::{relative_from_absolute_position, DocumentId, Result};
/// Extracts the word, field id, and the documents ids where this word appear at this field id.
#[logging_timer::time]
pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();
let mut word_fid_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory,
);
let mut key_buffer = Vec::new();
let mut cursor = docid_word_positions.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let (document_id_bytes, word_bytes) = try_split_array_at(key)
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let document_id = DocumentId::from_be_bytes(document_id_bytes);
for position in read_u32_ne_bytes(value) {
key_buffer.clear();
key_buffer.extend_from_slice(word_bytes);
key_buffer.push(0);
let (fid, _) = relative_from_absolute_position(position);
key_buffer.extend_from_slice(&fid.to_be_bytes());
word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
}
}
let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?;
Ok(word_fid_docids_reader)
}

View File

@@ -1,15 +1,17 @@
use std::cmp::Ordering;
use std::collections::{BinaryHeap, HashMap};
use std::collections::{BTreeMap, VecDeque};
use std::fs::File;
use std::{cmp, io, mem, str, vec};
use std::{cmp, io};
use obkv::KvReaderU16;
use super::helpers::{
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
try_split_array_at, GrenadParameters, MergeFn,
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at,
writer_into_reader, GrenadParameters, MergeFn,
};
use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::proximity::{positions_proximity, MAX_DISTANCE};
use crate::proximity::{index_proximity, MAX_DISTANCE};
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::{DocumentId, Result};
/// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
@@ -25,58 +27,138 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
let max_memory = indexer.max_memory_by_thread();
let mut word_pair_proximity_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory.map(|m| m / 2),
);
let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE)
.into_iter()
.map(|_| {
create_sorter(
grenad::SortAlgorithm::Unstable,
merge_deladd_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory.map(|m| m / MAX_DISTANCE as usize),
)
})
.collect();
// This map is assumed to not consume a lot of memory.
let mut document_word_positions_heap = BinaryHeap::new();
let mut del_word_positions: VecDeque<(String, u16)> =
VecDeque::with_capacity(MAX_DISTANCE as usize);
let mut add_word_positions: VecDeque<(String, u16)> =
VecDeque::with_capacity(MAX_DISTANCE as usize);
let mut del_word_pair_proximity = BTreeMap::new();
let mut add_word_pair_proximity = BTreeMap::new();
let mut current_document_id = None;
let mut cursor = docid_word_positions.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let (document_id_bytes, word_bytes) = try_split_array_at(key)
let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let document_id = u32::from_be_bytes(document_id_bytes);
let word = str::from_utf8(word_bytes)?;
let curr_document_id = *current_document_id.get_or_insert(document_id);
if curr_document_id != document_id {
let document_word_positions_heap = mem::take(&mut document_word_positions_heap);
// if we change document, we fill the sorter
if current_document_id.map_or(false, |id| id != document_id) {
puffin::profile_scope!("Document into sorter");
document_word_positions_into_sorter(
curr_document_id,
document_word_positions_heap,
&mut word_pair_proximity_docids_sorter,
current_document_id.unwrap(),
&del_word_pair_proximity,
&add_word_pair_proximity,
&mut word_pair_proximity_docids_sorters,
)?;
current_document_id = Some(document_id);
del_word_pair_proximity.clear();
add_word_pair_proximity.clear();
}
let word = word.to_string();
let mut positions: Vec<_> = read_u32_ne_bytes(value).collect();
positions.sort_unstable();
let mut iter = positions.into_iter();
if let Some(position) = iter.next() {
document_word_positions_heap.push(PeekedWordPosition { word, position, iter });
}
current_document_id = Some(document_id);
let (del, add): (Result<_>, Result<_>) = rayon::join(
|| {
// deletions
if let Some(deletion) = KvReaderDelAdd::new(&value).get(DelAdd::Deletion) {
for (position, word) in KvReaderU16::new(deletion).iter() {
// drain the proximity window until the head word is considered close to the word we are inserting.
while del_word_positions.get(0).map_or(false, |(_w, p)| {
index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
}) {
word_positions_into_word_pair_proximity(
&mut del_word_positions,
&mut del_word_pair_proximity,
)?;
}
// insert the new word.
let word = std::str::from_utf8(word)?;
del_word_positions.push_back((word.to_string(), position));
}
while !del_word_positions.is_empty() {
word_positions_into_word_pair_proximity(
&mut del_word_positions,
&mut del_word_pair_proximity,
)?;
}
}
Ok(())
},
|| {
// additions
if let Some(addition) = KvReaderDelAdd::new(&value).get(DelAdd::Addition) {
for (position, word) in KvReaderU16::new(addition).iter() {
// drain the proximity window until the head word is considered close to the word we are inserting.
while add_word_positions.get(0).map_or(false, |(_w, p)| {
index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
}) {
word_positions_into_word_pair_proximity(
&mut add_word_positions,
&mut add_word_pair_proximity,
)?;
}
// insert the new word.
let word = std::str::from_utf8(word)?;
add_word_positions.push_back((word.to_string(), position));
}
while !add_word_positions.is_empty() {
word_positions_into_word_pair_proximity(
&mut add_word_positions,
&mut add_word_pair_proximity,
)?;
}
}
Ok(())
},
);
del?;
add?;
}
if let Some(document_id) = current_document_id {
// We must make sure that don't lose the current document field id
// word count map if we break because we reached the end of the chunk.
let document_word_positions_heap = mem::take(&mut document_word_positions_heap);
puffin::profile_scope!("Final document into sorter");
document_word_positions_into_sorter(
document_id,
document_word_positions_heap,
&mut word_pair_proximity_docids_sorter,
&del_word_pair_proximity,
&add_word_pair_proximity,
&mut word_pair_proximity_docids_sorters,
)?;
}
{
puffin::profile_scope!("sorter_into_reader");
let mut writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
sorter_into_reader(word_pair_proximity_docids_sorter, indexer)
for sorter in word_pair_proximity_docids_sorters {
sorter.write_into_stream_writer(&mut writer)?;
}
writer_into_reader(writer)
}
}
/// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive.
@@ -85,96 +167,66 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
/// close to each other.
fn document_word_positions_into_sorter(
document_id: DocumentId,
mut word_positions_heap: BinaryHeap<PeekedWordPosition<vec::IntoIter<u32>>>,
word_pair_proximity_docids_sorter: &mut grenad::Sorter<MergeFn>,
del_word_pair_proximity: &BTreeMap<(String, String), u8>,
add_word_pair_proximity: &BTreeMap<(String, String), u8>,
word_pair_proximity_docids_sorters: &mut Vec<grenad::Sorter<MergeFn>>,
) -> Result<()> {
let mut word_pair_proximity = HashMap::new();
let mut ordered_peeked_word_positions = Vec::new();
while !word_positions_heap.is_empty() {
while let Some(peeked_word_position) = word_positions_heap.pop() {
ordered_peeked_word_positions.push(peeked_word_position);
if ordered_peeked_word_positions.len() == 7 {
break;
}
}
if let Some((head, tail)) = ordered_peeked_word_positions.split_first() {
for PeekedWordPosition { word, position, .. } in tail {
let prox = positions_proximity(head.position, *position);
if prox > 0 && prox < MAX_DISTANCE {
word_pair_proximity
.entry((head.word.clone(), word.clone()))
.and_modify(|p| {
*p = cmp::min(*p, prox);
})
.or_insert(prox);
}
}
// Push the tail in the heap.
let tail_iter = ordered_peeked_word_positions.drain(1..);
word_positions_heap.extend(tail_iter);
// Advance the head and push it in the heap.
if let Some(mut head) = ordered_peeked_word_positions.pop() {
if let Some(next_position) = head.iter.next() {
let prox = positions_proximity(head.position, next_position);
if prox > 0 && prox < MAX_DISTANCE {
word_pair_proximity
.entry((head.word.clone(), head.word.clone()))
.and_modify(|p| {
*p = cmp::min(*p, prox);
})
.or_insert(prox);
}
word_positions_heap.push(PeekedWordPosition {
word: head.word,
position: next_position,
iter: head.iter,
});
}
}
}
}
use itertools::merge_join_by;
use itertools::EitherOrBoth::{Both, Left, Right};
let mut buffer = Vec::new();
let mut key_buffer = Vec::new();
for ((w1, w2), prox) in word_pair_proximity {
for eob in
merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| {
d.cmp(a)
})
{
buffer.clear();
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
let ((w1, w2), prox) = match eob {
Left(key_value) => {
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
key_value
}
Right(key_value) => {
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
key_value
}
Both(key_value, _) => {
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
key_value
}
};
key_buffer.clear();
key_buffer.push(prox as u8);
key_buffer.push(*prox as u8);
key_buffer.extend_from_slice(w1.as_bytes());
key_buffer.push(0);
key_buffer.extend_from_slice(w2.as_bytes());
word_pair_proximity_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
word_pair_proximity_docids_sorters[*prox as usize - 1]
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
}
Ok(())
}
struct PeekedWordPosition<I> {
word: String,
position: u32,
iter: I,
}
impl<I> Ord for PeekedWordPosition<I> {
fn cmp(&self, other: &Self) -> Ordering {
self.position.cmp(&other.position).reverse()
}
}
impl<I> PartialOrd for PeekedWordPosition<I> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<I> Eq for PeekedWordPosition<I> {}
impl<I> PartialEq for PeekedWordPosition<I> {
fn eq(&self, other: &Self) -> bool {
self.position == other.position
fn word_positions_into_word_pair_proximity(
word_positions: &mut VecDeque<(String, u16)>,
word_pair_proximity: &mut BTreeMap<(String, String), u8>,
) -> Result<()> {
let (head_word, head_position) = word_positions.pop_front().unwrap();
for (word, position) in word_positions.iter() {
let prox = index_proximity(head_position as u32, *position as u32) as u8;
if prox > 0 && prox < MAX_DISTANCE as u8 {
word_pair_proximity
.entry((head_word.clone(), word.clone()))
.and_modify(|p| {
*p = cmp::min(*p, prox);
})
.or_insert(prox);
}
}
Ok(())
}

View File

@@ -1,13 +1,18 @@
use std::collections::BTreeSet;
use std::fs::File;
use std::io;
use obkv::KvReaderU16;
use super::helpers::{
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
try_split_array_at, GrenadParameters,
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
GrenadParameters,
};
use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Result};
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::MergeFn;
use crate::{bucketed_position, DocumentId, Result};
/// Extracts the word positions and the documents ids where this word appear.
///
@@ -24,32 +29,110 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
let mut word_position_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory,
);
let mut del_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
let mut add_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
let mut current_document_id: Option<u32> = None;
let mut key_buffer = Vec::new();
let mut cursor = docid_word_positions.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let (document_id_bytes, word_bytes) = try_split_array_at(key)
let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let document_id = DocumentId::from_be_bytes(document_id_bytes);
for position in read_u32_ne_bytes(value) {
key_buffer.clear();
key_buffer.extend_from_slice(word_bytes);
key_buffer.push(0);
let (_, position) = relative_from_absolute_position(position);
let position = bucketed_position(position);
key_buffer.extend_from_slice(&position.to_be_bytes());
word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
if current_document_id.map_or(false, |id| document_id != id) {
words_position_into_sorter(
current_document_id.unwrap(),
&mut key_buffer,
&del_word_positions,
&add_word_positions,
&mut word_position_docids_sorter,
)?;
del_word_positions.clear();
add_word_positions.clear();
}
current_document_id = Some(document_id);
let del_add_reader = KvReaderDelAdd::new(&value);
// extract all unique words to remove.
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
for (position, word_bytes) in KvReaderU16::new(deletion).iter() {
let position = bucketed_position(position);
del_word_positions.insert((position, word_bytes.to_vec()));
}
}
// extract all unique additional words.
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
for (position, word_bytes) in KvReaderU16::new(addition).iter() {
let position = bucketed_position(position);
add_word_positions.insert((position, word_bytes.to_vec()));
}
}
}
if let Some(document_id) = current_document_id {
words_position_into_sorter(
document_id,
&mut key_buffer,
&del_word_positions,
&add_word_positions,
&mut word_position_docids_sorter,
)?;
}
// TODO remove noop DelAdd OBKV
let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?;
Ok(word_position_docids_reader)
}
fn words_position_into_sorter(
document_id: DocumentId,
key_buffer: &mut Vec<u8>,
del_word_positions: &BTreeSet<(u16, Vec<u8>)>,
add_word_positions: &BTreeSet<(u16, Vec<u8>)>,
word_position_docids_sorter: &mut grenad::Sorter<MergeFn>,
) -> Result<()> {
puffin::profile_function!();
use itertools::merge_join_by;
use itertools::EitherOrBoth::{Both, Left, Right};
let mut buffer = Vec::new();
for eob in merge_join_by(del_word_positions.iter(), add_word_positions.iter(), |d, a| d.cmp(a))
{
buffer.clear();
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
let (position, word_bytes) = match eob {
Left(key) => {
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
key
}
Right(key) => {
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
key
}
Both(key, _) => {
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
key
}
};
key_buffer.clear();
key_buffer.extend_from_slice(word_bytes);
key_buffer.push(0);
key_buffer.extend_from_slice(&position.to_be_bytes());
word_position_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
}
Ok(())
}

View File

@@ -6,7 +6,6 @@ mod extract_fid_word_count_docids;
mod extract_geo_points;
mod extract_vector_points;
mod extract_word_docids;
mod extract_word_fid_docids;
mod extract_word_pair_proximity_docids;
mod extract_word_position_docids;
@@ -25,12 +24,11 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
use self::extract_geo_points::extract_geo_points;
use self::extract_vector_points::extract_vector_points;
use self::extract_word_docids::extract_word_docids;
use self::extract_word_fid_docids::extract_word_fid_docids;
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{
as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
GrenadParameters, MergeFn, MergeableReader,
as_cloneable_grenad, merge_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn,
MergeableReader,
};
use super::{helpers, TypedChunk};
use crate::{FieldId, Result};
@@ -93,9 +91,9 @@ pub(crate) fn data_from_obkv_documents(
let (
docid_word_positions_chunks,
(
docid_fid_facet_numbers_chunks,
fid_docid_facet_numbers_chunks,
(
docid_fid_facet_strings_chunks,
fid_docid_facet_strings_chunks,
(
facet_is_null_docids_chunks,
(facet_is_empty_docids_chunks, facet_exists_docids_chunks),
@@ -172,15 +170,22 @@ pub(crate) fn data_from_obkv_documents(
"field-id-wordcount-docids",
);
spawn_extraction_task::<_, _, Vec<(grenad::Reader<File>, grenad::Reader<File>)>>(
spawn_extraction_task::<
_,
_,
Vec<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)>,
>(
docid_word_positions_chunks.clone(),
indexer,
lmdb_writer_sx.clone(),
move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
merge_roaring_bitmaps,
|(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
merge_cbo_roaring_bitmaps,
|(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| {
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
}
},
"word-docids",
);
@@ -194,18 +199,9 @@ pub(crate) fn data_from_obkv_documents(
TypedChunk::WordPositionDocids,
"word-position-docids",
);
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_word_positions_chunks,
indexer,
lmdb_writer_sx.clone(),
extract_word_fid_docids,
merge_cbo_roaring_bitmaps,
TypedChunk::WordFidDocids,
"word-fid-docids",
);
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_fid_facet_strings_chunks,
fid_docid_facet_strings_chunks,
indexer,
lmdb_writer_sx.clone(),
extract_facet_string_docids,
@@ -215,7 +211,7 @@ pub(crate) fn data_from_obkv_documents(
);
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_fid_facet_numbers_chunks,
fid_docid_facet_numbers_chunks,
indexer,
lmdb_writer_sx,
extract_facet_number_docids,
@@ -348,7 +344,7 @@ fn send_and_extract_flattened_documents_data(
});
}
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) =
rayon::join(
|| {
let (documents_ids, docid_word_positions_chunk, script_language_pair) =
@@ -376,8 +372,8 @@ fn send_and_extract_flattened_documents_data(
},
|| {
let ExtractedFacetValues {
docid_fid_facet_numbers_chunk,
docid_fid_facet_strings_chunk,
fid_docid_facet_numbers_chunk,
fid_docid_facet_strings_chunk,
fid_facet_is_null_docids_chunk,
fid_facet_is_empty_docids_chunk,
fid_facet_exists_docids_chunk,
@@ -388,26 +384,26 @@ fn send_and_extract_flattened_documents_data(
geo_fields_ids,
)?;
// send docid_fid_facet_numbers_chunk to DB writer
let docid_fid_facet_numbers_chunk =
unsafe { as_cloneable_grenad(&docid_fid_facet_numbers_chunk)? };
// send fid_docid_facet_numbers_chunk to DB writer
let fid_docid_facet_numbers_chunk =
unsafe { as_cloneable_grenad(&fid_docid_facet_numbers_chunk)? };
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers(
docid_fid_facet_numbers_chunk.clone(),
fid_docid_facet_numbers_chunk.clone(),
)));
// send docid_fid_facet_strings_chunk to DB writer
let docid_fid_facet_strings_chunk =
unsafe { as_cloneable_grenad(&docid_fid_facet_strings_chunk)? };
// send fid_docid_facet_strings_chunk to DB writer
let fid_docid_facet_strings_chunk =
unsafe { as_cloneable_grenad(&fid_docid_facet_strings_chunk)? };
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings(
docid_fid_facet_strings_chunk.clone(),
fid_docid_facet_strings_chunk.clone(),
)));
Ok((
docid_fid_facet_numbers_chunk,
fid_docid_facet_numbers_chunk,
(
docid_fid_facet_strings_chunk,
fid_docid_facet_strings_chunk,
(
fid_facet_is_null_docids_chunk,
(fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk),
@@ -417,5 +413,5 @@ fn send_and_extract_flattened_documents_data(
},
);
Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?))
Ok((docid_word_positions_chunk?, fid_docid_facet_values_chunks?))
}

View File

@@ -54,6 +54,7 @@ pub fn sorter_into_reader(
sorter: grenad::Sorter<MergeFn>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> {
puffin::profile_function!();
let mut writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
@@ -113,6 +114,22 @@ impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>)> {
}
}
impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
type Output = (grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>);
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
let mut m1 = MergerBuilder::new(merge_fn);
let mut m2 = MergerBuilder::new(merge_fn);
let mut m3 = MergerBuilder::new(merge_fn);
for (r1, r2, r3) in self.into_iter() {
m1.push(r1)?;
m2.push(r2)?;
m3.push(r3)?;
}
Ok((m1.finish(params)?, m2.finish(params)?, m3.finish(params)?))
}
}
struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>);
impl<R: io::Read + io::Seek> MergerBuilder<R> {

View File

@@ -6,11 +6,13 @@ use std::result::Result as StdResult;
use roaring::RoaringBitmap;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::transform::Operation;
use crate::Result;
pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>;
#[allow(unused)]
pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
if values.len() == 1 {
Ok(values[0].clone())
@@ -75,57 +77,123 @@ pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<
Ok(obkvs.last().unwrap().clone())
}
pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec<u8>) {
pub fn merge_two_del_add_obkvs(
base: obkv::KvReaderU16,
update: obkv::KvReaderU16,
merge_additions: bool,
buffer: &mut Vec<u8>,
) {
use itertools::merge_join_by;
use itertools::EitherOrBoth::{Both, Left, Right};
buffer.clear();
let mut writer = obkv::KvWriter::new(buffer);
let mut value_buffer = Vec::new();
for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) {
match eob {
Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(),
Left((k, v)) => {
if merge_additions {
writer.insert(k, v).unwrap()
} else {
// If merge_additions is false, recreate an obkv keeping the deletions only.
value_buffer.clear();
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
let base_reader = KvReaderDelAdd::new(v);
if let Some(deletion) = base_reader.get(DelAdd::Deletion) {
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
value_writer.finish().unwrap();
writer.insert(k, &value_buffer).unwrap()
}
}
}
Right((k, v)) => writer.insert(k, v).unwrap(),
Both((k, base), (_, update)) => {
// merge deletions and additions.
value_buffer.clear();
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
let base_reader = KvReaderDelAdd::new(base);
let update_reader = KvReaderDelAdd::new(update);
// keep newest deletion.
if let Some(deletion) = update_reader
.get(DelAdd::Deletion)
.or_else(|| base_reader.get(DelAdd::Deletion))
{
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
}
// keep base addition only if merge_additions is true.
let base_addition =
merge_additions.then(|| base_reader.get(DelAdd::Addition)).flatten();
// keep newest addition.
// TODO use or_else
if let Some(addition) = update_reader.get(DelAdd::Addition).or(base_addition) {
value_writer.insert(DelAdd::Addition, addition).unwrap();
}
value_writer.finish().unwrap();
writer.insert(k, &value_buffer).unwrap()
}
}
}
writer.finish().unwrap();
}
/// Merge all the obks in the order we see them.
pub fn merge_obkvs_and_operations<'a>(
/// Merge all the obkvs from the newest to the oldest.
fn inner_merge_del_add_obkvs<'a>(
obkvs: &[Cow<'a, [u8]>],
merge_additions: bool,
) -> Result<Cow<'a, [u8]>> {
// pop the newest operation from the list.
let (newest, obkvs) = obkvs.split_last().unwrap();
// keep the operation type for the returned value.
let newest_operation_type = newest[0];
// treat the newest obkv as the starting point of the merge.
let mut acc_operation_type = newest_operation_type;
let mut acc = newest[1..].to_vec();
let mut buffer = Vec::new();
// reverse iter from the most recent to the oldest.
for current in obkvs.into_iter().rev() {
// if in the previous iteration there was a complete deletion,
// stop the merge process.
if acc_operation_type == Operation::Deletion as u8 {
break;
}
let newest = obkv::KvReader::new(&acc);
let oldest = obkv::KvReader::new(&current[1..]);
merge_two_del_add_obkvs(oldest, newest, merge_additions, &mut buffer);
// we want the result of the merge into our accumulator.
std::mem::swap(&mut acc, &mut buffer);
acc_operation_type = current[0];
}
acc.insert(0, newest_operation_type);
Ok(Cow::from(acc))
}
/// Merge all the obkvs from the newest to the oldest.
pub fn obkvs_merge_additions_and_deletions<'a>(
_key: &[u8],
obkvs: &[Cow<'a, [u8]>],
) -> Result<Cow<'a, [u8]>> {
// [add, add, delete, add, add]
// we can ignore everything that happened before the last delete.
let starting_position =
obkvs.iter().rposition(|obkv| obkv[0] == Operation::Deletion as u8).unwrap_or(0);
// [add, add, delete]
// if the last operation was a deletion then we simply return the deletion
if starting_position == obkvs.len() - 1 && obkvs.last().unwrap()[0] == Operation::Deletion as u8
{
return Ok(obkvs[obkvs.len() - 1].clone());
}
let mut buffer = Vec::new();
// (add, add, delete) [add, add]
// in the other case, no deletion will be encountered during the merge
let mut ret =
obkvs[starting_position..].iter().cloned().fold(Vec::new(), |mut acc, current| {
let first = obkv::KvReader::new(&acc);
let second = obkv::KvReader::new(&current[1..]);
merge_two_obkvs(first, second, &mut buffer);
// we want the result of the merge into our accumulator
std::mem::swap(&mut acc, &mut buffer);
acc
});
ret.insert(0, Operation::Addition as u8);
Ok(Cow::from(ret))
inner_merge_del_add_obkvs(obkvs, true)
}
/// Merge all the obkvs deletions from the newest to the oldest and keep only the newest additions.
pub fn obkvs_keep_last_addition_merge_deletions<'a>(
_key: &[u8],
obkvs: &[Cow<'a, [u8]>],
) -> Result<Cow<'a, [u8]>> {
inner_merge_del_add_obkvs(obkvs, false)
}
/// Do a union of all the CboRoaringBitmaps in the values.
pub fn merge_cbo_roaring_bitmaps<'a>(
_key: &[u8],
values: &[Cow<'a, [u8]>],
@@ -138,3 +206,36 @@ pub fn merge_cbo_roaring_bitmaps<'a>(
Ok(Cow::from(vec))
}
}
/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv
/// separately and outputs a new DelAdd with both unions.
pub fn merge_deladd_cbo_roaring_bitmaps<'a>(
_key: &[u8],
values: &[Cow<'a, [u8]>],
) -> Result<Cow<'a, [u8]>> {
if values.len() == 1 {
Ok(values[0].clone())
} else {
// Retrieve the bitmaps from both sides
let mut del_bitmaps_bytes = Vec::new();
let mut add_bitmaps_bytes = Vec::new();
for value in values {
let obkv = KvReaderDelAdd::new(value);
if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) {
del_bitmaps_bytes.push(bitmap_bytes);
}
if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) {
add_bitmaps_bytes.push(bitmap_bytes);
}
}
let mut output_deladd_obkv = KvWriterDelAdd::memory();
let mut buffer = Vec::new();
CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?;
output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?;
buffer.clear();
CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?;
output_deladd_obkv.insert(DelAdd::Addition, &buffer)?;
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
}
}

View File

@@ -14,7 +14,8 @@ pub use grenad_helpers::{
};
pub use merge_functions::{
concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs,
merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, merge_roaring_bitmaps,
obkvs_keep_last_addition_merge_deletions, obkvs_merge_additions_and_deletions,
serialize_roaring_bitmap, MergeFn,
};
@@ -44,6 +45,7 @@ where
Some((head, tail))
}
#[allow(unused)]
pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator<Item = u32> + '_ {
bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes)
}

View File

@@ -38,7 +38,7 @@ use crate::update::{
self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
};
use crate::{Index, Result, RoaringBitmapCodec};
use crate::{CboRoaringBitmapCodec, Index, Result};
static MERGED_DATABASE_COUNT: usize = 7;
static PREFIX_DATABASE_COUNT: usize = 5;
@@ -406,13 +406,23 @@ where
}
let typed_chunk = match result? {
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
} => {
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
word_docids = Some(cloneable_chunk);
let cloneable_chunk =
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
exact_word_docids = Some(cloneable_chunk);
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader }
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
word_fid_docids = Some(cloneable_chunk);
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
}
}
TypedChunk::WordPairProximityDocids(chunk) => {
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
@@ -424,11 +434,6 @@ where
word_position_docids = Some(cloneable_chunk);
TypedChunk::WordPositionDocids(chunk)
}
TypedChunk::WordFidDocids(chunk) => {
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
word_fid_docids = Some(cloneable_chunk);
TypedChunk::WordFidDocids(chunk)
}
otherwise => otherwise,
};
@@ -470,13 +475,14 @@ where
let all_documents_ids = index_documents_ids | new_documents_ids;
self.index.put_documents_ids(self.wtxn, &all_documents_ids)?;
self.execute_prefix_databases(
word_docids,
exact_word_docids,
word_pair_proximity_docids,
word_position_docids,
word_fid_docids,
)?;
// TODO: reactivate prefix DB with diff-indexing
// self.execute_prefix_databases(
// word_docids,
// exact_word_docids,
// word_pair_proximity_docids,
// word_position_docids,
// word_fid_docids,
// )?;
Ok(all_documents_ids.len())
}
@@ -690,8 +696,8 @@ where
fn execute_word_prefix_docids(
txn: &mut heed::RwTxn,
reader: grenad::Reader<Cursor<ClonableMmap>>,
word_docids_db: Database<Str, RoaringBitmapCodec>,
word_prefix_docids_db: Database<Str, RoaringBitmapCodec>,
word_docids_db: Database<Str, CboRoaringBitmapCodec>,
word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
indexer_config: &IndexerConfig,
new_prefix_fst_words: &[String],
common_prefix_fst_words: &[&[String]],

View File

@@ -7,18 +7,20 @@ use std::io::{Read, Seek};
use fxhash::FxHashMap;
use heed::RoTxn;
use itertools::Itertools;
use obkv::{KvReader, KvWriter};
use obkv::{KvReader, KvReaderU16, KvWriter};
use roaring::RoaringBitmap;
use serde_json::Value;
use smartstring::SmartString;
use super::helpers::{
create_sorter, create_writer, keep_latest_obkv, merge_obkvs_and_operations, MergeFn,
create_sorter, create_writer, obkvs_keep_last_addition_merge_deletions,
obkvs_merge_additions_and_deletions, MergeFn,
};
use super::{IndexDocumentsMethod, IndexerConfig};
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
use crate::error::{Error, InternalError, UserError};
use crate::index::{db_name, main_key};
use crate::update::del_add::into_del_add_obkv;
use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
use crate::{
FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32,
@@ -106,8 +108,8 @@ impl<'a, 'i> Transform<'a, 'i> {
// We must choose the appropriate merge function for when two or more documents
// with the same user id must be merged or fully replaced in the same batch.
let merge_function = match index_documents_method {
IndexDocumentsMethod::ReplaceDocuments => keep_latest_obkv,
IndexDocumentsMethod::UpdateDocuments => merge_obkvs_and_operations,
IndexDocumentsMethod::ReplaceDocuments => obkvs_keep_last_addition_merge_deletions,
IndexDocumentsMethod::UpdateDocuments => obkvs_merge_additions_and_deletions,
};
// We initialize the sorter with the user indexing settings.
@@ -223,19 +225,21 @@ impl<'a, 'i> Transform<'a, 'i> {
let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) {
Entry::Occupied(entry) => *entry.get() as u32,
Entry::Vacant(entry) => {
// If the document was already in the db we mark it as a replaced document.
// It'll be deleted later.
if let Some(docid) = external_documents_ids.get(entry.key()) {
// If it was already in the list of replaced documents it means it was deleted
// by the remove_document method. We should starts as if it never existed.
if self.replaced_documents_ids.insert(docid) {
original_docid = Some(docid);
let docid = match external_documents_ids.get(entry.key()) {
Some(docid) => {
// If it was already in the list of replaced documents it means it was deleted
// by the remove_document method. We should starts as if it never existed.
if self.replaced_documents_ids.insert(docid) {
original_docid = Some(docid);
}
docid
}
}
let docid = self
.available_documents_ids
.next()
.ok_or(UserError::DocumentLimitReached)?;
None => self
.available_documents_ids
.next()
.ok_or(UserError::DocumentLimitReached)?,
};
entry.insert(docid as u64);
docid
}
@@ -263,16 +267,28 @@ impl<'a, 'i> Transform<'a, 'i> {
skip_insertion = true;
} else {
// we associate the base document with the new key, everything will get merged later.
let keep_original_version =
self.index_documents_method == IndexDocumentsMethod::UpdateDocuments;
document_sorter_buffer.clear();
document_sorter_buffer.push(Operation::Addition as u8);
document_sorter_buffer.extend_from_slice(base_obkv);
into_del_add_obkv(
KvReaderU16::new(base_obkv),
true,
keep_original_version,
&mut document_sorter_buffer,
)?;
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? {
Some(flattened_obkv) => {
// we recreate our buffer with the flattened documents
document_sorter_buffer.clear();
document_sorter_buffer.push(Operation::Addition as u8);
document_sorter_buffer.extend_from_slice(&flattened_obkv);
into_del_add_obkv(
KvReaderU16::new(&flattened_obkv),
true,
keep_original_version,
&mut document_sorter_buffer,
)?;
self.flattened_sorter
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
}
@@ -288,7 +304,12 @@ impl<'a, 'i> Transform<'a, 'i> {
document_sorter_buffer.clear();
document_sorter_buffer.push(Operation::Addition as u8);
document_sorter_buffer.extend_from_slice(&obkv_buffer);
into_del_add_obkv(
KvReaderU16::new(&obkv_buffer),
false,
true,
&mut document_sorter_buffer,
)?;
// We use the extracted/generated user id as the key for this document.
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
@@ -296,7 +317,12 @@ impl<'a, 'i> Transform<'a, 'i> {
Some(flattened_obkv) => {
document_sorter_buffer.clear();
document_sorter_buffer.push(Operation::Addition as u8);
document_sorter_buffer.extend_from_slice(&flattened_obkv);
into_del_add_obkv(
KvReaderU16::new(&flattened_obkv),
false,
true,
&mut document_sorter_buffer,
)?;
self.flattened_sorter
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
}
@@ -354,19 +380,25 @@ impl<'a, 'i> Transform<'a, 'i> {
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
let mut documents_deleted = 0;
let mut document_sorter_buffer = Vec::new();
for to_remove in to_remove {
if should_abort() {
return Err(Error::InternalError(InternalError::AbortedIndexation));
}
match self.new_external_documents_ids_builder.entry((*to_remove).into()) {
// Check if the document has been added in the current indexing process.
let deleted_from_current = match self
.new_external_documents_ids_builder
.entry((*to_remove).into())
{
// if the document was added in a previous iteration of the transform we make it as deleted in the sorters.
Entry::Occupied(entry) => {
let doc_id = *entry.get() as u32;
self.original_sorter
.insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?;
self.flattened_sorter
.insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?;
document_sorter_buffer.clear();
document_sorter_buffer.push(Operation::Deletion as u8);
obkv::KvWriterU16::new(&mut document_sorter_buffer).finish().unwrap();
self.original_sorter.insert(doc_id.to_be_bytes(), &document_sorter_buffer)?;
self.flattened_sorter.insert(doc_id.to_be_bytes(), &document_sorter_buffer)?;
// we must NOT update the list of replaced_documents_ids
// Either:
@@ -375,21 +407,69 @@ impl<'a, 'i> Transform<'a, 'i> {
// we're removing it there is nothing to do.
self.new_documents_ids.remove(doc_id);
entry.remove_entry();
true
}
Entry::Vacant(entry) => {
// If the document was already in the db we mark it as a `to_delete` document.
// It'll be deleted later. We don't need to push anything to the sorters.
if let Some(docid) = external_documents_ids.get(entry.key()) {
self.replaced_documents_ids.insert(docid);
} else {
// if the document is nowehere to be found, there is nothing to do and we must NOT
// increment the count of documents_deleted
continue;
}
}
Entry::Vacant(_) => false,
};
documents_deleted += 1;
// If the document was already in the db we mark it as a `to_delete` document.
// Then we push the document in sorters in deletion mode.
let deleted_from_db = match external_documents_ids.get(&to_remove) {
Some(docid) => {
self.replaced_documents_ids.insert(docid);
// fetch the obkv document
let original_key = BEU32::new(docid);
let base_obkv = self
.index
.documents
.remap_data_type::<heed::types::ByteSlice>()
.get(wtxn, &original_key)?
.ok_or(InternalError::DatabaseMissingEntry {
db_name: db_name::DOCUMENTS,
key: None,
})?;
// push it as to delete in the original_sorter
document_sorter_buffer.clear();
document_sorter_buffer.push(Operation::Deletion as u8);
into_del_add_obkv(
KvReaderU16::new(base_obkv),
true,
false,
&mut document_sorter_buffer,
)?;
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
// flatten it and push it as to delete in the flattened_sorter
match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? {
Some(flattened_obkv) => {
// we recreate our buffer with the flattened documents
document_sorter_buffer.clear();
document_sorter_buffer.push(Operation::Deletion as u8);
into_del_add_obkv(
KvReaderU16::new(&flattened_obkv),
true,
false,
&mut document_sorter_buffer,
)?;
self.flattened_sorter
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
}
None => self
.flattened_sorter
.insert(docid.to_be_bytes(), &document_sorter_buffer)?,
}
true
}
None => false,
};
// increase counter only if the document existed somewhere before.
if deleted_from_current || deleted_from_db {
documents_deleted += 1;
}
}
Ok(documents_deleted)
@@ -589,9 +669,7 @@ impl<'a, 'i> Transform<'a, 'i> {
let mut documents_count = 0;
while let Some((key, val)) = iter.next()? {
if val[0] == Operation::Deletion as u8 {
continue;
}
// skip first byte corresponding to the operation type (Deletion or Addition).
let val = &val[1..];
// send a callback to show at which step we are
@@ -631,9 +709,7 @@ impl<'a, 'i> Transform<'a, 'i> {
// We get rids of the `Operation` byte and skip the deleted documents as well.
let mut iter = self.flattened_sorter.into_stream_merger_iter()?;
while let Some((key, val)) = iter.next()? {
if val[0] == Operation::Deletion as u8 {
continue;
}
// skip first byte corresponding to the operation type (Deletion or Addition).
let val = &val[1..];
writer.insert(key, val)?;
}
@@ -711,6 +787,7 @@ impl<'a, 'i> Transform<'a, 'i> {
);
let mut obkv_buffer = Vec::new();
let mut document_sorter_buffer = Vec::new();
for result in self.index.all_documents(wtxn)? {
let (docid, obkv) = result?;
@@ -725,7 +802,9 @@ impl<'a, 'i> Transform<'a, 'i> {
}
let buffer = obkv_writer.into_inner()?;
original_writer.insert(docid.to_be_bytes(), &buffer)?;
document_sorter_buffer.clear();
into_del_add_obkv(KvReaderU16::new(buffer), false, true, &mut document_sorter_buffer)?;
original_writer.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
// Once we have the document. We're going to flatten it
// and insert it in the flattened sorter.
@@ -760,7 +839,9 @@ impl<'a, 'i> Transform<'a, 'i> {
let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
writer.insert(fid, &value)?;
}
flattened_writer.insert(docid.to_be_bytes(), &buffer)?;
document_sorter_buffer.clear();
into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut document_sorter_buffer)?;
flattened_writer.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
}
// Once we have written all the documents, we extract
@@ -824,38 +905,86 @@ mod test {
#[test]
fn merge_obkvs() {
let mut doc_0 = Vec::new();
let mut kv_writer = KvWriter::new(&mut doc_0);
let mut additive_doc_0 = Vec::new();
let mut deletive_doc_0 = Vec::new();
let mut del_add_doc_0 = Vec::new();
let mut kv_writer = KvWriter::memory();
kv_writer.insert(0_u8, [0]).unwrap();
kv_writer.finish().unwrap();
doc_0.insert(0, Operation::Addition as u8);
let buffer = kv_writer.into_inner().unwrap();
into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_0).unwrap();
additive_doc_0.insert(0, Operation::Addition as u8);
into_del_add_obkv(KvReaderU16::new(&buffer), true, false, &mut deletive_doc_0).unwrap();
deletive_doc_0.insert(0, Operation::Deletion as u8);
into_del_add_obkv(KvReaderU16::new(&buffer), true, true, &mut del_add_doc_0).unwrap();
del_add_doc_0.insert(0, Operation::Addition as u8);
let ret = merge_obkvs_and_operations(&[], &[Cow::from(doc_0.as_slice())]).unwrap();
assert_eq!(*ret, doc_0);
let mut additive_doc_1 = Vec::new();
let mut kv_writer = KvWriter::memory();
kv_writer.insert(1_u8, [1]).unwrap();
let buffer = kv_writer.into_inner().unwrap();
into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_1).unwrap();
additive_doc_1.insert(0, Operation::Addition as u8);
let ret = merge_obkvs_and_operations(
let mut additive_doc_0_1 = Vec::new();
let mut kv_writer = KvWriter::memory();
kv_writer.insert(0_u8, [0]).unwrap();
kv_writer.insert(1_u8, [1]).unwrap();
let buffer = kv_writer.into_inner().unwrap();
into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_0_1).unwrap();
additive_doc_0_1.insert(0, Operation::Addition as u8);
let ret = obkvs_merge_additions_and_deletions(&[], &[Cow::from(additive_doc_0.as_slice())])
.unwrap();
assert_eq!(*ret, additive_doc_0);
let ret = obkvs_merge_additions_and_deletions(
&[],
&[Cow::from([Operation::Deletion as u8].as_slice()), Cow::from(doc_0.as_slice())],
&[Cow::from(deletive_doc_0.as_slice()), Cow::from(additive_doc_0.as_slice())],
)
.unwrap();
assert_eq!(*ret, doc_0);
assert_eq!(*ret, del_add_doc_0);
let ret = merge_obkvs_and_operations(
let ret = obkvs_merge_additions_and_deletions(
&[],
&[Cow::from(doc_0.as_slice()), Cow::from([Operation::Deletion as u8].as_slice())],
&[Cow::from(additive_doc_0.as_slice()), Cow::from(deletive_doc_0.as_slice())],
)
.unwrap();
assert_eq!(*ret, [Operation::Deletion as u8]);
assert_eq!(*ret, deletive_doc_0);
let ret = merge_obkvs_and_operations(
let ret = obkvs_merge_additions_and_deletions(
&[],
&[
Cow::from([Operation::Addition as u8, 1].as_slice()),
Cow::from([Operation::Deletion as u8].as_slice()),
Cow::from(doc_0.as_slice()),
Cow::from(additive_doc_1.as_slice()),
Cow::from(deletive_doc_0.as_slice()),
Cow::from(additive_doc_0.as_slice()),
],
)
.unwrap();
assert_eq!(*ret, doc_0);
assert_eq!(*ret, del_add_doc_0);
let ret = obkvs_merge_additions_and_deletions(
&[],
&[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())],
)
.unwrap();
assert_eq!(*ret, additive_doc_0_1);
let ret = obkvs_keep_last_addition_merge_deletions(
&[],
&[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())],
)
.unwrap();
assert_eq!(*ret, additive_doc_0);
let ret = obkvs_keep_last_addition_merge_deletions(
&[],
&[
Cow::from(deletive_doc_0.as_slice()),
Cow::from(additive_doc_1.as_slice()),
Cow::from(additive_doc_0.as_slice()),
],
)
.unwrap();
assert_eq!(*ret, del_add_doc_0);
}
}

View File

@@ -32,9 +32,9 @@ pub(crate) enum TypedChunk {
WordDocids {
word_docids_reader: grenad::Reader<File>,
exact_word_docids_reader: grenad::Reader<File>,
word_fid_docids_reader: grenad::Reader<File>,
},
WordPositionDocids(grenad::Reader<File>),
WordFidDocids(grenad::Reader<File>),
WordPairProximityDocids(grenad::Reader<File>),
FieldIdFacetStringDocids(grenad::Reader<File>),
FieldIdFacetNumberDocids(grenad::Reader<File>),
@@ -43,7 +43,7 @@ pub(crate) enum TypedChunk {
FieldIdFacetIsEmptyDocids(grenad::Reader<File>),
GeoPoints(grenad::Reader<File>),
VectorPoints(grenad::Reader<File>),
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
}
impl TypedChunk {
@@ -64,17 +64,19 @@ impl TypedChunk {
TypedChunk::NewDocumentsIds(grenad) => {
format!("NewDocumentsIds {{ number_of_entries: {} }}", grenad.len())
}
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => format!(
"WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {} }}",
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
} => format!(
"WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {}, word_fid_docids_reader: {} }}",
word_docids_reader.len(),
exact_word_docids_reader.len()
exact_word_docids_reader.len(),
word_fid_docids_reader.len()
),
TypedChunk::WordPositionDocids(grenad) => {
format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len())
}
TypedChunk::WordFidDocids(grenad) => {
format!("WordFidDocids {{ number_of_entries: {} }}", grenad.len())
}
TypedChunk::WordPairProximityDocids(grenad) => {
format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len())
}
@@ -99,8 +101,8 @@ impl TypedChunk {
TypedChunk::VectorPoints(grenad) => {
format!("VectorPoints {{ number_of_entries: {} }}", grenad.len())
}
TypedChunk::ScriptLanguageDocids(grenad) => {
format!("ScriptLanguageDocids {{ number_of_entries: {} }}", grenad.len())
TypedChunk::ScriptLanguageDocids(sl_map) => {
format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len())
}
}
}
@@ -138,7 +140,11 @@ pub(crate) fn write_typed_chunk_into_index(
TypedChunk::NewDocumentsIds(documents_ids) => {
return Ok((documents_ids, is_merged_database))
}
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
} => {
let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
append_entries_into_database(
word_docids_iter.clone(),
@@ -146,7 +152,7 @@ pub(crate) fn write_typed_chunk_into_index(
wtxn,
index_is_empty,
|value, _buffer| Ok(value),
merge_roaring_bitmaps,
merge_cbo_roaring_bitmaps,
)?;
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
@@ -156,7 +162,17 @@ pub(crate) fn write_typed_chunk_into_index(
wtxn,
index_is_empty,
|value, _buffer| Ok(value),
merge_roaring_bitmaps,
merge_cbo_roaring_bitmaps,
)?;
let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?;
append_entries_into_database(
word_fid_docids_iter,
&index.word_fid_docids,
wtxn,
index_is_empty,
|value, _buffer| Ok(value),
merge_cbo_roaring_bitmaps,
)?;
// create fst from word docids
@@ -182,17 +198,6 @@ pub(crate) fn write_typed_chunk_into_index(
)?;
is_merged_database = true;
}
TypedChunk::WordFidDocids(word_fid_docids_iter) => {
append_entries_into_database(
word_fid_docids_iter,
&index.word_fid_docids,
wtxn,
index_is_empty,
|value, _buffer| Ok(value),
merge_cbo_roaring_bitmaps,
)?;
is_merged_database = true;
}
TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => {
let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter);
indexer.execute(wtxn)?;
@@ -339,22 +344,25 @@ pub(crate) fn write_typed_chunk_into_index(
log::debug!("There are {} entries in the HNSW so far", hnsw_length);
index.put_vector_hnsw(wtxn, &new_hnsw)?;
}
TypedChunk::ScriptLanguageDocids(hash_pair) => {
let mut buffer = Vec::new();
for (key, value) in hash_pair {
buffer.clear();
TypedChunk::ScriptLanguageDocids(sl_map) => {
for (key, (deletion, addition)) in sl_map {
let mut db_key_exists = false;
let final_value = match index.script_language_docids.get(wtxn, &key)? {
Some(db_values) => {
let mut db_value_buffer = Vec::new();
serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?;
let mut new_value_buffer = Vec::new();
serialize_roaring_bitmap(&value, &mut new_value_buffer)?;
merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?;
RoaringBitmap::deserialize_from(&buffer[..])?
db_key_exists = true;
(db_values - deletion) | addition
}
None => value,
None => addition,
};
index.script_language_docids.put(wtxn, &key, &final_value)?;
if final_value.is_empty() {
// If the database entry exists, delete it.
if db_key_exists == true {
index.script_language_docids.delete(wtxn, &key)?;
}
} else {
index.script_language_docids.put(wtxn, &key, &final_value)?;
}
}
}
}
@@ -379,13 +387,6 @@ fn merge_word_docids_reader_into_fst(
Ok(builder.into_set())
}
fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> {
let new_value = RoaringBitmap::deserialize_from(new_value)?;
let db_value = RoaringBitmap::deserialize_from(db_value)?;
let value = new_value | db_value;
Ok(serialize_roaring_bitmap(&value, buffer)?)
}
fn merge_cbo_roaring_bitmaps(
new_value: &[u8],
db_value: &[u8],
@@ -455,6 +456,7 @@ where
R: io::Read + io::Seek,
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
K: for<'a> heed::BytesDecode<'a>,
{
puffin::profile_function!(format!("number of entries: {}", data.len()));
@@ -475,6 +477,12 @@ where
let mut cursor = data.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
if valid_lmdb_key(key) {
debug_assert!(
K::bytes_decode(&key).is_some(),
"Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}",
key.len(),
&key
);
buffer.clear();
let value = serialize_value(value, &mut buffer)?;
unsafe { database.append(key, value)? };

View File

@@ -21,6 +21,7 @@ pub use self::words_prefixes_fst::WordsPrefixesFst;
mod available_documents_ids;
mod clear_documents;
pub(crate) mod del_add;
mod delete_documents;
pub(crate) mod facet;
mod index_documents;

View File

@@ -5,15 +5,15 @@ use heed::types::{ByteSlice, Str};
use heed::Database;
use crate::update::index_documents::{
create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
CursorClonableMmap, MergeFn,
};
use crate::{Result, RoaringBitmapCodec};
use crate::{CboRoaringBitmapCodec, Result};
pub struct WordPrefixDocids<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
word_docids: Database<Str, RoaringBitmapCodec>,
word_prefix_docids: Database<Str, RoaringBitmapCodec>,
word_docids: Database<Str, CboRoaringBitmapCodec>,
word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
pub(crate) chunk_compression_type: CompressionType,
pub(crate) chunk_compression_level: Option<u32>,
pub(crate) max_nb_chunks: Option<usize>,
@@ -23,8 +23,8 @@ pub struct WordPrefixDocids<'t, 'u, 'i> {
impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
pub fn new(
wtxn: &'t mut heed::RwTxn<'i, 'u>,
word_docids: Database<Str, RoaringBitmapCodec>,
word_prefix_docids: Database<Str, RoaringBitmapCodec>,
word_docids: Database<Str, CboRoaringBitmapCodec>,
word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
) -> WordPrefixDocids<'t, 'u, 'i> {
WordPrefixDocids {
wtxn,
@@ -40,6 +40,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
#[logging_timer::time("WordPrefixDocids::{}")]
pub fn execute(
self,
// TODO grenad::Reader<onkv::Reader<Word, obkv::Reader<DelAdd, CboRoaringBitmap>>>
mut new_word_docids_iter: grenad::ReaderCursor<CursorClonableMmap>,
new_prefix_fst_words: &[String],
common_prefix_fst_words: &[&[String]],
@@ -51,7 +52,8 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
// and write into it at the same time, therefore we write into another file.
let mut prefix_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_roaring_bitmaps,
// TODO change to merge_deladd_cbo_roaring_bitmaps
merge_cbo_roaring_bitmaps,
self.chunk_compression_type,
self.chunk_compression_level,
self.max_nb_chunks,
@@ -96,6 +98,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
let prefix = std::str::from_utf8(prefix.as_bytes())?;
for result in db.prefix_iter(self.wtxn, prefix)? {
let (_word, data) = result?;
// TODO fake a DelAdd -> Add(`data`)
prefix_docids_sorter.insert(prefix, data)?;
}
}
@@ -111,11 +114,14 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
drop(iter);
// We finally write the word prefix docids into the LMDB database.
// TODO introduce a new function that is similar to `append_entries_into_database`
// and accepts the `merge_deladd_cbo_roaring_bitmaps` function
sorter_into_lmdb_database(
self.wtxn,
*self.word_prefix_docids.as_polymorph(),
prefix_docids_sorter,
merge_roaring_bitmaps,
// TODO change to `merge_deladd_cbo_roaring_bitmaps`
merge_cbo_roaring_bitmaps,
)?;
Ok(())
@@ -127,6 +133,7 @@ fn write_prefixes_in_sorter(
sorter: &mut grenad::Sorter<MergeFn>,
) -> Result<()> {
for (key, data_slices) in prefixes.drain() {
// TODO merge keys before inserting them in the sorter
for data in data_slices {
if valid_lmdb_key(&key) {
sorter.insert(&key, data)?;