Compare commits

...

514 Commits

Author SHA1 Message Date
29622e11f5 Merge pull request #533 from meilisearch/bump-to-v0.9.0
Bump the workspace crates to 0.9.0
2020-03-19 13:50:55 +01:00
3ca8db2cc1 Bump the workspace crates to 0.9.0 2020-03-19 11:56:23 +01:00
cc5eb885ea Merge pull request #531 from meilisearch/bump-rc
Bump the workspace crates to 0.9.0-rc.1
2020-03-16 18:09:11 +01:00
f6972ec682 Bump the workspace crates to 0.9.0-rc.1 2020-03-16 16:58:20 +01:00
cfe21f7b02 Merge pull request #530 from meilisearch/fix-ranking-rules-inference
Ranking fields should be stored and indexed by default
2020-03-16 16:53:06 +01:00
2d82f1b655 ranking fields should be stored and indexed by default; fix #521 2020-03-16 16:19:23 +01:00
cf6e481c14 Merge pull request #520 from meilisearch/fix-http-issues
Fix http issues
2020-03-11 15:21:50 +01:00
7be376721c global settings update make partial update; fix #516 2020-03-11 14:42:58 +01:00
ce0e8415d5 adding primary-key when adding documents does not work; fix #519 2020-03-11 14:12:38 +01:00
4ccf1d10bd error message when impossible to infer the primary-key; fix #517 2020-03-11 12:27:42 +01:00
c25641ff2d fix that AcceptNewFields does not take into account the primary-key; fix #518 2020-03-11 12:00:40 +01:00
14c1aba6c7 Merge pull request #509 from meilisearch/fix-internal-schema
Fix internal schema
2020-03-10 16:25:36 +01:00
8204d961de allow api key in header when no master-key is set; fix #515 2020-03-10 15:59:16 +01:00
ef3bcd65ab fix comments from review 2020-03-10 15:59:11 +01:00
b06e33f3d3 fix errors on http parameter naming 2020-03-10 12:08:10 +01:00
179969a9e2 fix tests + fmt 2020-03-10 11:29:56 +01:00
c984d8d5a5 rename identifier into primaryKey; fix #514 2020-03-09 18:45:29 +01:00
8ffa80883a remove the unused function 2020-03-09 18:45:29 +01:00
86c3482cbd review the internal schema to allow to create schema without identifier; fix #513 2020-03-09 18:45:20 +01:00
16a99aa95e update to infer identifier; fix #498 2020-03-06 10:55:25 +01:00
6d86968c4c Merge pull request #496 from meilisearch/small-fixes-before-0.9
Fix some issues before v0.9
2020-03-06 10:28:45 +01:00
8df6d6e954 fix error 500 when sending bad rankingRules; fix #500 2020-03-06 10:15:19 +01:00
8aeddec982 remove the route to get identifier on settings; fix #502 2020-03-06 10:15:19 +01:00
f4ae0844ab replace index-new-field route to accept-new-fields; fix #503 2020-03-06 10:15:19 +01:00
d56968cb23 default values of synonyms and stop-words; fix #499 fix #504 2020-03-06 10:15:19 +01:00
c5b6e641a4 index UID format; fix #497 2020-03-06 10:15:19 +01:00
041eed2a06 no id returned; fix #492 2020-03-06 10:15:19 +01:00
54c675e195 fix delete-batch route; #493 2020-03-06 10:15:19 +01:00
81ce90e57f update test 2020-03-06 10:15:19 +01:00
6016f2e941 change wording of custom ranking rules dsc -> desc; #490 2020-03-06 10:15:19 +01:00
4d27318b72 remove unnecessary comment on env Opt; #491 2020-03-06 10:15:11 +01:00
decce4d8e4 change route /keys/ -> /keys; #495 2020-03-05 15:33:02 +01:00
1cb9f75026 Merge pull request #507 from meilisearch/fix-documents-fields-order-inference
Fix the inference of the documents searchable fields
2020-03-04 14:16:36 +01:00
5e31d28759 Fix the inference of the documents searchable fields 2020-03-03 20:54:17 +01:00
2b780ab2c5 Merge pull request #489 from meilisearch/fix-rank-distinct
Use distinct on search
2020-03-02 16:34:27 +01:00
a2f0f95337 use distinct on search 2020-03-02 16:19:41 +01:00
72450c765d Merge pull request #484 from meilisearch/fix-reindex-by-chunk
Stop reindexing by chunk during complete reindexing
2020-02-28 18:29:25 +01:00
250aeaa86c stop reindexing by chunk during complete reindexing 2020-02-28 11:49:12 +01:00
06ace88901 Merge pull request #482 from meilisearch/review-settings-endpoint
Review settings endpoint
2020-02-28 11:39:38 +01:00
47009615ee rename words_position to wordsPosition; fix #483 2020-02-27 16:24:49 +01:00
dda08d60d2 cargo fmt 2020-02-27 14:33:57 +01:00
f182afc50b update tests 2020-02-27 11:30:23 +01:00
bb5d931f16 rename criterions on settings route; fix #480 2020-02-27 11:30:22 +01:00
3c74e71d4f show default ranking rules if user reset them; fix #476 2020-02-27 11:30:17 +01:00
79e07fa852 reset value of searchable and displayed attributes; fix #473 2020-02-27 11:04:39 +01:00
aa95c26e07 update tests 2020-02-27 11:04:39 +01:00
2eb6f81c58 rename ranking_distinct to distinct_attribute; fix #474 2020-02-27 11:04:39 +01:00
a067a1b16b replace index_new_fields to accept_new_fields; fix #475 2020-02-27 11:04:38 +01:00
1df51c52e0 Merge pull request #458 from meilisearch/rename-exactness-criterion
Rename the Exact criterion into Exactness
2020-02-25 16:23:57 +01:00
96248d9bfa Change the exactness criterion in the tests 2020-02-25 14:24:15 +01:00
9d167c08f4 Rename the Exact criterion into Exactness 2020-02-25 14:16:55 +01:00
8e6560d102 Merge pull request #464 from meilisearch/simplify-keys
Simplify keys & add launcher resume
2020-02-17 13:59:41 +01:00
ad83c3ab5a add launch resume & environment 2020-02-17 10:13:08 +01:00
257b7b4df4 introduce new key management 2020-02-14 12:54:07 +01:00
5ac757a5fd Merge pull request #465 from meilisearch/fix-un-rankable-fields
fix un-rankable fields errors
2020-02-14 11:27:12 +01:00
2d7a1bfce0 fix un-rankable fields errors; fix #463 2020-02-14 10:34:33 +01:00
3845b89a16 Merge pull request #441 from meilisearch/issues-0.9.0
Stabilize http endpoint
2020-02-13 15:57:37 +01:00
ce8e12c7c5 update tests 2020-02-13 12:24:30 +01:00
4986adc186 move identifier from settings to index; fix #470 2020-02-12 17:00:14 +01:00
dc9ca2ebc9 fixes for review 2020-02-12 16:51:14 +01:00
40d7396d90 update tests for settings 2020-02-11 15:28:01 +01:00
559c2f8907 Add stop words on query 2020-02-11 15:28:00 +01:00
dc6907e748 rebase from master 2020-02-11 15:28:00 +01:00
2143226f04 setup clippy and make a pass on code 2020-02-11 15:28:00 +01:00
ea2a64a504 remove unecessary settings routes 2020-02-11 15:28:00 +01:00
a5b0e468ee fix for review 2020-02-11 15:28:00 +01:00
14b5fc4d6c cargo fmt 2020-02-11 15:28:00 +01:00
f498bfed51 add test on /settings/ranking 2020-02-11 15:27:59 +01:00
50a9825a0f fix some uses cases on settings 2020-02-11 15:27:59 +01:00
5c49f08bb2 update settings routes 2020-02-11 15:27:59 +01:00
bbf9f41a04 add cors 2020-02-11 15:27:59 +01:00
6a32432b01 add /settings/index-new-fields routes 2020-02-11 15:27:59 +01:00
037724576e update tests 2020-02-11 15:27:59 +01:00
10b8a0ab00 add request middleware 2020-02-11 15:27:59 +01:00
faf0dd2f44 do not show matches on undesired fields 2020-02-11 15:27:58 +01:00
585bba43a0 set new attributes indexed if needed 2020-02-11 15:27:58 +01:00
b1528f9466 allow to see highlihts with matches and crop; fix #450 #449 2020-02-11 15:27:58 +01:00
7a491a64c0 add test 2020-02-11 15:27:58 +01:00
57503ad9bf add test on search 2020-02-11 15:27:58 +01:00
c276dda305 run cargo fmt 2020-02-11 15:27:58 +01:00
9c0497c419 change the way settings are show in updates 2020-02-11 15:27:58 +01:00
b33dac9faa add test for search + update ci for test in release 2020-02-11 15:27:57 +01:00
f77f38dfa0 fix update system 2020-02-11 15:27:57 +01:00
58fe87067b finish settings 2020-02-11 15:27:57 +01:00
dbba310770 squash me 2020-02-11 15:27:57 +01:00
6deb481589 definitely remove attributes_ranked on settings; auto create it with ranking_rules 2020-02-11 15:27:57 +01:00
036977bfe4 add the possibility to totally clear the schema 2020-02-11 15:27:57 +01:00
d280848ff6 add test for settings 2020-02-11 15:27:56 +01:00
7a6f583b1f fix issue on ranking rules 2020-02-11 15:27:56 +01:00
e078eafb1f clean unused functions 2020-02-11 15:27:56 +01:00
6f534540a6 fix error on stop words fst 2020-02-11 15:27:56 +01:00
38d57d213f expose api for new settings 2020-02-11 15:27:56 +01:00
7c14769226 add test for index creation 2020-02-11 15:27:56 +01:00
b71bbcffaa simplify error handling 2020-02-11 15:27:56 +01:00
f83e874e35 return the good created_at and updated_at on index creation 2020-02-11 15:27:55 +01:00
ae0a11e422 fix schema & fix tests 2020-02-11 15:27:55 +01:00
116a637cfd set test for healthyness 2020-02-11 15:27:55 +01:00
83cf683db4 introduce test for meilisearch-http 2020-02-11 15:27:55 +01:00
1b3312871e set name optional during index creation 2020-02-11 15:27:55 +01:00
0e12920910 bump tide version 2020-02-11 15:27:55 +01:00
a35eb16a2a store the schema after each document updates 2020-02-11 15:27:54 +01:00
4f0ead625b adapt meilisearch-http to the new schemaless option 2020-02-11 15:27:54 +01:00
21d122a870 rewrite indexed_pos -> field_id for hightligths 2020-02-11 15:27:54 +01:00
130fb74928 introduce a new schemaless way 2020-02-11 15:27:54 +01:00
bbe1845f66 squash-me 2020-02-11 15:27:54 +01:00
2ee90a891c introduce a new settings update system 2020-02-11 15:27:54 +01:00
203c83bdb4 Remove SearchableAttributes; fix #429 2020-02-11 15:27:53 +01:00
73918d803c Rename AttributesToSearchIn into SearchableAttributes; fix #428 2020-02-11 15:27:53 +01:00
110adcae85 Remove the schema; fix #422 2020-02-11 15:27:53 +01:00
c536ea64c3 Change the indexes stats HTTP route; fix #423 2020-02-11 15:27:53 +01:00
aa7a6d5f8c Rewrite the synonyms endpoint; fix #418 2020-02-11 15:27:53 +01:00
91c6539baf Rewrite the stop-words endpoint; fix #417 2020-02-11 15:27:53 +01:00
f0590d3301 Change documents routes; fix #416 2020-02-11 15:27:53 +01:00
a5c5df0290 Merge pull request #443 from curquiza/brew
Add Brew installation in README
2020-02-10 16:36:33 +01:00
f0c2913dcf Add Brew installation in README 2020-02-10 16:26:50 +01:00
9c6d590950 Merge pull request #442 from curquiza/docker-github-action
Change github action for docker latest image
2020-02-10 16:26:14 +01:00
ab3339f5a1 Change github action for docker latest image 2020-02-10 16:11:45 +01:00
43ce45f62b Merge pull request #456 from djKooks/update/cjk-filter-ko-ja
Update CJK filter
2020-01-30 09:46:08 +01:00
2b5d153361 Update cjk filter 2020-01-30 09:55:16 +09:00
cde8845143 Merge pull request #454 from meilisearch/fix-db-compaction
Support compaction with the new split database
2020-01-24 17:45:34 +01:00
7c0d8f073b Support compaction with multi database 2020-01-24 17:38:14 +01:00
69adb1d771 Merge pull request #453 from meilisearch/introduce-query-tree
Introduce a query tree structure
2020-01-23 10:40:53 +01:00
a2bc689b92 Fix the tests a little bit 2020-01-22 18:12:56 +01:00
a9adbda2cd Make the engine support non-exact multi-words synonyms 2020-01-22 18:11:58 +01:00
0b9fe2c072 Introduce the new Query Tree creation supporting more operations 2020-01-22 17:46:46 +01:00
789e05304c Replace prints by debug logs 2020-01-21 11:05:34 +01:00
7604387701 Clean up the dependencies 2020-01-21 11:04:25 +01:00
daffcaf4c6 Make the docids OR operation method conditional 2020-01-19 12:29:06 +01:00
ff1ec599e0 Try a better version of sdset 2020-01-19 12:01:24 +01:00
e44d498c94 Display more debug info for prefix tolerant fetches 2020-01-19 11:07:32 +01:00
c334d6b7fe Avoid sorting sorted sequences, prefer using set operations 2020-01-19 10:58:01 +01:00
5465e401bb Catch query tree related errors 2020-01-17 10:41:27 +01:00
9cc3c56c9c Fix the prefix system 2020-01-16 18:41:27 +01:00
d7a7560220 Use an union instead of a sort for prefix fetching 2020-01-16 17:09:27 +01:00
70a529d197 Reduce the number of args of update functions 2020-01-16 16:29:50 +01:00
be31a14326 Make the clear all operation clear caches 2020-01-16 16:19:04 +01:00
96139da0d2 Reintroduce the distinct search system 2020-01-16 15:55:55 +01:00
74fa9ee4df Introduce a better higlighting system 2020-01-16 14:56:16 +01:00
00336c5154 Reintroduce a basic highlight display 2020-01-16 14:24:45 +01:00
3912d1ec4b Improve query parsing and interpretation 2020-01-16 14:11:17 +01:00
70d4f47f37 Differentiate short words as prefix or exact matches 2020-01-16 12:01:51 +01:00
9809ded23d Implement synonym fetching 2020-01-16 11:38:23 +01:00
5f9a3546e0 Use an union instead of a sort for OR ops 2020-01-15 15:14:24 +01:00
db625a08f7 Update lock file 2020-01-15 12:25:14 +01:00
44fec1b6c9 Cache prefixes of a length of 2 2020-01-14 18:17:52 +01:00
54dacb362d Use different algorithms for different documents ratios 2020-01-14 17:51:08 +01:00
6edb460bea Try with an exponential search 2020-01-14 16:52:24 +01:00
40dab80dfa Change the way we filter the documents 2020-01-14 14:18:01 +01:00
681711fced Fix query ids to be usize 2020-01-14 13:12:42 +01:00
21c1473e0c Introduce the distance data 2020-01-14 11:38:04 +01:00
8acbdcbbad wip: Make the new query tree work with the criteria 2020-01-13 14:36:06 +01:00
da8abebfa2 Introduce the query words mapping along with the query tree 2020-01-13 13:29:47 +01:00
4f7a7ea0bb Faster intersection group by 2020-01-09 16:30:03 +01:00
d6c9ba8f08 Store the postings lists 2020-01-09 15:04:53 +01:00
ec8916bf54 Change the debug outputs 2020-01-09 12:05:39 +01:00
81c573ec92 Add the raw document IDs to the postings lists 2020-01-08 15:30:43 +01:00
9420edadf4 Introduce the Postings type to decorrelate the DocumentIds 2020-01-08 14:48:23 +01:00
d724a7659e Introduce a query tree context struct 2020-01-08 13:37:22 +01:00
887c212b49 Add more logs about the docids construction 2020-01-08 13:22:42 +01:00
07937ed6d7 Use the prefix caches 2020-01-08 13:14:07 +01:00
a262c67ec3 limit the search in the FST 2020-01-08 13:06:12 +01:00
13ca30c4d8 WIP: Made the query tree traversing support prefix search 2020-01-08 12:02:58 +01:00
fbcec2975d wip: Impl a basic tree traversing 2020-01-07 18:24:13 +01:00
6e1f4af833 wip: Create a tree from query but need to show synonyms 2020-01-07 18:24:13 +01:00
856c5c4214 Fix group offset computing 2019-12-31 14:24:10 +01:00
670e80c151 Use the cached postings lists in the query system 2019-12-31 13:32:36 +01:00
eed07c724f Add more logging for postings lists fetching by word 2019-12-31 13:32:36 +01:00
99d35fb940 Introduce a first version of a number of candidates reducer
It works by ignoring the postings lists associated to documents that the previous words did not returned
2019-12-31 13:32:36 +01:00
106b886873 Cache the prefix postings lists 2019-12-30 18:01:32 +01:00
928876b553 Introduce the postings lists caching stores
Currently not used
2019-12-30 18:01:27 +01:00
58836d89aa Rename the PrefixCache into PrefixDocumentsCache 2019-12-30 15:42:09 +01:00
1a5a104f13 Display proximity evaluation number of calls 2019-12-30 15:42:09 +01:00
9790c393a0 Change the time measurement of the query 2019-12-30 15:42:08 +01:00
064cfa4755 Add more debug, where are those 100ms 2019-12-30 15:42:08 +01:00
ed6172aa94 Add a time measurement of the criterion loop 2019-12-30 15:42:08 +01:00
8c140f6bcd Increase the disk usage limit 2019-12-30 15:42:08 +01:00
1e1f0fcaf5 Introduce a basic cache system for first letters 2019-12-30 15:42:08 +01:00
d21352a109 Change the time measurement of the FST 2019-12-30 15:42:08 +01:00
4be11f961b Use an ugly trick to avoid cloning the FST 2019-12-30 15:42:07 +01:00
1163f390b3 Restrict FST search to the first letter of the word 2019-12-30 15:42:07 +01:00
534143e91d Merge pull request #439 from meilisearch/fix-update-deadlock
Fix a blocking channel, appearing like a deadlock
2019-12-30 15:41:26 +01:00
691e2a3c1d Fix a blocking channel, appearing like a deadlock 2019-12-30 15:28:28 +01:00
20b92fcb4c Merge pull request #435 from meilisearch/debug-missing-measurements
Add more debug timings
2019-12-20 18:04:21 +01:00
04bb49989f Add more debug timings 2019-12-20 14:18:48 +01:00
2aa7cb9d20 Merge pull request #433 from meilisearch/fix-index-creation
Set the indexes info in the create_index function
2019-12-19 10:59:47 +01:00
d12ff15ee3 Set the indexes info in the create_index function 2019-12-19 10:38:56 +01:00
11b684114d Merge pull request #431 from curquiza/web-interface-readme
Update REAME with the Web Interface introduction
2019-12-18 13:50:12 +01:00
1bf177f81a Update REAME with the Web Interface introduction
Co-Authored-By: cvermand <33010418+bidoubiwa@users.noreply.github.com>
2019-12-18 13:41:15 +01:00
df7dc54409 Merge pull request #415 from meilisearch/fix-blocking-settings
Use a main read transaction instead of a write one
2019-12-17 16:21:41 +01:00
7e86056a27 Use a main read transaction instead of a write one 2019-12-17 15:48:06 +01:00
59f74dabe7 Merge pull request #407 from meilisearch/friendly-web-interface
Friendly web interface
2019-12-17 14:47:24 +01:00
4610198ba2 Introduce a Bulma based web interface 2019-12-17 14:36:26 +01:00
3d19f566b6 Merge pull request #406 from bidoubiwa/remove_nsfw_movie
Removed nsfw movie from movies.json dataset
2019-12-13 17:56:09 +01:00
8d90cd8e35 Removed nsfw movie from movies.json dataset 2019-12-13 17:21:46 +01:00
610d44e703 Merge pull request #401 from tpayet/feat/heroku-button
Add heroku one-click deploy
2019-12-13 16:26:31 +01:00
0272b44d7e Add heroku one-click deploy 2019-12-13 16:03:00 +01:00
3eccf2fd76 Merge pull request #405 from meilisearch/disable-bench-workflow
Disable the benchmarks github workflow
2019-12-13 15:56:16 +01:00
736f285092 Disable the benchmarks github workflow 2019-12-13 15:37:24 +01:00
020cd7f9e8 Merge pull request #403 from meilisearch/lazy-data-fetching
Criteria lazy data preparation
2019-12-13 14:57:19 +01:00
40c0b14d1c Reintroduce searchable attributes and reordering 2019-12-13 14:38:25 +01:00
a4dd033ccf Rename raw_matches into bare_matches 2019-12-13 14:38:25 +01:00
48e8778881 Clean up the modules declarations 2019-12-13 14:38:25 +01:00
4be23efe66 Remove the AttrCount type
Could probably be reintroduced later
2019-12-13 14:38:25 +01:00
7d67750865 Reintroduce exacteness for one word document field 2019-12-13 14:38:25 +01:00
746e6e170c Make the test pass again 2019-12-13 14:38:24 +01:00
d93e35cace Introduce ContextMut and Context structs 2019-12-13 14:38:24 +01:00
d75339a271 Prefer summing the attribute 2019-12-13 14:38:24 +01:00
86ee0cbd6e Introduce bucket_sort_with_distinct function 2019-12-13 14:38:24 +01:00
248ccfc0d8 Update the criteria to the new ones 2019-12-13 14:38:24 +01:00
ea148575cf Remove the raw_query functions 2019-12-13 14:38:23 +01:00
efc2be0b7b Bump the sdset dependency to 0.3.6 2019-12-13 14:38:23 +01:00
8d71112dcb Rewrite the phrase query postings lists
This simplified the multiword_rewrite_matches function a little bit.
2019-12-13 14:38:23 +01:00
dd03a6256a Debug pre filtered number of documents 2019-12-13 14:38:23 +01:00
9c03bb3428 First probably working phrase query doc filtering 2019-12-13 14:38:23 +01:00
22b19c0d93 Fix the processed distance algorithm 2019-12-13 14:38:22 +01:00
0f698d6bd9 Work in progress: Bad Typo detection
I have an issue where "speakers" is split into "speaker" and "s",
when I compute the distances for the Typo criterion,
it takes "s" into account and put a distance of zero in the bucket 0
(the "speakers" bucket), therefore it reports any document matching "s"
without typos as best results.

I need to make sure to ignore "s" when its associated part "speaker"
doesn't even exist in the document and is not in the place
it should be ("speaker" followed by "s").

This is hard to think that it will had much computation time to
the Typo criterion like in the previous algorithm where I computed
the real query/words indexes based and removed the invalid ones
before sending the documents to the bucket sort.
2019-12-13 14:38:22 +01:00
4e91b31b1f Make the Typo and Words work with synonyms 2019-12-13 14:38:22 +01:00
f87c67fcad Improve the QueryEnhancer by doing a single lookup 2019-12-13 14:38:22 +01:00
902625601a Work in progress: It seems like we support synonyms, split and concat words 2019-12-13 14:38:22 +01:00
d17d4dc5ec Add more debug infos 2019-12-13 14:38:21 +01:00
ef6a4db182 Before improving fields AttrCount
Removing the fields_count fetching reduced by 2 times the serach time, we should look at lazily pulling them form the criterions in needs

ugly-test: Make the fields_count fetching lazy

Just before running the exactness criterion
2019-12-13 14:38:21 +01:00
11f3d7782d Introduce the AttrCount type 2019-12-13 14:38:21 +01:00
5b9fff6636 Merge pull request #352 from meilisearch/add-search-benchmarks
Add some criterion benchmarks to help detect regressions
2019-12-13 14:37:48 +01:00
a8272f0eef Add a benchmark github workflow 2019-12-13 14:17:40 +01:00
951f0bcb10 sqaush-me: Improve benchmarks naming 2019-12-13 14:17:40 +01:00
d8ba405baf Add some criterion benchmarks to help mesure improvements 2019-12-13 14:17:40 +01:00
70f18a8086 Merge pull request #400 from meilisearch/fix-issues
Close multiples issues on HTTP behavior
2019-12-13 10:30:42 +01:00
0b5db77511 Fix erase setting option 2019-12-13 10:22:35 +01:00
3a4130f344 Allow to index files with null or boolean 2019-12-12 19:25:05 +01:00
1ea29bb92e Fix unwrap if schema does not contain ranked attributes on a custom ranking setting 2019-12-12 16:37:46 +01:00
04d34cb8aa Search; return formated section only if it's necessary 2019-12-12 16:36:42 +01:00
bf80729e17 Update message on access forbidden 2019-12-12 15:39:32 +01:00
88b3c05155 Stop words; Do not reindex all documents if there is no documents 2019-12-12 15:31:39 +01:00
6edef07e29 HTTP delete index route; Fix error on index not found 2019-12-12 14:06:16 +01:00
5ad73fe08b Merge pull request #399 from meilisearch/rewrite-synonym-endpoint
Rewrite the synonym endpoint
2019-12-12 12:58:14 +01:00
a4f26e8e48 Rewrite the synonym endpoint 2019-12-12 12:47:02 +01:00
cc10804607 Merge pull request #395 from meilisearch/update-bitly-link
Update the bit.ly movies.json link
2019-12-10 18:13:52 +01:00
f959cd76ae Update the bit.ly movies.json link 2019-12-10 18:07:14 +01:00
dcd332e2e4 Merge pull request #396 from meilisearch/disable-windows-tests
Disable windows tests
2019-12-10 18:03:13 +01:00
f3a276d1e1 Update the workflow README.md 2019-12-10 17:56:24 +01:00
640d21a7d2 Disable the Windows tests workflow 2019-12-10 17:53:26 +01:00
216cccbfba Merge pull request #391 from meilisearch/fix-one-document-route
Do not expect a JSON value as a document indentifer
2019-12-09 21:53:04 +01:00
04d1da11f7 Do not expect a JSON value as a document indentifer 2019-12-09 21:34:40 +01:00
ee4e9dcc74 Merge pull request #388 from meilisearch/remove-synonyms-unwraps
Remove unsound unwraps from the synonym routes
2019-12-09 17:06:02 +01:00
6fef04be20 Remove unsound unwraps from the synonym routes 2019-12-09 16:54:54 +01:00
86347bff3a Merge pull request #384 from curquiza/install-script-prereleases
Change regexp in install script
2019-12-09 15:28:19 +01:00
e291d9954a Change regexp in install script to not take into acccount pre-releases 2019-12-09 15:14:25 +01:00
7a548467b9 Merge pull request #382 from curquiza/health-routes
Keep only useful routes for /health
2019-12-08 18:11:19 +01:00
06d8e00ff3 Keep only useful routes for /health 2019-12-08 17:56:33 +01:00
225f5a172d Merge pull request #381 from curquiza/update-index-httpstatus
Change HTTP status of update index route
2019-12-08 17:53:01 +01:00
e531ff2e98 Change HTTP status of update index route 2019-12-08 17:10:21 +01:00
8c8040884e Merge pull request #376 from meilisearch/windows-support
Update the actions to support Windows
2019-12-07 12:07:27 +01:00
e3611ad0e4 Update the action to test on more platforms 2019-12-07 11:57:33 +01:00
289bc6570b Update the action to publish windows binaries 2019-12-07 11:52:14 +01:00
dc1849d291 Bump heed to 0.6.1 2019-12-07 11:49:45 +01:00
17a66227f4 Merge pull request #375 from nithinkashyapn/master
Docker command updated
2019-12-06 12:11:56 +01:00
0e8b95f4bf Docker command updated
Docker does not allow Uppercase letters, throws this error 

`docker: invalid reference format: repository name must be lowercase.`
2019-12-06 16:30:37 +05:30
5b8344cfc3 Merge pull request #373 from curquiza/stop-words-deletion
Use POST instead of DELETE method to delete stops-word
2019-12-05 23:06:15 +01:00
075f4034d9 Use POST instead of DELETE method to delete stops-word 2019-12-05 18:07:56 +01:00
c616ce99a8 Merge pull request #368 from tpayet/add-push-debpkg
Add publish action to gemfury for apt pkg
2019-12-05 15:35:12 +01:00
6b9b5fda7e Add publish action to gemfury for apt pkg 2019-12-05 14:54:57 +01:00
b756fc382a Merge pull request #367 from meilisearch/support-stdin-example
Allow users to send csv files from stdin in examples
2019-12-05 12:33:18 +01:00
29fd54dcfa Allow users to send csv files from stdin in examples 2019-12-05 12:23:56 +01:00
d664e97104 Merge pull request #365 from meilisearch/update-readme
Reorder "Deploy the server" options on the README
2019-12-04 18:37:40 +01:00
4466097d44 Update readme.md; Deploy part 2019-12-04 18:16:56 +01:00
60b94d2dc1 Merge pull request #366 from tpayet/cargo-deb
Add debian package in CI
2019-12-04 18:14:10 +01:00
51636402c2 Add debian package in CI 2019-12-04 18:02:30 +01:00
fc8182d7d3 Merge pull request #363 from meilisearch/bump-version
Bump meilisearch crates to v0.8.4
2019-12-03 17:30:31 +01:00
4f87465f18 Bump meilisearch crates to v0.8.4 2019-12-03 17:22:45 +01:00
5f1586ae85 Merge pull request #360 from meilisearch/fix-readme-broken-links
Fix README broken links
2019-12-02 19:10:40 +01:00
8d3161a2cf Reorder README parts 2019-12-02 18:29:53 +01:00
8bc8214279 Fix README broken links
Thanks to @baptistejamin!
2019-12-02 16:45:27 +01:00
3ea5aa18a2 Merge pull request #359 from bidoubiwa/fix_wording_in_readme
Fix bad wording in readme file
2019-12-02 14:06:49 +01:00
c4845b78a9 Fix bad wording in readme file 2019-12-02 11:15:39 +01:00
530e913e2f Merge pull request #356 from tpayet/fix-port-readme
Fix port in README & Dockerfile
2019-11-29 19:21:55 +01:00
5917f212ba Fix port in README & Dockerfile 2019-11-29 18:03:54 +01:00
d2b1690191 Merge pull request #355 from tpayet/master
Update binary default settings
2019-11-29 15:47:04 +01:00
710b7ea091 Update default listening port to 7700 2019-11-29 15:25:26 +01:00
089579d835 Update default database directory to working directory 2019-11-29 15:25:26 +01:00
7780293ddb Merge pull request #354 from meilisearch/camelcase-updates-result
Fix updates formattings and namings
2019-11-29 15:19:45 +01:00
773a51e7d0 Rename 'update_type' to 'type' on EnqueuedUpdateResult 2019-11-29 15:09:48 +01:00
7923752513 Serialize updates results to camelCase 2019-11-29 15:05:54 +01:00
9a48091b21 Merge pull request #353 from meilisearch/bump-version
Bump meilisearch crates to v0.8.3
2019-11-29 14:13:37 +01:00
30cb60f679 Bump meilisearch crates to v0.8.3 2019-11-29 14:06:17 +01:00
08687d8dab Merge pull request #351 from meilisearch/status-failed-updates-status
Add status failed on UpdateStatus
2019-11-28 18:53:31 +01:00
3a90233a3d Add status failed on UpdateStatus 2019-11-28 18:41:11 +01:00
32483cae2d Merge pull request #347 from curquiza/installation-script
Add script for binary installation
2019-11-28 18:34:58 +01:00
d7f28e0260 Add script for binary installation 2019-11-28 18:34:12 +01:00
9640c2aaa6 Merge pull request #349 from meilisearch/bump-version
Bump meilisearch crates to v0.8.2
2019-11-28 17:23:40 +01:00
9a2b4d08e1 Bump meilisearch crates to v0.8.2 2019-11-28 17:15:13 +01:00
e91615fe59 Merge pull request #348 from meilisearch/replace-isahc-by-ureq
Replace isahc by ureq
2019-11-28 17:14:32 +01:00
aed02b2e19 Remove many dependencies from the Dockerfile 2019-11-28 17:04:01 +01:00
83ad80d9db Replace isahc by ureq 2019-11-28 16:41:42 +01:00
abdb7793fb Merge pull request #345 from tpayet/readme_changes
Clarification of readme file
2019-11-28 16:35:44 +01:00
387eb3fde3 Clarification of readme file 2019-11-28 16:28:25 +01:00
e640bc90b4 Merge pull request #343 from meilisearch/explicit-index-clear
Change the update loop to be more explicit on index clear
2019-11-28 14:48:37 +01:00
3978378152 Merge pull request #344 from tpayet/patch-1
Update README license badge
2019-11-28 14:35:50 +01:00
61e3e4f0b9 Update README license badge 2019-11-28 14:28:30 +01:00
1def56ea11 Change the update loop to be more explicit on index clear 2019-11-27 13:43:28 +01:00
6d686ac14f Merge pull request #342 from meilisearch/update-lock
Update the lock file
2019-11-27 12:49:47 +01:00
641e0d15f5 Make sure the lock file is up to date 2019-11-27 12:06:14 +01:00
71b39426c0 Update the lock file 2019-11-27 12:01:22 +01:00
57584eaccc Merge pull request #341 from meilisearch/bump-version
Bump meilisearch crates to v0.8.1
2019-11-27 11:54:39 +01:00
f6fb31c531 Bump meilisearch crates to v0.8.1 2019-11-27 11:47:27 +01:00
0cea8ce5b5 Merge pull request #340 from meilisearch/separate-updates-kvstore
Separate the update and main databases
2019-11-27 11:39:14 +01:00
d08b76a323 Separate the update and main databases
We used the heed typed transaction to make it safe (https://github.com/Kerollmops/heed/pull/27).
2019-11-27 11:29:06 +01:00
86a87d6032 Merge pull request #339 from tpayet/action-docker-tag
Update action workflow for docker tagged image
2019-11-26 19:17:19 +01:00
e534929f80 Update action workflow for docker tagged image 2019-11-26 18:18:51 +01:00
fcc154da1c Merge pull request #336 from meilisearch/rename-to-meilisearch
Rename MeiliDB into MeiliSearch
2019-11-26 14:06:01 +01:00
00d1200704 Rename the meilisearch-http binary into meilisearch 2019-11-26 11:17:30 +01:00
7cc096e0a2 Rename MeiliDB into MeiliSearch 2019-11-26 11:12:30 +01:00
58eaf78dc4 Merge pull request #335 from tpayet/github-release-action
GitHub release action
2019-11-25 19:19:08 +01:00
3be2281483 Update workflows README 2019-11-25 18:14:21 +01:00
cc06d96993 Add gh actions to release binaries 2019-11-25 17:27:15 +01:00
93c7e700bc Merge pull request #333 from tpayet/update-dockerfile
Add meilihttp_addr env variable in docker build
2019-11-25 16:41:52 +01:00
97c6757fc7 Add meilihttp_addr env variable in docker build 2019-11-25 16:30:07 +01:00
276d3f8e22 Merge pull request #332 from meilisearch/jemalloc-only-on-linux
Make jemalloc only used on linux
2019-11-25 16:13:54 +01:00
4869a88ae2 Make jemalloc only used on linux 2019-11-25 15:35:13 +01:00
ae88bc31bc Merge pull request #331 from meilisearch/enable-jemalloc-linux-only
Enable jemalloc only on linux OSs
2019-11-25 14:59:56 +01:00
8aed1d96c5 Enable jemalloc only on linux OSs 2019-11-25 14:51:47 +01:00
c93949474c Merge pull request #330 from tpayet/fix-actions-badge-link
Update action badge link
2019-11-25 13:51:07 +01:00
8cf19f1c6b Update action badge link 2019-11-25 13:44:20 +01:00
a82ecb3cef Merge pull request #324 from tpayet/gh-actions
Replace Azure CI by Github Actions
2019-11-25 13:31:15 +01:00
04c2b37d82 Remove Azure CI
Add gh actions for cargo check using rust nightly

Add readme about actions workflows

Add basic Dockerfile

Add action workflow for docker publish

Change check action to test action

Update workflow readme without rust nightly

Rename test action file

Add gh actions to push latest docker image from master

Update github action for publish docker image

Add 2 steps dockerfile based on alpine

Update readme badges to match new CI
2019-11-25 13:20:54 +01:00
ab3e8d6537 Merge pull request #314 from meilisearch/fix-number-ord
Fix the ordering functions of the Number type
2019-11-22 15:14:05 +01:00
fd185a5e6b Add a test for the SorByAttr criterion 2019-11-22 15:04:23 +01:00
d9678f0040 Fix the ordering functions of the Number type 2019-11-22 14:44:02 +01:00
840217b111 Merge pull request #321 from meilisearch/fix-create-index
Fix index creation
2019-11-22 14:10:05 +01:00
9605a2cd88 Make possible to use a custom uid and simplify the usage 2019-11-22 14:01:00 +01:00
0f86ccc035 Index UID generation makes sure to not generate the same number 2019-11-22 14:01:00 +01:00
b3b73e2276 Merge pull request #323 from meilisearch/fix-index-deletion
Fix index deletion once again
2019-11-22 14:00:19 +01:00
f241c999ad Make the CI use rust stable 2019-11-22 13:47:29 +01:00
d4d2a2303a Fix a typo on timeout_ms used for multi index search 2019-11-22 13:47:29 +01:00
c8832409ad Fix the dead lock on index deletion once again 2019-11-22 13:47:29 +01:00
98f76aa952 Merge pull request #320 from meilisearch/send-amplitude-events
Add an Amplitude analysis loop tick
2019-11-22 10:52:29 +01:00
4236632af6 Add an amplitude analysis loop tick 2019-11-21 20:28:58 +01:00
e2c98244ec Merge pull request #313 from meilisearch/fix-dead-lock
Fix dead locks when deleting indexes
2019-11-21 12:42:40 +01:00
c1cf67c008 Join updates threads after dropping the indexes lock and avoid deadlocks 2019-11-21 12:01:46 +01:00
4abea919b2 Merge pull request #311 from meilisearch/add-index-name-and-id
Add index name and change some routes request body & response
2019-11-21 11:59:14 +01:00
d60aa722c0 Allow to update expireAt and revoked on token 2019-11-21 11:49:49 +01:00
055368acd8 Fix for review 2019-11-21 11:49:49 +01:00
7f2e5d091a Rename routes /synonym to /synonyms 2019-11-20 15:33:42 +01:00
c69ae8154f Allow to receive schema update formated as SchemaBuilder 2019-11-20 15:25:34 +01:00
cd95b243bb Add the update index route 2019-11-20 15:00:06 +01:00
1f1cb1f501 Rename browse_documents into get_all_documents and always respond HTTP Ok 2019-11-20 14:18:21 +01:00
530738cfe9 Format code 2019-11-20 14:12:12 +01:00
878dd6912e Return a HTTP 401 instead of 404 if token is not found 2019-11-20 14:06:56 +01:00
5f0f699f37 Move route to clear all synonyms on DELETE /synonyms 2019-11-20 14:03:55 +01:00
ca13900699 Add async routes should return ACCEPTED status code response 2019-11-20 14:03:19 +01:00
cc97889b37 Add stop-word is now PATCH method 2019-11-20 13:56:43 +01:00
45ded0498b Format code with cargo fmt 2019-11-20 11:45:23 +01:00
d01a3944c1 Add last_update information on global /stats route 2019-11-20 11:45:22 +01:00
a0caf0d6d7 Remove unused result response on indexes_uids function 2019-11-20 11:45:22 +01:00
e22debb994 Update index updated_at information at each update callback 2019-11-20 11:45:22 +01:00
1b8df0ed8b Remove last_update from stats 2019-11-20 11:45:22 +01:00
3286a5213c Move fields frequency from common store to index main store 2019-11-20 11:45:22 +01:00
394976d330 Update list_index route to return all index information, not only list of uid 2019-11-20 11:45:22 +01:00
b95acbece0 Function generate_uid return now lowercased uid 2019-11-20 11:45:22 +01:00
c94f4dff71 Do not return update_id on IndexCreateRespnse if it's none 2019-11-20 11:45:22 +01:00
e6465f4ea1 Create a new specific route for schema 2019-11-20 11:45:22 +01:00
2b3c91aabd Update get_index_schema to allow raw response 2019-11-20 11:45:22 +01:00
e97e13ce9f Rename index_name to index_uids 2019-11-20 11:45:22 +01:00
39e2b73718 Add updatedAt on main index store 2019-11-20 11:45:22 +01:00
a90facaa41 Rename index_name by index_uid 2019-11-20 11:45:22 +01:00
5527457655 Rewrite create_index route new path, body request and response 2019-11-20 11:45:21 +01:00
076e781810 Add name, created_at and updated_at informations into main index 2019-11-20 11:45:21 +01:00
750d336018 Bump Cargo.lock meili versions 2019-11-20 11:45:21 +01:00
e8251ad45b Merge pull request #310 from meilisearch/unify-crates-version
Unify the crates versions to 0.8.0
2019-11-20 11:05:54 +01:00
963ca1e2c7 Unify the crates versions to 0.8.0 2019-11-20 10:47:32 +01:00
12a6c7d54d Merge pull request #298 from bidoubiwa/add_ranked_movies_dataset
Create a dataset where the release_date is a numeric timestamp
2019-11-20 10:46:24 +01:00
2d0fc3f9d3 Create a dataset where the release_date is a numeric timestamp 2019-11-20 10:44:32 +01:00
e554784527 Merge pull request #309 from bidoubiwa/remove_stop_words_from_settings
Removed stop words from settings route
2019-11-19 18:35:27 +01:00
2cb43fa638 Removed stop words from settings route 2019-11-19 18:21:44 +01:00
66d5309a51 Merge pull request #308 from meilisearch/improve-structopt
Introduce better argument names
2019-11-19 18:09:44 +01:00
7eeedec7eb Bump meilidb-http to v0.3.0 2019-11-19 17:50:01 +01:00
4b798c71ae Introduce new arguments and understand env vars 2019-11-19 17:50:01 +01:00
685016bfec Bump meilidb-core to v0.7.0 and meilidb-http to v0.2.0 2019-11-18 15:49:23 +01:00
d30e5f6231 Merge pull request #299 from meilisearch/default-update-callbacks
Prefer using a global update callback common to all indexes
2019-11-18 15:05:21 +01:00
e854d67a55 Remove useless routes and checks 2019-11-18 14:41:49 +01:00
23a89732a5 Prefer using a global update callback common to all indexes 2019-11-18 14:41:49 +01:00
3a1f41ebdb Merge pull request #305 from meilisearch/fix-example
Make easier to interact with compacted databases
2019-11-17 20:31:06 +01:00
f873761a27 Make easier to interact with compacted databases 2019-11-17 20:01:02 +01:00
ebf620c7f9 Merge pull request #302 from meilisearch/fix-dataset-schema
Rename the movies dataset schema file
2019-11-17 17:17:33 +01:00
8b92bc3421 Rename the movies dataset schema file 2019-11-17 16:45:13 +01:00
70a5aa61e9 Merge pull request #301 from meilisearch/separate-types
Move the main types to a separate library
2019-11-17 12:45:25 +01:00
a76169042f Make the serde and zerocopy meilidb-types dependencies optional 2019-11-17 12:30:39 +01:00
c9c3cfcee9 Move the main types to a separate library 2019-11-17 12:19:36 +01:00
2e60ac5359 Merge pull request #300 from meilisearch/update-dependencies
Do not use a forked fst dependency
2019-11-17 12:19:08 +01:00
2dd7751e09 Disable the fst MemMap feature 2019-11-17 11:43:00 +01:00
26bdabcdec Do not use a forked fst dependency 2019-11-17 11:14:01 +01:00
fc8c7ed77e Merge pull request #297 from meilisearch/improve-highlights
Improve the highlight formatted outputs
2019-11-15 14:28:27 +01:00
521c96354f Improve the highlight formatted outputs 2019-11-15 14:16:21 +01:00
9788779894 Merge pull request #296 from meilisearch/update-readme
Update the README
2019-11-14 21:32:32 +01:00
9b965764ab Update the README 2019-11-14 19:09:04 +01:00
9a5a543311 Merge pull request #290 from curquiza/deploy-doc
Add information in documentation in Deploy Server part
2019-11-13 16:06:27 +01:00
b18fb868e8 Add information in documentation in Deploy Server part 2019-11-13 15:37:21 +01:00
c734af55c0 Merge pull request #289 from curquiza/status204-delete-index
Change the HTTP status code on index deletion
2019-11-13 15:33:27 +01:00
810b328ad2 Change the HTTP status code on index deletion 2019-11-13 15:14:23 +01:00
0a8039d8d8 Merge pull request #285 from bidoubiwa/remove_catching_same_index_creation
Change the error catching on the index creation route
2019-11-13 15:13:51 +01:00
e51704c09a Remove the error catching on the index creation route when the index already exist 2019-11-13 14:42:59 +01:00
623a9012d5 Merge pull request #279 from bidoubiwa/new_slogan_and_resume
Slogan and Resume proposition
2019-11-13 14:41:21 +01:00
b9a185634f Slogan and Resume proposition 2019-11-13 14:31:22 +01:00
b46889b5f0 Merge pull request #282 from meilisearch/fix-ci-artifacts
Add the meilidb-http binary to the artifacts
2019-11-13 11:39:00 +01:00
ef9a0c07db Add the meilidb-http binary to the artifacts 2019-11-13 11:15:39 +01:00
3a6f3947c9 Merge pull request #281 from meilisearch/fix-attributes-to-search-in
Take attributes to search in into account
2019-11-12 18:45:40 +01:00
5c5f41d755 Take attributes to search in into account 2019-11-12 18:35:58 +01:00
6803a8fad0 Merge pull request #280 from meilisearch/format-updates-json
Format updates json
2019-11-12 18:35:25 +01:00
8e4b362e4d Fixed the display of enqueued updates 2019-11-12 18:21:59 +01:00
acb5e624c6 Add enqueued and processed datetimes 2019-11-12 18:21:59 +01:00
a98949ff1d Improve updates JSON format 2019-11-12 16:57:22 +01:00
f355280250 Merge pull request #278 from meilisearch/mit-license
Change the license to an MIT one
2019-11-12 14:35:32 +01:00
cee8d6a8d9 Change the license to an MIT one 2019-11-12 14:24:28 +01:00
27326ea069 Merge pull request #277 from bidoubiwa/add_cmd_to_compile
Add cmd line to compile binary
2019-11-12 13:55:54 +01:00
7bbe5aca5b Add cmd line to compile binary 2019-11-12 10:57:03 +01:00
1c4afe6d0f Merge pull request #276 from meilisearch/support-slash-tokenizer
Add support for back/slashes
2019-11-11 21:46:14 +01:00
2d8f9a9849 Add support for back/slashes 2019-11-11 21:23:08 +01:00
3f41681b18 Merge pull request #274 from meilisearch/enable-env-logger
Add env logger to enable logging
2019-11-11 19:13:33 +01:00
64791815fa Add env logger to enable logging 2019-11-11 19:03:38 +01:00
8a36571a74 Merge pull request #272 from meilisearch/fix-long-words
Ignore words that are too long
2019-11-10 20:07:22 +01:00
d18e775bec Ignore words that are too long 2019-11-10 17:44:27 +01:00
78381f1818 Merge pull request #271 from meilisearch/update-dependencies
Update Dependencies
2019-11-10 11:17:09 +01:00
7f33a01ae1 Update dependencies 2019-11-10 11:04:56 +01:00
d07d14d33a Update crossbeam-channel to 0.4.0 2019-11-10 11:03:22 +01:00
540d7886ab Merge pull request #266 from meilisearch/update-readme
Update the readme and add a Quick Start section
2019-11-09 13:21:22 +01:00
5a5d10af52 Add an image description of the gif 2019-11-09 13:12:01 +01:00
f95d077ef8 Improve the README a little bit by adding a quick start section 2019-11-09 13:12:01 +01:00
05dd99936f Add a gif to show a demo using crates.io 2019-11-09 12:59:39 +01:00
c086625773 Merge pull request #269 from meilisearch/repo-became-binary
Make the repository be a binary and version the Cargo.lock
2019-11-09 12:58:52 +01:00
dc17bebf4a Make the repository be a binary and version the Cargo.lock 2019-11-09 12:13:28 +01:00
026464b2e4 Bump meilidb-core to v0.6.5 2019-11-06 11:52:34 +01:00
bd42158a70 Merge pull request #264 from meilisearch/index-soft-deletion
Index soft deletion
2019-11-06 11:51:50 +01:00
df066f4321 Introduce a new add or update documents PUT route 2019-11-06 11:42:41 +01:00
69832e8c70 Update the http index deletion route 2019-11-06 11:42:41 +01:00
95eb6ad09a Add a test to check index soft deletion works correctly 2019-11-06 11:02:30 +01:00
f3fc0bed45 Introduce index soft deletion 2019-11-06 11:02:30 +01:00
5dd6b697b9 Bump meilidb-core to v0.6.4 2019-11-05 18:46:16 +01:00
b7d170c7d1 Merge pull request #262 from meilisearch/fix-unidecoded-emojis
Fix an highlighting problem
2019-11-05 17:04:35 +01:00
7541172d12 Make the example show highlighted areas more explicitly 2019-11-05 16:40:48 +01:00
85bf5d113c Fix an highlighting problem when query was longer than original text 2019-11-05 16:40:34 +01:00
89fd397903 Bump meilidb-core to v0.6.3 2019-11-05 15:40:04 +01:00
d8392f2f18 Merge pull request #261 from meilisearch/partial-updates
Introduce the support of partial updates
2019-11-05 15:39:02 +01:00
36b74f0efe Introduce partial updates to the update system 2019-11-05 15:23:41 +01:00
68c0a36b00 Make the deserialization support correctly optional documents 2019-11-05 15:03:18 +01:00
a127b72a74 Merge pull request #259 from meilisearch/allow-add-schema-attributes-at-end
Allow to introduce attributes only at the end of a schema
2019-11-05 12:34:11 +01:00
5782fb9e52 Test the add of attributes only at the end of a schema 2019-11-05 12:09:52 +01:00
20319f7974 Allow to introduce attributes only at the end of a schema 2019-11-05 12:09:52 +01:00
c4087e2ec2 Merge pull request #258 from meilisearch/debug-schema
Implement a better debug for the schema
2019-11-05 11:35:02 +01:00
b1d1f2f627 Implement a better debug system for the schema 2019-11-05 11:21:07 +01:00
62fe6a8263 Merge pull request #257 from meilisearch/bump-version
Bump meilidb-core/tokenizer versions
2019-11-04 17:26:01 +01:00
d88c10f3b4 Bump meilidb-tokenizer to v0.6.1 2019-11-04 17:17:06 +01:00
00f49990c7 Bump meilidb-core to v0.6.2 2019-11-04 17:16:50 +01:00
89f30ad47e Merge pull request #256 from meilisearch/fix-tokenizer
Fix the tokenizer to make it work with unicode chars
2019-11-04 17:15:17 +01:00
3b1cbed238 Check that the unidecoded words are not empty 2019-11-04 17:03:11 +01:00
4571b80a49 Update the tests 2019-11-04 16:41:58 +01:00
de2b8672d4 Make the tokenizer understand strange whitespaces/quotes 2019-11-04 16:41:58 +01:00
ccded7b429 Improve the indexer to not not deunicode before indexing
Revert of #179
2019-11-04 16:41:58 +01:00
1d4e98410a Merge pull request #255 from meilisearch/bump-version
Bump meilidb-core to v0.6.1
2019-11-04 14:47:53 +01:00
e493b27ef1 Bump meilidb-core to v0.6.1 2019-11-04 14:22:08 +01:00
70589c136f Merge pull request #253 from meilisearch/fix-updates-system
Fix the updates system
2019-11-04 13:46:37 +01:00
1c3620a7d4 Add tests to the update system 2019-11-04 13:18:07 +01:00
c2cc0704d7 Clean up the update_awaiter function 2019-11-04 11:11:58 +01:00
2a50e08bb8 Moving to heed v0.5.0 2019-11-04 10:49:27 +01:00
6b326a45d7 Fix the update system to always consume updates even if failing 2019-10-31 17:44:13 +01:00
b73874bf24 Merge pull request #252 from meilisearch/examples-specify-index-name
Allow users to specify the index name to use with examples bins
2019-10-31 17:02:00 +01:00
95c8ad0f80 Allow users to specify the index name to use with examples bins 2019-10-31 16:20:31 +01:00
996763cc52 Merge pull request #251 from meilisearch/update-heed
Moving to heed 0.3.0
2019-10-31 16:20:07 +01:00
6a8171d335 Moving to heed 0.3.0 2019-10-31 16:11:02 +01:00
2f32586dab Merge pull request #250 from meilisearch/new-http-server
Introduce a brand new HTTP server
2019-10-31 16:07:52 +01:00
db898001eb Get rid of rust-crypto and uuid 2019-10-31 15:28:37 +01:00
c2a12b661a Make it a runnable server 2019-10-31 15:27:21 +01:00
f51c49db93 Introduce the HTTP tide based library 2019-10-31 15:02:34 +01:00
1be5b0f327 Bump the meili-core/schema/tokenizer crates to 0.6.0 2019-10-31 14:05:59 +01:00
a136c62208 Merge pull request #249 from meilisearch/display-all-updates
Display enqueued along with processed updates
2019-10-31 13:53:46 +01:00
cc461b1331 Display enqueued along with processed updates 2019-10-31 12:25:52 +01:00
dbe5363672 Merge pull request #248 from meilisearch/fix-highlight-too-long
Correctly highlight when query string is too long
2019-10-30 18:19:06 +01:00
45d4361e7d Correctly highlight when query string is longer 2019-10-30 17:49:50 +01:00
b28c44cc6b Merge pull request #247 from meilisearch/bump-meilidb
Bump the meili-core/schema/tokenizer crates to 0.5.11
2019-10-30 17:48:26 +01:00
b709a7a30a Bump the meili-core/schema/tokenizer crates to 0.5.11 2019-10-30 17:40:31 +01:00
64c25bdb40 Merge pull request #246 from meilisearch/better-highlighting-area
Make the highlight system much better
2019-10-30 17:39:12 +01:00
c230f244be Make the highlight system much better 2019-10-30 17:32:29 +01:00
02af4ff113 Merge pull request #245 from meilisearch/reindex-all-documents-reduce-memory-usage
Reduce the ram consumption when re-indexing all the documents
2019-10-29 17:54:47 +01:00
4dff8a215e Reduce the ram consumption when re-indexing all the documents 2019-10-29 17:46:23 +01:00
41065305aa Merge pull request #244 from meilisearch/reintroduce-stop-words
Reintroduce stop words
2019-10-29 16:35:03 +01:00
e9dce3ce81 Add a test to ensure that the indexer support stop words 2019-10-29 16:18:06 +01:00
ff7dde7522 Make the RawIndexer support stop words 2019-10-29 16:18:06 +01:00
a226fd23c3 Introduce the stop words deletion update type 2019-10-29 16:18:06 +01:00
776673ebae Introduce the stop words addition update type 2019-10-29 15:24:09 +01:00
32d2cc3aea Merge pull request #243 from meilisearch/all-updates-results
Introduce a function to get all updates results
2019-10-29 11:45:55 +01:00
8a17fcdda5 Introduce a function to get all updates results 2019-10-29 11:37:40 +01:00
9602d7a960 Merge pull request #242 from meilisearch/accept-dup-documents
Make documents additions accept only the last duplicate document
2019-10-28 20:52:40 +01:00
ac12a4b9c9 Make documents additions accept only the last duplicate document 2019-10-28 20:40:33 +01:00
af96050944 Merge pull request #241 from meilisearch/fix-dead-locks
Fix dead locks
2019-10-28 18:20:01 +01:00
a43b37dfc1 Send channel notification when clearing documents 2019-10-28 17:58:22 +01:00
c08dcac1d4 Abort the update transaction before calling the update callback 2019-10-28 17:55:43 +01:00
a17dccd84e Merge pull request #237 from meilisearch/fix-exactness-criterion
Fix the exactness criterion algorithm
2019-10-26 18:43:10 +02:00
9a57cab3ee Fix the exactness criterion algorithm 2019-10-26 18:34:40 +02:00
751b060320 Merge pull request #238 from meilisearch/improve-highlighting
Only highlight query words areas not the whole words
2019-10-26 18:23:19 +02:00
4111b99a6d Only highlight query words areas not the whole words 2019-10-26 15:56:34 +02:00
d6fb2b56d1 Merge pull request #236 from meilisearch/reorder-automatons
Make sure that automatons group with more automatons are better
2019-10-24 15:29:16 +02:00
cb5c77e536 Make sure that automatons group with more automatons are better 2019-10-24 15:18:53 +02:00
44c89b1ea2 Merge pull request #235 from meilisearch/readme-concat-split-query-words
Add information about search concat and split query words support
2019-10-23 18:20:59 +02:00
26a285053b Add information about search concat and split query words support 2019-10-23 18:19:15 +02:00
1446a6a2d2 Merge pull request #234 from meilisearch/clear-all-update-variant
Introduce a clear all documents update
2019-10-23 16:45:37 +02:00
047eba3ff3 Introduce a clear all documents update 2019-10-23 16:39:10 +02:00
8d9d183ce6 Merge pull request #233 from meilisearch/commit-when-update-ok
Commit an update only when it is Ok
2019-10-23 16:07:48 +02:00
eb67195840 Commit an update only when it is Ok 2019-10-23 15:52:40 +02:00
93306c2326 Merge pull request #232 from meilisearch/support-splitted-words
Support splitted words
2019-10-23 13:38:16 +02:00
7d9cf8d713 Clean up the fetch algorithm 2019-10-23 12:06:21 +02:00
03eb7898e7 Introduce a basic working version of phrase query for splitting words 2019-10-23 11:40:13 +02:00
0fbd4cd632 Merge pull request #231 from meilisearch/recursive-object-indexing
Make possible to convert recursive object into strings
2019-10-22 16:20:10 +02:00
858bf359b8 Make possible to convert recursive object into strings 2019-10-22 16:02:02 +02:00
5dc8465ebd Merge pull request #181 from meilisearch/diff-schema
Make possible to update an index schema
2019-10-22 14:23:43 +02:00
0f30a221fa Introduce the reindex_all_documents indexing function 2019-10-22 14:07:27 +02:00
e86a547e93 Introduce a basic schema diff function 2019-10-21 17:57:32 +02:00
150 changed files with 138550 additions and 6388 deletions

5
.dockerignore Normal file
View File

@ -0,0 +1,5 @@
target
Dockerfile
.dockerignore
.git
.gitignore

18
.github/workflows/README.md vendored Normal file
View File

@ -0,0 +1,18 @@
# GitHub actions workflow for MeiliDB
> **Note:**
> - We do not use [cache](https://github.com/actions/cache) yet but we could use it to speed up CI
## Workflow
- On each pull request, we are triggering `cargo test`.
- On each tag, we are building:
- the tagged docker image
- the binaries for MacOS, Ubuntu, and Windows
- the debian package
- On each stable release, we are build the latest docker image.
## Problems
- We do not test on Windows because we are unable to make it work, there is a disk space problem.

39
.github/workflows/publish-binaries.yml vendored Normal file
View File

@ -0,0 +1,39 @@
name: Publish binaries to GitHub release
on:
push:
tags:
- '*'
jobs:
publish:
name: Publish for ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
include:
- os: ubuntu-latest
artifact_name: meilisearch
asset_name: meilisearch-linux-amd64
- os: macos-latest
artifact_name: meilisearch
asset_name: meilisearch-macos-amd64
- os: windows-latest
artifact_name: meilisearch
asset_name: meilisearch-windows-amd64
steps:
- uses: hecrj/setup-rust-action@master
with:
rust-version: stable
- uses: actions/checkout@v1
- name: Build
run: cargo build --release --locked
- name: Upload binaries to release
uses: svenstaro/upload-release-action@v1-release
with:
repo_token: ${{ secrets.GITHUB_TOKEN }}
file: target/release/${{ matrix.artifact_name }}
asset_name: ${{ matrix.asset_name }}
tag: ${{ github.ref }}

30
.github/workflows/publish-deb-pkg.yml vendored Normal file
View File

@ -0,0 +1,30 @@
name: Publish deb pkg to GitHub release & apt repository
on:
push:
tags:
- '*'
jobs:
publish:
name: Publish debian packagge
runs-on: ubuntu-latest
steps:
- uses: hecrj/setup-rust-action@master
with:
rust-version: stable
- name: Install cargo-deb
run: cargo install cargo-deb
- uses: actions/checkout@v1
- name: Build deb package
run: cargo deb -p meilisearch-http -o target/debian/meilisearch.deb
- name: Upload debian pkg to release
uses: svenstaro/upload-release-action@v1-release
with:
repo_token: ${{ secrets.GITHUB_TOKEN }}
file: target/debian/meilisearch.deb
asset_name: meilisearch.deb
tag: ${{ github.ref }}
- name: Upload debian pkg to apt repository
run: curl -F package=@target/debian/meilisearch.deb https://${{ secrets.GEMFURY_PUSH_TOKEN }}@push.fury.io/meilisearch/

View File

@ -0,0 +1,19 @@
---
on:
push:
tags:
- 'v[0-9]+.[0-9]+.[0-9]+'
name: Publish latest image to Docker Hub
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1
- name: Publish to Registry
uses: elgohr/Publish-Docker-Github-Action@master
with:
name: getmeili/meilisearch
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

View File

@ -0,0 +1,20 @@
---
on:
push:
tags:
- '*'
name: Publish tagged image to Docker Hub
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1
- name: Publish to Registry
uses: elgohr/Publish-Docker-Github-Action@master
with:
name: getmeili/meilisearch
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
tag_names: true

25
.github/workflows/test.yml vendored Normal file
View File

@ -0,0 +1,25 @@
---
on: [pull_request]
name: Test binaries with cargo test
jobs:
check:
name: Test on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
steps:
- uses: actions/checkout@v1
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
- name: Run cargo test
uses: actions-rs/cargo@v1
with:
command: test
args: --locked --release

3
.gitignore vendored
View File

@ -1,7 +1,8 @@
/target
Cargo.lock
meilisearch-core/target
**/*.csv
**/*.json_lines
**/*.rs.bk
/*.mdb
/query-history.txt
/data.ms

2696
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,8 +1,10 @@
[workspace]
members = [
"meilidb-core",
"meilidb-schema",
"meilidb-tokenizer",
"meilisearch-core",
"meilisearch-http",
"meilisearch-schema",
"meilisearch-tokenizer",
"meilisearch-types",
]
[profile.release]

27
Dockerfile Normal file
View File

@ -0,0 +1,27 @@
# Compile
FROM alpine:3.10 AS compiler
RUN apk update --quiet
RUN apk add curl
RUN apk add build-base
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
WORKDIR /meilisearch
COPY . .
ENV RUSTFLAGS="-C target-feature=-crt-static"
RUN $HOME/.cargo/bin/cargo build --release
# Run
FROM alpine:3.10
RUN apk update --quiet
RUN apk add libgcc
COPY --from=compiler /meilisearch/target/release/meilisearch .
ENV MEILI_HTTP_ADDR 0.0.0.0:7700
CMD ./meilisearch

26
LICENSE
View File

@ -1,13 +1,21 @@
“Commons Clause” License Condition v1.0
MIT License
The Software is provided to you by the Licensor under the License, as defined below, subject to the following condition.
Copyright (c) 2019-2020 Meili SAS
Without limiting other conditions in the License, the grant of rights under the License will not include, and the License does not grant to you, the right to Sell the Software.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
For purposes of the foregoing, “Sell” means practicing any or all of the rights granted to you under the License to provide to third parties, for a fee or other consideration (including without limitation fees for hosting or consulting/ support services related to the Software), a product or service whose value derives, entirely or substantially, from the functionality of the Software. Any license notice or attribution required by the License must also include this Commons Clause License Condition notice.
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
Software: MeiliDB
License: MIT
Licensor: MEILI SAS
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

1
Procfile Normal file
View File

@ -0,0 +1 @@
web: ./target/release/meilisearch --http-addr=0.0.0.0:$PORT

195
README.md
View File

@ -1,43 +1,169 @@
# MeiliDB
# MeiliSearch
[![Build Status](https://dev.azure.com/thomas0884/thomas/_apis/build/status/meilisearch.MeiliDB?branchName=master)](https://dev.azure.com/thomas0884/thomas/_build/latest?definitionId=1&branchName=master)
[![dependency status](https://deps.rs/repo/github/meilisearch/MeiliDB/status.svg)](https://deps.rs/repo/github/meilisearch/MeiliDB)
[![License](https://img.shields.io/badge/license-commons%20clause-lightgrey)](https://commonsclause.com/)
[![Build Status](https://github.com/meilisearch/MeiliSearch/workflows/Cargo%20test/badge.svg)](https://github.com/meilisearch/MeiliSearch/actions)
[![dependency status](https://deps.rs/repo/github/meilisearch/MeiliSearch/status.svg)](https://deps.rs/repo/github/meilisearch/MeiliSearch)
[![License](https://img.shields.io/badge/license-MIT-informational)](https://github.com/meilisearch/MeiliSearch/blob/master/LICENSE)
A _full-text search database_ based on the fast [LMDB key-value store](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database).
⚡ Ultra relevant and instant full-text search API 🔍
MeiliSearch is a powerful, fast, open-source, easy to use, and deploy search engine. The search and indexation are fully customizable and handles features like typo-tolerance, filters, and synonyms.
For more [details about those features, go to our documentation](https://docs.meilisearch.com/).
[![crates.io demo gif](misc/crates-io-demo.gif)](https://crates.meilisearch.com)
> Meili helps the Rust community find crates on [crates.meilisearch.com](https://crates.meilisearch.com)
## Features
* Search as-you-type experience (answers < 50ms)
* Full-text search
* Typo tolerant (understands typos and spelling mistakes)
* Supports Kanji
* Supports Synonym
* Easy to install, deploy, and maintain
* Whole documents returned
* Highly customizable
* RESTfull API
- Provides [6 default ranking criteria](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/criterion/mod.rs#L107-L113) used to [bucket sort](https://en.wikipedia.org/wiki/Bucket_sort) documents
- Accepts [custom criteria](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/criterion/mod.rs#L24-L33) and can apply them in any custom order
- Support [ranged queries](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L283), useful for paginating results
- Can [distinct](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L265-L270) and [filter](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L246-L259) returned documents based on context defined rules
- Can store complete documents or only [user schema specified fields](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-schema/src/lib.rs#L265-L279)
- The [default tokenizer](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-tokenizer/src/lib.rs) can index latin and kanji based languages
- Returns [the matching text areas](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/lib.rs#L66-L88), useful to highlight matched words in results
- Accepts query time search config like the [searchable attributes](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/query_builder.rs#L272-L275)
- Supports [runtime incremental indexing](https://github.com/meilisearch/MeiliDB/blob/dc5c42821e1340e96cb90a3da472264624a26326/meilidb-core/src/store/mod.rs#L143-L173)
## Quick Start
### Deploy the Server
#### Run it using Docker
It uses [LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database) as the internal key-value store. The key-value store allows us to handle updates and queries with small memory and CPU overheads. The whole ranking system is [data oriented](https://github.com/meilisearch/MeiliDB/issues/82) and provides great performances.
```bash
docker run -it -p 7700:7700 --rm getmeili/meilisearch
```
You can [read the deep dive](deep-dive.md) if you want more information on the engine, it describes the whole process of generating updates and handling queries or you can take a look at the [typos and ranking rules](typos-ranking-rules.md) if you want to know the default rules used to sort the documents.
#### Installation using Homebrew
We will be proud if you submit issues and pull requests. You can help to grow this project and start contributing by checking [issues tagged "good-first-issue"](https://github.com/meilisearch/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). It is a good start!
```bash
brew update && brew install meilisearch
meilisearch
```
The project is only a library yet. It means that there is no binary provided yet. To get started, you can check the examples wich are made to work with the data located in the `datasets/` folder.
#### Installation using APT
MeiliDB will be a binary in a near future so you will be able to use it as a database out-of-the-box. We should be able to query it using HTTP. This is our current goal, [see the milestones](https://github.com/meilisearch/MeiliDB/milestones). In the end, the binary will be a bunch of network protocols and wrappers around the library - which will also be published on [crates.io](https://crates.io). Both the binary and the library will follow the same update cycle.
```bash
echo "deb [trusted=yes] https://apt.fury.io/meilisearch/ /" > /etc/apt/sources.list.d/fury.list
apt update && apt install meilisearch-http
meilisearch
```
#### Download the binary
```bash
curl -L https://install.meilisearch.com | sh
./meilisearch
```
#### Run it on heroku
[![Deploy](https://www.herokucdn.com/deploy/button.svg)](https://heroku.com/deploy?template=https://github.com/meilisearch/MeiliSearch)
#### Compile and run it from sources
If you have the Rust toolchain already installed, you can compile from the source
```bash
git clone https://github.com/meilisearch/MeiliSearch.git
cd MeiliSearch
cargo run --release
```
### Create an Index and Upload Some Documents
We provide a movie dataset that you can use for testing purposes.
```bash
curl -L 'https://bit.ly/2PAcw9l' -o movies.json
```
MeiliSearch can serve multiple indexes, with different kinds of documents,
therefore, it is required to create the index before sending documents to it.
```bash
curl -i -X POST 'http://127.0.0.1:7700/indexes' --data '{ "name": "Movies", "uid": "movies" }'
```
Now that the server knows about our brand new index, we can send it data.
We provided you a small dataset that is available in the `datasets/` directory.
```bash
curl -i -X POST 'http://127.0.0.1:7700/indexes/movies/documents' \
--header 'content-type: application/json' \
--data-binary @movies.json
```
### Search for Documents
#### In command line
The search engine is now aware of our documents and can serve those via our HTTP server again.
The [`jq` command-line tool](https://stedolan.github.io/jq/) can significantly help you read the server responses.
```bash
curl 'http://127.0.0.1:7700/indexes/movies/search?q=botman+robin&limit=2' | jq
```
```json
{
"hits": [
{
"id": "415",
"title": "Batman & Robin",
"poster": "https://image.tmdb.org/t/p/w1280/79AYCcxw3kSKbhGpx1LiqaCAbwo.jpg",
"overview": "Along with crime-fighting partner Robin and new recruit Batgirl...",
"release_date": "1997-06-20",
},
{
"id": "411736",
"title": "Batman: Return of the Caped Crusaders",
"poster": "https://image.tmdb.org/t/p/w1280/GW3IyMW5Xgl0cgCN8wu96IlNpD.jpg",
"overview": "Adam West and Burt Ward returns to their iconic roles of Batman and Robin...",
"release_date": "2016-10-08",
}
],
"offset": 0,
"limit": 2,
"processingTimeMs": 1,
"query": "botman robin"
}
```
#### With the Web Interface
MeiliSearch provides a simple web interface containing a search bar in order to quickly test the instant search experience with a given set of documents.
This web interface is available in your browser at the root of the server. The default URL is [http://127.0.0.1:7700](http://127.0.0.1:7700).
### Documentation
Now, that you have a running MeiliSearch, you can learn more and tune your search engine using [the documentation](https://docs.meilisearch.com).
## How it works
MeiliSearch uses [LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database) as the internal key-value store. The key-value store allows us to handle updates and queries with small memory and CPU overheads. The whole ranking system is [data oriented](https://github.com/meilisearch/MeiliSearch/issues/82) and provides great performances.
You can [read the deep dive](deep-dive.md) if you want more information on the engine; it describes the whole process of generating updates and handling queries. Also, you can take a look at the [typos and ranking rules](typos-ranking-rules.md) if you want to know the default rules used to sort the documents.
### Technical features
- Provides [6 default ranking criteria](https://github.com/meilisearch/MeiliSearch/blob/3ea5aa18a209b6973b921542d46a79e1c753c163/meilisearch-core/src/criterion/mod.rs#L106-L111) used to [bucket sort](https://en.wikipedia.org/wiki/Bucket_sort) documents
- Accepts [custom criteria](https://github.com/meilisearch/MeiliSearch/blob/3ea5aa18a209b6973b921542d46a79e1c753c163/meilisearch-core/src/criterion/mod.rs#L20-L29) and can apply them in any custom order
- Support [ranged queries](https://github.com/meilisearch/MeiliSearch/blob/3ea5aa18a209b6973b921542d46a79e1c753c163/meilisearch-core/src/query_builder.rs#L342), useful for paginating results
- Can [distinct](https://github.com/meilisearch/MeiliSearch/blob/3ea5aa18a209b6973b921542d46a79e1c753c163/meilisearch-core/src/query_builder.rs#L324-L329) and [filter](https://github.com/meilisearch/MeiliSearch/blob/3ea5aa18a209b6973b921542d46a79e1c753c163/meilisearch-core/src/query_builder.rs#L313-L318) returned documents based on context defined rules
- Searches for [concatenated](https://github.com/meilisearch/MeiliSearch/pull/164) and [splitted query words](https://github.com/meilisearch/MeiliSearch/pull/232) to improve the search quality.
- Can store complete documents or only [user schema specified fields](https://github.com/meilisearch/MeiliSearch/blob/3ea5aa18a209b6973b921542d46a79e1c753c163/datasets/movies/schema.toml)
- The [default tokenizer](https://github.com/meilisearch/MeiliSearch/blob/3ea5aa18a209b6973b921542d46a79e1c753c163/meilisearch-tokenizer/src/lib.rs) can index latin and kanji based languages
- Returns [the matching text areas](https://github.com/meilisearch/MeiliSearch/blob/3ea5aa18a209b6973b921542d46a79e1c753c163/meilisearch-types/src/lib.rs#L49-L65), useful to highlight matched words in results
- Accepts query time search config like the [searchable attributes](https://github.com/meilisearch/MeiliSearch/blob/3ea5aa18a209b6973b921542d46a79e1c753c163/meilisearch-core/src/query_builder.rs#L331-L336)
- Supports [runtime incremental indexing](https://github.com/meilisearch/MeiliSearch/blob/3ea5aa18a209b6973b921542d46a79e1c753c163/meilisearch-core/src/store/mod.rs#L143-L212)
## Performances
With a database composed of _100 353_ documents with _352_ attributes each and _3_ of them indexed.
With a dataset composed of _100 353_ documents with _352_ attributes each and _3_ of them indexed.
So more than _300 000_ fields indexed for _35 million_ stored we can handle more than _2.8k req/sec_ with an average response time of _9 ms_ on an Intel i7-7700 (8) @ 4.2GHz.
Requests are made using [wrk](https://github.com/wg/wrk) and scripted to simulate real users queries.
Requests are made using [wrk](https://github.com/wg/wrk) and scripted to simulate real users' queries.
```
Running 10s test @ http://localhost:2230
@ -50,29 +176,20 @@ Requests/sec: 2806.46
Transfer/sec: 759.17KB
```
We also indexed a dataset containing something like _12 millions_ cities names in _24 minutes_ on a machine with _8 cores_, _64 GB of RAM_, and a _300 GB NMVe_ SSD.<br/>
The resulting database was _16 GB_ and search results were between _30 ms_ and _4 seconds_ for short prefix queries.
### Notes
With Rust 1.32 the allocator has been [changed to use the system allocator](https://blog.rust-lang.org/2019/01/17/Rust-1.32.0.html#jemalloc-is-removed-by-default).
We have seen much better performances when [using jemalloc as the global allocator](https://github.com/alexcrichton/jemallocator#documentation).
## Usage and examples
## Contributing
Currently MeiliDB do not provide an http server but you can run the example binary.
We will be glad if you submit issues and pull requests. You can help to grow this project and start contributing by checking [issues tagged "good-first-issue"](https://github.com/meilisearch/MeiliSearch/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). It is a good start!
The _index_ subcommand has been made to create an index and inject documents into it. Using the command line below, the index will be named _movies_ and the _19 700_ movies of the `datasets/` will be injected in MeiliDB.
### Analytic Events
```bash
cargo run --release --example from_file -- \
index example.mdb datasets/movies/data.csv \
--schema datasets/movies/schema.toml
```
Once the first command is done, you can query the freshly created _movies_ index using the _search_ subcomand. In this example we filtered the dataset to only show _non-adult_ movies using the non-definitive `!adult` syntax filter.
```bash
cargo run --release --example from_file -- \
search example.mdb
--number 4 \
--filter '!adult' \
id popularity adult original_title
```
We send events to our Amplitude instance to be aware of the number of people who use MeiliSearch.<br/>
We only send the platform on which the server runs once by day. No other information is sent.<br/>
If you do not want us to send events, you can disable these analytics by using the `MEILI_NO_ANALYTICS` env variable.

16
app.json Normal file
View File

@ -0,0 +1,16 @@
{
"name": "MeiliSearch",
"description": "Ultra relevant, instant and typo-tolerant full-text search API",
"keywords": [
"search-engine",
"instant search",
"search API"
],
"website": "https://docs.meilisearch.com/",
"repository": "https://github.com/meilisearch/MeiliSearch",
"buildpacks": [
{
"url": "https://github.com/emk/heroku-buildpack-rust"
}
]
}

View File

@ -1,52 +0,0 @@
---
trigger:
branches:
include: [ master ]
pr: [ master ]
jobs:
- job: test
pool:
vmImage: 'Ubuntu 16.04'
container: tpayet/chiquitita:latest
steps:
- script: |
curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain nightly
$HOME/.cargo/bin/rustup component add rustfmt
displayName: 'Install rustc and components'
- script: |
$HOME/.cargo/bin/cargo check
displayName: 'Check MeiliDB'
- script: |
$HOME/.cargo/bin/cargo test
displayName: 'Test MeiliDB'
- script: |
$HOME/.cargo/bin/cargo fmt --all -- --check
displayName: 'Fmt MeiliDB'
- job: build
dependsOn:
- test
condition: succeeded()
pool:
vmImage: 'Ubuntu 16.04'
container: tpayet/chiquitita:latest
steps:
- script: |
curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain nightly
$HOME/.cargo/bin/rustup component add rustfmt
displayName: 'Install rustc and components'
- script: |
$HOME/.cargo/bin/cargo build --release
displayName: 'Build MeiliDB'
- task: CopyFiles@2
inputs:
contents: '$(System.DefaultWorkingDirectory)/target/release/libmeilidb.rlib'
targetFolder: $(Build.ArtifactStagingDirectory)
displayName: 'Copy build'
- task: PublishBuildArtifacts@1
inputs:
artifactName: libmeilidb.rlib
displayName: 'Upload artifacts'

View File

@ -1 +1 @@
_datas in movies.csv are from https://www.themoviedb.org/_
_datas in movies.csv are from https://www.themoviedb.org/_

19654
datasets/movies/movies.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,21 +0,0 @@
# This schema has been generated ...
# The order in which the attributes are declared is important,
# it specify the attribute xxx...
identifier = "id"
[attributes.id]
displayed = true
[attributes.title]
displayed = true
indexed = true
[attributes.overview]
displayed = true
indexed = true
[attributes.release_date]
displayed = true
[attributes.poster]
displayed = true

View File

@ -0,0 +1,11 @@
{
"primaryKey": "id",
"searchableAttributes": ["title", "overview"],
"displayedAttributes": [
"id",
"title",
"overview",
"release_date",
"poster"
]
}

View File

@ -1,8 +1,8 @@
# A deep dive in MeiliDB
# A deep dive in MeiliSearch
On the 15 of May 2019.
MeiliDB is a full text search engine based on a final state transducer named [fst](https://github.com/BurntSushi/fst) and a key-value store named [sled](https://github.com/spacejam/sled). The goal of a search engine is to store data and to respond to queries as accurate and fast as possible. To achieve this it must save the matching words in an [inverted index](https://en.wikipedia.org/wiki/Inverted_index).
MeiliSearch is a full text search engine based on a final state transducer named [fst](https://github.com/BurntSushi/fst) and a key-value store named [sled](https://github.com/spacejam/sled). The goal of a search engine is to store data and to respond to queries as accurate and fast as possible. To achieve this it must save the matching words in an [inverted index](https://en.wikipedia.org/wiki/Inverted_index).
<!-- MarkdownTOC autolink="true" -->
@ -22,7 +22,7 @@ MeiliDB is a full text search engine based on a final state transducer named [fs
## Where is the data stored?
MeiliDB is entirely backed by a key-value store like any good database (i.e. Postgres, MySQL). This brings a great flexibility in the way documents can be stored and updates handled along time.
MeiliSearch is entirely backed by a key-value store like any good database (i.e. Postgres, MySQL). This brings a great flexibility in the way documents can be stored and updates handled along time.
[sled will brings some](https://github.com/spacejam/sled/tree/434533332a3f485e6d2e467023be0a0b55d3a1af#plans) of the [A.C.I.D. properties](https://en.wikipedia.org/wiki/ACID_(computer_science)) to help us be sure the saved data is consistent.
@ -34,7 +34,7 @@ It contain the inverted word index, the schema and the documents fields.
### The inverted word index
[The inverted word index](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/words_index.rs) is a sled Tree dedicated to store and give access to all documents that contains a specific word. The information stored under the word is simply a big ordered array of where in the document the word has been found. In other word, a big list of [`DocIndex`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/lib.rs#L35-L51).
[The inverted word index](https://github.com/meilisearch/MeiliSearch/blob/3db823de002243004612e36a19b4578d800dab97/meilisearch-data/src/database/words_index.rs) is a sled Tree dedicated to store and give access to all documents that contains a specific word. The information stored under the word is simply a big ordered array of where in the document the word has been found. In other word, a big list of [`DocIndex`](https://github.com/meilisearch/MeiliSearch/blob/3db823de002243004612e36a19b4578d800dab97/meilisearch-core/src/lib.rs#L35-L51).
#### A final state transducer
@ -42,27 +42,27 @@ _...also abbreviated fst_
This is the first entry point of the engine, you can read more about how it work with the beautiful blog post of @BurntSushi, [Index 1,600,000,000 Keys with Automata and Rust](https://blog.burntsushi.net/transducers/).
To make it short it is a powerful way to store all the words that are present in the indexed documents. You construct it by giving it all the words you want to index. When you want to search in it you can provide any automaton you want, in MeiliDB [a custom levenshtein automaton](https://github.com/tantivy-search/levenshtein-automata/) is used.
To make it short it is a powerful way to store all the words that are present in the indexed documents. You construct it by giving it all the words you want to index. When you want to search in it you can provide any automaton you want, in MeiliSearch [a custom levenshtein automaton](https://github.com/tantivy-search/levenshtein-automata/) is used.
#### Document indexes
The `fst` will only return the words that match with the search automaton but the goal of the search engine is to retrieve all matches in all the documents when a query is made. You want it to return some sort of position in an attribute in a document, an information about where the given word matched.
To make it possible we retrieve all of the `DocIndex` corresponding to all the matching words in the fst, we use the [`WordsIndex`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/words_index.rs#L11-L21) Tree to get the `DocIndexes` corresponding the words.
To make it possible we retrieve all of the `DocIndex` corresponding to all the matching words in the fst, we use the [`WordsIndex`](https://github.com/meilisearch/MeiliSearch/blob/3db823de002243004612e36a19b4578d800dab97/meilisearch-data/src/database/words_index.rs#L11-L21) Tree to get the `DocIndexes` corresponding the words.
### The schema
The schema is a data structure that represents which documents attributes should be stored and which should be indexed. It is stored under a the [`MainIndex`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/main_index.rs#L12) Tree and given to MeiliDB only at the creation of an index.
The schema is a data structure that represents which documents attributes should be stored and which should be indexed. It is stored under a the [`MainIndex`](https://github.com/meilisearch/MeiliSearch/blob/3db823de002243004612e36a19b4578d800dab97/meilisearch-data/src/database/main_index.rs#L12) Tree and given to MeiliSearch only at the creation of an index.
Each document attribute is associated to a unique 16 bit number named [`SchemaAttr`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/schema.rs#L186).
Each document attribute is associated to a unique 16 bit number named [`SchemaAttr`](https://github.com/meilisearch/MeiliSearch/blob/3db823de002243004612e36a19b4578d800dab97/meilisearch-data/src/schema.rs#L186).
In the future, this schema type could be given along with updates, the database could be able to handled a new schema and reindex the database according to the new one.
### Document attributes
When the engine handle a query the result that the requester want is a document, not only the [`Matches`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/lib.rs#L62-L88) associated to it, fields of the original document must be returned too.
When the engine handle a query the result that the requester want is a document, not only the [`Matches`](https://github.com/meilisearch/MeiliSearch/blob/3db823de002243004612e36a19b4578d800dab97/meilisearch-core/src/lib.rs#L62-L88) associated to it, fields of the original document must be returned too.
So MeiliDB again uses the power of the underlying key-value store and save the documents attributes marked as _STORE_ in the schema. The dedicated Tree for this information is the [`DocumentsIndex`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/documents_index.rs#L11).
So MeiliSearch again uses the power of the underlying key-value store and save the documents attributes marked as _STORE_ in the schema. The dedicated Tree for this information is the [`DocumentsIndex`](https://github.com/meilisearch/MeiliSearch/blob/3db823de002243004612e36a19b4578d800dab97/meilisearch-data/src/database/documents_index.rs#L11).
When a document field is saved in the key-value store its value is binary encoded using [message pack](https://github.com/3Hren/msgpack-rust), so a document must be serializable using serde.
@ -70,26 +70,26 @@ When a document field is saved in the key-value store its value is binary encode
## How is a request processed?
Now that we have our inverted index we are able to return results based on a query. In the MeiliDB universe a query is a simple string containing words.
Now that we have our inverted index we are able to return results based on a query. In the MeiliSearch universe a query is a simple string containing words.
### Query lexemes
The first step to be able to call the underlying structures is to split the query in words, for that we use a [custom tokenizer](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-tokenizer/src/lib.rs#L82-L84). Note that a tokenizer is specialized for a human language, this is the hard part.
The first step to be able to call the underlying structures is to split the query in words, for that we use a [custom tokenizer](https://github.com/meilisearch/MeiliSearch/blob/3db823de002243004612e36a19b4578d800dab97/meilisearch-tokenizer/src/lib.rs#L82-L84). Note that a tokenizer is specialized for a human language, this is the hard part.
### Automatons and query index
So to query the fst we need an automaton, in MeiliDB we use a [levenshtein automaton](https://en.wikipedia.org/wiki/Levenshtein_automaton), this automaton is constructed using a string and a maximum distance. According to the [Algolia's blog post](https://blog.algolia.com/inside-the-algolia-engine-part-3-query-processing/#algolia%e2%80%99s-way-of-searching-for-alternatives) we [created the DFAs](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/automaton.rs#L59-L78) with different settings.
So to query the fst we need an automaton, in MeiliSearch we use a [levenshtein automaton](https://en.wikipedia.org/wiki/Levenshtein_automaton), this automaton is constructed using a string and a maximum distance. According to the [Algolia's blog post](https://blog.algolia.com/inside-the-algolia-engine-part-3-query-processing/#algolia%e2%80%99s-way-of-searching-for-alternatives) we [created the DFAs](https://github.com/meilisearch/MeiliSearch/blob/3db823de002243004612e36a19b4578d800dab97/meilisearch-core/src/automaton.rs#L59-L78) with different settings.
Thanks to the power of the fst library [it is possible to union multiple automatons](https://docs.rs/fst/0.3.2/fst/map/struct.OpBuilder.html#method.union) on the same fst set. The `Stream` is able to return all the matching words. We use these words to find the whole list of `DocIndexes` associated.
With all these informations it is possible [to reconstruct a list of all the `DocIndexes` associated](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/query_builder.rs#L103-L130) with the words queried.
With all these informations it is possible [to reconstruct a list of all the `DocIndexes` associated](https://github.com/meilisearch/MeiliSearch/blob/3db823de002243004612e36a19b4578d800dab97/meilisearch-core/src/query_builder.rs#L103-L130) with the words queried.
### Sort by criteria
Now that we are able to get a big list of [DocIndexes](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/lib.rs#L21-L36) it is not enough to sort them by criteria, we need more informations like the levenshtein distance or the fact that a query word match exactly the word stored in the fst. So [we stuff it a little bit](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/rank/query_builder.rs#L86-L93), and aggregate all these [Matches](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/lib.rs#L47-L74) for each document. This way it will be easy to sort a simple vector of document using a bunch of functions.
Now that we are able to get a big list of [DocIndexes](https://github.com/Kerollmops/MeiliSearch/blob/550dc1e99224e386516877450320f694947332d4/src/lib.rs#L21-L36) it is not enough to sort them by criteria, we need more informations like the levenshtein distance or the fact that a query word match exactly the word stored in the fst. So [we stuff it a little bit](https://github.com/Kerollmops/MeiliSearch/blob/550dc1e99224e386516877450320f694947332d4/src/rank/query_builder.rs#L86-L93), and aggregate all these [Matches](https://github.com/Kerollmops/MeiliSearch/blob/550dc1e99224e386516877450320f694947332d4/src/lib.rs#L47-L74) for each document. This way it will be easy to sort a simple vector of document using a bunch of functions.
With this big list of documents and associated matches [we are able to sort only the part of the slice that we want](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/query_builder.rs#L160-L188) using bucket sorting. [Each criterion](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/criterion/mod.rs#L95-L101) is evaluated on each subslice without copy, thanks to [GroupByMut](https://docs.rs/slice-group-by/0.2.4/slice_group_by/) which, I hope [will soon be merged](https://github.com/rust-lang/rfcs/pull/2477).
With this big list of documents and associated matches [we are able to sort only the part of the slice that we want](https://github.com/meilisearch/MeiliSearch/blob/3db823de002243004612e36a19b4578d800dab97/meilisearch-core/src/query_builder.rs#L160-L188) using bucket sorting. [Each criterion](https://github.com/meilisearch/MeiliSearch/blob/3db823de002243004612e36a19b4578d800dab97/meilisearch-core/src/criterion/mod.rs#L95-L101) is evaluated on each subslice without copy, thanks to [GroupByMut](https://docs.rs/slice-group-by/0.2.4/slice_group_by/) which, I hope [will soon be merged](https://github.com/rust-lang/rfcs/pull/2477).
Note that it is possible to customize the criteria used by using the `QueryBuilder::with_criteria` constructor, this way you can implement some custom ranking based on the document attributes using the appropriate structure and the [`document` method](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/index.rs#L86).
Note that it is possible to customize the criteria used by using the `QueryBuilder::with_criteria` constructor, this way you can implement some custom ranking based on the document attributes using the appropriate structure and the [`document` method](https://github.com/meilisearch/MeiliSearch/blob/3db823de002243004612e36a19b4578d800dab97/meilisearch-data/src/database/index.rs#L86).
At this point, MeiliDB work is over 🎉
At this point, MeiliSearch work is over 🎉

129
download-latest.sh Normal file
View File

@ -0,0 +1,129 @@
#!/bin/sh
# COLORS
RED='\033[31m'
GREEN='\033[32m'
DEFAULT='\033[0m'
# GLOBALS
GREP_SEMVER_REGEXP='\"v\([0-9]*\)[.]\([0-9]*\)[.]\([0-9]*\)\"' # i.e. "v[number].[number].[number]"
BINARY_NAME='meilisearch'
# semverParseInto and semverLT from https://github.com/cloudflare/semver_bash/blob/master/semver.sh
# usage: semverParseInto version major minor patch special
# version: the string version
# major, minor, patch, special: will be assigned by the function
semverParseInto() {
local RE='[^0-9]*\([0-9]*\)[.]\([0-9]*\)[.]\([0-9]*\)\([0-9A-Za-z-]*\)'
#MAJOR
eval $2=`echo $1 | sed -e "s#$RE#\1#"`
#MINOR
eval $3=`echo $1 | sed -e "s#$RE#\2#"`
#MINOR
eval $4=`echo $1 | sed -e "s#$RE#\3#"`
#SPECIAL
eval $5=`echo $1 | sed -e "s#$RE#\4#"`
}
# usage: semverLT version1 version2
semverLT() {
local MAJOR_A=0
local MINOR_A=0
local PATCH_A=0
local SPECIAL_A=0
local MAJOR_B=0
local MINOR_B=0
local PATCH_B=0
local SPECIAL_B=0
semverParseInto $1 MAJOR_A MINOR_A PATCH_A SPECIAL_A
semverParseInto $2 MAJOR_B MINOR_B PATCH_B SPECIAL_B
if [ $MAJOR_A -lt $MAJOR_B ]; then
return 0
fi
if [ $MAJOR_A -le $MAJOR_B ] && [ $MINOR_A -lt $MINOR_B ]; then
return 0
fi
if [ $MAJOR_A -le $MAJOR_B ] && [ $MINOR_A -le $MINOR_B ] && [ $PATCH_A -lt $PATCH_B ]; then
return 0
fi
if [ "_$SPECIAL_A" == "_" ] && [ "_$SPECIAL_B" == "_" ] ; then
return 1
fi
if [ "_$SPECIAL_A" == "_" ] && [ "_$SPECIAL_B" != "_" ] ; then
return 1
fi
if [ "_$SPECIAL_A" != "_" ] && [ "_$SPECIAL_B" == "_" ] ; then
return 0
fi
if [ "_$SPECIAL_A" < "_$SPECIAL_B" ]; then
return 0
fi
return 1
}
success_usage() {
printf "$GREEN%s\n$DEFAULT" "MeiliSearch binary successfully downloaded as '$BINARY_NAME' file."
echo ''
echo 'Run it:'
echo ' $ ./meilisearch'
echo 'Usage:'
echo ' $ ./meilisearch --help'
}
failure_usage() {
printf "$RED%s\n$DEFAULT" 'ERROR: MeiliSearch binary is not available for your OS distribution yet.'
echo ''
echo 'However, you can easily compile the binary from the source files.'
echo 'Follow the steps on the docs: https://docs.meilisearch.com/advanced_guides/binary.html#how-to-compile-meilisearch'
}
# OS DETECTION
echo 'Detecting OS distribution...'
os_name=$(uname -s)
if [ "$os_name" != "Darwin" ]; then
os_name=$(cat /etc/os-release | grep '^ID=' | tr -d '"' | cut -d '=' -f 2)
fi
echo "OS distribution detected: $os_name"
case "$os_name" in
'Darwin')
os='macos'
;;
'ubuntu' | 'debian')
os='linux'
;;
*)
failure_usage
exit 1
esac
# GET LATEST VERSION
tags=$(curl -s 'https://api.github.com/repos/meilisearch/MeiliSearch/tags' \
| grep "$GREP_SEMVER_REGEXP" \
| grep 'name' \
| tr -d '"' | tr -d ',' | cut -d 'v' -f 2)
latest=""
for tag in $tags; do
if [ "$latest" = "" ]; then
latest="$tag"
else
semverLT $tag $latest
if [ $? -eq 1 ]; then
latest="$tag"
fi
fi
done
# DOWNLOAD THE LATEST
echo "Downloading MeiliSearch binary v$latest for $os..."
release_file="meilisearch-$os-amd64"
link="https://github.com/meilisearch/MeiliSearch/releases/download/v$latest/$release_file"
curl -OL "$link"
mv "$release_file" "$BINARY_NAME"
chmod 744 "$BINARY_NAME"
success_usage

View File

@ -1,45 +0,0 @@
[package]
name = "meilidb-core"
version = "0.1.0"
authors = ["Kerollmops <clement@meilisearch.com>"]
edition = "2018"
[dependencies]
arc-swap = "0.4.3"
bincode = "1.1.4"
byteorder = "1.3.2"
crossbeam-channel = "0.3.9"
deunicode = "1.0.0"
env_logger = "0.7.0"
hashbrown = { version = "0.6.0", features = ["serde"] }
heed = "0.1.0"
log = "0.4.8"
meilidb-schema = { path = "../meilidb-schema", version = "0.1.0" }
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
once_cell = "1.2.0"
ordered-float = { version = "1.0.2", features = ["serde"] }
sdset = "0.3.3"
serde = { version = "1.0.101", features = ["derive"] }
serde_json = "1.0.41"
siphasher = "0.3.0"
slice-group-by = "0.2.6"
zerocopy = "0.2.8"
[dependencies.levenshtein_automata]
git = "https://github.com/Kerollmops/levenshtein-automata.git"
branch = "arc-byte-slice"
features = ["fst_automaton"]
[dependencies.fst]
git = "https://github.com/Kerollmops/fst.git"
branch = "arc-byte-slice"
[dev-dependencies]
assert_matches = "1.3"
csv = "1.0.7"
indexmap = { version = "1.2.0", features = ["serde-1"] }
rustyline = { version = "5.0.0", default-features = false }
structopt = "0.3.2"
tempfile = "3.1.0"
termcolor = "1.0.4"
toml = "0.5.3"

View File

@ -1,220 +0,0 @@
mod dfa;
mod query_enhancer;
use std::cmp::Reverse;
use std::vec;
use fst::{IntoStreamer, Streamer};
use levenshtein_automata::DFA;
use meilidb_tokenizer::{is_cjk, split_query_string};
use crate::error::MResult;
use crate::store;
use self::dfa::{build_dfa, build_prefix_dfa};
pub use self::query_enhancer::QueryEnhancer;
use self::query_enhancer::QueryEnhancerBuilder;
const NGRAMS: usize = 3;
pub struct AutomatonProducer {
automatons: Vec<Vec<Automaton>>,
}
impl AutomatonProducer {
pub fn new(
reader: &heed::RoTxn,
query: &str,
main_store: store::Main,
synonyms_store: store::Synonyms,
) -> MResult<(AutomatonProducer, QueryEnhancer)> {
let (automatons, query_enhancer) =
generate_automatons(reader, query, main_store, synonyms_store)?;
Ok((AutomatonProducer { automatons }, query_enhancer))
}
pub fn into_iter(self) -> vec::IntoIter<Vec<Automaton>> {
self.automatons.into_iter()
}
}
#[derive(Debug)]
pub struct Automaton {
pub index: usize,
pub ngram: usize,
pub query_len: usize,
pub is_exact: bool,
pub is_prefix: bool,
pub query: String,
}
impl Automaton {
pub fn dfa(&self) -> DFA {
if self.is_prefix {
build_prefix_dfa(&self.query)
} else {
build_dfa(&self.query)
}
}
fn exact(index: usize, ngram: usize, query: &str) -> Automaton {
Automaton {
index,
ngram,
query_len: query.len(),
is_exact: true,
is_prefix: false,
query: query.to_string(),
}
}
fn prefix_exact(index: usize, ngram: usize, query: &str) -> Automaton {
Automaton {
index,
ngram,
query_len: query.len(),
is_exact: true,
is_prefix: true,
query: query.to_string(),
}
}
fn non_exact(index: usize, ngram: usize, query: &str) -> Automaton {
Automaton {
index,
ngram,
query_len: query.len(),
is_exact: false,
is_prefix: false,
query: query.to_string(),
}
}
}
pub fn normalize_str(string: &str) -> String {
let mut string = string.to_lowercase();
if !string.contains(is_cjk) {
string = deunicode::deunicode_with_tofu(&string, "");
}
string
}
fn generate_automatons(
reader: &heed::RoTxn,
query: &str,
main_store: store::Main,
synonym_store: store::Synonyms,
) -> MResult<(Vec<Vec<Automaton>>, QueryEnhancer)> {
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
let synonyms = match main_store.synonyms_fst(reader)? {
Some(synonym) => synonym,
None => fst::Set::default(),
};
let mut automaton_index = 0;
let mut automatons = Vec::new();
let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words);
// We must not declare the original words to the query enhancer
// *but* we need to push them in the automatons list first
let mut original_automatons = Vec::new();
let mut original_words = query_words.iter().peekable();
while let Some(word) = original_words.next() {
let has_following_word = original_words.peek().is_some();
let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
let automaton = if not_prefix_dfa {
Automaton::exact(automaton_index, 1, word)
} else {
Automaton::prefix_exact(automaton_index, 1, word)
};
automaton_index += 1;
original_automatons.push(automaton);
}
automatons.push(original_automatons);
for n in 1..=NGRAMS {
let mut ngrams = query_words.windows(n).enumerate().peekable();
while let Some((query_index, ngram_slice)) = ngrams.next() {
let query_range = query_index..query_index + n;
let ngram_nb_words = ngram_slice.len();
let ngram = ngram_slice.join(" ");
let has_following_word = ngrams.peek().is_some();
let not_prefix_dfa =
has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
// automaton of synonyms of the ngrams
let normalized = normalize_str(&ngram);
let lev = if not_prefix_dfa {
build_dfa(&normalized)
} else {
build_prefix_dfa(&normalized)
};
let mut stream = synonyms.search(&lev).into_stream();
while let Some(base) = stream.next() {
// only trigger alternatives when the last word has been typed
// i.e. "new " do not but "new yo" triggers alternatives to "new york"
let base = std::str::from_utf8(base).unwrap();
let base_nb_words = split_query_string(base).count();
if ngram_nb_words != base_nb_words {
continue;
}
if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
let mut stream = synonyms.into_stream();
while let Some(synonyms) = stream.next() {
let synonyms = std::str::from_utf8(synonyms).unwrap();
let synonyms_words: Vec<_> = split_query_string(synonyms).collect();
let nb_synonym_words = synonyms_words.len();
let real_query_index = automaton_index;
enhancer_builder.declare(
query_range.clone(),
real_query_index,
&synonyms_words,
);
for synonym in synonyms_words {
let automaton = if nb_synonym_words == 1 {
Automaton::exact(automaton_index, n, synonym)
} else {
Automaton::non_exact(automaton_index, n, synonym)
};
automaton_index += 1;
automatons.push(vec![automaton]);
}
}
}
}
if n != 1 {
// automaton of concatenation of query words
let concat = ngram_slice.concat();
let normalized = normalize_str(&concat);
let real_query_index = automaton_index;
enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]);
let automaton = Automaton::exact(automaton_index, n, &normalized);
automaton_index += 1;
automatons.push(vec![automaton]);
}
}
}
// order automatons, the most important first,
// we keep the original automatons at the front.
automatons[1..].sort_by_key(|a| {
let a = a.first().unwrap();
(Reverse(a.is_exact), a.ngram)
});
Ok((automatons, enhancer_builder.build()))
}

View File

@ -1,423 +0,0 @@
use std::cmp::Ordering::{Equal, Greater, Less};
use std::ops::Range;
/// Return `true` if the specified range can accept the given replacements words.
/// Returns `false` if the replacements words are already present in the original query
/// or if there is fewer replacement words than the range to replace.
//
//
// ## Ignored because already present in original
//
// new york city subway
// -------- ^^^^
// / \
// [new york city]
//
//
// ## Ignored because smaller than the original
//
// new york city subway
// -------------
// \ /
// [new york]
//
//
// ## Accepted because bigger than the original
//
// NYC subway
// ---
// / \
// / \
// / \
// / \
// / \
// [new york city]
//
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
where
S: AsRef<str>,
T: AsRef<str>,
{
if words.len() <= range.len() {
// there is fewer or equal replacement words
// than there is already in the replaced range
return false;
}
// retrieve the part to rewrite but with the length
// of the replacement part
let original = query.iter().skip(range.start).take(words.len());
// check if the original query doesn't already contain
// the replacement words
!original
.map(AsRef::as_ref)
.eq(words.iter().map(AsRef::as_ref))
}
type Origin = usize;
type RealLength = usize;
struct FakeIntervalTree {
intervals: Vec<(Range<usize>, (Origin, RealLength))>,
}
impl FakeIntervalTree {
fn new(mut intervals: Vec<(Range<usize>, (Origin, RealLength))>) -> FakeIntervalTree {
intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end));
FakeIntervalTree { intervals }
}
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
let element = self.intervals.binary_search_by(|(r, _)| {
if point >= r.start {
if point < r.end {
Equal
} else {
Less
}
} else {
Greater
}
});
let n = match element {
Ok(n) => n,
Err(n) => n,
};
match self.intervals.get(n) {
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
_otherwise => None,
}
}
}
pub struct QueryEnhancerBuilder<'a, S> {
query: &'a [S],
origins: Vec<usize>,
real_to_origin: Vec<(Range<usize>, (Origin, RealLength))>,
}
impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
// we initialize origins query indices based on their positions
let origins: Vec<_> = (0..=query.len()).collect();
let real_to_origin = origins.iter().map(|&o| (o..o + 1, (o, 1))).collect();
QueryEnhancerBuilder {
query,
origins,
real_to_origin,
}
}
/// Update the final real to origin query indices mapping.
///
/// `range` is the original words range that this `replacement` words replace
/// and `real` is the first real query index of these replacement words.
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
where
T: AsRef<str>,
{
// check if the range of original words
// can be rewritten with the replacement words
if rewrite_range_with(self.query, range.clone(), replacement) {
// this range can be replaced so we need to
// modify the origins accordingly
let offset = replacement.len() - range.len();
let previous_padding = self.origins[range.end - 1];
let current_offset = (self.origins[range.end] - 1) - previous_padding;
let diff = offset.saturating_sub(current_offset);
self.origins[range.end] += diff;
for r in &mut self.origins[range.end + 1..] {
*r += diff;
}
}
// we need to store the real number and origins relations
// this way it will be possible to know by how many
// we need to pad real query indices
let real_range = real..real + replacement.len().max(range.len());
let real_length = replacement.len();
self.real_to_origin
.push((real_range, (range.start, real_length)));
}
pub fn build(self) -> QueryEnhancer {
QueryEnhancer {
origins: self.origins,
real_to_origin: FakeIntervalTree::new(self.real_to_origin),
}
}
}
pub struct QueryEnhancer {
origins: Vec<usize>,
real_to_origin: FakeIntervalTree,
}
impl QueryEnhancer {
/// Returns the query indices to use to replace this real query index.
pub fn replacement(&self, real: u32) -> Range<u32> {
let real = real as usize;
// query the fake interval tree with the real query index
let (range, (origin, real_length)) = self
.real_to_origin
.query(real)
.expect("real has never been declared");
// if `real` is the end bound of the range
if (range.start + real_length - 1) == real {
let mut count = range.len();
let mut new_origin = origin;
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
let len = slice[1] - slice[0];
count = count.saturating_sub(len);
if count == 0 {
new_origin = origin + i;
break;
}
}
let n = real - range.start;
let start = self.origins[origin];
let end = self.origins[new_origin + 1];
let remaining = (end - start) - n;
Range {
start: (start + n) as u32,
end: (start + n + remaining) as u32,
}
} else {
// just return the origin along with
// the real position of the word
let n = real as usize - range.start;
let origin = self.origins[origin];
Range {
start: (origin + n) as u32,
end: (origin + n + 1) as u32,
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn original_unmodified() {
let query = ["new", "york", "city", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// new york = new york city
builder.declare(0..2, 4, &["new", "york", "city"]);
// ^ 4 5 6
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // new
assert_eq!(enhancer.replacement(1), 1..2); // york
assert_eq!(enhancer.replacement(2), 2..3); // city
assert_eq!(enhancer.replacement(3), 3..4); // subway
assert_eq!(enhancer.replacement(4), 0..1); // new
assert_eq!(enhancer.replacement(5), 1..2); // york
assert_eq!(enhancer.replacement(6), 2..3); // city
}
#[test]
fn simple_growing() {
let query = ["new", "york", "subway"];
// 0 1 2
let mut builder = QueryEnhancerBuilder::new(&query);
// new york = new york city
builder.declare(0..2, 3, &["new", "york", "city"]);
// ^ 3 4 5
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // new
assert_eq!(enhancer.replacement(1), 1..3); // york
assert_eq!(enhancer.replacement(2), 3..4); // subway
assert_eq!(enhancer.replacement(3), 0..1); // new
assert_eq!(enhancer.replacement(4), 1..2); // york
assert_eq!(enhancer.replacement(5), 2..3); // city
}
#[test]
fn same_place_growings() {
let query = ["NY", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NY = new york
builder.declare(0..1, 2, &["new", "york"]);
// ^ 2 3
// NY = new york city
builder.declare(0..1, 4, &["new", "york", "city"]);
// ^ 4 5 6
// NY = NYC
builder.declare(0..1, 7, &["NYC"]);
// ^ 7
// NY = new york city
builder.declare(0..1, 8, &["new", "york", "city"]);
// ^ 8 9 10
// subway = underground train
builder.declare(1..2, 11, &["underground", "train"]);
// ^ 11 12
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..3); // NY
assert_eq!(enhancer.replacement(1), 3..5); // subway
assert_eq!(enhancer.replacement(2), 0..1); // new
assert_eq!(enhancer.replacement(3), 1..3); // york
assert_eq!(enhancer.replacement(4), 0..1); // new
assert_eq!(enhancer.replacement(5), 1..2); // york
assert_eq!(enhancer.replacement(6), 2..3); // city
assert_eq!(enhancer.replacement(7), 0..3); // NYC
assert_eq!(enhancer.replacement(8), 0..1); // new
assert_eq!(enhancer.replacement(9), 1..2); // york
assert_eq!(enhancer.replacement(10), 2..3); // city
assert_eq!(enhancer.replacement(11), 3..4); // underground
assert_eq!(enhancer.replacement(12), 4..5); // train
}
#[test]
fn bigger_growing() {
let query = ["NYC", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(0..1, 2, &["new", "york", "city"]);
// ^ 2 3 4
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..3); // NYC
assert_eq!(enhancer.replacement(1), 3..4); // subway
assert_eq!(enhancer.replacement(2), 0..1); // new
assert_eq!(enhancer.replacement(3), 1..2); // york
assert_eq!(enhancer.replacement(4), 2..3); // city
}
#[test]
fn middle_query_growing() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..6); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
}
#[test]
fn end_query_growing() {
let query = ["NYC", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(1..2, 2, &["underground", "train"]);
// ^ 2 3
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // NYC
assert_eq!(enhancer.replacement(1), 1..3); // subway
assert_eq!(enhancer.replacement(2), 1..2); // underground
assert_eq!(enhancer.replacement(3), 2..3); // train
}
#[test]
fn multiple_growings() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
// subway = underground train
builder.declare(3..4, 7, &["underground", "train"]);
// ^ 7 8
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..7); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
assert_eq!(enhancer.replacement(7), 5..6); // underground
assert_eq!(enhancer.replacement(8), 6..7); // train
}
#[test]
fn multiple_probable_growings() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
// subway = underground train
builder.declare(3..4, 7, &["underground", "train"]);
// ^ 7 8
// great awesome = good
builder.declare(0..2, 9, &["good"]);
// ^ 9
// awesome NYC = NY
builder.declare(1..3, 10, &["NY"]);
// ^^ 10
// NYC subway = metro
builder.declare(2..4, 11, &["metro"]);
// ^^ 11
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..7); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
assert_eq!(enhancer.replacement(7), 5..6); // underground
assert_eq!(enhancer.replacement(8), 6..7); // train
assert_eq!(enhancer.replacement(9), 0..2); // good
assert_eq!(enhancer.replacement(10), 1..5); // NY
assert_eq!(enhancer.replacement(11), 2..5); // metro
}
}

View File

@ -1,16 +0,0 @@
use crate::criterion::Criterion;
use crate::RawDocument;
use std::cmp::Ordering;
#[derive(Debug, Clone, Copy)]
pub struct DocumentId;
impl Criterion for DocumentId {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
lhs.id.cmp(&rhs.id)
}
fn name(&self) -> &str {
"DocumentId"
}
}

View File

@ -1,133 +0,0 @@
use std::cmp::Ordering;
use meilidb_schema::SchemaAttr;
use sdset::Set;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn number_exact_matches(
query_index: &[u32],
attribute: &[u16],
is_exact: &[bool],
fields_counts: &Set<(SchemaAttr, u64)>,
) -> usize {
let mut count = 0;
let mut index = 0;
for group in query_index.linear_group() {
let len = group.len();
let mut found_exact = false;
for (pos, _) in is_exact[index..index + len]
.iter()
.filter(|x| **x)
.enumerate()
{
found_exact = true;
if let Ok(pos) = fields_counts.binary_search_by_key(&attribute[pos], |(a, _)| a.0) {
let (_, count) = fields_counts[pos];
if count == 1 {
return usize::max_value();
}
}
}
count += found_exact as usize;
index += len;
}
count
}
#[derive(Debug, Clone, Copy)]
pub struct Exact;
impl Criterion for Exact {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let is_exact = lhs.is_exact();
let attribute = lhs.attribute();
let fields_counts = &lhs.fields_counts;
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
let rhs = {
let query_index = rhs.query_index();
let is_exact = rhs.is_exact();
let attribute = rhs.attribute();
let fields_counts = &rhs.fields_counts;
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
lhs.cmp(&rhs).reverse()
}
fn name(&self) -> &str {
"Exact"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "soulier"
//
// doc0: "Soulier bleu"
// doc1: "souliereres rouge"
#[test]
fn easy_case() {
let doc0 = {
let query_index = &[0];
let attribute = &[0];
let is_exact = &[true];
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
let doc1 = {
let query_index = &[0];
let attribute = &[0];
let is_exact = &[false];
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
// typing: "soulier"
//
// doc0: { 0. "soulier" }
// doc1: { 0. "soulier bleu et blanc" }
#[test]
fn basic() {
let doc0 = {
let query_index = &[0];
let attribute = &[0];
let is_exact = &[true];
let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
let doc1 = {
let query_index = &[0];
let attribute = &[0];
let is_exact = &[true];
let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
}

View File

@ -1,121 +0,0 @@
mod document_id;
mod exact;
mod number_of_words;
mod sort_by_attr;
mod sum_of_typos;
mod sum_of_words_attribute;
mod sum_of_words_position;
mod words_proximity;
use crate::RawDocument;
use std::cmp::Ordering;
pub use self::{
document_id::DocumentId, exact::Exact, number_of_words::NumberOfWords,
sort_by_attr::SortByAttr, sum_of_typos::SumOfTypos,
sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition,
words_proximity::WordsProximity,
};
pub trait Criterion: Send + Sync {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering;
fn name(&self) -> &str;
#[inline]
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
self.evaluate(lhs, rhs) == Ordering::Equal
}
}
impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
(**self).evaluate(lhs, rhs)
}
fn name(&self) -> &str {
(**self).name()
}
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
(**self).eq(lhs, rhs)
}
}
impl<T: Criterion + ?Sized> Criterion for Box<T> {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
(**self).evaluate(lhs, rhs)
}
fn name(&self) -> &str {
(**self).name()
}
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
(**self).eq(lhs, rhs)
}
}
#[derive(Default)]
pub struct CriteriaBuilder<'a> {
inner: Vec<Box<dyn Criterion + 'a>>,
}
impl<'a> CriteriaBuilder<'a> {
pub fn new() -> CriteriaBuilder<'a> {
CriteriaBuilder { inner: Vec::new() }
}
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> {
CriteriaBuilder {
inner: Vec::with_capacity(capacity),
}
}
pub fn reserve(&mut self, additional: usize) {
self.inner.reserve(additional)
}
pub fn add<C: 'a>(mut self, criterion: C) -> CriteriaBuilder<'a>
where
C: Criterion,
{
self.push(criterion);
self
}
pub fn push<C: 'a>(&mut self, criterion: C)
where
C: Criterion,
{
self.inner.push(Box::new(criterion));
}
pub fn build(self) -> Criteria<'a> {
Criteria { inner: self.inner }
}
}
pub struct Criteria<'a> {
inner: Vec<Box<dyn Criterion + 'a>>,
}
impl<'a> Default for Criteria<'a> {
fn default() -> Self {
CriteriaBuilder::with_capacity(7)
.add(SumOfTypos)
.add(NumberOfWords)
.add(WordsProximity)
.add(SumOfWordsAttribute)
.add(SumOfWordsPosition)
.add(Exact)
.add(DocumentId)
.build()
}
}
impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> {
fn as_ref(&self) -> &[Box<dyn Criterion + 'a>] {
&self.inner
}
}

View File

@ -1,31 +0,0 @@
use crate::criterion::Criterion;
use crate::RawDocument;
use slice_group_by::GroupBy;
use std::cmp::Ordering;
#[inline]
fn number_of_query_words(query_index: &[u32]) -> usize {
query_index.linear_group().count()
}
#[derive(Debug, Clone, Copy)]
pub struct NumberOfWords;
impl Criterion for NumberOfWords {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
number_of_query_words(query_index)
};
let rhs = {
let query_index = rhs.query_index();
number_of_query_words(query_index)
};
lhs.cmp(&rhs).reverse()
}
fn name(&self) -> &str {
"NumberOfWords"
}
}

View File

@ -1,116 +0,0 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
// This function is a wrong logarithmic 10 function.
// It is safe to panic on input number higher than 3,
// the number of typos is never bigger than that.
#[inline]
fn custom_log10(n: u8) -> f32 {
match n {
0 => 0.0, // log(1)
1 => 0.30102, // log(2)
2 => 0.47712, // log(3)
3 => 0.60205, // log(4)
_ => panic!("invalid number"),
}
}
#[inline]
fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize {
let mut number_words: usize = 0;
let mut sum_typos = 0.0;
let mut index = 0;
for group in query_index.linear_group() {
sum_typos += custom_log10(distance[index]);
number_words += 1;
index += group.len();
}
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfTypos;
impl Criterion for SumOfTypos {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let distance = lhs.distance();
sum_matches_typos(query_index, distance)
};
let rhs = {
let query_index = rhs.query_index();
let distance = rhs.distance();
sum_matches_typos(query_index, distance)
};
lhs.cmp(&rhs).reverse()
}
fn name(&self) -> &str {
"SumOfTypos"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "Geox CEO"
//
// doc0: "Geox SpA: CEO and Executive"
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
#[test]
fn one_typo_reference() {
let query_index0 = &[0, 1];
let distance0 = &[0, 0];
let query_index1 = &[0, 1];
let distance1 = &[1, 0];
let doc0 = sum_matches_typos(query_index0, distance0);
let doc1 = sum_matches_typos(query_index1, distance1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
// typing: "bouton manchette"
//
// doc0: "bouton manchette"
// doc1: "bouton"
#[test]
fn no_typo() {
let query_index0 = &[0, 1];
let distance0 = &[0, 0];
let query_index1 = &[0];
let distance1 = &[0];
let doc0 = sum_matches_typos(query_index0, distance0);
let doc1 = sum_matches_typos(query_index1, distance1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
// typing: "bouton manchztte"
//
// doc0: "bouton manchette"
// doc1: "bouton"
#[test]
fn one_typo() {
let query_index0 = &[0, 1];
let distance0 = &[0, 1];
let query_index1 = &[0];
let distance1 = &[0];
let doc0 = sum_matches_typos(query_index0, distance0);
let doc1 = sum_matches_typos(query_index1, distance1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
}

View File

@ -1,64 +0,0 @@
use crate::criterion::Criterion;
use crate::RawDocument;
use slice_group_by::GroupBy;
use std::cmp::Ordering;
#[inline]
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
let mut sum_attributes = 0;
let mut index = 0;
for group in query_index.linear_group() {
sum_attributes += attribute[index] as usize;
index += group.len();
}
sum_attributes
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfWordsAttribute;
impl Criterion for SumOfWordsAttribute {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let attribute = lhs.attribute();
sum_matches_attributes(query_index, attribute)
};
let rhs = {
let query_index = rhs.query_index();
let attribute = rhs.attribute();
sum_matches_attributes(query_index, attribute)
};
lhs.cmp(&rhs)
}
fn name(&self) -> &str {
"SumOfWordsAttribute"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "soulier"
//
// doc0: { 0. "Soulier bleu", 1. "bla bla bla" }
// doc1: { 0. "Botte rouge", 1. "Soulier en cuir" }
#[test]
fn title_vs_description() {
let query_index0 = &[0];
let attribute0 = &[0];
let query_index1 = &[0];
let attribute1 = &[1];
let doc0 = sum_matches_attributes(query_index0, attribute0);
let doc1 = sum_matches_attributes(query_index1, attribute1);
assert_eq!(doc0.cmp(&doc1), Ordering::Less);
}
}

View File

@ -1,64 +0,0 @@
use crate::criterion::Criterion;
use crate::RawDocument;
use slice_group_by::GroupBy;
use std::cmp::Ordering;
#[inline]
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
let mut sum_word_index = 0;
let mut index = 0;
for group in query_index.linear_group() {
sum_word_index += word_index[index] as usize;
index += group.len();
}
sum_word_index
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfWordsPosition;
impl Criterion for SumOfWordsPosition {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let word_index = lhs.word_index();
sum_matches_attribute_index(query_index, word_index)
};
let rhs = {
let query_index = rhs.query_index();
let word_index = rhs.word_index();
sum_matches_attribute_index(query_index, word_index)
};
lhs.cmp(&rhs)
}
fn name(&self) -> &str {
"SumOfWordsPosition"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "soulier"
//
// doc0: "Soulier bleu"
// doc1: "Botte rouge et soulier noir"
#[test]
fn easy_case() {
let query_index0 = &[0];
let word_index0 = &[0];
let query_index1 = &[0];
let word_index1 = &[3];
let doc0 = sum_matches_attribute_index(query_index0, word_index0);
let doc1 = sum_matches_attribute_index(query_index1, word_index1);
assert_eq!(doc0.cmp(&doc1), Ordering::Less);
}
}

View File

@ -1,164 +0,0 @@
use crate::criterion::Criterion;
use crate::RawDocument;
use slice_group_by::GroupBy;
use std::cmp::{self, Ordering};
const MAX_DISTANCE: u16 = 8;
#[inline]
fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
(a.clone(), b.clone())
}
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
if lhs < rhs {
cmp::min(rhs - lhs, MAX_DISTANCE)
} else {
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
}
}
fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
if lattr != rattr {
return MAX_DISTANCE;
}
index_proximity(lwi, rwi)
}
fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 {
let mut min_prox = u16::max_value();
for a in lattr.iter().zip(lwi) {
for b in rattr.iter().zip(rwi) {
let a = clone_tuple(a);
let b = clone_tuple(b);
min_prox = cmp::min(min_prox, attribute_proximity(a, b));
}
}
min_prox
}
fn matches_proximity(
query_index: &[u32],
distance: &[u8],
attribute: &[u16],
word_index: &[u16],
) -> u16 {
let mut query_index_groups = query_index.linear_group();
let mut proximity = 0;
let mut index = 0;
let get_attr_wi = |index: usize, group_len: usize| {
// retrieve the first distance group (with the lowest values)
let len = distance[index..index + group_len]
.linear_group()
.next()
.unwrap()
.len();
let rattr = &attribute[index..index + len];
let rwi = &word_index[index..index + len];
(rattr, rwi)
};
let mut last = query_index_groups.next().map(|group| {
let attr_wi = get_attr_wi(index, group.len());
index += group.len();
attr_wi
});
// iter by windows of size 2
while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) {
let attr_wi = get_attr_wi(index, rhs.len());
proximity += min_proximity(lhs, attr_wi);
last = Some(attr_wi);
index += rhs.len();
}
proximity
}
#[derive(Debug, Clone, Copy)]
pub struct WordsProximity;
impl Criterion for WordsProximity {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let distance = lhs.distance();
let attribute = lhs.attribute();
let word_index = lhs.word_index();
matches_proximity(query_index, distance, attribute, word_index)
};
let rhs = {
let query_index = rhs.query_index();
let distance = rhs.distance();
let attribute = rhs.attribute();
let word_index = rhs.word_index();
matches_proximity(query_index, distance, attribute, word_index)
};
lhs.cmp(&rhs)
}
fn name(&self) -> &str {
"WordsProximity"
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn three_different_attributes() {
// "soup" "of the" "the day"
//
// { id: 0, attr: 0, attr_index: 0 }
// { id: 1, attr: 1, attr_index: 0 }
// { id: 2, attr: 1, attr_index: 1 }
// { id: 2, attr: 2, attr_index: 0 }
// { id: 3, attr: 3, attr_index: 1 }
let query_index = &[0, 1, 2, 2, 3];
let distance = &[0, 0, 0, 0, 0];
let attribute = &[0, 1, 1, 2, 3];
let word_index = &[0, 0, 1, 0, 1];
// soup -> of = 8
// + of -> the = 1
// + the -> day = 8 (not 1)
assert_eq!(
matches_proximity(query_index, distance, attribute, word_index),
17
);
}
#[test]
fn two_different_attributes() {
// "soup day" "soup of the day"
//
// { id: 0, attr: 0, attr_index: 0 }
// { id: 0, attr: 1, attr_index: 0 }
// { id: 1, attr: 1, attr_index: 1 }
// { id: 2, attr: 1, attr_index: 2 }
// { id: 3, attr: 0, attr_index: 1 }
// { id: 3, attr: 1, attr_index: 3 }
let query_index = &[0, 0, 1, 2, 3, 3];
let distance = &[0, 0, 0, 0, 0, 0];
let attribute = &[0, 1, 1, 1, 0, 1];
let word_index = &[0, 0, 1, 2, 1, 3];
// soup -> of = 1
// + of -> the = 1
// + the -> day = 1
assert_eq!(
matches_proximity(query_index, distance, attribute, word_index),
3
);
}
}

View File

@ -1,200 +0,0 @@
use std::collections::hash_map::{Entry, HashMap};
use std::fs::File;
use std::path::Path;
use std::sync::{Arc, RwLock};
use std::{fs, thread};
use crossbeam_channel::Receiver;
use heed::types::{Str, Unit};
use heed::{CompactionOption, Result as ZResult};
use log::{debug, error};
use crate::{store, update, Index, MResult};
pub type BoxUpdateFn = Box<dyn Fn(update::UpdateResult) + Send + Sync + 'static>;
type ArcSwapFn = arc_swap::ArcSwapOption<BoxUpdateFn>;
pub struct Database {
pub env: heed::Env,
common_store: heed::PolyDatabase,
indexes_store: heed::Database<Str, Unit>,
indexes: RwLock<HashMap<String, (Index, Arc<ArcSwapFn>, thread::JoinHandle<()>)>>,
}
fn update_awaiter(receiver: Receiver<()>, env: heed::Env, update_fn: Arc<ArcSwapFn>, index: Index) {
for () in receiver {
// consume all updates in order (oldest first)
loop {
let mut writer = match env.write_txn() {
Ok(writer) => writer,
Err(e) => {
error!("LMDB writer transaction begin failed: {}", e);
break;
}
};
match update::update_task(&mut writer, index.clone()) {
Ok(Some(status)) => {
if let Err(e) = writer.commit() {
error!("update transaction failed: {}", e)
}
if let Some(ref callback) = *update_fn.load() {
(callback)(status);
}
}
// no more updates to handle for now
Ok(None) => {
debug!("no more updates");
writer.abort();
break;
}
Err(e) => {
error!("update task failed: {}", e);
writer.abort()
}
}
}
}
}
impl Database {
pub fn open_or_create(path: impl AsRef<Path>) -> MResult<Database> {
fs::create_dir_all(path.as_ref())?;
let env = heed::EnvOpenOptions::new()
.map_size(10 * 1024 * 1024 * 1024) // 10GB
.max_dbs(3000)
.open(path)?;
let common_store = env.create_dyn_database(Some("common"))?;
let indexes_store = env.create_database::<Str, Unit>(Some("indexes"))?;
// list all indexes that needs to be opened
let mut must_open = Vec::new();
let reader = env.read_txn()?;
for result in indexes_store.iter(&reader)? {
let (index_name, _) = result?;
must_open.push(index_name.to_owned());
}
reader.abort();
// open the previously aggregated indexes
let mut indexes = HashMap::new();
for index_name in must_open {
let (sender, receiver) = crossbeam_channel::bounded(100);
let index = match store::open(&env, &index_name, sender.clone())? {
Some(index) => index,
None => {
log::warn!(
"the index {} doesn't exist or has not all the databases",
index_name
);
continue;
}
};
let update_fn = Arc::new(ArcSwapFn::empty());
let env_clone = env.clone();
let index_clone = index.clone();
let update_fn_clone = update_fn.clone();
let handle = thread::spawn(move || {
update_awaiter(receiver, env_clone, update_fn_clone, index_clone)
});
// send an update notification to make sure that
// possible pre-boot updates are consumed
sender.send(()).unwrap();
let result = indexes.insert(index_name, (index, update_fn, handle));
assert!(
result.is_none(),
"The index should not have been already open"
);
}
Ok(Database {
env,
common_store,
indexes_store,
indexes: RwLock::new(indexes),
})
}
pub fn open_index(&self, name: impl AsRef<str>) -> Option<Index> {
let indexes_lock = self.indexes.read().unwrap();
match indexes_lock.get(name.as_ref()) {
Some((index, ..)) => Some(index.clone()),
None => None,
}
}
pub fn create_index(&self, name: impl AsRef<str>) -> MResult<Index> {
let name = name.as_ref();
let mut indexes_lock = self.indexes.write().unwrap();
match indexes_lock.entry(name.to_owned()) {
Entry::Occupied(_) => Err(crate::Error::IndexAlreadyExists),
Entry::Vacant(entry) => {
let (sender, receiver) = crossbeam_channel::bounded(100);
let index = store::create(&self.env, name, sender)?;
let mut writer = self.env.write_txn()?;
self.indexes_store.put(&mut writer, name, &())?;
let env_clone = self.env.clone();
let index_clone = index.clone();
let no_update_fn = Arc::new(ArcSwapFn::empty());
let no_update_fn_clone = no_update_fn.clone();
let handle = thread::spawn(move || {
update_awaiter(receiver, env_clone, no_update_fn_clone, index_clone)
});
writer.commit()?;
entry.insert((index.clone(), no_update_fn, handle));
Ok(index)
}
}
}
pub fn set_update_callback(&self, name: impl AsRef<str>, update_fn: BoxUpdateFn) -> bool {
let indexes_lock = self.indexes.read().unwrap();
match indexes_lock.get(name.as_ref()) {
Some((_, current_update_fn, _)) => {
let update_fn = Some(Arc::new(update_fn));
current_update_fn.swap(update_fn);
true
}
None => false,
}
}
pub fn unset_update_callback(&self, name: impl AsRef<str>) -> bool {
let indexes_lock = self.indexes.read().unwrap();
match indexes_lock.get(name.as_ref()) {
Some((_, current_update_fn, _)) => {
current_update_fn.swap(None);
true
}
None => false,
}
}
pub fn copy_and_compact_to_path<P: AsRef<Path>>(&self, path: P) -> ZResult<File> {
self.env.copy_to_path(path, CompactionOption::Enabled)
}
pub fn indexes_names(&self) -> MResult<Vec<String>> {
let indexes = self.indexes.read().unwrap();
Ok(indexes.keys().cloned().collect())
}
pub fn common_store(&self) -> heed::PolyDatabase {
self.common_store
}
}

View File

@ -1,168 +0,0 @@
#[cfg(test)]
#[macro_use]
extern crate assert_matches;
mod automaton;
pub mod criterion;
mod database;
mod distinct_map;
mod error;
mod number;
mod query_builder;
mod ranked_map;
mod raw_document;
pub mod raw_indexer;
mod reordered_attrs;
pub mod serde;
pub mod store;
mod update;
pub use self::database::{BoxUpdateFn, Database};
pub use self::error::{Error, MResult};
pub use self::number::{Number, ParseNumberError};
pub use self::ranked_map::RankedMap;
pub use self::raw_document::RawDocument;
pub use self::store::Index;
pub use self::update::{UpdateResult, UpdateStatus, UpdateType};
use ::serde::{Deserialize, Serialize};
use zerocopy::{AsBytes, FromBytes};
/// Represent an internally generated document unique identifier.
///
/// It is used to inform the database the document you want to deserialize.
/// Helpful for custom ranking.
#[derive(
Debug,
Copy,
Clone,
Eq,
PartialEq,
PartialOrd,
Ord,
Hash,
Serialize,
Deserialize,
AsBytes,
FromBytes,
)]
#[repr(C)]
pub struct DocumentId(pub u64);
/// This structure represent the position of a word
/// in a document and its attributes.
///
/// This is stored in the map, generated at index time,
/// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, AsBytes, FromBytes)]
#[repr(C)]
pub struct DocIndex {
/// The document identifier where the word was found.
pub document_id: DocumentId,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
pub word_index: u16,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
pub char_length: u16,
}
/// This structure represent a matching word with informations
/// on the location of the word in the document.
///
/// The order of the field is important because it defines
/// the way these structures are ordered between themselves.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Highlight {
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
/// The position in bytes where the word was found.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
/// The length in bytes of the found word.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_length: u16,
}
#[doc(hidden)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct TmpMatch {
pub query_index: u32,
pub distance: u8,
pub attribute: u16,
pub word_index: u16,
pub is_exact: bool,
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Document {
pub id: DocumentId,
pub highlights: Vec<Highlight>,
#[cfg(test)]
pub matches: Vec<TmpMatch>,
}
impl Document {
#[cfg(not(test))]
fn from_raw(raw: RawDocument) -> Document {
Document {
id: raw.id,
highlights: raw.highlights,
}
}
#[cfg(test)]
fn from_raw(raw: RawDocument) -> Document {
let len = raw.query_index().len();
let mut matches = Vec::with_capacity(len);
let query_index = raw.query_index();
let distance = raw.distance();
let attribute = raw.attribute();
let word_index = raw.word_index();
let is_exact = raw.is_exact();
for i in 0..len {
let match_ = TmpMatch {
query_index: query_index[i],
distance: distance[i],
attribute: attribute[i],
word_index: word_index[i],
is_exact: is_exact[i],
};
matches.push(match_);
}
Document {
id: raw.id,
matches,
highlights: raw.highlights,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::mem;
#[test]
fn docindex_mem_size() {
assert_eq!(mem::size_of::<DocIndex>(), 16);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,398 +0,0 @@
use std::ops::Range;
use std::cmp::Ordering::{Less, Greater, Equal};
/// Return `true` if the specified range can accept the given replacements words.
/// Returns `false` if the replacements words are already present in the original query
/// or if there is fewer replacement words than the range to replace.
//
//
// ## Ignored because already present in original
//
// new york city subway
// -------- ^^^^
// / \
// [new york city]
//
//
// ## Ignored because smaller than the original
//
// new york city subway
// -------------
// \ /
// [new york]
//
//
// ## Accepted because bigger than the original
//
// NYC subway
// ---
// / \
// / \
// / \
// / \
// / \
// [new york city]
//
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
where S: AsRef<str>,
T: AsRef<str>,
{
if words.len() <= range.len() {
// there is fewer or equal replacement words
// than there is already in the replaced range
return false
}
// retrieve the part to rewrite but with the length
// of the replacement part
let original = query.iter().skip(range.start).take(words.len());
// check if the original query doesn't already contain
// the replacement words
!original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref))
}
type Origin = usize;
type RealLength = usize;
struct FakeIntervalTree {
intervals: Vec<(Range<usize>, (Origin, RealLength))>,
}
impl FakeIntervalTree {
fn new(mut intervals: Vec<(Range<usize>, (Origin, RealLength))>) -> FakeIntervalTree {
intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end));
FakeIntervalTree { intervals }
}
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
let element = self.intervals.binary_search_by(|(r, _)| {
if point >= r.start {
if point < r.end { Equal } else { Less }
} else { Greater }
});
let n = match element { Ok(n) => n, Err(n) => n };
match self.intervals.get(n) {
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
_otherwise => None,
}
}
}
pub struct QueryEnhancerBuilder<'a, S> {
query: &'a [S],
origins: Vec<usize>,
real_to_origin: Vec<(Range<usize>, (Origin, RealLength))>,
}
impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
// we initialize origins query indices based on their positions
let origins: Vec<_> = (0..query.len() + 1).collect();
let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect();
QueryEnhancerBuilder { query, origins, real_to_origin }
}
/// Update the final real to origin query indices mapping.
///
/// `range` is the original words range that this `replacement` words replace
/// and `real` is the first real query index of these replacement words.
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
where T: AsRef<str>,
{
// check if the range of original words
// can be rewritten with the replacement words
if rewrite_range_with(self.query, range.clone(), replacement) {
// this range can be replaced so we need to
// modify the origins accordingly
let offset = replacement.len() - range.len();
let previous_padding = self.origins[range.end - 1];
let current_offset = (self.origins[range.end] - 1) - previous_padding;
let diff = offset.saturating_sub(current_offset);
self.origins[range.end] += diff;
for r in &mut self.origins[range.end + 1..] {
*r += diff;
}
}
// we need to store the real number and origins relations
// this way it will be possible to know by how many
// we need to pad real query indices
let real_range = real..real + replacement.len().max(range.len());
let real_length = replacement.len();
self.real_to_origin.push((real_range, (range.start, real_length)));
}
pub fn build(self) -> QueryEnhancer {
QueryEnhancer {
origins: self.origins,
real_to_origin: FakeIntervalTree::new(self.real_to_origin),
}
}
}
pub struct QueryEnhancer {
origins: Vec<usize>,
real_to_origin: FakeIntervalTree,
}
impl QueryEnhancer {
/// Returns the query indices to use to replace this real query index.
pub fn replacement(&self, real: u32) -> Range<u32> {
let real = real as usize;
// query the fake interval tree with the real query index
let (range, (origin, real_length)) =
self.real_to_origin
.query(real)
.expect("real has never been declared");
// if `real` is the end bound of the range
if (range.start + real_length - 1) == real {
let mut count = range.len();
let mut new_origin = origin;
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
let len = slice[1] - slice[0];
count = count.saturating_sub(len);
if count == 0 { new_origin = origin + i; break }
}
let n = real - range.start;
let start = self.origins[origin];
let end = self.origins[new_origin + 1];
let remaining = (end - start) - n;
Range { start: (start + n) as u32, end: (start + n + remaining) as u32 }
} else {
// just return the origin along with
// the real position of the word
let n = real as usize - range.start;
let origin = self.origins[origin];
Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 }
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn original_unmodified() {
let query = ["new", "york", "city", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// new york = new york city
builder.declare(0..2, 4, &["new", "york", "city"]);
// ^ 4 5 6
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // new
assert_eq!(enhancer.replacement(1), 1..2); // york
assert_eq!(enhancer.replacement(2), 2..3); // city
assert_eq!(enhancer.replacement(3), 3..4); // subway
assert_eq!(enhancer.replacement(4), 0..1); // new
assert_eq!(enhancer.replacement(5), 1..2); // york
assert_eq!(enhancer.replacement(6), 2..3); // city
}
#[test]
fn simple_growing() {
let query = ["new", "york", "subway"];
// 0 1 2
let mut builder = QueryEnhancerBuilder::new(&query);
// new york = new york city
builder.declare(0..2, 3, &["new", "york", "city"]);
// ^ 3 4 5
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // new
assert_eq!(enhancer.replacement(1), 1..3); // york
assert_eq!(enhancer.replacement(2), 3..4); // subway
assert_eq!(enhancer.replacement(3), 0..1); // new
assert_eq!(enhancer.replacement(4), 1..2); // york
assert_eq!(enhancer.replacement(5), 2..3); // city
}
#[test]
fn same_place_growings() {
let query = ["NY", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NY = new york
builder.declare(0..1, 2, &["new", "york"]);
// ^ 2 3
// NY = new york city
builder.declare(0..1, 4, &["new", "york", "city"]);
// ^ 4 5 6
// NY = NYC
builder.declare(0..1, 7, &["NYC"]);
// ^ 7
// NY = new york city
builder.declare(0..1, 8, &["new", "york", "city"]);
// ^ 8 9 10
// subway = underground train
builder.declare(1..2, 11, &["underground", "train"]);
// ^ 11 12
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..3); // NY
assert_eq!(enhancer.replacement(1), 3..5); // subway
assert_eq!(enhancer.replacement(2), 0..1); // new
assert_eq!(enhancer.replacement(3), 1..3); // york
assert_eq!(enhancer.replacement(4), 0..1); // new
assert_eq!(enhancer.replacement(5), 1..2); // york
assert_eq!(enhancer.replacement(6), 2..3); // city
assert_eq!(enhancer.replacement(7), 0..3); // NYC
assert_eq!(enhancer.replacement(8), 0..1); // new
assert_eq!(enhancer.replacement(9), 1..2); // york
assert_eq!(enhancer.replacement(10), 2..3); // city
assert_eq!(enhancer.replacement(11), 3..4); // underground
assert_eq!(enhancer.replacement(12), 4..5); // train
}
#[test]
fn bigger_growing() {
let query = ["NYC", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(0..1, 2, &["new", "york", "city"]);
// ^ 2 3 4
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..3); // NYC
assert_eq!(enhancer.replacement(1), 3..4); // subway
assert_eq!(enhancer.replacement(2), 0..1); // new
assert_eq!(enhancer.replacement(3), 1..2); // york
assert_eq!(enhancer.replacement(4), 2..3); // city
}
#[test]
fn middle_query_growing() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..6); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
}
#[test]
fn end_query_growing() {
let query = ["NYC", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(1..2, 2, &["underground", "train"]);
// ^ 2 3
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // NYC
assert_eq!(enhancer.replacement(1), 1..3); // subway
assert_eq!(enhancer.replacement(2), 1..2); // underground
assert_eq!(enhancer.replacement(3), 2..3); // train
}
#[test]
fn multiple_growings() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
// subway = underground train
builder.declare(3..4, 7, &["underground", "train"]);
// ^ 7 8
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..7); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
assert_eq!(enhancer.replacement(7), 5..6); // underground
assert_eq!(enhancer.replacement(8), 6..7); // train
}
#[test]
fn multiple_probable_growings() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
// subway = underground train
builder.declare(3..4, 7, &["underground", "train"]);
// ^ 7 8
// great awesome = good
builder.declare(0..2, 9, &["good"]);
// ^ 9
// awesome NYC = NY
builder.declare(1..3, 10, &["NY"]);
// ^^ 10
// NYC subway = metro
builder.declare(2..4, 11, &["metro"]);
// ^^ 11
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..7); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
assert_eq!(enhancer.replacement(7), 5..6); // underground
assert_eq!(enhancer.replacement(8), 6..7); // train
assert_eq!(enhancer.replacement(9), 0..2); // good
assert_eq!(enhancer.replacement(10), 1..5); // NY
assert_eq!(enhancer.replacement(11), 2..5); // metro
}
}

View File

@ -1,186 +0,0 @@
use std::fmt;
use std::sync::Arc;
use meilidb_schema::SchemaAttr;
use sdset::SetBuf;
use slice_group_by::GroupBy;
use crate::{DocumentId, Highlight, TmpMatch};
#[derive(Clone)]
pub struct RawDocument {
pub id: DocumentId,
pub matches: SharedMatches,
pub highlights: Vec<Highlight>,
pub fields_counts: SetBuf<(SchemaAttr, u64)>,
}
impl RawDocument {
pub fn query_index(&self) -> &[u32] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe {
&self
.matches
.matches
.query_index
.get_unchecked(r.start..r.end)
}
}
pub fn distance(&self) -> &[u8] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
}
pub fn attribute(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
}
pub fn word_index(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe {
&self
.matches
.matches
.word_index
.get_unchecked(r.start..r.end)
}
}
pub fn is_exact(&self) -> &[bool] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
}
}
impl fmt::Debug for RawDocument {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str("RawDocument {\r\n")?;
f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"query_index",
self.query_index()
))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"distance",
self.distance()
))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"attribute",
self.attribute()
))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"word_index",
self.word_index()
))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"is_exact",
self.is_exact()
))?;
f.write_str("}")?;
Ok(())
}
}
pub fn raw_documents_from(
matches: SetBuf<(DocumentId, TmpMatch)>,
highlights: SetBuf<(DocumentId, Highlight)>,
fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>,
) -> Vec<RawDocument> {
let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
let mut matches2 = Matches::with_capacity(matches.len());
let matches = matches.linear_group_by_key(|(id, _)| *id);
let highlights = highlights.linear_group_by_key(|(id, _)| *id);
let fields_counts = fields_counts.linear_group_by_key(|(id, _, _)| *id);
for ((mgroup, hgroup), fgroup) in matches.zip(highlights).zip(fields_counts) {
debug_assert_eq!(mgroup[0].0, hgroup[0].0);
debug_assert_eq!(mgroup[0].0, fgroup[0].0);
let document_id = mgroup[0].0;
let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0);
let end = start + mgroup.len();
let highlights = hgroup.iter().map(|(_, h)| *h).collect();
let fields_counts = SetBuf::new(fgroup.iter().map(|(_, a, c)| (*a, *c)).collect()).unwrap();
docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts));
matches2.extend_from_slice(mgroup);
}
let matches = Arc::new(matches2);
docs_ranges
.into_iter()
.map(|(id, range, highlights, fields_counts)| {
let matches = SharedMatches {
range,
matches: matches.clone(),
};
RawDocument {
id,
matches,
highlights,
fields_counts,
}
})
.collect()
}
#[derive(Debug, Copy, Clone)]
struct Range {
start: usize,
end: usize,
}
#[derive(Clone)]
pub struct SharedMatches {
range: Range,
matches: Arc<Matches>,
}
#[derive(Clone)]
struct Matches {
query_index: Vec<u32>,
distance: Vec<u8>,
attribute: Vec<u16>,
word_index: Vec<u16>,
is_exact: Vec<bool>,
}
impl Matches {
fn with_capacity(cap: usize) -> Matches {
Matches {
query_index: Vec::with_capacity(cap),
distance: Vec::with_capacity(cap),
attribute: Vec::with_capacity(cap),
word_index: Vec::with_capacity(cap),
is_exact: Vec::with_capacity(cap),
}
}
fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) {
for (_, match_) in matches {
self.query_index.push(match_.query_index);
self.distance.push(match_.distance);
self.attribute.push(match_.attribute);
self.word_index.push(match_.word_index);
self.is_exact.push(match_.is_exact);
}
}
}

View File

@ -1,255 +0,0 @@
use std::collections::{BTreeMap, HashMap};
use std::convert::TryFrom;
use crate::{DocIndex, DocumentId};
use deunicode::deunicode_with_tofu;
use meilidb_schema::SchemaAttr;
use meilidb_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer};
use sdset::SetBuf;
type Word = Vec<u8>; // TODO make it be a SmallVec
pub struct RawIndexer {
word_limit: usize, // the maximum number of indexed words
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
docs_words: HashMap<DocumentId, Vec<Word>>,
}
pub struct Indexed {
pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,
pub docs_words: HashMap<DocumentId, fst::Set>,
}
impl RawIndexer {
pub fn new() -> RawIndexer {
RawIndexer::with_word_limit(1000)
}
pub fn with_word_limit(limit: usize) -> RawIndexer {
RawIndexer {
word_limit: limit,
words_doc_indexes: BTreeMap::new(),
docs_words: HashMap::new(),
}
}
pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) -> usize {
let mut number_of_words = 0;
let lowercase_text = text.to_lowercase();
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
// TODO compute the deunicoded version after the cjk check
let next = if !lowercase_text.contains(is_cjk) && lowercase_text != deunicoded {
Some(deunicoded)
} else {
None
};
let iter = Some(lowercase_text).into_iter().chain(next);
for text in iter {
// we must not count 2 times the same words
number_of_words = 0;
for token in Tokenizer::new(&text) {
let must_continue = index_token(
token,
id,
attr,
self.word_limit,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
if !must_continue {
break;
}
number_of_words += 1;
}
}
number_of_words
}
pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
where
I: IntoIterator<Item = &'a str, IntoIter = IT>,
IT: Iterator<Item = &'a str> + Clone,
{
// TODO serialize this to one call to the SeqTokenizer loop
let lowercased: Vec<_> = iter.into_iter().map(str::to_lowercase).collect();
let iter = lowercased.iter().map(|t| t.as_str());
for token in SeqTokenizer::new(iter) {
let must_continue = index_token(
token,
id,
attr,
self.word_limit,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
if !must_continue {
break;
}
}
let deunicoded: Vec<_> = lowercased
.into_iter()
.map(|lowercase_text| {
if lowercase_text.contains(is_cjk) {
return lowercase_text;
}
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
if lowercase_text != deunicoded {
deunicoded
} else {
lowercase_text
}
})
.collect();
let iter = deunicoded.iter().map(|t| t.as_str());
for token in SeqTokenizer::new(iter) {
let must_continue = index_token(
token,
id,
attr,
self.word_limit,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
if !must_continue {
break;
}
}
}
pub fn build(self) -> Indexed {
let words_doc_indexes = self
.words_doc_indexes
.into_iter()
.map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
.collect();
let docs_words = self
.docs_words
.into_iter()
.map(|(id, mut words)| {
words.sort_unstable();
words.dedup();
(id, fst::Set::from_iter(words).unwrap())
})
.collect();
Indexed {
words_doc_indexes,
docs_words,
}
}
}
impl Default for RawIndexer {
fn default() -> Self {
Self::new()
}
}
fn index_token(
token: Token,
id: DocumentId,
attr: SchemaAttr,
word_limit: usize,
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool {
if token.word_index >= word_limit {
return false;
}
match token_to_docindex(id, attr, token) {
Some(docindex) => {
let word = Vec::from(token.word);
words_doc_indexes
.entry(word.clone())
.or_insert_with(Vec::new)
.push(docindex);
docs_words.entry(id).or_insert_with(Vec::new).push(word);
}
None => return false,
}
true
}
fn token_to_docindex(id: DocumentId, attr: SchemaAttr, token: Token) -> Option<DocIndex> {
let word_index = u16::try_from(token.word_index).ok()?;
let char_index = u16::try_from(token.char_index).ok()?;
let char_length = u16::try_from(token.word.chars().count()).ok()?;
let docindex = DocIndex {
document_id: id,
attribute: attr.0,
word_index,
char_index,
char_length,
};
Some(docindex)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn strange_apostrophe() {
let mut indexer = RawIndexer::new();
let docid = DocumentId(0);
let attr = SchemaAttr(0);
let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
indexer.index_text(docid, attr, text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe...
assert!(words_doc_indexes
.get(&"l’éteindre".to_owned().into_bytes())
.is_some());
}
#[test]
fn strange_apostrophe_in_sequence() {
let mut indexer = RawIndexer::new();
let docid = DocumentId(0);
let attr = SchemaAttr(0);
let text = vec!["Zut, l’aspirateur, j’ai oublié de l’éteindre !"];
indexer.index_text_seq(docid, attr, text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe...
assert!(words_doc_indexes
.get(&"l’éteindre".to_owned().into_bytes())
.is_some());
}
}

View File

@ -1,27 +0,0 @@
#[derive(Default, Clone)]
pub struct ReorderedAttrs {
count: usize,
reorders: Vec<Option<u16>>,
}
impl ReorderedAttrs {
pub fn new() -> ReorderedAttrs {
ReorderedAttrs {
count: 0,
reorders: Vec::new(),
}
}
pub fn insert_attribute(&mut self, attribute: u16) {
self.reorders.resize(attribute as usize + 1, None);
self.reorders[attribute as usize] = Some(self.count as u16);
self.count += 1;
}
pub fn get(&self, attribute: u16) -> Option<u16> {
match self.reorders.get(attribute as usize) {
Some(Some(attribute)) => Some(*attribute),
_ => None,
}
}
}

View File

@ -1,74 +0,0 @@
use heed::types::{ByteSlice, OwnedType};
use heed::Result as ZResult;
use meilidb_schema::SchemaAttr;
use super::DocumentAttrKey;
use crate::DocumentId;
#[derive(Copy, Clone)]
pub struct DocumentsFields {
pub(crate) documents_fields: heed::Database<OwnedType<DocumentAttrKey>, ByteSlice>,
}
impl DocumentsFields {
pub fn put_document_field(
self,
writer: &mut heed::RwTxn,
document_id: DocumentId,
attribute: SchemaAttr,
value: &[u8],
) -> ZResult<()> {
let key = DocumentAttrKey::new(document_id, attribute);
self.documents_fields.put(writer, &key, value)
}
pub fn del_all_document_fields(
self,
writer: &mut heed::RwTxn,
document_id: DocumentId,
) -> ZResult<usize> {
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
self.documents_fields.delete_range(writer, start..=end)
}
pub fn document_attribute<'txn>(
self,
reader: &'txn heed::RoTxn,
document_id: DocumentId,
attribute: SchemaAttr,
) -> ZResult<Option<&'txn [u8]>> {
let key = DocumentAttrKey::new(document_id, attribute);
self.documents_fields.get(reader, &key)
}
pub fn document_fields<'txn>(
self,
reader: &'txn heed::RoTxn,
document_id: DocumentId,
) -> ZResult<DocumentFieldsIter<'txn>> {
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
let iter = self.documents_fields.range(reader, start..=end)?;
Ok(DocumentFieldsIter { iter })
}
}
pub struct DocumentFieldsIter<'txn> {
iter: heed::RoRange<'txn, OwnedType<DocumentAttrKey>, ByteSlice>,
}
impl<'txn> Iterator for DocumentFieldsIter<'txn> {
type Item = ZResult<(SchemaAttr, &'txn [u8])>;
fn next(&mut self) -> Option<Self::Item> {
match self.iter.next() {
Some(Ok((key, bytes))) => {
let attr = SchemaAttr(key.attr.get());
Some(Ok((attr, bytes)))
}
Some(Err(e)) => Some(Err(e)),
None => None,
}
}
}

View File

@ -1,102 +0,0 @@
use crate::RankedMap;
use heed::types::{ByteSlice, OwnedType, SerdeBincode, Str};
use heed::Result as ZResult;
use meilidb_schema::Schema;
use std::sync::Arc;
const CUSTOMS_KEY: &str = "customs-key";
const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
const RANKED_MAP_KEY: &str = "ranked-map";
const SCHEMA_KEY: &str = "schema";
const SYNONYMS_KEY: &str = "synonyms";
const WORDS_KEY: &str = "words";
#[derive(Copy, Clone)]
pub struct Main {
pub(crate) main: heed::PolyDatabase,
}
impl Main {
pub fn put_words_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> {
let bytes = fst.as_fst().as_bytes();
self.main.put::<Str, ByteSlice>(writer, WORDS_KEY, bytes)
}
pub fn words_fst(self, reader: &heed::RoTxn) -> ZResult<Option<fst::Set>> {
match self.main.get::<Str, ByteSlice>(reader, WORDS_KEY)? {
Some(bytes) => {
let len = bytes.len();
let bytes = Arc::from(bytes);
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst)))
}
None => Ok(None),
}
}
pub fn put_schema(self, writer: &mut heed::RwTxn, schema: &Schema) -> ZResult<()> {
self.main
.put::<Str, SerdeBincode<Schema>>(writer, SCHEMA_KEY, schema)
}
pub fn schema(self, reader: &heed::RoTxn) -> ZResult<Option<Schema>> {
self.main
.get::<Str, SerdeBincode<Schema>>(reader, SCHEMA_KEY)
}
pub fn put_ranked_map(self, writer: &mut heed::RwTxn, ranked_map: &RankedMap) -> ZResult<()> {
self.main
.put::<Str, SerdeBincode<RankedMap>>(writer, RANKED_MAP_KEY, &ranked_map)
}
pub fn ranked_map(self, reader: &heed::RoTxn) -> ZResult<Option<RankedMap>> {
self.main
.get::<Str, SerdeBincode<RankedMap>>(reader, RANKED_MAP_KEY)
}
pub fn put_synonyms_fst(self, writer: &mut heed::RwTxn, fst: &fst::Set) -> ZResult<()> {
let bytes = fst.as_fst().as_bytes();
self.main.put::<Str, ByteSlice>(writer, SYNONYMS_KEY, bytes)
}
pub fn synonyms_fst(self, reader: &heed::RoTxn) -> ZResult<Option<fst::Set>> {
match self.main.get::<Str, ByteSlice>(reader, SYNONYMS_KEY)? {
Some(bytes) => {
let len = bytes.len();
let bytes = Arc::from(bytes);
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst)))
}
None => Ok(None),
}
}
pub fn put_number_of_documents<F>(self, writer: &mut heed::RwTxn, f: F) -> ZResult<u64>
where
F: Fn(u64) -> u64,
{
let new = self.number_of_documents(writer).map(f)?;
self.main
.put::<Str, OwnedType<u64>>(writer, NUMBER_OF_DOCUMENTS_KEY, &new)?;
Ok(new)
}
pub fn number_of_documents(self, reader: &heed::RoTxn) -> ZResult<u64> {
match self
.main
.get::<Str, OwnedType<u64>>(reader, NUMBER_OF_DOCUMENTS_KEY)?
{
Some(value) => Ok(value),
None => Ok(0),
}
}
pub fn put_customs(self, writer: &mut heed::RwTxn, customs: &[u8]) -> ZResult<()> {
self.main
.put::<Str, ByteSlice>(writer, CUSTOMS_KEY, customs)
}
pub fn customs<'txn>(self, reader: &'txn heed::RoTxn) -> ZResult<Option<&'txn [u8]>> {
self.main.get::<Str, ByteSlice>(reader, CUSTOMS_KEY)
}
}

View File

@ -1,325 +0,0 @@
mod docs_words;
mod documents_fields;
mod documents_fields_counts;
mod main;
mod postings_lists;
mod synonyms;
mod updates;
mod updates_results;
pub use self::docs_words::DocsWords;
pub use self::documents_fields::{DocumentFieldsIter, DocumentsFields};
pub use self::documents_fields_counts::{
DocumentFieldsCountsIter, DocumentsFieldsCounts, DocumentsIdsIter,
};
pub use self::main::Main;
pub use self::postings_lists::PostingsLists;
pub use self::synonyms::Synonyms;
pub use self::updates::Updates;
pub use self::updates_results::UpdatesResults;
use std::collections::HashSet;
use heed::Result as ZResult;
use meilidb_schema::{Schema, SchemaAttr};
use serde::de;
use zerocopy::{AsBytes, FromBytes};
use crate::criterion::Criteria;
use crate::serde::Deserializer;
use crate::{query_builder::QueryBuilder, update, DocumentId, Error, MResult};
type BEU64 = zerocopy::U64<byteorder::BigEndian>;
type BEU16 = zerocopy::U16<byteorder::BigEndian>;
#[derive(Debug, Copy, Clone, AsBytes, FromBytes)]
#[repr(C)]
pub struct DocumentAttrKey {
docid: BEU64,
attr: BEU16,
}
impl DocumentAttrKey {
fn new(docid: DocumentId, attr: SchemaAttr) -> DocumentAttrKey {
DocumentAttrKey {
docid: BEU64::new(docid.0),
attr: BEU16::new(attr.0),
}
}
}
fn main_name(name: &str) -> String {
format!("store-{}", name)
}
fn postings_lists_name(name: &str) -> String {
format!("store-{}-postings-lists", name)
}
fn documents_fields_name(name: &str) -> String {
format!("store-{}-documents-fields", name)
}
fn documents_fields_counts_name(name: &str) -> String {
format!("store-{}-documents-fields-counts", name)
}
fn synonyms_name(name: &str) -> String {
format!("store-{}-synonyms", name)
}
fn docs_words_name(name: &str) -> String {
format!("store-{}-docs-words", name)
}
fn updates_name(name: &str) -> String {
format!("store-{}-updates", name)
}
fn updates_results_name(name: &str) -> String {
format!("store-{}-updates-results", name)
}
#[derive(Clone)]
pub struct Index {
pub main: Main,
pub postings_lists: PostingsLists,
pub documents_fields: DocumentsFields,
pub documents_fields_counts: DocumentsFieldsCounts,
pub synonyms: Synonyms,
pub docs_words: DocsWords,
pub updates: Updates,
pub updates_results: UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
}
impl Index {
pub fn document<T: de::DeserializeOwned>(
&self,
reader: &heed::RoTxn,
attributes: Option<&HashSet<&str>>,
document_id: DocumentId,
) -> MResult<Option<T>> {
let schema = self.main.schema(reader)?;
let schema = schema.ok_or(Error::SchemaMissing)?;
let attributes = match attributes {
Some(attributes) => attributes
.iter()
.map(|name| schema.attribute(name))
.collect(),
None => None,
};
let mut deserializer = Deserializer {
document_id,
reader,
documents_fields: self.documents_fields,
schema: &schema,
attributes: attributes.as_ref(),
};
// TODO: currently we return an error if all document fields are missing,
// returning None would have been better
Ok(T::deserialize(&mut deserializer).map(Some)?)
}
pub fn document_attribute<T: de::DeserializeOwned>(
&self,
reader: &heed::RoTxn,
document_id: DocumentId,
attribute: SchemaAttr,
) -> MResult<Option<T>> {
let bytes = self
.documents_fields
.document_attribute(reader, document_id, attribute)?;
match bytes {
Some(bytes) => Ok(Some(serde_json::from_slice(bytes)?)),
None => Ok(None),
}
}
pub fn schema_update(&self, writer: &mut heed::RwTxn, schema: Schema) -> MResult<u64> {
let _ = self.updates_notifier.send(());
update::push_schema_update(writer, self.updates, self.updates_results, schema)
}
pub fn customs_update(&self, writer: &mut heed::RwTxn, customs: Vec<u8>) -> ZResult<u64> {
let _ = self.updates_notifier.send(());
update::push_customs_update(writer, self.updates, self.updates_results, customs)
}
pub fn documents_addition<D>(&self) -> update::DocumentsAddition<D> {
update::DocumentsAddition::new(
self.updates,
self.updates_results,
self.updates_notifier.clone(),
)
}
pub fn documents_deletion(&self) -> update::DocumentsDeletion {
update::DocumentsDeletion::new(
self.updates,
self.updates_results,
self.updates_notifier.clone(),
)
}
pub fn synonyms_addition(&self) -> update::SynonymsAddition {
update::SynonymsAddition::new(
self.updates,
self.updates_results,
self.updates_notifier.clone(),
)
}
pub fn synonyms_deletion(&self) -> update::SynonymsDeletion {
update::SynonymsDeletion::new(
self.updates,
self.updates_results,
self.updates_notifier.clone(),
)
}
pub fn current_update_id(&self, reader: &heed::RoTxn) -> MResult<Option<u64>> {
match self.updates.last_update_id(reader)? {
Some((id, _)) => Ok(Some(id)),
None => Ok(None),
}
}
pub fn update_status(
&self,
reader: &heed::RoTxn,
update_id: u64,
) -> MResult<update::UpdateStatus> {
update::update_status(reader, self.updates, self.updates_results, update_id)
}
pub fn query_builder(&self) -> QueryBuilder {
QueryBuilder::new(
self.main,
self.postings_lists,
self.documents_fields_counts,
self.synonyms,
)
}
pub fn query_builder_with_criteria<'c, 'f, 'd>(
&self,
criteria: Criteria<'c>,
) -> QueryBuilder<'c, 'f, 'd> {
QueryBuilder::with_criteria(
self.main,
self.postings_lists,
self.documents_fields_counts,
self.synonyms,
criteria,
)
}
}
pub fn create(
env: &heed::Env,
name: &str,
updates_notifier: crossbeam_channel::Sender<()>,
) -> MResult<Index> {
// create all the store names
let main_name = main_name(name);
let postings_lists_name = postings_lists_name(name);
let documents_fields_name = documents_fields_name(name);
let documents_fields_counts_name = documents_fields_counts_name(name);
let synonyms_name = synonyms_name(name);
let docs_words_name = docs_words_name(name);
let updates_name = updates_name(name);
let updates_results_name = updates_results_name(name);
// open all the stores
let main = env.create_dyn_database(Some(&main_name))?;
let postings_lists = env.create_database(Some(&postings_lists_name))?;
let documents_fields = env.create_database(Some(&documents_fields_name))?;
let documents_fields_counts = env.create_database(Some(&documents_fields_counts_name))?;
let synonyms = env.create_database(Some(&synonyms_name))?;
let docs_words = env.create_database(Some(&docs_words_name))?;
let updates = env.create_database(Some(&updates_name))?;
let updates_results = env.create_database(Some(&updates_results_name))?;
Ok(Index {
main: Main { main },
postings_lists: PostingsLists { postings_lists },
documents_fields: DocumentsFields { documents_fields },
documents_fields_counts: DocumentsFieldsCounts {
documents_fields_counts,
},
synonyms: Synonyms { synonyms },
docs_words: DocsWords { docs_words },
updates: Updates { updates },
updates_results: UpdatesResults { updates_results },
updates_notifier,
})
}
pub fn open(
env: &heed::Env,
name: &str,
updates_notifier: crossbeam_channel::Sender<()>,
) -> MResult<Option<Index>> {
// create all the store names
let main_name = main_name(name);
let postings_lists_name = postings_lists_name(name);
let documents_fields_name = documents_fields_name(name);
let documents_fields_counts_name = documents_fields_counts_name(name);
let synonyms_name = synonyms_name(name);
let docs_words_name = docs_words_name(name);
let updates_name = updates_name(name);
let updates_results_name = updates_results_name(name);
// open all the stores
let main = match env.open_dyn_database(Some(&main_name))? {
Some(main) => main,
None => return Ok(None),
};
let postings_lists = match env.open_database(Some(&postings_lists_name))? {
Some(postings_lists) => postings_lists,
None => return Ok(None),
};
let documents_fields = match env.open_database(Some(&documents_fields_name))? {
Some(documents_fields) => documents_fields,
None => return Ok(None),
};
let documents_fields_counts = match env.open_database(Some(&documents_fields_counts_name))? {
Some(documents_fields_counts) => documents_fields_counts,
None => return Ok(None),
};
let synonyms = match env.open_database(Some(&synonyms_name))? {
Some(synonyms) => synonyms,
None => return Ok(None),
};
let docs_words = match env.open_database(Some(&docs_words_name))? {
Some(docs_words) => docs_words,
None => return Ok(None),
};
let updates = match env.open_database(Some(&updates_name))? {
Some(updates) => updates,
None => return Ok(None),
};
let updates_results = match env.open_database(Some(&updates_results_name))? {
Some(updates_results) => updates_results,
None => return Ok(None),
};
Ok(Some(Index {
main: Main { main },
postings_lists: PostingsLists { postings_lists },
documents_fields: DocumentsFields { documents_fields },
documents_fields_counts: DocumentsFieldsCounts {
documents_fields_counts,
},
synonyms: Synonyms { synonyms },
docs_words: DocsWords { docs_words },
updates: Updates { updates },
updates_results: UpdatesResults { updates_results },
updates_notifier,
}))
}

View File

@ -1,37 +0,0 @@
use crate::DocIndex;
use heed::types::{ByteSlice, CowSlice};
use heed::Result as ZResult;
use sdset::{Set, SetBuf};
use std::borrow::Cow;
#[derive(Copy, Clone)]
pub struct PostingsLists {
pub(crate) postings_lists: heed::Database<ByteSlice, CowSlice<DocIndex>>,
}
impl PostingsLists {
pub fn put_postings_list(
self,
writer: &mut heed::RwTxn,
word: &[u8],
words_indexes: &Set<DocIndex>,
) -> ZResult<()> {
self.postings_lists.put(writer, word, words_indexes)
}
pub fn del_postings_list(self, writer: &mut heed::RwTxn, word: &[u8]) -> ZResult<bool> {
self.postings_lists.delete(writer, word)
}
pub fn postings_list<'txn>(
self,
reader: &'txn heed::RoTxn,
word: &[u8],
) -> ZResult<Option<Cow<'txn, Set<DocIndex>>>> {
match self.postings_lists.get(reader, word)? {
Some(Cow::Borrowed(slice)) => Ok(Some(Cow::Borrowed(Set::new_unchecked(slice)))),
Some(Cow::Owned(vec)) => Ok(Some(Cow::Owned(SetBuf::new_unchecked(vec)))),
None => Ok(None),
}
}
}

View File

@ -1,194 +0,0 @@
use std::collections::{HashMap, HashSet};
use fst::{set::OpBuilder, SetBuilder};
use sdset::{duo::Union, SetOperation};
use serde::Serialize;
use crate::raw_indexer::RawIndexer;
use crate::serde::{extract_document_id, RamDocumentStore, Serializer};
use crate::store;
use crate::update::{apply_documents_deletion, next_update_id, Update};
use crate::{Error, MResult, RankedMap};
pub struct DocumentsAddition<D> {
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
documents: Vec<D>,
}
impl<D> DocumentsAddition<D> {
pub fn new(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
) -> DocumentsAddition<D> {
DocumentsAddition {
updates_store,
updates_results_store,
updates_notifier,
documents: Vec::new(),
}
}
pub fn update_document(&mut self, document: D) {
self.documents.push(document);
}
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64>
where
D: serde::Serialize,
{
let _ = self.updates_notifier.send(());
let update_id = push_documents_addition(
writer,
self.updates_store,
self.updates_results_store,
self.documents,
)?;
Ok(update_id)
}
}
impl<D> Extend<D> for DocumentsAddition<D> {
fn extend<T: IntoIterator<Item = D>>(&mut self, iter: T) {
self.documents.extend(iter)
}
}
pub fn push_documents_addition<D: serde::Serialize>(
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
addition: Vec<D>,
) -> MResult<u64> {
let mut values = Vec::with_capacity(addition.len());
for add in addition {
let vec = serde_json::to_vec(&add)?;
let add = serde_json::from_slice(&vec)?;
values.push(add);
}
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::DocumentsAddition(values);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}
pub fn apply_documents_addition(
writer: &mut heed::RwTxn,
main_store: store::Main,
documents_fields_store: store::DocumentsFields,
documents_fields_counts_store: store::DocumentsFieldsCounts,
postings_lists_store: store::PostingsLists,
docs_words_store: store::DocsWords,
mut ranked_map: RankedMap,
addition: Vec<serde_json::Value>,
) -> MResult<()> {
let mut document_ids = HashSet::new();
let mut document_store = RamDocumentStore::new();
let mut document_fields_counts = HashMap::new();
let mut indexer = RawIndexer::new();
let schema = match main_store.schema(writer)? {
Some(schema) => schema,
None => return Err(Error::SchemaMissing),
};
let identifier = schema.identifier_name();
for document in addition {
let document_id = match extract_document_id(identifier, &document)? {
Some(id) => id,
None => return Err(Error::MissingDocumentId),
};
// 1. store the document id for future deletion
document_ids.insert(document_id);
// 2. index the document fields in ram stores
let serializer = Serializer {
schema: &schema,
document_store: &mut document_store,
document_fields_counts: &mut document_fields_counts,
indexer: &mut indexer,
ranked_map: &mut ranked_map,
document_id,
};
document.serialize(serializer)?;
}
// 1. remove the previous documents match indexes
let documents_to_insert = document_ids.iter().cloned().collect();
apply_documents_deletion(
writer,
main_store,
documents_fields_store,
documents_fields_counts_store,
postings_lists_store,
docs_words_store,
ranked_map.clone(),
documents_to_insert,
)?;
// 2. insert new document attributes in the database
for ((id, attr), value) in document_store.into_inner() {
documents_fields_store.put_document_field(writer, id, attr, &value)?;
}
// 3. insert new document attributes counts
for ((id, attr), count) in document_fields_counts {
documents_fields_counts_store.put_document_field_count(writer, id, attr, count)?;
}
let indexed = indexer.build();
let mut delta_words_builder = SetBuilder::memory();
for (word, delta_set) in indexed.words_doc_indexes {
delta_words_builder.insert(&word).unwrap();
let set = match postings_lists_store.postings_list(writer, &word)? {
Some(set) => Union::new(&set, &delta_set).into_set_buf(),
None => delta_set,
};
postings_lists_store.put_postings_list(writer, &word, &set)?;
}
for (id, words) in indexed.docs_words {
docs_words_store.put_doc_words(writer, id, &words)?;
}
let delta_words = delta_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
let words = match main_store.words_fst(writer)? {
Some(words) => {
let op = OpBuilder::new()
.add(words.stream())
.add(delta_words.stream())
.r#union();
let mut words_builder = SetBuilder::memory();
words_builder.extend_stream(op).unwrap();
words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
}
None => delta_words,
};
main_store.put_words_fst(writer, &words)?;
main_store.put_ranked_map(writer, &ranked_map)?;
let inserted_documents_len = document_ids.len() as u64;
main_store.put_number_of_documents(writer, |old| old + inserted_documents_len)?;
Ok(())
}

View File

@ -1,220 +0,0 @@
mod customs_update;
mod documents_addition;
mod documents_deletion;
mod schema_update;
mod synonyms_addition;
mod synonyms_deletion;
pub use self::customs_update::{apply_customs_update, push_customs_update};
pub use self::documents_addition::{apply_documents_addition, DocumentsAddition};
pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion};
pub use self::schema_update::{apply_schema_update, push_schema_update};
pub use self::synonyms_addition::{apply_synonyms_addition, SynonymsAddition};
pub use self::synonyms_deletion::{apply_synonyms_deletion, SynonymsDeletion};
use std::cmp;
use std::collections::BTreeMap;
use std::time::{Duration, Instant};
use heed::Result as ZResult;
use log::debug;
use serde::{Deserialize, Serialize};
use crate::{store, DocumentId, MResult, RankedMap};
use meilidb_schema::Schema;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum Update {
Schema(Schema),
Customs(Vec<u8>),
DocumentsAddition(Vec<serde_json::Value>),
DocumentsDeletion(Vec<DocumentId>),
SynonymsAddition(BTreeMap<String, Vec<String>>),
SynonymsDeletion(BTreeMap<String, Option<Vec<String>>>),
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum UpdateType {
Schema { schema: Schema },
Customs,
DocumentsAddition { number: usize },
DocumentsDeletion { number: usize },
SynonymsAddition { number: usize },
SynonymsDeletion { number: usize },
}
#[derive(Clone, Serialize, Deserialize)]
pub struct DetailedDuration {
pub main: Duration,
}
#[derive(Clone, Serialize, Deserialize)]
pub struct UpdateResult {
pub update_id: u64,
pub update_type: UpdateType,
pub result: Result<(), String>,
pub detailed_duration: DetailedDuration,
}
#[derive(Clone, Serialize, Deserialize)]
pub enum UpdateStatus {
Enqueued,
Processed(UpdateResult),
Unknown,
}
pub fn update_status(
reader: &heed::RoTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
update_id: u64,
) -> MResult<UpdateStatus> {
match updates_results_store.update_result(reader, update_id)? {
Some(result) => Ok(UpdateStatus::Processed(result)),
None => {
if updates_store.contains(reader, update_id)? {
Ok(UpdateStatus::Enqueued)
} else {
Ok(UpdateStatus::Unknown)
}
}
}
}
pub fn next_update_id(
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
) -> ZResult<u64> {
let last_update_id = updates_store.last_update_id(writer)?;
let last_update_id = last_update_id.map(|(n, _)| n);
let last_update_results_id = updates_results_store.last_update_id(writer)?;
let last_update_results_id = last_update_results_id.map(|(n, _)| n);
let max_update_id = cmp::max(last_update_id, last_update_results_id);
let new_update_id = max_update_id.map_or(0, |n| n + 1);
Ok(new_update_id)
}
pub fn update_task(writer: &mut heed::RwTxn, index: store::Index) -> MResult<Option<UpdateResult>> {
let (update_id, update) = match index.updates.pop_front(writer)? {
Some(value) => value,
None => return Ok(None),
};
debug!("Processing update number {}", update_id);
let (update_type, result, duration) = match update {
Update::Schema(schema) => {
let start = Instant::now();
let update_type = UpdateType::Schema {
schema: schema.clone(),
};
let result = apply_schema_update(writer, index.main, &schema);
(update_type, result, start.elapsed())
}
Update::Customs(customs) => {
let start = Instant::now();
let update_type = UpdateType::Customs;
let result = apply_customs_update(writer, index.main, &customs).map_err(Into::into);
(update_type, result, start.elapsed())
}
Update::DocumentsAddition(documents) => {
let start = Instant::now();
let ranked_map = match index.main.ranked_map(writer)? {
Some(ranked_map) => ranked_map,
None => RankedMap::default(),
};
let update_type = UpdateType::DocumentsAddition {
number: documents.len(),
};
let result = apply_documents_addition(
writer,
index.main,
index.documents_fields,
index.documents_fields_counts,
index.postings_lists,
index.docs_words,
ranked_map,
documents,
);
(update_type, result, start.elapsed())
}
Update::DocumentsDeletion(documents) => {
let start = Instant::now();
let ranked_map = match index.main.ranked_map(writer)? {
Some(ranked_map) => ranked_map,
None => RankedMap::default(),
};
let update_type = UpdateType::DocumentsDeletion {
number: documents.len(),
};
let result = apply_documents_deletion(
writer,
index.main,
index.documents_fields,
index.documents_fields_counts,
index.postings_lists,
index.docs_words,
ranked_map,
documents,
);
(update_type, result, start.elapsed())
}
Update::SynonymsAddition(synonyms) => {
let start = Instant::now();
let update_type = UpdateType::SynonymsAddition {
number: synonyms.len(),
};
let result = apply_synonyms_addition(writer, index.main, index.synonyms, synonyms);
(update_type, result, start.elapsed())
}
Update::SynonymsDeletion(synonyms) => {
let start = Instant::now();
let update_type = UpdateType::SynonymsDeletion {
number: synonyms.len(),
};
let result = apply_synonyms_deletion(writer, index.main, index.synonyms, synonyms);
(update_type, result, start.elapsed())
}
};
debug!(
"Processed update number {} {:?} {:?}",
update_id, update_type, result
);
let detailed_duration = DetailedDuration { main: duration };
let status = UpdateResult {
update_id,
update_type,
result: result.map_err(|e| e.to_string()),
detailed_duration,
};
index
.updates_results
.put_update_result(writer, update_id, &status)?;
Ok(Some(status))
}

View File

@ -1,31 +0,0 @@
use crate::update::{next_update_id, Update};
use crate::{error::UnsupportedOperation, store, MResult};
use meilidb_schema::Schema;
pub fn apply_schema_update(
writer: &mut heed::RwTxn,
main_store: store::Main,
new_schema: &Schema,
) -> MResult<()> {
if main_store.schema(writer)?.is_some() {
return Err(UnsupportedOperation::SchemaAlreadyExists.into());
}
main_store
.put_schema(writer, new_schema)
.map_err(Into::into)
}
pub fn push_schema_update(
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
schema: Schema,
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::Schema(schema);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}

View File

@ -1,118 +0,0 @@
use std::collections::BTreeMap;
use fst::{set::OpBuilder, SetBuilder};
use sdset::SetBuf;
use crate::automaton::normalize_str;
use crate::update::{next_update_id, Update};
use crate::{store, MResult};
pub struct SynonymsAddition {
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
synonyms: BTreeMap<String, Vec<String>>,
}
impl SynonymsAddition {
pub fn new(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
) -> SynonymsAddition {
SynonymsAddition {
updates_store,
updates_results_store,
updates_notifier,
synonyms: BTreeMap::new(),
}
}
pub fn add_synonym<S, T, I>(&mut self, synonym: S, alternatives: I)
where
S: AsRef<str>,
T: AsRef<str>,
I: IntoIterator<Item = T>,
{
let synonym = normalize_str(synonym.as_ref());
let alternatives = alternatives.into_iter().map(|s| s.as_ref().to_lowercase());
self.synonyms
.entry(synonym)
.or_insert_with(Vec::new)
.extend(alternatives);
}
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(());
let update_id = push_synonyms_addition(
writer,
self.updates_store,
self.updates_results_store,
self.synonyms,
)?;
Ok(update_id)
}
}
pub fn push_synonyms_addition(
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
addition: BTreeMap<String, Vec<String>>,
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::SynonymsAddition(addition);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}
pub fn apply_synonyms_addition(
writer: &mut heed::RwTxn,
main_store: store::Main,
synonyms_store: store::Synonyms,
addition: BTreeMap<String, Vec<String>>,
) -> MResult<()> {
let mut synonyms_builder = SetBuilder::memory();
for (word, alternatives) in addition {
synonyms_builder.insert(&word).unwrap();
let alternatives = {
let alternatives = SetBuf::from_dirty(alternatives);
let mut alternatives_builder = SetBuilder::memory();
alternatives_builder.extend_iter(alternatives).unwrap();
let bytes = alternatives_builder.into_inner().unwrap();
fst::Set::from_bytes(bytes).unwrap()
};
synonyms_store.put_synonyms(writer, word.as_bytes(), &alternatives)?;
}
let delta_synonyms = synonyms_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
let synonyms = match main_store.synonyms_fst(writer)? {
Some(synonyms) => {
let op = OpBuilder::new()
.add(synonyms.stream())
.add(delta_synonyms.stream())
.r#union();
let mut synonyms_builder = SetBuilder::memory();
synonyms_builder.extend_stream(op).unwrap();
synonyms_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
}
None => delta_synonyms,
};
main_store.put_synonyms_fst(writer, &synonyms)?;
Ok(())
}

View File

@ -1,156 +0,0 @@
use std::collections::BTreeMap;
use std::iter::FromIterator;
use fst::{set::OpBuilder, SetBuilder};
use sdset::SetBuf;
use crate::automaton::normalize_str;
use crate::update::{next_update_id, Update};
use crate::{store, MResult};
pub struct SynonymsDeletion {
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
synonyms: BTreeMap<String, Option<Vec<String>>>,
}
impl SynonymsDeletion {
pub fn new(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: crossbeam_channel::Sender<()>,
) -> SynonymsDeletion {
SynonymsDeletion {
updates_store,
updates_results_store,
updates_notifier,
synonyms: BTreeMap::new(),
}
}
pub fn delete_all_alternatives_of<S: AsRef<str>>(&mut self, synonym: S) {
let synonym = normalize_str(synonym.as_ref());
self.synonyms.insert(synonym, None);
}
pub fn delete_specific_alternatives_of<S, T, I>(&mut self, synonym: S, alternatives: I)
where
S: AsRef<str>,
T: AsRef<str>,
I: Iterator<Item = T>,
{
let synonym = normalize_str(synonym.as_ref());
let value = self.synonyms.entry(synonym).or_insert(None);
let alternatives = alternatives.map(|s| s.as_ref().to_lowercase());
match value {
Some(v) => v.extend(alternatives),
None => *value = Some(Vec::from_iter(alternatives)),
}
}
pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult<u64> {
let _ = self.updates_notifier.send(());
let update_id = push_synonyms_deletion(
writer,
self.updates_store,
self.updates_results_store,
self.synonyms,
)?;
Ok(update_id)
}
}
pub fn push_synonyms_deletion(
writer: &mut heed::RwTxn,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
deletion: BTreeMap<String, Option<Vec<String>>>,
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::SynonymsDeletion(deletion);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}
pub fn apply_synonyms_deletion(
writer: &mut heed::RwTxn,
main_store: store::Main,
synonyms_store: store::Synonyms,
deletion: BTreeMap<String, Option<Vec<String>>>,
) -> MResult<()> {
let mut delete_whole_synonym_builder = SetBuilder::memory();
for (synonym, alternatives) in deletion {
match alternatives {
Some(alternatives) => {
let prev_alternatives = synonyms_store.synonyms(writer, synonym.as_bytes())?;
let prev_alternatives = match prev_alternatives {
Some(alternatives) => alternatives,
None => continue,
};
let delta_alternatives = {
let alternatives = SetBuf::from_dirty(alternatives);
let mut builder = SetBuilder::memory();
builder.extend_iter(alternatives).unwrap();
builder.into_inner().and_then(fst::Set::from_bytes).unwrap()
};
let op = OpBuilder::new()
.add(prev_alternatives.stream())
.add(delta_alternatives.stream())
.difference();
let (alternatives, empty_alternatives) = {
let mut builder = SetBuilder::memory();
let len = builder.get_ref().len();
builder.extend_stream(op).unwrap();
let is_empty = len == builder.get_ref().len();
let bytes = builder.into_inner().unwrap();
let alternatives = fst::Set::from_bytes(bytes).unwrap();
(alternatives, is_empty)
};
if empty_alternatives {
delete_whole_synonym_builder.insert(synonym.as_bytes())?;
} else {
synonyms_store.put_synonyms(writer, synonym.as_bytes(), &alternatives)?;
}
}
None => {
delete_whole_synonym_builder.insert(&synonym).unwrap();
synonyms_store.del_synonyms(writer, synonym.as_bytes())?;
}
}
}
let delta_synonyms = delete_whole_synonym_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
let synonyms = match main_store.synonyms_fst(writer)? {
Some(synonyms) => {
let op = OpBuilder::new()
.add(synonyms.stream())
.add(delta_synonyms.stream())
.difference();
let mut synonyms_builder = SetBuilder::memory();
synonyms_builder.extend_stream(op).unwrap();
synonyms_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
}
None => fst::Set::default(),
};
main_store.put_synonyms_fst(writer, &synonyms)?;
Ok(())
}

View File

@ -1,306 +0,0 @@
use std::collections::{BTreeMap, HashMap};
use std::ops::BitOr;
use std::sync::Arc;
use std::{fmt, u16};
use indexmap::IndexMap;
use serde::{Deserialize, Serialize};
pub const DISPLAYED: SchemaProps = SchemaProps {
displayed: true,
indexed: false,
ranked: false,
};
pub const INDEXED: SchemaProps = SchemaProps {
displayed: false,
indexed: true,
ranked: false,
};
pub const RANKED: SchemaProps = SchemaProps {
displayed: false,
indexed: false,
ranked: true,
};
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct SchemaProps {
#[serde(default)]
pub displayed: bool,
#[serde(default)]
pub indexed: bool,
#[serde(default)]
pub ranked: bool,
}
impl SchemaProps {
pub fn is_displayed(self) -> bool {
self.displayed
}
pub fn is_indexed(self) -> bool {
self.indexed
}
pub fn is_ranked(self) -> bool {
self.ranked
}
}
impl BitOr for SchemaProps {
type Output = Self;
fn bitor(self, other: Self) -> Self::Output {
SchemaProps {
displayed: self.displayed | other.displayed,
indexed: self.indexed | other.indexed,
ranked: self.ranked | other.ranked,
}
}
}
#[derive(Serialize, Deserialize)]
pub struct SchemaBuilder {
identifier: String,
attributes: IndexMap<String, SchemaProps>,
}
impl SchemaBuilder {
pub fn with_identifier<S: Into<String>>(name: S) -> SchemaBuilder {
SchemaBuilder {
identifier: name.into(),
attributes: IndexMap::new(),
}
}
pub fn new_attribute<S: Into<String>>(&mut self, name: S, props: SchemaProps) -> SchemaAttr {
let len = self.attributes.len();
if self.attributes.insert(name.into(), props).is_some() {
panic!("Field already inserted.")
}
SchemaAttr(len as u16)
}
pub fn build(self) -> Schema {
let mut attrs = HashMap::new();
let mut props = Vec::new();
for (i, (name, prop)) in self.attributes.into_iter().enumerate() {
attrs.insert(name.clone(), SchemaAttr(i as u16));
props.push((name, prop));
}
let identifier = self.identifier;
Schema {
inner: Arc::new(InnerSchema {
identifier,
attrs,
props,
}),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Schema {
inner: Arc<InnerSchema>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct InnerSchema {
identifier: String,
attrs: HashMap<String, SchemaAttr>,
props: Vec<(String, SchemaProps)>,
}
impl Schema {
fn to_builder(&self) -> SchemaBuilder {
let identifier = self.inner.identifier.clone();
let attributes = self.attributes_ordered();
SchemaBuilder {
identifier,
attributes,
}
}
fn attributes_ordered(&self) -> IndexMap<String, SchemaProps> {
let mut ordered = BTreeMap::new();
for (name, attr) in &self.inner.attrs {
let (_, props) = self.inner.props[attr.0 as usize];
ordered.insert(attr.0, (name, props));
}
let mut attributes = IndexMap::with_capacity(ordered.len());
for (_, (name, props)) in ordered {
attributes.insert(name.clone(), props);
}
attributes
}
pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
let (_, props) = self.inner.props[attr.0 as usize];
props
}
pub fn identifier_name(&self) -> &str {
&self.inner.identifier
}
pub fn attribute<S: AsRef<str>>(&self, name: S) -> Option<SchemaAttr> {
self.inner.attrs.get(name.as_ref()).cloned()
}
pub fn attribute_name(&self, attr: SchemaAttr) -> &str {
let (name, _) = &self.inner.props[attr.0 as usize];
name
}
pub fn iter<'a>(&'a self) -> impl Iterator<Item = (&str, SchemaAttr, SchemaProps)> + 'a {
self.inner.props.iter().map(move |(name, prop)| {
let attr = self.inner.attrs.get(name).unwrap();
(name.as_str(), *attr, *prop)
})
}
}
impl Serialize for Schema {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::ser::Serializer,
{
self.to_builder().serialize(serializer)
}
}
impl<'de> Deserialize<'de> for Schema {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let builder = SchemaBuilder::deserialize(deserializer)?;
Ok(builder.build())
}
}
#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct SchemaAttr(pub u16);
impl SchemaAttr {
pub const fn new(value: u16) -> SchemaAttr {
SchemaAttr(value)
}
pub const fn min() -> SchemaAttr {
SchemaAttr(u16::min_value())
}
pub const fn max() -> SchemaAttr {
SchemaAttr(u16::max_value())
}
pub fn next(self) -> Option<SchemaAttr> {
self.0.checked_add(1).map(SchemaAttr)
}
pub fn prev(self) -> Option<SchemaAttr> {
self.0.checked_sub(1).map(SchemaAttr)
}
}
impl fmt::Display for SchemaAttr {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.0.fmt(f)
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::error::Error;
#[test]
fn serialize_deserialize() -> bincode::Result<()> {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", DISPLAYED);
builder.new_attribute("beta", DISPLAYED | INDEXED);
builder.new_attribute("gamma", INDEXED);
let schema = builder.build();
let mut buffer = Vec::new();
bincode::serialize_into(&mut buffer, &schema)?;
let schema2 = bincode::deserialize_from(buffer.as_slice())?;
assert_eq!(schema, schema2);
Ok(())
}
#[test]
fn serialize_deserialize_toml() -> Result<(), Box<dyn Error>> {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", DISPLAYED);
builder.new_attribute("beta", DISPLAYED | INDEXED);
builder.new_attribute("gamma", INDEXED);
let schema = builder.build();
let buffer = toml::to_vec(&schema)?;
let schema2 = toml::from_slice(buffer.as_slice())?;
assert_eq!(schema, schema2);
let data = r#"
identifier = "id"
[attributes."alpha"]
displayed = true
[attributes."beta"]
displayed = true
indexed = true
[attributes."gamma"]
indexed = true
"#;
let schema2 = toml::from_str(data)?;
assert_eq!(schema, schema2);
Ok(())
}
#[test]
fn serialize_deserialize_json() -> Result<(), Box<dyn Error>> {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", DISPLAYED);
builder.new_attribute("beta", DISPLAYED | INDEXED);
builder.new_attribute("gamma", INDEXED);
let schema = builder.build();
let buffer = serde_json::to_vec(&schema)?;
let schema2 = serde_json::from_slice(buffer.as_slice())?;
assert_eq!(schema, schema2);
let data = r#"
{
"identifier": "id",
"attributes": {
"alpha": {
"displayed": true
},
"beta": {
"displayed": true,
"indexed": true
},
"gamma": {
"indexed": true
}
}
}"#;
let schema2 = serde_json::from_str(data)?;
assert_eq!(schema, schema2);
Ok(())
}
}

View File

@ -0,0 +1,50 @@
[package]
name = "meilisearch-core"
version = "0.9.0"
license = "MIT"
authors = ["Kerollmops <clement@meilisearch.com>"]
edition = "2018"
[dependencies]
arc-swap = "0.4.3"
bincode = "1.1.4"
byteorder = "1.3.2"
chrono = { version = "0.4.9", features = ["serde"] }
compact_arena = "0.4.0"
crossbeam-channel = "0.4.0"
deunicode = "1.0.0"
env_logger = "0.7.0"
fst = { version = "0.3.5", default-features = false }
hashbrown = { version = "0.6.0", features = ["serde"] }
heed = "0.6.1"
indexmap = { version = "1.2.0", features = ["serde-1"] }
intervaltree = "0.2.5"
itertools = "0.8.2"
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
log = "0.4.8"
meilisearch-schema = { path = "../meilisearch-schema", version = "0.9.0" }
meilisearch-tokenizer = { path = "../meilisearch-tokenizer", version = "0.9.0" }
meilisearch-types = { path = "../meilisearch-types", version = "0.9.0" }
once_cell = "1.2.0"
ordered-float = { version = "1.0.2", features = ["serde"] }
regex = "1.3.1"
sdset = "0.3.6"
serde = { version = "1.0.101", features = ["derive"] }
serde_json = "1.0.41"
siphasher = "0.3.1"
slice-group-by = "0.2.6"
zerocopy = "0.2.8"
[dev-dependencies]
assert_matches = "1.3"
criterion = "0.3"
csv = "1.0.7"
jemallocator = "0.3.2"
rustyline = { version = "5.0.0", default-features = false }
structopt = "0.3.2"
tempfile = "3.1.0"
termcolor = "1.0.4"
[[bench]]
name = "search_benchmark"
harness = false

View File

@ -0,0 +1,95 @@
#[cfg(test)]
#[macro_use]
extern crate assert_matches;
use std::sync::mpsc;
use std::path::Path;
use std::fs;
use std::iter;
use meilisearch_core::Database;
use meilisearch_core::{ProcessedUpdateResult, UpdateStatus};
use serde_json::Value;
use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId};
fn prepare_database(path: &Path) -> Database {
let database = Database::open_or_create(path).unwrap();
let db = &database;
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |_name: &str, update: ProcessedUpdateResult| {
sender.send(update.update_id).unwrap()
};
let index = database.create_index("bench").unwrap();
database.set_update_callback(Box::new(update_fn));
let schema = {
let path = concat!(env!("CARGO_MANIFEST_DIR"), "/../datasets/movies/schema.toml");
let string = fs::read_to_string(path).expect("find schema");
toml::from_str(&string).unwrap()
};
let mut update_writer = db.update_write_txn().unwrap();
let _update_id = index.schema_update(&mut update_writer, schema).unwrap();
update_writer.commit().unwrap();
let mut additions = index.documents_addition();
let json: Value = {
let path = concat!(env!("CARGO_MANIFEST_DIR"), "/../datasets/movies/movies.json");
let movies_file = fs::File::open(path).expect("find movies");
serde_json::from_reader(movies_file).unwrap()
};
let documents = json.as_array().unwrap();
for document in documents {
additions.update_document(document);
}
let mut update_writer = db.update_write_txn().unwrap();
let update_id = additions.finalize(&mut update_writer).unwrap();
update_writer.commit().unwrap();
// block until the transaction is processed
let _ = receiver.into_iter().find(|id| *id == update_id);
let update_reader = db.update_read_txn().unwrap();
let result = index.update_status(&update_reader, update_id).unwrap();
assert_matches!(result, Some(UpdateStatus::Processed { content }) if content.error.is_none());
database
}
pub fn criterion_benchmark(c: &mut Criterion) {
let dir = tempfile::tempdir().unwrap();
let database = prepare_database(dir.path());
let reader = database.main_read_txn().unwrap();
let index = database.open_index("bench").unwrap();
let mut count = 0;
let query = "I love paris ";
let iter = iter::from_fn(|| {
count += 1;
query.get(0..count)
});
let mut group = c.benchmark_group("searching in movies (19654 docs)");
group.sample_size(10);
for query in iter {
let bench_name = BenchmarkId::from_parameter(format!("{:?}", query));
group.bench_with_input(bench_name, &query, |b, query| b.iter(|| {
let builder = index.query_builder();
builder.query(&reader, query, 0..20).unwrap();
}));
}
group.finish();
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

View File

@ -1,7 +1,7 @@
use std::collections::btree_map::{BTreeMap, Entry};
use std::collections::HashSet;
use std::collections::btree_map::{BTreeMap, Entry};
use std::error::Error;
use std::io::Write;
use std::io::{Read, Write};
use std::iter::FromIterator;
use std::path::{Path, PathBuf};
use std::time::{Duration, Instant};
@ -12,10 +12,13 @@ use serde::{Deserialize, Serialize};
use structopt::StructOpt;
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
use meilidb_core::{Database, Highlight, UpdateResult};
use meilidb_schema::SchemaAttr;
use meilisearch_core::{Database, Highlight, ProcessedUpdateResult};
use meilisearch_core::settings::Settings;
use meilisearch_schema::FieldId;
const INDEX_NAME: &str = "default";
// #[cfg(target_os = "linux")]
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
#[derive(Debug, StructOpt)]
struct IndexCommand {
@ -23,13 +26,16 @@ struct IndexCommand {
#[structopt(parse(from_os_str))]
database_path: PathBuf,
/// The csv file to index.
#[structopt(long, default_value = "default")]
index_uid: String,
/// The csv file path to index, you can also use `-` to specify the standard input.
#[structopt(parse(from_os_str))]
csv_data_path: PathBuf,
/// The path to the schema.
/// The path to the settings.
#[structopt(long, parse(from_os_str))]
schema: PathBuf,
settings: PathBuf,
#[structopt(long)]
update_group_size: Option<usize>,
@ -40,10 +46,13 @@ struct IndexCommand {
#[derive(Debug, StructOpt)]
struct SearchCommand {
/// The destination where the database must be created.
/// The path of the database to work with.
#[structopt(parse(from_os_str))]
database_path: PathBuf,
#[structopt(long, default_value = "default")]
index_uid: String,
/// Timeout after which the search will return results.
#[structopt(long)]
fetch_timeout_ms: Option<u64>,
@ -65,10 +74,21 @@ struct SearchCommand {
displayed_fields: Vec<String>,
}
#[derive(Debug, StructOpt)]
struct ShowUpdatesCommand {
/// The path of the database to work with.
#[structopt(parse(from_os_str))]
database_path: PathBuf,
#[structopt(long, default_value = "default")]
index_uid: String,
}
#[derive(Debug, StructOpt)]
enum Command {
Index(IndexCommand),
Search(SearchCommand),
ShowUpdates(ShowUpdatesCommand),
}
impl Command {
@ -76,6 +96,7 @@ impl Command {
match self {
Command::Index(command) => &command.database_path,
Command::Search(command) => &command.database_path,
Command::ShowUpdates(command) => &command.database_path,
}
}
}
@ -88,37 +109,34 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
let start = Instant::now();
let (sender, receiver) = mpsc::sync_channel(100);
let update_fn = move |update: UpdateResult| sender.send(update.update_id).unwrap();
let index = match database.open_index(INDEX_NAME) {
let update_fn =
move |_name: &str, update: ProcessedUpdateResult| sender.send(update.update_id).unwrap();
let index = match database.open_index(&command.index_uid) {
Some(index) => index,
None => database.create_index(INDEX_NAME).unwrap(),
None => database.create_index(&command.index_uid).unwrap(),
};
let done = database.set_update_callback(INDEX_NAME, Box::new(update_fn));
assert!(done, "could not set the index update function");
database.set_update_callback(Box::new(update_fn));
let env = &database.env;
let db = &database;
let schema = {
let string = fs::read_to_string(&command.schema)?;
toml::from_str(&string).unwrap()
let settings = {
let string = fs::read_to_string(&command.settings)?;
let settings: Settings = serde_json::from_str(&string).unwrap();
settings.into_update().unwrap()
};
let mut writer = env.write_txn().unwrap();
match index.main.schema(&writer)? {
Some(current_schema) => {
if current_schema != schema {
return Err(meilidb_core::Error::SchemaDiffer.into());
}
writer.abort();
}
None => {
index.schema_update(&mut writer, schema)?;
writer.commit().unwrap();
}
}
let mut update_writer = db.update_write_txn().unwrap();
index.settings_update(&mut update_writer, settings)?;
update_writer.commit().unwrap();
let mut rdr = if command.csv_data_path.as_os_str() == "-" {
csv::Reader::from_reader(Box::new(io::stdin()) as Box<dyn Read>)
} else {
let file = std::fs::File::open(command.csv_data_path)?;
csv::Reader::from_reader(Box::new(file) as Box<dyn Read>)
};
let mut rdr = csv::Reader::from_path(command.csv_data_path)?;
let mut raw_record = csv::StringRecord::new();
let headers = rdr.headers()?.clone();
@ -157,10 +175,10 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
println!();
let mut writer = env.write_txn().unwrap();
let mut update_writer = db.update_write_txn().unwrap();
println!("committing update...");
let update_id = additions.finalize(&mut writer)?;
writer.commit().unwrap();
let update_id = additions.finalize(&mut update_writer)?;
update_writer.commit().unwrap();
max_update_id = max_update_id.max(update_id);
println!("committed update {}", update_id);
}
@ -179,8 +197,9 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
);
if let Some(path) = command.compact_to_path {
fs::create_dir_all(&path)?;
let start = Instant::now();
let _file = database.copy_and_compact_to_path(&path)?;
let _file = database.copy_and_compact_to_path(path.join("data.mdb"))?;
println!(
"database compacted in {:.2?} at: {:?}",
start.elapsed(),
@ -201,7 +220,11 @@ fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
_ => unreachable!(),
};
if highlighted {
stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
stdout.set_color(
ColorSpec::new()
.set_fg(Some(Color::Yellow))
.set_underline(true),
)?;
}
write!(&mut stdout, "{}", &text[start..end])?;
stdout.reset()?;
@ -295,15 +318,16 @@ fn crop_text(
}
fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<dyn Error>> {
let env = &database.env;
let db = &database;
let index = database
.open_index(INDEX_NAME)
.open_index(&command.index_uid)
.expect("Could not find index");
let reader = env.read_txn().unwrap();
let reader = db.main_read_txn().unwrap();
let schema = index.main.schema(&reader)?;
reader.abort();
let schema = schema.ok_or(meilidb_core::Error::SchemaMissing)?;
let schema = schema.ok_or(meilisearch_core::Error::SchemaMissing)?;
let fields = command.displayed_fields.iter().map(String::as_str);
let fields = HashSet::from_iter(fields);
@ -317,7 +341,7 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
Ok(query) => {
let start_total = Instant::now();
let reader = env.read_txn().unwrap();
let reader = db.main_read_txn().unwrap();
let ref_index = &index;
let ref_reader = &reader;
@ -335,7 +359,7 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
};
let attr = schema
.attribute(&filter)
.id(filter)
.expect("Could not find filtered attribute");
builder.with_filter(move |document_id| {
@ -366,11 +390,11 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
for (name, text) in document.0 {
print!("{}: ", name);
let attr = schema.attribute(&name).unwrap();
let attr = schema.id(&name).unwrap();
let highlights = doc
.highlights
.iter()
.filter(|m| SchemaAttr::new(m.attribute) == attr)
.filter(|m| FieldId::new(m.attribute) == attr)
.cloned();
let (text, highlights) =
crop_text(&text, highlights, command.char_context);
@ -385,8 +409,8 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
let mut matching_attributes = HashSet::new();
for highlight in doc.highlights {
let attr = SchemaAttr::new(highlight.attribute);
let name = schema.attribute_name(attr);
let attr = FieldId::new(highlight.attribute);
let name = schema.name(attr);
matching_attributes.insert(name);
}
@ -418,6 +442,23 @@ fn search_command(command: SearchCommand, database: Database) -> Result<(), Box<
Ok(())
}
fn show_updates_command(
command: ShowUpdatesCommand,
database: Database,
) -> Result<(), Box<dyn Error>> {
let db = &database;
let index = database
.open_index(&command.index_uid)
.expect("Could not find index");
let reader = db.update_read_txn().unwrap();
let updates = index.all_updates_status(&reader)?;
println!("{:#?}", updates);
reader.abort();
Ok(())
}
fn main() -> Result<(), Box<dyn Error>> {
env_logger::init();
@ -427,5 +468,6 @@ fn main() -> Result<(), Box<dyn Error>> {
match opt {
Command::Index(command) => index_command(command, database),
Command::Search(command) => search_command(command, database),
Command::ShowUpdates(command) => show_updates_command(command, database),
}
}

View File

@ -46,3 +46,8 @@ pub fn build_prefix_dfa(query: &str) -> DFA {
pub fn build_dfa(query: &str) -> DFA {
build_dfa_with_setting(query, PrefixSetting::NoPrefix)
}
pub fn build_exact_dfa(query: &str) -> DFA {
let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, true));
builder.build_dfa(query)
}

View File

@ -0,0 +1,15 @@
mod dfa;
use meilisearch_tokenizer::is_cjk;
pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa};
pub fn normalize_str(string: &str) -> String {
let mut string = string.to_lowercase();
if !string.contains(is_cjk) {
string = deunicode::deunicode_with_tofu(&string, "");
}
string
}

View File

@ -0,0 +1,560 @@
use std::borrow::Cow;
use std::collections::HashMap;
use std::mem;
use std::ops::Deref;
use std::ops::Range;
use std::rc::Rc;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::time::Instant;
use std::fmt;
use compact_arena::{SmallArena, Idx32, mk_arena};
use log::debug;
use meilisearch_types::DocIndex;
use sdset::{Set, SetBuf, exponential_search};
use slice_group_by::{GroupBy, GroupByMut};
use crate::error::Error;
use crate::criterion::{Criteria, Context, ContextMut};
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
use crate::raw_document::RawDocument;
use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
use crate::{store, Document, DocumentId, MResult};
use crate::query_tree::{create_query_tree, traverse_query_tree};
use crate::query_tree::{Operation, QueryResult, QueryKind, QueryId, PostingsKey};
use crate::query_tree::Context as QTContext;
pub fn bucket_sort<'c, FI>(
reader: &heed::RoTxn<MainT>,
query: &str,
range: Range<usize>,
filter: Option<FI>,
criteria: Criteria<'c>,
searchable_attrs: Option<ReorderedAttrs>,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
documents_fields_counts_store: store::DocumentsFieldsCounts,
synonyms_store: store::Synonyms,
prefix_documents_cache_store: store::PrefixDocumentsCache,
prefix_postings_lists_cache_store: store::PrefixPostingsListsCache,
) -> MResult<Vec<Document>>
where
FI: Fn(DocumentId) -> bool,
{
// We delegate the filter work to the distinct query builder,
// specifying a distinct rule that has no effect.
if filter.is_some() {
let distinct = |_| None;
let distinct_size = 1;
return bucket_sort_with_distinct(
reader,
query,
range,
filter,
distinct,
distinct_size,
criteria,
searchable_attrs,
main_store,
postings_lists_store,
documents_fields_counts_store,
synonyms_store,
prefix_documents_cache_store,
prefix_postings_lists_cache_store,
);
}
let words_set = match unsafe { main_store.static_words_fst(reader)? } {
Some(words) => words,
None => return Ok(Vec::new()),
};
let stop_words = main_store.stop_words_fst(reader)?.unwrap_or_default();
let context = QTContext {
words_set,
stop_words,
synonyms: synonyms_store,
postings_lists: postings_lists_store,
prefix_postings_lists: prefix_postings_lists_cache_store,
};
let (operation, mapping) = create_query_tree(reader, &context, query)?;
debug!("operation:\n{:?}", operation);
debug!("mapping:\n{:?}", mapping);
fn recurs_operation<'o>(map: &mut HashMap<QueryId, &'o QueryKind>, operation: &'o Operation) {
match operation {
Operation::And(ops) => ops.iter().for_each(|op| recurs_operation(map, op)),
Operation::Or(ops) => ops.iter().for_each(|op| recurs_operation(map, op)),
Operation::Query(query) => { map.insert(query.id, &query.kind); },
}
}
let mut queries_kinds = HashMap::new();
recurs_operation(&mut queries_kinds, &operation);
let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation)?;
debug!("found {} documents", docids.len());
debug!("number of postings {:?}", queries.len());
let before = Instant::now();
mk_arena!(arena);
let mut bare_matches = cleanup_bare_matches(&mut arena, &docids, queries);
debug!("matches cleaned in {:.02?}", before.elapsed());
let before_bucket_sort = Instant::now();
let before_raw_documents_building = Instant::now();
let mut raw_documents = Vec::new();
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref());
raw_documents.push(raw_document);
}
debug!("creating {} candidates documents took {:.02?}",
raw_documents.len(),
before_raw_documents_building.elapsed(),
);
let before_criterion_loop = Instant::now();
let proximity_count = AtomicUsize::new(0);
let mut groups = vec![raw_documents.as_mut_slice()];
'criteria: for criterion in criteria.as_ref() {
let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut documents_seen = 0;
for mut group in tmp_groups {
let before_criterion_preparation = Instant::now();
let ctx = ContextMut {
reader,
postings_lists: &mut arena,
query_mapping: &mapping,
documents_fields_counts_store,
};
criterion.prepare(ctx, &mut group)?;
debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed());
let ctx = Context {
postings_lists: &arena,
query_mapping: &mapping,
};
let before_criterion_sort = Instant::now();
group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b));
debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed());
for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) {
debug!("{:?} produced a group of size {}", criterion.name(), group.len());
documents_seen += group.len();
groups.push(group);
// we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion
if documents_seen >= range.end {
continue 'criteria;
}
}
}
}
debug!("criterion loop took {:.02?}", before_criterion_loop.elapsed());
debug!("proximity evaluation called {} times", proximity_count.load(Ordering::Relaxed));
let schema = main_store.schema(reader)?.ok_or(Error::SchemaMissing)?;
let iter = raw_documents.into_iter().skip(range.start).take(range.len());
let iter = iter.map(|rd| Document::from_raw(rd, &queries_kinds, &arena, searchable_attrs.as_ref(), &schema));
let documents = iter.collect();
debug!("bucket sort took {:.02?}", before_bucket_sort.elapsed());
Ok(documents)
}
pub fn bucket_sort_with_distinct<'c, FI, FD>(
reader: &heed::RoTxn<MainT>,
query: &str,
range: Range<usize>,
filter: Option<FI>,
distinct: FD,
distinct_size: usize,
criteria: Criteria<'c>,
searchable_attrs: Option<ReorderedAttrs>,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
documents_fields_counts_store: store::DocumentsFieldsCounts,
synonyms_store: store::Synonyms,
_prefix_documents_cache_store: store::PrefixDocumentsCache,
prefix_postings_lists_cache_store: store::PrefixPostingsListsCache,
) -> MResult<Vec<Document>>
where
FI: Fn(DocumentId) -> bool,
FD: Fn(DocumentId) -> Option<u64>,
{
let words_set = match unsafe { main_store.static_words_fst(reader)? } {
Some(words) => words,
None => return Ok(Vec::new()),
};
let stop_words = main_store.stop_words_fst(reader)?.unwrap_or_default();
let context = QTContext {
words_set,
stop_words,
synonyms: synonyms_store,
postings_lists: postings_lists_store,
prefix_postings_lists: prefix_postings_lists_cache_store,
};
let (operation, mapping) = create_query_tree(reader, &context, query)?;
debug!("operation:\n{:?}", operation);
debug!("mapping:\n{:?}", mapping);
fn recurs_operation<'o>(map: &mut HashMap<QueryId, &'o QueryKind>, operation: &'o Operation) {
match operation {
Operation::And(ops) => ops.iter().for_each(|op| recurs_operation(map, op)),
Operation::Or(ops) => ops.iter().for_each(|op| recurs_operation(map, op)),
Operation::Query(query) => { map.insert(query.id, &query.kind); },
}
}
let mut queries_kinds = HashMap::new();
recurs_operation(&mut queries_kinds, &operation);
let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation)?;
debug!("found {} documents", docids.len());
debug!("number of postings {:?}", queries.len());
let before = Instant::now();
mk_arena!(arena);
let mut bare_matches = cleanup_bare_matches(&mut arena, &docids, queries);
debug!("matches cleaned in {:.02?}", before.elapsed());
let before_raw_documents_building = Instant::now();
let mut raw_documents = Vec::new();
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref());
raw_documents.push(raw_document);
}
debug!("creating {} candidates documents took {:.02?}",
raw_documents.len(),
before_raw_documents_building.elapsed(),
);
let mut groups = vec![raw_documents.as_mut_slice()];
let mut key_cache = HashMap::new();
let mut filter_map = HashMap::new();
// these two variables informs on the current distinct map and
// on the raw offset of the start of the group where the
// range.start bound is located according to the distinct function
let mut distinct_map = DistinctMap::new(distinct_size);
let mut distinct_raw_offset = 0;
'criteria: for criterion in criteria.as_ref() {
let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
let mut documents_seen = 0;
for mut group in tmp_groups {
// if this group does not overlap with the requested range,
// push it without sorting and splitting it
if documents_seen + group.len() < distinct_raw_offset {
documents_seen += group.len();
groups.push(group);
continue;
}
let ctx = ContextMut {
reader,
postings_lists: &mut arena,
query_mapping: &mapping,
documents_fields_counts_store,
};
let before_criterion_preparation = Instant::now();
criterion.prepare(ctx, &mut group)?;
debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed());
let ctx = Context {
postings_lists: &arena,
query_mapping: &mapping,
};
let before_criterion_sort = Instant::now();
group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b));
debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed());
for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) {
// we must compute the real distinguished len of this sub-group
for document in group.iter() {
let filter_accepted = match &filter {
Some(filter) => {
let entry = filter_map.entry(document.id);
*entry.or_insert_with(|| (filter)(document.id))
}
None => true,
};
if filter_accepted {
let entry = key_cache.entry(document.id);
let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new));
match key.clone() {
Some(key) => buf_distinct.register(key),
None => buf_distinct.register_without_key(),
};
}
// the requested range end is reached: stop computing distinct
if buf_distinct.len() >= range.end {
break;
}
}
documents_seen += group.len();
groups.push(group);
// if this sub-group does not overlap with the requested range
// we must update the distinct map and its start index
if buf_distinct.len() < range.start {
buf_distinct.transfert_to_internal();
distinct_raw_offset = documents_seen;
}
// we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion
if buf_distinct.len() >= range.end {
continue 'criteria;
}
}
}
}
// once we classified the documents related to the current
// automatons we save that as the next valid result
let mut seen = BufferedDistinctMap::new(&mut distinct_map);
let schema = main_store.schema(reader)?.ok_or(Error::SchemaMissing)?;
let mut documents = Vec::with_capacity(range.len());
for raw_document in raw_documents.into_iter().skip(distinct_raw_offset) {
let filter_accepted = match &filter {
Some(_) => filter_map.remove(&raw_document.id).unwrap(),
None => true,
};
if filter_accepted {
let key = key_cache.remove(&raw_document.id).unwrap();
let distinct_accepted = match key {
Some(key) => seen.register(key),
None => seen.register_without_key(),
};
if distinct_accepted && seen.len() > range.start {
documents.push(Document::from_raw(raw_document, &queries_kinds, &arena, searchable_attrs.as_ref(), &schema));
if documents.len() == range.len() {
break;
}
}
}
}
Ok(documents)
}
fn cleanup_bare_matches<'tag, 'txn>(
arena: &mut SmallArena<'tag, PostingsListView<'txn>>,
docids: &Set<DocumentId>,
queries: HashMap<PostingsKey, Cow<'txn, Set<DocIndex>>>,
) -> Vec<BareMatch<'tag>>
{
let docidslen = docids.len() as f32;
let mut bare_matches = Vec::new();
for (PostingsKey { query, input, distance, is_exact }, matches) in queries {
let postings_list_view = PostingsListView::original(Rc::from(input), Rc::new(matches));
let pllen = postings_list_view.len() as f32;
if docidslen / pllen >= 0.8 {
let mut offset = 0;
for matches in postings_list_view.linear_group_by_key(|m| m.document_id) {
let document_id = matches[0].document_id;
if docids.contains(&document_id) {
let range = postings_list_view.range(offset, matches.len());
let posting_list_index = arena.add(range);
let bare_match = BareMatch {
document_id,
query_index: query.id,
distance,
is_exact,
postings_list: posting_list_index,
};
bare_matches.push(bare_match);
}
offset += matches.len();
}
} else {
let mut offset = 0;
for id in docids.as_slice() {
let di = DocIndex { document_id: *id, ..DocIndex::default() };
let pos = exponential_search(&postings_list_view[offset..], &di).unwrap_or_else(|x| x);
offset += pos;
let group = postings_list_view[offset..]
.linear_group_by_key(|m| m.document_id)
.next()
.filter(|matches| matches[0].document_id == *id);
if let Some(matches) = group {
let range = postings_list_view.range(offset, matches.len());
let posting_list_index = arena.add(range);
let bare_match = BareMatch {
document_id: *id,
query_index: query.id,
distance,
is_exact,
postings_list: posting_list_index,
};
bare_matches.push(bare_match);
}
}
}
}
let before_raw_documents_presort = Instant::now();
bare_matches.sort_unstable_by_key(|sm| sm.document_id);
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
bare_matches
}
pub struct BareMatch<'tag> {
pub document_id: DocumentId,
pub query_index: usize,
pub distance: u8,
pub is_exact: bool,
pub postings_list: Idx32<'tag>,
}
impl fmt::Debug for BareMatch<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("BareMatch")
.field("document_id", &self.document_id)
.field("query_index", &self.query_index)
.field("distance", &self.distance)
.field("is_exact", &self.is_exact)
.finish()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct SimpleMatch {
pub query_index: usize,
pub distance: u8,
pub attribute: u16,
pub word_index: u16,
pub is_exact: bool,
}
#[derive(Clone)]
pub enum PostingsListView<'txn> {
Original {
input: Rc<[u8]>,
postings_list: Rc<Cow<'txn, Set<DocIndex>>>,
offset: usize,
len: usize,
},
Rewritten {
input: Rc<[u8]>,
postings_list: SetBuf<DocIndex>,
},
}
impl fmt::Debug for PostingsListView<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("PostingsListView")
.field("input", &std::str::from_utf8(&self.input()).unwrap())
.field("postings_list", &self.as_ref())
.finish()
}
}
impl<'txn> PostingsListView<'txn> {
pub fn original(input: Rc<[u8]>, postings_list: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> {
let len = postings_list.len();
PostingsListView::Original { input, postings_list, offset: 0, len }
}
pub fn rewritten(input: Rc<[u8]>, postings_list: SetBuf<DocIndex>) -> PostingsListView<'txn> {
PostingsListView::Rewritten { input, postings_list }
}
pub fn rewrite_with(&mut self, postings_list: SetBuf<DocIndex>) {
let input = match self {
PostingsListView::Original { input, .. } => input.clone(),
PostingsListView::Rewritten { input, .. } => input.clone(),
};
*self = PostingsListView::rewritten(input, postings_list);
}
pub fn len(&self) -> usize {
match self {
PostingsListView::Original { len, .. } => *len,
PostingsListView::Rewritten { postings_list, .. } => postings_list.len(),
}
}
pub fn input(&self) -> &[u8] {
match self {
PostingsListView::Original { ref input, .. } => input,
PostingsListView::Rewritten { ref input, .. } => input,
}
}
pub fn range(&self, range_offset: usize, range_len: usize) -> PostingsListView<'txn> {
match self {
PostingsListView::Original { input, postings_list, offset, len } => {
assert!(range_offset + range_len <= *len);
PostingsListView::Original {
input: input.clone(),
postings_list: postings_list.clone(),
offset: offset + range_offset,
len: range_len,
}
},
PostingsListView::Rewritten { .. } => {
panic!("Cannot create a range on a rewritten postings list view");
}
}
}
}
impl AsRef<Set<DocIndex>> for PostingsListView<'_> {
fn as_ref(&self) -> &Set<DocIndex> {
self
}
}
impl Deref for PostingsListView<'_> {
type Target = Set<DocIndex>;
fn deref(&self) -> &Set<DocIndex> {
match *self {
PostingsListView::Original { ref postings_list, offset, len, .. } => {
Set::new_unchecked(&postings_list[offset..offset + len])
},
PostingsListView::Rewritten { ref postings_list, .. } => postings_list,
}
}
}

View File

@ -0,0 +1,37 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::{RawDocument, MResult};
use crate::bucket_sort::SimpleMatch;
use super::{Criterion, Context, ContextMut, prepare_bare_matches};
pub struct Attribute;
impl Criterion for Attribute {
fn name(&self) -> &str { "attribute" }
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
&self,
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping);
Ok(())
}
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
#[inline]
fn sum_of_attribute(matches: &[SimpleMatch]) -> usize {
let mut sum_of_attribute = 0;
for group in matches.linear_group_by_key(|bm| bm.query_index) {
sum_of_attribute += group[0].attribute as usize;
}
sum_of_attribute
}
let lhs = sum_of_attribute(&lhs.processed_matches);
let rhs = sum_of_attribute(&rhs.processed_matches);
lhs.cmp(&rhs)
}
}

View File

@ -0,0 +1,16 @@
use std::cmp::Ordering;
use crate::RawDocument;
use super::{Criterion, Context};
pub struct DocumentId;
impl Criterion for DocumentId {
fn name(&self) -> &str { "stable document id" }
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = &lhs.id;
let rhs = &rhs.id;
lhs.cmp(rhs)
}
}

View File

@ -0,0 +1,78 @@
use std::cmp::{Ordering, Reverse};
use std::collections::hash_map::{HashMap, Entry};
use meilisearch_schema::IndexedPos;
use slice_group_by::GroupBy;
use crate::{RawDocument, MResult};
use crate::bucket_sort::BareMatch;
use super::{Criterion, Context, ContextMut};
pub struct Exactness;
impl Criterion for Exactness {
fn name(&self) -> &str { "exactness" }
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
&self,
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
let store = ctx.documents_fields_counts_store;
let reader = ctx.reader;
'documents: for doc in documents {
doc.bare_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact)));
// mark the document if we find a "one word field" that matches
let mut fields_counts = HashMap::new();
for group in doc.bare_matches.linear_group_by_key(|bm| bm.query_index) {
for group in group.linear_group_by_key(|bm| bm.is_exact) {
if !group[0].is_exact { break }
for bm in group {
for di in ctx.postings_lists[bm.postings_list].as_ref() {
let attr = IndexedPos(di.attribute);
let count = match fields_counts.entry(attr) {
Entry::Occupied(entry) => *entry.get(),
Entry::Vacant(entry) => {
let count = store.document_field_count(reader, doc.id, attr)?;
*entry.insert(count)
},
};
if count == Some(1) {
doc.contains_one_word_field = true;
continue 'documents
}
}
}
}
}
}
Ok(())
}
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
#[inline]
fn sum_exact_query_words(matches: &[BareMatch]) -> usize {
let mut sum_exact_query_words = 0;
for group in matches.linear_group_by_key(|bm| bm.query_index) {
sum_exact_query_words += group[0].is_exact as usize;
}
sum_exact_query_words
}
// does it contains a "one word field"
lhs.contains_one_word_field.cmp(&rhs.contains_one_word_field).reverse()
// if not, with document contains the more exact words
.then_with(|| {
let lhs = sum_exact_query_words(&lhs.bare_matches);
let rhs = sum_exact_query_words(&rhs.bare_matches);
lhs.cmp(&rhs).reverse()
})
}
}

View File

@ -0,0 +1,291 @@
use std::cmp::{self, Ordering};
use std::collections::HashMap;
use std::ops::Range;
use compact_arena::SmallArena;
use sdset::SetBuf;
use slice_group_by::GroupBy;
use crate::bucket_sort::{SimpleMatch, PostingsListView};
use crate::database::MainT;
use crate::query_tree::QueryId;
use crate::{store, RawDocument, MResult};
mod typo;
mod words;
mod proximity;
mod attribute;
mod words_position;
mod exactness;
mod document_id;
mod sort_by_attr;
pub use self::typo::Typo;
pub use self::words::Words;
pub use self::proximity::Proximity;
pub use self::attribute::Attribute;
pub use self::words_position::WordsPosition;
pub use self::exactness::Exactness;
pub use self::document_id::DocumentId;
pub use self::sort_by_attr::SortByAttr;
pub trait Criterion {
fn name(&self) -> &str;
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
&self,
_ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
_documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
Ok(())
}
fn evaluate<'p, 'tag, 'txn, 'q, 'r>(
&self,
ctx: &Context<'p, 'tag, 'txn, 'q>,
lhs: &RawDocument<'r, 'tag>,
rhs: &RawDocument<'r, 'tag>,
) -> Ordering;
#[inline]
fn eq<'p, 'tag, 'txn, 'q, 'r>(
&self,
ctx: &Context<'p, 'tag, 'txn, 'q>,
lhs: &RawDocument<'r, 'tag>,
rhs: &RawDocument<'r, 'tag>,
) -> bool
{
self.evaluate(ctx, lhs, rhs) == Ordering::Equal
}
}
pub struct ContextMut<'h, 'p, 'tag, 'txn, 'q> {
pub reader: &'h heed::RoTxn<MainT>,
pub postings_lists: &'p mut SmallArena<'tag, PostingsListView<'txn>>,
pub query_mapping: &'q HashMap<QueryId, Range<usize>>,
pub documents_fields_counts_store: store::DocumentsFieldsCounts,
}
pub struct Context<'p, 'tag, 'txn, 'q> {
pub postings_lists: &'p SmallArena<'tag, PostingsListView<'txn>>,
pub query_mapping: &'q HashMap<QueryId, Range<usize>>,
}
#[derive(Default)]
pub struct CriteriaBuilder<'a> {
inner: Vec<Box<dyn Criterion + 'a>>,
}
impl<'a> CriteriaBuilder<'a> {
pub fn new() -> CriteriaBuilder<'a> {
CriteriaBuilder { inner: Vec::new() }
}
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> {
CriteriaBuilder {
inner: Vec::with_capacity(capacity),
}
}
pub fn reserve(&mut self, additional: usize) {
self.inner.reserve(additional)
}
pub fn add<C: 'a>(mut self, criterion: C) -> CriteriaBuilder<'a>
where
C: Criterion,
{
self.push(criterion);
self
}
pub fn push<C: 'a>(&mut self, criterion: C)
where
C: Criterion,
{
self.inner.push(Box::new(criterion));
}
pub fn build(self) -> Criteria<'a> {
Criteria { inner: self.inner }
}
}
pub struct Criteria<'a> {
inner: Vec<Box<dyn Criterion + 'a>>,
}
impl<'a> Default for Criteria<'a> {
fn default() -> Self {
CriteriaBuilder::with_capacity(7)
.add(Typo)
.add(Words)
.add(Proximity)
.add(Attribute)
.add(WordsPosition)
.add(Exactness)
.add(DocumentId)
.build()
}
}
impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> {
fn as_ref(&self) -> &[Box<dyn Criterion + 'a>] {
&self.inner
}
}
fn prepare_query_distances<'a, 'tag, 'txn>(
documents: &mut [RawDocument<'a, 'tag>],
query_mapping: &HashMap<QueryId, Range<usize>>,
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
) {
for document in documents {
if !document.processed_distances.is_empty() { continue }
let mut processed = Vec::new();
for m in document.bare_matches.iter() {
if postings_lists[m.postings_list].is_empty() { continue }
let range = query_mapping[&(m.query_index as usize)].clone();
let new_len = cmp::max(range.end as usize, processed.len());
processed.resize(new_len, None);
for index in range {
let index = index as usize;
processed[index] = match processed[index] {
Some(distance) if distance > m.distance => Some(m.distance),
Some(distance) => Some(distance),
None => Some(m.distance),
};
}
}
document.processed_distances = processed;
}
}
fn prepare_bare_matches<'a, 'tag, 'txn>(
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
query_mapping: &HashMap<QueryId, Range<usize>>,
) {
for document in documents {
if !document.processed_matches.is_empty() { continue }
let mut processed = Vec::new();
for m in document.bare_matches.iter() {
let postings_list = &postings_lists[m.postings_list];
processed.reserve(postings_list.len());
for di in postings_list.as_ref() {
let simple_match = SimpleMatch {
query_index: m.query_index,
distance: m.distance,
attribute: di.attribute,
word_index: di.word_index,
is_exact: m.is_exact,
};
processed.push(simple_match);
}
}
let processed = multiword_rewrite_matches(&mut processed, query_mapping);
document.processed_matches = processed.into_vec();
}
}
fn multiword_rewrite_matches(
matches: &mut [SimpleMatch],
query_mapping: &HashMap<QueryId, Range<usize>>,
) -> SetBuf<SimpleMatch>
{
matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
let mut padded_matches = Vec::with_capacity(matches.len());
// let before_padding = Instant::now();
// for each attribute of each document
for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) {
// padding will only be applied
// to word indices in the same attribute
let mut padding = 0;
let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index);
// for each match at the same position
// in this document attribute
while let Some(same_word_index) = iter.next() {
// find the biggest padding
let mut biggest = 0;
for match_ in same_word_index {
let mut replacement = query_mapping[&(match_.query_index as usize)].clone();
let replacement_len = replacement.len();
let nexts = iter.remainder().linear_group_by_key(|m| m.word_index);
if let Some(query_index) = replacement.next() {
let word_index = match_.word_index + padding as u16;
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
padded_matches.push(match_);
}
let mut found = false;
// look ahead and if there already is a match
// corresponding to this padding word, abort the padding
'padding: for (x, next_group) in nexts.enumerate() {
for (i, query_index) in replacement.clone().enumerate().skip(x) {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let padmatch = SimpleMatch { query_index, word_index, ..*match_ };
for nmatch_ in next_group {
let mut rep = query_mapping[&(nmatch_.query_index as usize)].clone();
let query_index = rep.next().unwrap();
if query_index == padmatch.query_index {
if !found {
// if we find a corresponding padding for the
// first time we must push preceding paddings
for (i, query_index) in replacement.clone().enumerate().take(i) {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
padded_matches.push(match_);
biggest = biggest.max(i + 1);
}
}
padded_matches.push(padmatch);
found = true;
continue 'padding;
}
}
}
// if we do not find a corresponding padding in the
// next groups so stop here and pad what was found
break;
}
if !found {
// if no padding was found in the following matches
// we must insert the entire padding
for (i, query_index) in replacement.enumerate() {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
padded_matches.push(match_);
}
biggest = biggest.max(replacement_len - 1);
}
}
padding += biggest;
}
}
// debug!("padding matches took {:.02?}", before_padding.elapsed());
// With this check we can see that the loop above takes something
// like 43% of the search time even when no rewrite is needed.
// assert_eq!(before_matches, padded_matches);
SetBuf::from_dirty(padded_matches)
}

View File

@ -0,0 +1,68 @@
use std::cmp::{self, Ordering};
use slice_group_by::GroupBy;
use crate::bucket_sort::{SimpleMatch};
use crate::{RawDocument, MResult};
use super::{Criterion, Context, ContextMut, prepare_bare_matches};
const MAX_DISTANCE: u16 = 8;
pub struct Proximity;
impl Criterion for Proximity {
fn name(&self) -> &str { "proximity" }
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
&self,
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping);
Ok(())
}
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
if lhs < rhs {
cmp::min(rhs - lhs, MAX_DISTANCE)
} else {
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
}
}
fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 {
if lhs.attribute != rhs.attribute { MAX_DISTANCE }
else { index_proximity(lhs.word_index, rhs.word_index) }
}
fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 {
let mut min_prox = u16::max_value();
for a in lhs {
for b in rhs {
let prox = attribute_proximity(*a, *b);
min_prox = cmp::min(min_prox, prox);
}
}
min_prox
}
fn matches_proximity(matches: &[SimpleMatch],) -> u16 {
let mut proximity = 0;
let mut iter = matches.linear_group_by_key(|m| m.query_index);
// iterate over groups by windows of size 2
let mut last = iter.next();
while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
proximity += min_proximity(lhs, rhs);
last = Some(rhs);
}
proximity
}
let lhs = matches_proximity(&lhs.processed_matches);
let rhs = matches_proximity(&rhs.processed_matches);
lhs.cmp(&rhs)
}
}

View File

@ -1,10 +1,9 @@
use std::cmp::Ordering;
use std::error::Error;
use std::fmt;
use crate::criterion::Criterion;
use meilisearch_schema::{Schema, FieldId};
use crate::{RankedMap, RawDocument};
use meilidb_schema::{Schema, SchemaAttr};
use super::{Criterion, Context};
/// An helper struct that permit to sort documents by
/// some of their stored attributes.
@ -23,17 +22,17 @@ use meilidb_schema::{Schema, SchemaAttr};
///
/// ```ignore
/// use serde_derive::Deserialize;
/// use meilidb::rank::criterion::*;
/// use meilisearch::rank::criterion::*;
///
/// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?;
///
/// let builder = CriteriaBuilder::with_capacity(8)
/// .add(SumOfTypos)
/// .add(NumberOfWords)
/// .add(WordsProximity)
/// .add(SumOfWordsAttribute)
/// .add(SumOfWordsPosition)
/// .add(Exact)
/// .add(Typo)
/// .add(Words)
/// .add(Proximity)
/// .add(Attribute)
/// .add(WordsPosition)
/// .add(Exactness)
/// .add(custom_ranking)
/// .add(DocumentId);
///
@ -42,7 +41,7 @@ use meilidb_schema::{Schema, SchemaAttr};
/// ```
pub struct SortByAttr<'a> {
ranked_map: &'a RankedMap,
attr: SchemaAttr,
field_id: FieldId,
reversed: bool,
}
@ -69,27 +68,31 @@ impl<'a> SortByAttr<'a> {
attr_name: &str,
reversed: bool,
) -> Result<SortByAttr<'a>, SortByAttrError> {
let attr = match schema.attribute(attr_name) {
Some(attr) => attr,
let field_id = match schema.id(attr_name) {
Some(field_id) => field_id,
None => return Err(SortByAttrError::AttributeNotFound),
};
if !schema.props(attr).is_ranked() {
if !schema.is_ranked(field_id) {
return Err(SortByAttrError::AttributeNotRegisteredForRanking);
}
Ok(SortByAttr {
ranked_map,
attr,
field_id,
reversed,
})
}
}
impl<'a> Criterion for SortByAttr<'a> {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = self.ranked_map.get(lhs.id, self.attr);
let rhs = self.ranked_map.get(rhs.id, self.attr);
impl Criterion for SortByAttr<'_> {
fn name(&self) -> &str {
"sort by attribute"
}
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = self.ranked_map.get(lhs.id, self.field_id);
let rhs = self.ranked_map.get(rhs.id, self.field_id);
match (lhs, rhs) {
(Some(lhs), Some(rhs)) => {
@ -105,10 +108,6 @@ impl<'a> Criterion for SortByAttr<'a> {
(None, None) => Ordering::Equal,
}
}
fn name(&self) -> &str {
"SortByAttr"
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]

View File

@ -0,0 +1,55 @@
use std::cmp::Ordering;
use crate::{RawDocument, MResult};
use super::{Criterion, Context, ContextMut, prepare_query_distances};
pub struct Typo;
impl Criterion for Typo {
fn name(&self) -> &str { "typo" }
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
&self,
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
prepare_query_distances(documents, ctx.query_mapping, ctx.postings_lists);
Ok(())
}
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
// This function is a wrong logarithmic 10 function.
// It is safe to panic on input number higher than 3,
// the number of typos is never bigger than that.
#[inline]
fn custom_log10(n: u8) -> f32 {
match n {
0 => 0.0, // log(1)
1 => 0.30102, // log(2)
2 => 0.47712, // log(3)
3 => 0.60205, // log(4)
_ => panic!("invalid number"),
}
}
#[inline]
fn compute_typos(distances: &[Option<u8>]) -> usize {
let mut number_words: usize = 0;
let mut sum_typos = 0.0;
for distance in distances {
if let Some(distance) = distance {
sum_typos += custom_log10(*distance);
number_words += 1;
}
}
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
}
let lhs = compute_typos(&lhs.processed_distances);
let rhs = compute_typos(&rhs.processed_distances);
lhs.cmp(&rhs).reverse()
}
}

View File

@ -0,0 +1,31 @@
use std::cmp::Ordering;
use crate::{RawDocument, MResult};
use super::{Criterion, Context, ContextMut, prepare_query_distances};
pub struct Words;
impl Criterion for Words {
fn name(&self) -> &str { "words" }
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
&self,
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
prepare_query_distances(documents, ctx.query_mapping, ctx.postings_lists);
Ok(())
}
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
#[inline]
fn number_of_query_words(distances: &[Option<u8>]) -> usize {
distances.iter().cloned().filter(Option::is_some).count()
}
let lhs = number_of_query_words(&lhs.processed_distances);
let rhs = number_of_query_words(&rhs.processed_distances);
lhs.cmp(&rhs).reverse()
}
}

View File

@ -0,0 +1,37 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::bucket_sort::SimpleMatch;
use crate::{RawDocument, MResult};
use super::{Criterion, Context, ContextMut, prepare_bare_matches};
pub struct WordsPosition;
impl Criterion for WordsPosition {
fn name(&self) -> &str { "words position" }
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
&self,
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
documents: &mut [RawDocument<'r, 'tag>],
) -> MResult<()>
{
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping);
Ok(())
}
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
#[inline]
fn sum_words_position(matches: &[SimpleMatch]) -> usize {
let mut sum_words_position = 0;
for group in matches.linear_group_by_key(|bm| bm.query_index) {
sum_words_position += group[0].word_index as usize;
}
sum_words_position
}
let lhs = sum_words_position(&lhs.processed_matches);
let rhs = sum_words_position(&rhs.processed_matches);
lhs.cmp(&rhs)
}
}

File diff suppressed because it is too large Load Diff

View File

@ -2,16 +2,22 @@ use crate::serde::{DeserializerError, SerializerError};
use serde_json::Error as SerdeJsonError;
use std::{error, fmt, io};
pub use heed::Error as HeedError;
pub use fst::Error as FstError;
pub use bincode::Error as BincodeError;
pub type MResult<T> = Result<T, Error>;
#[derive(Debug)]
pub enum Error {
Io(io::Error),
IndexAlreadyExists,
SchemaDiffer,
MissingPrimaryKey,
SchemaMissing,
WordIndexMissing,
MissingDocumentId,
MaxFieldsLimitExceeded,
Schema(meilisearch_schema::Error),
Zlmdb(heed::Error),
Fst(fst::Error),
SerdeJson(SerdeJsonError),
@ -27,14 +33,20 @@ impl From<io::Error> for Error {
}
}
impl From<heed::Error> for Error {
fn from(error: heed::Error) -> Error {
impl From<meilisearch_schema::Error> for Error {
fn from(error: meilisearch_schema::Error) -> Error {
Error::Schema(error)
}
}
impl From<HeedError> for Error {
fn from(error: HeedError) -> Error {
Error::Zlmdb(error)
}
}
impl From<fst::Error> for Error {
fn from(error: fst::Error) -> Error {
impl From<FstError> for Error {
fn from(error: FstError) -> Error {
Error::Fst(error)
}
}
@ -45,8 +57,8 @@ impl From<SerdeJsonError> for Error {
}
}
impl From<bincode::Error> for Error {
fn from(error: bincode::Error) -> Error {
impl From<BincodeError> for Error {
fn from(error: BincodeError) -> Error {
Error::Bincode(error)
}
}
@ -75,10 +87,12 @@ impl fmt::Display for Error {
match self {
Io(e) => write!(f, "{}", e),
IndexAlreadyExists => write!(f, "index already exists"),
SchemaDiffer => write!(f, "schemas differ"),
MissingPrimaryKey => write!(f, "schema cannot be built without a primary key"),
SchemaMissing => write!(f, "this index does not have a schema"),
WordIndexMissing => write!(f, "this index does not have a word index"),
MissingDocumentId => write!(f, "document id is missing"),
MaxFieldsLimitExceeded => write!(f, "maximum number of fields in a document exceeded"),
Schema(e) => write!(f, "schema error; {}", e),
Zlmdb(e) => write!(f, "heed error; {}", e),
Fst(e) => write!(f, "fst error; {}", e),
SerdeJson(e) => write!(f, "serde json error; {}", e),
@ -95,6 +109,10 @@ impl error::Error for Error {}
#[derive(Debug)]
pub enum UnsupportedOperation {
SchemaAlreadyExists,
CannotUpdateSchemaPrimaryKey,
CannotReorderSchemaAttribute,
CanOnlyIntroduceNewSchemaAttributesAtEnd,
CannotRemoveSchemaAttribute,
}
impl fmt::Display for UnsupportedOperation {
@ -102,6 +120,12 @@ impl fmt::Display for UnsupportedOperation {
use self::UnsupportedOperation::*;
match self {
SchemaAlreadyExists => write!(f, "Cannot update index which already have a schema"),
CannotUpdateSchemaPrimaryKey => write!(f, "Cannot update the primary key of a schema"),
CannotReorderSchemaAttribute => write!(f, "Cannot reorder the attributes of a schema"),
CanOnlyIntroduceNewSchemaAttributesAtEnd => {
write!(f, "Can only introduce new attributes at end of a schema")
}
CannotRemoveSchemaAttribute => write!(f, "Cannot remove attributes from a schema"),
}
}
}

View File

@ -0,0 +1,134 @@
use std::cmp::min;
use std::collections::BTreeMap;
use std::ops::{Index, IndexMut};
// A simple wrapper around vec so we can get contiguous but index it like it's 2D array.
struct N2Array<T> {
y_size: usize,
buf: Vec<T>,
}
impl<T: Clone> N2Array<T> {
fn new(x: usize, y: usize, value: T) -> N2Array<T> {
N2Array {
y_size: y,
buf: vec![value; x * y],
}
}
}
impl<T> Index<(usize, usize)> for N2Array<T> {
type Output = T;
#[inline]
fn index(&self, (x, y): (usize, usize)) -> &T {
&self.buf[(x * self.y_size) + y]
}
}
impl<T> IndexMut<(usize, usize)> for N2Array<T> {
#[inline]
fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T {
&mut self.buf[(x * self.y_size) + y]
}
}
pub fn prefix_damerau_levenshtein(source: &[u8], target: &[u8]) -> (u32, usize) {
let (n, m) = (source.len(), target.len());
assert!(
n <= m,
"the source string must be shorter than the target one"
);
if n == 0 {
return (m as u32, 0);
}
if m == 0 {
return (n as u32, 0);
}
if n == m && source == target {
return (0, m);
}
let inf = n + m;
let mut matrix = N2Array::new(n + 2, m + 2, 0);
matrix[(0, 0)] = inf;
for i in 0..n + 1 {
matrix[(i + 1, 0)] = inf;
matrix[(i + 1, 1)] = i;
}
for j in 0..m + 1 {
matrix[(0, j + 1)] = inf;
matrix[(1, j + 1)] = j;
}
let mut last_row = BTreeMap::new();
for (row, char_s) in source.iter().enumerate() {
let mut last_match_col = 0;
let row = row + 1;
for (col, char_t) in target.iter().enumerate() {
let col = col + 1;
let last_match_row = *last_row.get(&char_t).unwrap_or(&0);
let cost = if char_s == char_t { 0 } else { 1 };
let dist_add = matrix[(row, col + 1)] + 1;
let dist_del = matrix[(row + 1, col)] + 1;
let dist_sub = matrix[(row, col)] + cost;
let dist_trans = matrix[(last_match_row, last_match_col)]
+ (row - last_match_row - 1)
+ 1
+ (col - last_match_col - 1);
let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans));
matrix[(row + 1, col + 1)] = dist;
if cost == 0 {
last_match_col = col;
}
}
last_row.insert(char_s, row);
}
let mut minimum = (u32::max_value(), 0);
for x in n..=m {
let dist = matrix[(n + 1, x + 1)] as u32;
if dist < minimum.0 {
minimum = (dist, x)
}
}
minimum
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matched_length() {
let query = "Levenste";
let text = "Levenshtein";
let (dist, length) = prefix_damerau_levenshtein(query.as_bytes(), text.as_bytes());
assert_eq!(dist, 1);
assert_eq!(&text[..length], "Levenshte");
}
#[test]
#[should_panic]
fn matched_length_panic() {
let query = "Levenshtein";
let text = "Levenste";
// this function will panic if source if longer than target
prefix_damerau_levenshtein(query.as_bytes(), text.as_bytes());
}
}

191
meilisearch-core/src/lib.rs Normal file
View File

@ -0,0 +1,191 @@
#[cfg(test)]
#[macro_use]
extern crate assert_matches;
mod automaton;
mod bucket_sort;
mod database;
mod distinct_map;
mod error;
mod levenshtein;
mod number;
mod query_builder;
mod query_tree;
mod query_words_mapper;
mod ranked_map;
mod raw_document;
mod reordered_attrs;
mod update;
pub mod settings;
pub mod criterion;
pub mod raw_indexer;
pub mod serde;
pub mod store;
pub use self::database::{BoxUpdateFn, Database, MainT, UpdateT};
pub use self::error::{Error, HeedError, FstError, MResult};
pub use self::number::{Number, ParseNumberError};
pub use self::ranked_map::RankedMap;
pub use self::raw_document::RawDocument;
pub use self::store::Index;
pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType};
pub use meilisearch_types::{DocIndex, DocumentId, Highlight};
pub use meilisearch_schema::Schema;
pub use query_words_mapper::QueryWordsMapper;
use std::convert::TryFrom;
use std::collections::HashMap;
use compact_arena::SmallArena;
use log::{error, trace};
use crate::bucket_sort::PostingsListView;
use crate::levenshtein::prefix_damerau_levenshtein;
use crate::query_tree::{QueryId, QueryKind};
use crate::reordered_attrs::ReorderedAttrs;
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct Document {
pub id: DocumentId,
pub highlights: Vec<Highlight>,
#[cfg(test)]
pub matches: Vec<crate::bucket_sort::SimpleMatch>,
}
fn highlights_from_raw_document<'a, 'tag, 'txn>(
raw_document: &RawDocument<'a, 'tag>,
queries_kinds: &HashMap<QueryId, &QueryKind>,
arena: &SmallArena<'tag, PostingsListView<'txn>>,
searchable_attrs: Option<&ReorderedAttrs>,
schema: &Schema,
) -> Vec<Highlight>
{
let mut highlights = Vec::new();
for bm in raw_document.bare_matches.iter() {
let postings_list = &arena[bm.postings_list];
let input = postings_list.input();
let kind = &queries_kinds.get(&bm.query_index);
for di in postings_list.iter() {
let covered_area = match kind {
Some(QueryKind::NonTolerant(query)) | Some(QueryKind::Tolerant(query)) => {
let len = if query.len() > input.len() {
input.len()
} else {
prefix_damerau_levenshtein(query.as_bytes(), input).1
};
u16::try_from(len).unwrap_or(u16::max_value())
},
_ => di.char_length,
};
let attribute = searchable_attrs
.and_then(|sa| sa.reverse(di.attribute))
.unwrap_or(di.attribute);
let attribute = match schema.indexed_pos_to_field_id(attribute) {
Some(field_id) => field_id.0,
None => {
error!("Cannot convert indexed_pos {} to field_id", attribute);
trace!("Schema is compromized; {:?}", schema);
continue
}
};
let highlight = Highlight {
attribute,
char_index: di.char_index,
char_length: covered_area,
};
highlights.push(highlight);
}
}
highlights
}
impl Document {
#[cfg(not(test))]
pub fn from_highlights(id: DocumentId, highlights: &[Highlight]) -> Document {
Document { id, highlights: highlights.to_owned() }
}
#[cfg(test)]
pub fn from_highlights(id: DocumentId, highlights: &[Highlight]) -> Document {
Document { id, highlights: highlights.to_owned(), matches: Vec::new() }
}
#[cfg(not(test))]
pub fn from_raw<'a, 'tag, 'txn>(
raw_document: RawDocument<'a, 'tag>,
queries_kinds: &HashMap<QueryId, &QueryKind>,
arena: &SmallArena<'tag, PostingsListView<'txn>>,
searchable_attrs: Option<&ReorderedAttrs>,
schema: &Schema,
) -> Document
{
let highlights = highlights_from_raw_document(
&raw_document,
queries_kinds,
arena,
searchable_attrs,
schema,
);
Document { id: raw_document.id, highlights }
}
#[cfg(test)]
pub fn from_raw<'a, 'tag, 'txn>(
raw_document: RawDocument<'a, 'tag>,
queries_kinds: &HashMap<QueryId, &QueryKind>,
arena: &SmallArena<'tag, PostingsListView<'txn>>,
searchable_attrs: Option<&ReorderedAttrs>,
schema: &Schema,
) -> Document
{
use crate::bucket_sort::SimpleMatch;
let highlights = highlights_from_raw_document(
&raw_document,
queries_kinds,
arena,
searchable_attrs,
schema,
);
let mut matches = Vec::new();
for sm in raw_document.processed_matches {
let attribute = searchable_attrs
.and_then(|sa| sa.reverse(sm.attribute))
.unwrap_or(sm.attribute);
let attribute = match schema.indexed_pos_to_field_id(attribute) {
Some(field_id) => field_id.0,
None => {
error!("Cannot convert indexed_pos {} to field_id", attribute);
trace!("Schema is compromized; {:?}", schema);
continue
}
};
matches.push(SimpleMatch { attribute, ..sm });
}
matches.sort_unstable();
Document { id: raw_document.id, highlights, matches }
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::mem;
#[test]
fn docindex_mem_size() {
assert_eq!(mem::size_of::<DocIndex>(), 16);
}
}

View File

@ -1,3 +1,4 @@
use std::cmp::Ordering;
use std::fmt;
use std::num::{ParseFloatError, ParseIntError};
use std::str::FromStr;
@ -5,11 +6,18 @@ use std::str::FromStr;
use ordered_float::OrderedFloat;
use serde::{Deserialize, Serialize};
#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[derive(Serialize, Deserialize, Debug, Copy, Clone, Hash)]
pub enum Number {
Unsigned(u64),
Signed(i64),
Float(OrderedFloat<f64>),
Null,
}
impl Default for Number {
fn default() -> Self {
Self::Null
}
}
impl FromStr for Number {
@ -39,6 +47,53 @@ impl FromStr for Number {
}
}
impl PartialEq for Number {
fn eq(&self, other: &Number) -> bool {
self.cmp(other) == Ordering::Equal
}
}
impl Eq for Number {}
impl PartialOrd for Number {
fn partial_cmp(&self, other: &Number) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Number {
fn cmp(&self, other: &Self) -> Ordering {
use Number::{Float, Signed, Unsigned, Null};
match (*self, *other) {
(Unsigned(a), Unsigned(b)) => a.cmp(&b),
(Unsigned(a), Signed(b)) => {
if b < 0 {
Ordering::Greater
} else {
a.cmp(&(b as u64))
}
}
(Unsigned(a), Float(b)) => (OrderedFloat(a as f64)).cmp(&b),
(Signed(a), Unsigned(b)) => {
if a < 0 {
Ordering::Less
} else {
(a as u64).cmp(&b)
}
}
(Signed(a), Signed(b)) => a.cmp(&b),
(Signed(a), Float(b)) => OrderedFloat(a as f64).cmp(&b),
(Float(a), Unsigned(b)) => a.cmp(&OrderedFloat(b as f64)),
(Float(a), Signed(b)) => a.cmp(&OrderedFloat(b as f64)),
(Float(a), Float(b)) => a.cmp(&b),
(Null, Null) => Ordering::Equal,
(_, Null) => Ordering::Less,
(Null, _) => Ordering::Greater,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ParseNumberError {
uint_error: ParseIntError,

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,560 @@
use std::borrow::Cow;
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::ops::Range;
use std::time::Instant;
use std::{cmp, fmt, iter::once};
use fst::{IntoStreamer, Streamer};
use itertools::{EitherOrBoth, merge_join_by};
use meilisearch_tokenizer::split_query_string;
use sdset::{Set, SetBuf, SetOperation};
use log::debug;
use crate::database::MainT;
use crate::{store, DocumentId, DocIndex, MResult};
use crate::automaton::{normalize_str, build_dfa, build_prefix_dfa, build_exact_dfa};
use crate::QueryWordsMapper;
#[derive(Clone, PartialEq, Eq, Hash)]
pub enum Operation {
And(Vec<Operation>),
Or(Vec<Operation>),
Query(Query),
}
impl fmt::Debug for Operation {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fn pprint_tree(f: &mut fmt::Formatter<'_>, op: &Operation, depth: usize) -> fmt::Result {
match op {
Operation::And(children) => {
writeln!(f, "{:1$}AND", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
},
Operation::Or(children) => {
writeln!(f, "{:1$}OR", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
},
Operation::Query(query) => writeln!(f, "{:2$}{:?}", "", query, depth * 2),
}
}
pprint_tree(f, self, 0)
}
}
impl Operation {
fn tolerant(id: QueryId, prefix: bool, s: &str) -> Operation {
Operation::Query(Query { id, prefix, exact: true, kind: QueryKind::Tolerant(s.to_string()) })
}
fn non_tolerant(id: QueryId, prefix: bool, s: &str) -> Operation {
Operation::Query(Query { id, prefix, exact: true, kind: QueryKind::NonTolerant(s.to_string()) })
}
fn phrase2(id: QueryId, prefix: bool, (left, right): (&str, &str)) -> Operation {
let kind = QueryKind::Phrase(vec![left.to_owned(), right.to_owned()]);
Operation::Query(Query { id, prefix, exact: true, kind })
}
}
pub type QueryId = usize;
#[derive(Clone, Eq)]
pub struct Query {
pub id: QueryId,
pub prefix: bool,
pub exact: bool,
pub kind: QueryKind,
}
impl PartialEq for Query {
fn eq(&self, other: &Self) -> bool {
self.prefix == other.prefix && self.kind == other.kind
}
}
impl Hash for Query {
fn hash<H: Hasher>(&self, state: &mut H) {
self.prefix.hash(state);
self.kind.hash(state);
}
}
#[derive(Clone, PartialEq, Eq, Hash)]
pub enum QueryKind {
Tolerant(String),
NonTolerant(String),
Phrase(Vec<String>),
}
impl fmt::Debug for Query {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let Query { id, prefix, kind, .. } = self;
let prefix = if *prefix { String::from("Prefix") } else { String::default() };
match kind {
QueryKind::NonTolerant(word) => {
f.debug_struct(&(prefix + "NonTolerant")).field("id", &id).field("word", &word).finish()
},
QueryKind::Tolerant(word) => {
f.debug_struct(&(prefix + "Tolerant")).field("id", &id).field("word", &word).finish()
},
QueryKind::Phrase(words) => {
f.debug_struct(&(prefix + "Phrase")).field("id", &id).field("words", &words).finish()
},
}
}
}
#[derive(Debug, Default)]
pub struct PostingsList {
docids: SetBuf<DocumentId>,
matches: SetBuf<DocIndex>,
}
pub struct Context {
pub words_set: fst::Set,
pub stop_words: fst::Set,
pub synonyms: store::Synonyms,
pub postings_lists: store::PostingsLists,
pub prefix_postings_lists: store::PrefixPostingsListsCache,
}
fn split_best_frequency<'a>(reader: &heed::RoTxn<MainT>, ctx: &Context, word: &'a str) -> MResult<Option<(&'a str, &'a str)>> {
let chars = word.char_indices().skip(1);
let mut best = None;
for (i, _) in chars {
let (left, right) = word.split_at(i);
let left_freq = ctx.postings_lists
.postings_list(reader, left.as_bytes())?
.map(|p| p.docids.len())
.unwrap_or(0);
let right_freq = ctx.postings_lists
.postings_list(reader, right.as_bytes())?
.map(|p| p.docids.len())
.unwrap_or(0);
let min_freq = cmp::min(left_freq, right_freq);
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
best = Some((min_freq, left, right));
}
}
Ok(best.map(|(_, l, r)| (l, r)))
}
fn fetch_synonyms(reader: &heed::RoTxn<MainT>, ctx: &Context, words: &[&str]) -> MResult<Vec<Vec<String>>> {
let words = normalize_str(&words.join(" "));
let set = ctx.synonyms.synonyms(reader, words.as_bytes())?.unwrap_or_default();
let mut strings = Vec::new();
let mut stream = set.stream();
while let Some(input) = stream.next() {
if let Ok(input) = std::str::from_utf8(input) {
let alts = input.split_ascii_whitespace().map(ToOwned::to_owned).collect();
strings.push(alts);
}
}
Ok(strings)
}
fn create_operation<I, F>(iter: I, f: F) -> Operation
where I: IntoIterator<Item=Operation>,
F: Fn(Vec<Operation>) -> Operation,
{
let mut iter = iter.into_iter();
match (iter.next(), iter.next()) {
(Some(first), None) => first,
(first, second) => f(first.into_iter().chain(second).chain(iter).collect()),
}
}
const MAX_NGRAM: usize = 3;
pub fn create_query_tree(
reader: &heed::RoTxn<MainT>,
ctx: &Context,
query: &str,
) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)>
{
let words = split_query_string(query).map(str::to_lowercase);
let words = words.filter(|w| !ctx.stop_words.contains(w));
let words: Vec<_> = words.enumerate().collect();
let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w));
fn create_inner(
reader: &heed::RoTxn<MainT>,
ctx: &Context,
mapper: &mut QueryWordsMapper,
words: &[(usize, String)],
) -> MResult<Vec<Operation>>
{
let mut alts = Vec::new();
for ngram in 1..=MAX_NGRAM {
if let Some(group) = words.get(..ngram) {
let mut group_ops = Vec::new();
let tail = &words[ngram..];
let is_last = tail.is_empty();
let mut group_alts = Vec::new();
match group {
[(id, word)] => {
let mut idgen = ((id + 1) * 100)..;
let range = (*id)..id+1;
let phrase = split_best_frequency(reader, ctx, word)?
.map(|ws| {
let id = idgen.next().unwrap();
idgen.next().unwrap();
mapper.declare(range.clone(), id, &[ws.0, ws.1]);
Operation::phrase2(id, is_last, ws)
});
let synonyms = fetch_synonyms(reader, ctx, &[word])?
.into_iter()
.map(|alts| {
let exact = alts.len() == 1;
let id = idgen.next().unwrap();
mapper.declare(range.clone(), id, &alts);
let mut idgen = once(id).chain(&mut idgen);
let iter = alts.into_iter().map(|w| {
let id = idgen.next().unwrap();
let kind = QueryKind::NonTolerant(w);
Operation::Query(Query { id, prefix: false, exact, kind })
});
create_operation(iter, Operation::And)
});
let original = Operation::tolerant(*id, is_last, word);
group_alts.push(original);
group_alts.extend(synonyms.chain(phrase));
},
words => {
let id = words[0].0;
let mut idgen = ((id + 1) * 100_usize.pow(ngram as u32))..;
let range = id..id+ngram;
let words: Vec<_> = words.iter().map(|(_, s)| s.as_str()).collect();
for synonym in fetch_synonyms(reader, ctx, &words)? {
let exact = synonym.len() == 1;
let id = idgen.next().unwrap();
mapper.declare(range.clone(), id, &synonym);
let mut idgen = once(id).chain(&mut idgen);
let synonym = synonym.into_iter().map(|s| {
let id = idgen.next().unwrap();
let kind = QueryKind::NonTolerant(s);
Operation::Query(Query { id, prefix: false, exact, kind })
});
group_alts.push(create_operation(synonym, Operation::And));
}
let id = idgen.next().unwrap();
let concat = words.concat();
mapper.declare(range.clone(), id, &[&concat]);
group_alts.push(Operation::non_tolerant(id, is_last, &concat));
}
}
group_ops.push(create_operation(group_alts, Operation::Or));
if !tail.is_empty() {
let tail_ops = create_inner(reader, ctx, mapper, tail)?;
group_ops.push(create_operation(tail_ops, Operation::Or));
}
alts.push(create_operation(group_ops, Operation::And));
}
}
Ok(alts)
}
let alternatives = create_inner(reader, ctx, &mut mapper, &words)?;
let operation = Operation::Or(alternatives);
let mapping = mapper.mapping();
Ok((operation, mapping))
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct PostingsKey<'o> {
pub query: &'o Query,
pub input: Vec<u8>,
pub distance: u8,
pub is_exact: bool,
}
pub type Postings<'o, 'txn> = HashMap<PostingsKey<'o>, Cow<'txn, Set<DocIndex>>>;
pub type Cache<'o, 'txn> = HashMap<&'o Operation, Cow<'txn, Set<DocumentId>>>;
pub struct QueryResult<'o, 'txn> {
pub docids: Cow<'txn, Set<DocumentId>>,
pub queries: Postings<'o, 'txn>,
}
pub fn traverse_query_tree<'o, 'txn>(
reader: &'txn heed::RoTxn<MainT>,
ctx: &Context,
tree: &'o Operation,
) -> MResult<QueryResult<'o, 'txn>>
{
fn execute_and<'o, 'txn>(
reader: &'txn heed::RoTxn<MainT>,
ctx: &Context,
cache: &mut Cache<'o, 'txn>,
postings: &mut Postings<'o, 'txn>,
depth: usize,
operations: &'o [Operation],
) -> MResult<Cow<'txn, Set<DocumentId>>>
{
debug!("{:1$}AND", "", depth * 2);
let before = Instant::now();
let mut results = Vec::new();
for op in operations {
if cache.get(op).is_none() {
let docids = match op {
Operation::And(ops) => execute_and(reader, ctx, cache, postings, depth + 1, &ops)?,
Operation::Or(ops) => execute_or(reader, ctx, cache, postings, depth + 1, &ops)?,
Operation::Query(query) => execute_query(reader, ctx, postings, depth + 1, &query)?,
};
cache.insert(op, docids);
}
}
for op in operations {
if let Some(docids) = cache.get(op) {
results.push(docids.as_ref());
}
}
let op = sdset::multi::Intersection::new(results);
let docids = op.into_set_buf();
debug!("{:3$}--- AND fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2);
Ok(Cow::Owned(docids))
}
fn execute_or<'o, 'txn>(
reader: &'txn heed::RoTxn<MainT>,
ctx: &Context,
cache: &mut Cache<'o, 'txn>,
postings: &mut Postings<'o, 'txn>,
depth: usize,
operations: &'o [Operation],
) -> MResult<Cow<'txn, Set<DocumentId>>>
{
debug!("{:1$}OR", "", depth * 2);
let before = Instant::now();
let mut results = Vec::new();
for op in operations {
if cache.get(op).is_none() {
let docids = match op {
Operation::And(ops) => execute_and(reader, ctx, cache, postings, depth + 1, &ops)?,
Operation::Or(ops) => execute_or(reader, ctx, cache, postings, depth + 1, &ops)?,
Operation::Query(query) => execute_query(reader, ctx, postings, depth + 1, &query)?,
};
cache.insert(op, docids);
}
}
for op in operations {
if let Some(docids) = cache.get(op) {
results.push(docids.as_ref());
}
}
let op = sdset::multi::Union::new(results);
let docids = op.into_set_buf();
debug!("{:3$}--- OR fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2);
Ok(Cow::Owned(docids))
}
fn execute_query<'o, 'txn>(
reader: &'txn heed::RoTxn<MainT>,
ctx: &Context,
postings: &mut Postings<'o, 'txn>,
depth: usize,
query: &'o Query,
) -> MResult<Cow<'txn, Set<DocumentId>>>
{
let before = Instant::now();
let Query { prefix, kind, exact, .. } = query;
let docids: Cow<Set<_>> = match kind {
QueryKind::Tolerant(word) => {
if *prefix && word.len() <= 2 {
let prefix = {
let mut array = [0; 4];
let bytes = word.as_bytes();
array[..bytes.len()].copy_from_slice(bytes);
array
};
// We retrieve the cached postings lists for all
// the words that starts with this short prefix.
let result = ctx.prefix_postings_lists.prefix_postings_list(reader, prefix)?.unwrap_or_default();
let key = PostingsKey { query, input: word.clone().into_bytes(), distance: 0, is_exact: false };
postings.insert(key, result.matches);
let prefix_docids = &result.docids;
// We retrieve the exact postings list for the prefix,
// because we must consider these matches as exact.
let result = ctx.postings_lists.postings_list(reader, word.as_bytes())?.unwrap_or_default();
let key = PostingsKey { query, input: word.clone().into_bytes(), distance: 0, is_exact: true };
postings.insert(key, result.matches);
let exact_docids = &result.docids;
let before = Instant::now();
let docids = sdset::duo::Union::new(prefix_docids, exact_docids).into_set_buf();
debug!("{:4$}prefix docids ({} and {}) construction took {:.02?}",
"", prefix_docids.len(), exact_docids.len(), before.elapsed(), depth * 2);
Cow::Owned(docids)
} else {
let dfa = if *prefix { build_prefix_dfa(word) } else { build_dfa(word) };
let byte = word.as_bytes()[0];
let mut stream = if byte == u8::max_value() {
ctx.words_set.search(&dfa).ge(&[byte]).into_stream()
} else {
ctx.words_set.search(&dfa).ge(&[byte]).lt(&[byte + 1]).into_stream()
};
let before = Instant::now();
let mut results = Vec::new();
while let Some(input) = stream.next() {
if let Some(result) = ctx.postings_lists.postings_list(reader, input)? {
let distance = dfa.eval(input).to_u8();
let is_exact = *exact && distance == 0 && input.len() == word.len();
results.push(result.docids);
let key = PostingsKey { query, input: input.to_owned(), distance, is_exact };
postings.insert(key, result.matches);
}
}
debug!("{:3$}docids retrieval ({:?}) took {:.02?}", "", results.len(), before.elapsed(), depth * 2);
let before = Instant::now();
let docids = if results.len() > 10 {
let cap = results.iter().map(|dis| dis.len()).sum();
let mut docids = Vec::with_capacity(cap);
for dis in results {
docids.extend_from_slice(&dis);
}
SetBuf::from_dirty(docids)
} else {
let sets = results.iter().map(AsRef::as_ref).collect();
sdset::multi::Union::new(sets).into_set_buf()
};
debug!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
Cow::Owned(docids)
}
},
QueryKind::NonTolerant(word) => {
// TODO support prefix and non-prefix exact DFA
let dfa = build_exact_dfa(word);
let byte = word.as_bytes()[0];
let mut stream = if byte == u8::max_value() {
ctx.words_set.search(&dfa).ge(&[byte]).into_stream()
} else {
ctx.words_set.search(&dfa).ge(&[byte]).lt(&[byte + 1]).into_stream()
};
let before = Instant::now();
let mut results = Vec::new();
while let Some(input) = stream.next() {
if let Some(result) = ctx.postings_lists.postings_list(reader, input)? {
let distance = dfa.eval(input).to_u8();
results.push(result.docids);
let key = PostingsKey { query, input: input.to_owned(), distance, is_exact: *exact };
postings.insert(key, result.matches);
}
}
debug!("{:3$}docids retrieval ({:?}) took {:.02?}", "", results.len(), before.elapsed(), depth * 2);
let before = Instant::now();
let docids = if results.len() > 10 {
let cap = results.iter().map(|dis| dis.len()).sum();
let mut docids = Vec::with_capacity(cap);
for dis in results {
docids.extend_from_slice(&dis);
}
SetBuf::from_dirty(docids)
} else {
let sets = results.iter().map(AsRef::as_ref).collect();
sdset::multi::Union::new(sets).into_set_buf()
};
debug!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
Cow::Owned(docids)
},
QueryKind::Phrase(words) => {
// TODO support prefix and non-prefix exact DFA
if let [first, second] = words.as_slice() {
let first = ctx.postings_lists.postings_list(reader, first.as_bytes())?.unwrap_or_default();
let second = ctx.postings_lists.postings_list(reader, second.as_bytes())?.unwrap_or_default();
let iter = merge_join_by(first.matches.as_slice(), second.matches.as_slice(), |a, b| {
let x = (a.document_id, a.attribute, (a.word_index as u32) + 1);
let y = (b.document_id, b.attribute, b.word_index as u32);
x.cmp(&y)
});
let matches: Vec<_> = iter
.filter_map(EitherOrBoth::both)
.flat_map(|(a, b)| once(*a).chain(Some(*b)))
.collect();
let before = Instant::now();
let mut docids: Vec<_> = matches.iter().map(|m| m.document_id).collect();
docids.dedup();
let docids = SetBuf::new(docids).unwrap();
debug!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
let matches = Cow::Owned(SetBuf::new(matches).unwrap());
let key = PostingsKey { query, input: vec![], distance: 0, is_exact: true };
postings.insert(key, matches);
Cow::Owned(docids)
} else {
debug!("{:2$}{:?} skipped", "", words, depth * 2);
Cow::default()
}
},
};
debug!("{:4$}{:?} fetched {:?} documents in {:.02?}", "", query, docids.len(), before.elapsed(), depth * 2);
Ok(docids)
}
let mut cache = Cache::new();
let mut postings = Postings::new();
let docids = match tree {
Operation::And(ops) => execute_and(reader, ctx, &mut cache, &mut postings, 0, &ops)?,
Operation::Or(ops) => execute_or(reader, ctx, &mut cache, &mut postings, 0, &ops)?,
Operation::Query(query) => execute_query(reader, ctx, &mut postings, 0, &query)?,
};
Ok(QueryResult { docids, queries: postings })
}

View File

@ -0,0 +1,415 @@
use std::collections::HashMap;
use std::iter::FromIterator;
use std::ops::Range;
use intervaltree::{Element, IntervalTree};
pub type QueryId = usize;
pub struct QueryWordsMapper {
originals: Vec<String>,
mappings: HashMap<QueryId, (Range<usize>, Vec<String>)>,
}
impl QueryWordsMapper {
pub fn new<I, A>(originals: I) -> QueryWordsMapper
where I: IntoIterator<Item = A>,
A: ToString,
{
let originals = originals.into_iter().map(|s| s.to_string()).collect();
QueryWordsMapper { originals, mappings: HashMap::new() }
}
pub fn declare<I, A>(&mut self, range: Range<usize>, id: QueryId, replacement: I)
where I: IntoIterator<Item = A>,
A: ToString,
{
assert!(range.len() != 0);
assert!(self.originals.get(range.clone()).is_some());
assert!(id >= self.originals.len());
let replacement: Vec<_> = replacement.into_iter().map(|s| s.to_string()).collect();
assert!(!replacement.is_empty());
// We detect words at the end and at the front of the
// replacement that are common with the originals:
//
// x a b c d e f g
// ^^^/ \^^^
// a b x c d k j e f
// ^^^ ^^^
//
let left = &self.originals[..range.start];
let right = &self.originals[range.end..];
let common_left = longest_common_prefix(left, &replacement);
let common_right = longest_common_prefix(&replacement, right);
for i in 0..common_left {
let range = range.start - common_left + i..range.start - common_left + i + 1;
let replacement = vec![replacement[i].clone()];
self.mappings.insert(id + i, (range, replacement));
}
{
let replacement = replacement[common_left..replacement.len() - common_right].iter().cloned().collect();
self.mappings.insert(id + common_left, (range.clone(), replacement));
}
for i in 0..common_right {
let id = id + replacement.len() - common_right + i;
let range = range.end + i..range.end + i + 1;
let replacement = vec![replacement[replacement.len() - common_right + i].clone()];
self.mappings.insert(id, (range, replacement));
}
}
pub fn mapping(self) -> HashMap<QueryId, Range<usize>> {
let mappings = self.mappings.into_iter().map(|(i, (r, v))| (r, (i, v)));
let intervals = IntervalTree::from_iter(mappings);
let mut output = HashMap::new();
let mut offset = 0;
// We map each original word to the biggest number of
// associated words.
for i in 0..self.originals.len() {
let max = intervals.query_point(i)
.filter_map(|e| {
if e.range.end - 1 == i {
let len = e.value.1.iter().skip(i - e.range.start).count();
if len != 0 { Some(len) } else { None }
} else { None }
})
.max()
.unwrap_or(1);
let range = i + offset..i + offset + max;
output.insert(i, range);
offset += max - 1;
}
// We retrieve the range that each original word
// is mapped to and apply it to each of the words.
for i in 0..self.originals.len() {
let iter = intervals.query_point(i).filter(|e| e.range.end - 1 == i);
for Element { range, value: (id, words) } in iter {
// We ask for the complete range mapped to the area we map.
let start = output.get(&range.start).map(|r| r.start).unwrap_or(range.start);
let end = output.get(&(range.end - 1)).map(|r| r.end).unwrap_or(range.end);
let range = start..end;
// We map each query id to one word until the last,
// we map it to the remainings words.
let add = range.len() - words.len();
for (j, x) in range.take(words.len()).enumerate() {
let add = if j == words.len() - 1 { add } else { 0 }; // is last?
let range = x..x + 1 + add;
output.insert(id + j, range);
}
}
}
output
}
}
fn longest_common_prefix<T: Eq + std::fmt::Debug>(a: &[T], b: &[T]) -> usize {
let mut best = None;
for i in (0..a.len()).rev() {
let count = a[i..].iter().zip(b).take_while(|(a, b)| a == b).count();
best = match best {
Some(old) if count > old => Some(count),
Some(_) => break,
None => Some(count),
};
}
best.unwrap_or(0)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn original_unmodified() {
let query = ["new", "york", "city", "subway"];
// 0 1 2 3
let mut builder = QueryWordsMapper::new(&query);
// new york = new york city
builder.declare(0..2, 4, &["new", "york", "city"]);
// ^ 4 5 6
// new = new york city
builder.declare(0..1, 7, &["new", "york", "city"]);
// ^ 7 8 9
let mapping = builder.mapping();
assert_eq!(mapping[&0], 0..1); // new
assert_eq!(mapping[&1], 1..2); // york
assert_eq!(mapping[&2], 2..3); // city
assert_eq!(mapping[&3], 3..4); // subway
assert_eq!(mapping[&4], 0..1); // new
assert_eq!(mapping[&5], 1..2); // york
assert_eq!(mapping[&6], 2..3); // city
assert_eq!(mapping[&7], 0..1); // new
assert_eq!(mapping[&8], 1..2); // york
assert_eq!(mapping[&9], 2..3); // city
}
#[test]
fn original_unmodified2() {
let query = ["new", "york", "city", "subway"];
// 0 1 2 3
let mut builder = QueryWordsMapper::new(&query);
// city subway = new york city underground train
builder.declare(2..4, 4, &["new", "york", "city", "underground", "train"]);
// ^ 4 5 6 7 8
let mapping = builder.mapping();
assert_eq!(mapping[&0], 0..1); // new
assert_eq!(mapping[&1], 1..2); // york
assert_eq!(mapping[&2], 2..3); // city
assert_eq!(mapping[&3], 3..5); // subway
assert_eq!(mapping[&4], 0..1); // new
assert_eq!(mapping[&5], 1..2); // york
assert_eq!(mapping[&6], 2..3); // city
assert_eq!(mapping[&7], 3..4); // underground
assert_eq!(mapping[&8], 4..5); // train
}
#[test]
fn original_unmodified3() {
let query = ["a", "b", "x", "x", "a", "b", "c", "d", "e", "f", "g"];
// 0 1 2 3 4 5 6 7 8 9 10
let mut builder = QueryWordsMapper::new(&query);
// c d = a b x c d k j e f
builder.declare(6..8, 11, &["a", "b", "x", "c", "d", "k", "j", "e", "f"]);
// ^^ 11 12 13 14 15 16 17 18 19
let mapping = builder.mapping();
assert_eq!(mapping[&0], 0..1); // a
assert_eq!(mapping[&1], 1..2); // b
assert_eq!(mapping[&2], 2..3); // x
assert_eq!(mapping[&3], 3..4); // x
assert_eq!(mapping[&4], 4..5); // a
assert_eq!(mapping[&5], 5..6); // b
assert_eq!(mapping[&6], 6..7); // c
assert_eq!(mapping[&7], 7..11); // d
assert_eq!(mapping[&8], 11..12); // e
assert_eq!(mapping[&9], 12..13); // f
assert_eq!(mapping[&10], 13..14); // g
assert_eq!(mapping[&11], 4..5); // a
assert_eq!(mapping[&12], 5..6); // b
assert_eq!(mapping[&13], 6..7); // x
assert_eq!(mapping[&14], 7..8); // c
assert_eq!(mapping[&15], 8..9); // d
assert_eq!(mapping[&16], 9..10); // k
assert_eq!(mapping[&17], 10..11); // j
assert_eq!(mapping[&18], 11..12); // e
assert_eq!(mapping[&19], 12..13); // f
}
#[test]
fn simple_growing() {
let query = ["new", "york", "subway"];
// 0 1 2
let mut builder = QueryWordsMapper::new(&query);
// new york = new york city
builder.declare(0..2, 3, &["new", "york", "city"]);
// ^ 3 4 5
let mapping = builder.mapping();
assert_eq!(mapping[&0], 0..1); // new
assert_eq!(mapping[&1], 1..3); // york
assert_eq!(mapping[&2], 3..4); // subway
assert_eq!(mapping[&3], 0..1); // new
assert_eq!(mapping[&4], 1..2); // york
assert_eq!(mapping[&5], 2..3); // city
}
#[test]
fn same_place_growings() {
let query = ["NY", "subway"];
// 0 1
let mut builder = QueryWordsMapper::new(&query);
// NY = new york
builder.declare(0..1, 2, &["new", "york"]);
// ^ 2 3
// NY = new york city
builder.declare(0..1, 4, &["new", "york", "city"]);
// ^ 4 5 6
// NY = NYC
builder.declare(0..1, 7, &["NYC"]);
// ^ 7
// NY = new york city
builder.declare(0..1, 8, &["new", "york", "city"]);
// ^ 8 9 10
// subway = underground train
builder.declare(1..2, 11, &["underground", "train"]);
// ^ 11 12
let mapping = builder.mapping();
assert_eq!(mapping[&0], 0..3); // NY
assert_eq!(mapping[&1], 3..5); // subway
assert_eq!(mapping[&2], 0..1); // new
assert_eq!(mapping[&3], 1..3); // york
assert_eq!(mapping[&4], 0..1); // new
assert_eq!(mapping[&5], 1..2); // york
assert_eq!(mapping[&6], 2..3); // city
assert_eq!(mapping[&7], 0..3); // NYC
assert_eq!(mapping[&8], 0..1); // new
assert_eq!(mapping[&9], 1..2); // york
assert_eq!(mapping[&10], 2..3); // city
assert_eq!(mapping[&11], 3..4); // underground
assert_eq!(mapping[&12], 4..5); // train
}
#[test]
fn bigger_growing() {
let query = ["NYC", "subway"];
// 0 1
let mut builder = QueryWordsMapper::new(&query);
// NYC = new york city
builder.declare(0..1, 2, &["new", "york", "city"]);
// ^ 2 3 4
let mapping = builder.mapping();
assert_eq!(mapping[&0], 0..3); // NYC
assert_eq!(mapping[&1], 3..4); // subway
assert_eq!(mapping[&2], 0..1); // new
assert_eq!(mapping[&3], 1..2); // york
assert_eq!(mapping[&4], 2..3); // city
}
#[test]
fn middle_query_growing() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryWordsMapper::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
let mapping = builder.mapping();
assert_eq!(mapping[&0], 0..1); // great
assert_eq!(mapping[&1], 1..2); // awesome
assert_eq!(mapping[&2], 2..5); // NYC
assert_eq!(mapping[&3], 5..6); // subway
assert_eq!(mapping[&4], 2..3); // new
assert_eq!(mapping[&5], 3..4); // york
assert_eq!(mapping[&6], 4..5); // city
}
#[test]
fn end_query_growing() {
let query = ["NYC", "subway"];
// 0 1
let mut builder = QueryWordsMapper::new(&query);
// NYC = new york city
builder.declare(1..2, 2, &["underground", "train"]);
// ^ 2 3
let mapping = builder.mapping();
assert_eq!(mapping[&0], 0..1); // NYC
assert_eq!(mapping[&1], 1..3); // subway
assert_eq!(mapping[&2], 1..2); // underground
assert_eq!(mapping[&3], 2..3); // train
}
#[test]
fn multiple_growings() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryWordsMapper::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
// subway = underground train
builder.declare(3..4, 7, &["underground", "train"]);
// ^ 7 8
let mapping = builder.mapping();
assert_eq!(mapping[&0], 0..1); // great
assert_eq!(mapping[&1], 1..2); // awesome
assert_eq!(mapping[&2], 2..5); // NYC
assert_eq!(mapping[&3], 5..7); // subway
assert_eq!(mapping[&4], 2..3); // new
assert_eq!(mapping[&5], 3..4); // york
assert_eq!(mapping[&6], 4..5); // city
assert_eq!(mapping[&7], 5..6); // underground
assert_eq!(mapping[&8], 6..7); // train
}
#[test]
fn multiple_probable_growings() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryWordsMapper::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
// subway = underground train
builder.declare(3..4, 7, &["underground", "train"]);
// ^ 7 8
// great awesome = good
builder.declare(0..2, 9, &["good"]);
// ^ 9
// awesome NYC = NY
builder.declare(1..3, 10, &["NY"]);
// ^^ 10
// NYC subway = metro
builder.declare(2..4, 11, &["metro"]);
// ^^ 11
let mapping = builder.mapping();
assert_eq!(mapping[&0], 0..1); // great
assert_eq!(mapping[&1], 1..2); // awesome
assert_eq!(mapping[&2], 2..5); // NYC
assert_eq!(mapping[&3], 5..7); // subway
assert_eq!(mapping[&4], 2..3); // new
assert_eq!(mapping[&5], 3..4); // york
assert_eq!(mapping[&6], 4..5); // city
assert_eq!(mapping[&7], 5..6); // underground
assert_eq!(mapping[&8], 6..7); // train
assert_eq!(mapping[&9], 0..2); // good
assert_eq!(mapping[&10], 1..5); // NY
assert_eq!(mapping[&11], 2..7); // metro
}
}

View File

@ -1,14 +1,14 @@
use std::io::{Read, Write};
use hashbrown::HashMap;
use meilidb_schema::SchemaAttr;
use meilisearch_schema::FieldId;
use serde::{Deserialize, Serialize};
use crate::{DocumentId, Number};
#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(transparent)]
pub struct RankedMap(HashMap<(DocumentId, SchemaAttr), Number>);
pub struct RankedMap(HashMap<(DocumentId, FieldId), Number>);
impl RankedMap {
pub fn len(&self) -> usize {
@ -19,16 +19,16 @@ impl RankedMap {
self.0.is_empty()
}
pub fn insert(&mut self, document: DocumentId, attribute: SchemaAttr, number: Number) {
self.0.insert((document, attribute), number);
pub fn insert(&mut self, document: DocumentId, field: FieldId, number: Number) {
self.0.insert((document, field), number);
}
pub fn remove(&mut self, document: DocumentId, attribute: SchemaAttr) {
self.0.remove(&(document, attribute));
pub fn remove(&mut self, document: DocumentId, field: FieldId) {
self.0.remove(&(document, field));
}
pub fn get(&self, document: DocumentId, attribute: SchemaAttr) -> Option<Number> {
self.0.get(&(document, attribute)).cloned()
pub fn get(&self, document: DocumentId, field: FieldId) -> Option<Number> {
self.0.get(&(document, field)).cloned()
}
pub fn read_from_bin<R: Read>(reader: R) -> bincode::Result<RankedMap> {

View File

@ -0,0 +1,51 @@
use compact_arena::SmallArena;
use sdset::SetBuf;
use crate::DocIndex;
use crate::bucket_sort::{SimpleMatch, BareMatch, PostingsListView};
use crate::reordered_attrs::ReorderedAttrs;
pub struct RawDocument<'a, 'tag> {
pub id: crate::DocumentId,
pub bare_matches: &'a mut [BareMatch<'tag>],
pub processed_matches: Vec<SimpleMatch>,
/// The list of minimum `distance` found
pub processed_distances: Vec<Option<u8>>,
/// Does this document contains a field
/// with one word that is exactly matching
pub contains_one_word_field: bool,
}
impl<'a, 'tag> RawDocument<'a, 'tag> {
pub fn new<'txn>(
bare_matches: &'a mut [BareMatch<'tag>],
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
searchable_attrs: Option<&ReorderedAttrs>,
) -> RawDocument<'a, 'tag>
{
if let Some(reordered_attrs) = searchable_attrs {
for bm in bare_matches.iter() {
let postings_list = &postings_lists[bm.postings_list];
let mut rewritten = Vec::new();
for di in postings_list.iter() {
if let Some(attribute) = reordered_attrs.get(di.attribute) {
rewritten.push(DocIndex { attribute, ..*di });
}
}
let new_postings = SetBuf::from_dirty(rewritten);
postings_lists[bm.postings_list].rewrite_with(new_postings);
}
}
bare_matches.sort_unstable_by_key(|m| m.query_index);
RawDocument {
id: bare_matches[0].document_id,
bare_matches,
processed_matches: Vec::new(),
processed_distances: Vec::new(),
contains_one_word_field: false,
}
}
}

View File

@ -0,0 +1,272 @@
use std::collections::{BTreeMap, HashMap};
use std::convert::TryFrom;
use crate::{DocIndex, DocumentId};
use deunicode::deunicode_with_tofu;
use meilisearch_schema::IndexedPos;
use meilisearch_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer};
use sdset::SetBuf;
const WORD_LENGTH_LIMIT: usize = 80;
type Word = Vec<u8>; // TODO make it be a SmallVec
pub struct RawIndexer {
word_limit: usize, // the maximum number of indexed words
stop_words: fst::Set,
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
docs_words: HashMap<DocumentId, Vec<Word>>,
}
pub struct Indexed {
pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,
pub docs_words: HashMap<DocumentId, fst::Set>,
}
impl RawIndexer {
pub fn new(stop_words: fst::Set) -> RawIndexer {
RawIndexer::with_word_limit(stop_words, 1000)
}
pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer {
RawIndexer {
word_limit: limit,
stop_words,
words_doc_indexes: BTreeMap::new(),
docs_words: HashMap::new(),
}
}
pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize {
let mut number_of_words = 0;
for token in Tokenizer::new(text) {
let must_continue = index_token(
token,
id,
indexed_pos,
self.word_limit,
&self.stop_words,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
number_of_words += 1;
if !must_continue {
break;
}
}
number_of_words
}
pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, iter: I)
where
I: IntoIterator<Item = &'a str>,
{
let iter = iter.into_iter();
for token in SeqTokenizer::new(iter) {
let must_continue = index_token(
token,
id,
indexed_pos,
self.word_limit,
&self.stop_words,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
if !must_continue {
break;
}
}
}
pub fn build(self) -> Indexed {
let words_doc_indexes = self
.words_doc_indexes
.into_iter()
.map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
.collect();
let docs_words = self
.docs_words
.into_iter()
.map(|(id, mut words)| {
words.sort_unstable();
words.dedup();
(id, fst::Set::from_iter(words).unwrap())
})
.collect();
Indexed {
words_doc_indexes,
docs_words,
}
}
}
fn index_token(
token: Token,
id: DocumentId,
indexed_pos: IndexedPos,
word_limit: usize,
stop_words: &fst::Set,
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool {
if token.word_index >= word_limit {
return false;
}
let lower = token.word.to_lowercase();
let token = Token {
word: &lower,
..token
};
if !stop_words.contains(&token.word) {
match token_to_docindex(id, indexed_pos, token) {
Some(docindex) => {
let word = Vec::from(token.word);
if word.len() <= WORD_LENGTH_LIMIT {
words_doc_indexes
.entry(word.clone())
.or_insert_with(Vec::new)
.push(docindex);
docs_words.entry(id).or_insert_with(Vec::new).push(word);
if !lower.contains(is_cjk) {
let unidecoded = deunicode_with_tofu(&lower, "");
if unidecoded != lower && !unidecoded.is_empty() {
let word = Vec::from(unidecoded);
if word.len() <= WORD_LENGTH_LIMIT {
words_doc_indexes
.entry(word.clone())
.or_insert_with(Vec::new)
.push(docindex);
docs_words.entry(id).or_insert_with(Vec::new).push(word);
}
}
}
}
}
None => return false,
}
}
true
}
fn token_to_docindex(id: DocumentId, indexed_pos: IndexedPos, token: Token) -> Option<DocIndex> {
let word_index = u16::try_from(token.word_index).ok()?;
let char_index = u16::try_from(token.char_index).ok()?;
let char_length = u16::try_from(token.word.chars().count()).ok()?;
let docindex = DocIndex {
document_id: id,
attribute: indexed_pos.0,
word_index,
char_index,
char_length,
};
Some(docindex)
}
#[cfg(test)]
mod tests {
use super::*;
use meilisearch_schema::IndexedPos;
#[test]
fn strange_apostrophe() {
let mut indexer = RawIndexer::new(fst::Set::default());
let docid = DocumentId(0);
let indexed_pos = IndexedPos(0);
let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
indexer.index_text(docid, indexed_pos, text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
assert!(words_doc_indexes
.get(&"éteindre".to_owned().into_bytes())
.is_some());
}
#[test]
fn strange_apostrophe_in_sequence() {
let mut indexer = RawIndexer::new(fst::Set::default());
let docid = DocumentId(0);
let indexed_pos = IndexedPos(0);
let text = vec!["Zut, l’aspirateur, j’ai oublié de l’éteindre !"];
indexer.index_text_seq(docid, indexed_pos, text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
assert!(words_doc_indexes
.get(&"éteindre".to_owned().into_bytes())
.is_some());
}
#[test]
fn basic_stop_words() {
let stop_words = sdset::SetBuf::from_dirty(vec!["l", "j", "ai", "de"]);
let stop_words = fst::Set::from_iter(stop_words).unwrap();
let mut indexer = RawIndexer::new(stop_words);
let docid = DocumentId(0);
let indexed_pos = IndexedPos(0);
let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
indexer.index_text(docid, indexed_pos, text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_none());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"j"[..]).is_none());
assert!(words_doc_indexes.get(&b"ai"[..]).is_none());
assert!(words_doc_indexes.get(&b"de"[..]).is_none());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
assert!(words_doc_indexes
.get(&"éteindre".to_owned().into_bytes())
.is_some());
}
#[test]
fn no_empty_unidecode() {
let mut indexer = RawIndexer::new(fst::Set::default());
let docid = DocumentId(0);
let indexed_pos = IndexedPos(0);
let text = "🇯🇵";
indexer.index_text(docid, indexed_pos, text);
let Indexed {
words_doc_indexes, ..
} = indexer.build();
assert!(words_doc_indexes
.get(&"🇯🇵".to_owned().into_bytes())
.is_some());
}
}

View File

@ -0,0 +1,31 @@
use std::cmp;
#[derive(Default, Clone)]
pub struct ReorderedAttrs {
reorders: Vec<Option<u16>>,
reverse: Vec<u16>,
}
impl ReorderedAttrs {
pub fn new() -> ReorderedAttrs {
ReorderedAttrs { reorders: Vec::new(), reverse: Vec::new() }
}
pub fn insert_attribute(&mut self, attribute: u16) {
let new_len = cmp::max(attribute as usize + 1, self.reorders.len());
self.reorders.resize(new_len, None);
self.reorders[attribute as usize] = Some(self.reverse.len() as u16);
self.reverse.push(attribute);
}
pub fn get(&self, attribute: u16) -> Option<u16> {
match self.reorders.get(attribute as usize)? {
Some(attribute) => Some(*attribute),
None => None,
}
}
pub fn reverse(&self, attribute: u16) -> Option<u16> {
self.reverse.get(attribute as usize).copied()
}
}

View File

@ -12,8 +12,8 @@ impl ser::Serializer for ConvertToString {
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapConvertToString;
type SerializeStruct = StructConvertToString;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
@ -169,7 +169,9 @@ impl ser::Serializer for ConvertToString {
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "map" })
Ok(MapConvertToString {
text: String::new(),
})
}
fn serialize_struct(
@ -177,8 +179,8 @@ impl ser::Serializer for ConvertToString {
_name: &'static str,
_len: usize,
) -> Result<Self::SerializeStruct, Self::Error> {
Err(SerializerError::UnserializableType {
type_name: "struct",
Ok(StructConvertToString {
text: String::new(),
})
}
@ -194,3 +196,63 @@ impl ser::Serializer for ConvertToString {
})
}
}
pub struct MapConvertToString {
text: String,
}
impl ser::SerializeMap for MapConvertToString {
type Ok = String;
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where
T: ser::Serialize,
{
let text = key.serialize(ConvertToString)?;
self.text.push_str(&text);
self.text.push_str(" ");
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where
T: ser::Serialize,
{
let text = value.serialize(ConvertToString)?;
self.text.push_str(&text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(self.text)
}
}
pub struct StructConvertToString {
text: String,
}
impl ser::SerializeStruct for StructConvertToString {
type Ok = String;
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T,
) -> Result<(), Self::Error>
where
T: ser::Serialize,
{
let value = value.serialize(ConvertToString)?;
self.text.push_str(key);
self.text.push_str(" ");
self.text.push_str(&value);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(self.text)
}
}

View File

@ -2,12 +2,13 @@ use std::collections::HashSet;
use std::io::Cursor;
use std::{error::Error, fmt};
use meilidb_schema::{Schema, SchemaAttr};
use meilisearch_schema::{Schema, FieldId};
use serde::{de, forward_to_deserialize_any};
use serde_json::de::IoRead as SerdeJsonIoRead;
use serde_json::Deserializer as SerdeJsonDeserializer;
use serde_json::Error as SerdeJsonError;
use crate::database::MainT;
use crate::store::DocumentsFields;
use crate::DocumentId;
@ -50,10 +51,10 @@ impl From<heed::Error> for DeserializerError {
pub struct Deserializer<'a> {
pub document_id: DocumentId,
pub reader: &'a heed::RoTxn,
pub reader: &'a heed::RoTxn<MainT>,
pub documents_fields: DocumentsFields,
pub schema: &'a Schema,
pub attributes: Option<&'a HashSet<SchemaAttr>>,
pub fields: Option<&'a HashSet<FieldId>>,
}
impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
@ -63,13 +64,14 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
where
V: de::Visitor<'de>,
{
self.deserialize_map(visitor)
self.deserialize_option(visitor)
}
forward_to_deserialize_any! {
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string
bytes byte_buf option unit unit_struct newtype_struct seq tuple
tuple_struct struct enum identifier ignored_any
fn deserialize_option<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where
V: de::Visitor<'de>,
{
self.deserialize_map(visitor)
}
fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error>
@ -90,30 +92,45 @@ impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> {
}
};
let is_displayed = self.schema.props(attr).is_displayed();
if is_displayed && self.attributes.map_or(true, |f| f.contains(&attr)) {
let attribute_name = self.schema.attribute_name(attr);
let is_displayed = self.schema.is_displayed(attr);
if is_displayed && self.fields.map_or(true, |f| f.contains(&attr)) {
if let Some(attribute_name) = self.schema.name(attr) {
let cursor = Cursor::new(value.to_owned());
let ioread = SerdeJsonIoRead::new(cursor);
let value = Value(SerdeJsonDeserializer::new(ioread));
let cursor = Cursor::new(value.to_owned());
let ioread = SerdeJsonIoRead::new(cursor);
let value = Value(SerdeJsonDeserializer::new(ioread));
Some((attribute_name, value))
Some((attribute_name, value))
} else {
None
}
} else {
None
}
});
let map_deserializer = de::value::MapDeserializer::new(iter);
let result = visitor
.visit_map(map_deserializer)
.map_err(DeserializerError::from);
let mut iter = iter.peekable();
let result = match iter.peek() {
Some(_) => {
let map_deserializer = de::value::MapDeserializer::new(iter);
visitor
.visit_some(map_deserializer)
.map_err(DeserializerError::from)
}
None => visitor.visit_none(),
};
match error.take() {
Some(error) => Err(error.into()),
None => result,
}
}
forward_to_deserialize_any! {
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string
bytes byte_buf unit unit_struct newtype_struct seq tuple
tuple_struct struct enum identifier ignored_any
}
}
struct Value(SerdeJsonDeserializer<SerdeJsonIoRead<Cursor<Vec<u8>>>>);

View File

@ -2,28 +2,43 @@ use std::hash::{Hash, Hasher};
use crate::DocumentId;
use serde::{ser, Serialize};
use serde_json::Value;
use serde_json::{Value, Number};
use siphasher::sip::SipHasher;
use super::{ConvertToString, SerializerError};
pub fn extract_document_id<D>(
identifier: &str,
primary_key: &str,
document: &D,
) -> Result<Option<DocumentId>, SerializerError>
where
D: serde::Serialize,
{
let serializer = ExtractDocumentId { identifier };
let serializer = ExtractDocumentId { primary_key };
document.serialize(serializer)
}
fn validate_number(value: &Number) -> Option<String> {
if value.is_f64() {
return None
}
Some(value.to_string())
}
fn validate_string(value: &str) -> Option<String> {
if value.chars().all(|x| x.is_ascii_alphanumeric() || x == '-' || x == '_') {
Some(value.to_string())
} else {
None
}
}
pub fn value_to_string(value: &Value) -> Option<String> {
match value {
Value::Null => None,
Value::Bool(_) => None,
Value::Number(value) => Some(value.to_string()),
Value::String(value) => Some(value.to_string()),
Value::Number(value) => validate_number(value),
Value::String(value) => validate_string(value),
Value::Array(_) => None,
Value::Object(_) => None,
}
@ -37,7 +52,7 @@ pub fn compute_document_id<H: Hash>(t: H) -> DocumentId {
}
struct ExtractDocumentId<'a> {
identifier: &'a str,
primary_key: &'a str,
}
impl<'a> ser::Serializer for ExtractDocumentId<'a> {
@ -173,7 +188,7 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
let serializer = ExtractDocumentIdMapSerializer {
identifier: self.identifier,
primary_key: self.primary_key,
document_id: None,
current_key_name: None,
};
@ -187,7 +202,7 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
_len: usize,
) -> Result<Self::SerializeStruct, Self::Error> {
let serializer = ExtractDocumentIdStructSerializer {
identifier: self.identifier,
primary_key: self.primary_key,
document_id: None,
};
@ -208,7 +223,7 @@ impl<'a> ser::Serializer for ExtractDocumentId<'a> {
}
pub struct ExtractDocumentIdMapSerializer<'a> {
identifier: &'a str,
primary_key: &'a str,
document_id: Option<DocumentId>,
current_key_name: Option<String>,
}
@ -245,7 +260,7 @@ impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> {
{
let key = key.serialize(ConvertToString)?;
if self.identifier == key {
if self.primary_key == key {
let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?;
match value_to_string(&value).map(|s| compute_document_id(&s)) {
Some(document_id) => self.document_id = Some(document_id),
@ -262,7 +277,7 @@ impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> {
}
pub struct ExtractDocumentIdStructSerializer<'a> {
identifier: &'a str,
primary_key: &'a str,
document_id: Option<DocumentId>,
}
@ -278,7 +293,7 @@ impl<'a> ser::SerializeStruct for ExtractDocumentIdStructSerializer<'a> {
where
T: Serialize,
{
if self.identifier == key {
if self.primary_key == key {
let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?;
match value_to_string(&value).map(compute_document_id) {
Some(document_id) => self.document_id = Some(document_id),

View File

@ -1,4 +1,4 @@
use meilidb_schema::SchemaAttr;
use meilisearch_schema::IndexedPos;
use serde::ser;
use serde::Serialize;
@ -7,7 +7,7 @@ use crate::raw_indexer::RawIndexer;
use crate::DocumentId;
pub struct Indexer<'a> {
pub attribute: SchemaAttr,
pub pos: IndexedPos,
pub indexer: &'a mut RawIndexer,
pub document_id: DocumentId,
}
@ -20,13 +20,11 @@ impl<'a> ser::Serializer for Indexer<'a> {
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapIndexer<'a>;
type SerializeStruct = StructSerializer<'a>;
type SerializeStruct = StructIndexer<'a>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType {
type_name: "boolean",
})
Ok(None)
}
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
@ -87,7 +85,7 @@ impl<'a> ser::Serializer for Indexer<'a> {
fn serialize_str(self, text: &str) -> Result<Self::Ok, Self::Error> {
let number_of_words = self
.indexer
.index_text(self.document_id, self.attribute, text);
.index_text(self.document_id, self.pos, text);
Ok(Some(number_of_words))
}
@ -96,9 +94,7 @@ impl<'a> ser::Serializer for Indexer<'a> {
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType {
type_name: "Option",
})
Ok(None)
}
fn serialize_some<T: ?Sized>(self, value: &T) -> Result<Self::Ok, Self::Error>
@ -108,18 +104,16 @@ impl<'a> ser::Serializer for Indexer<'a> {
let text = value.serialize(ConvertToString)?;
let number_of_words = self
.indexer
.index_text(self.document_id, self.attribute, &text);
.index_text(self.document_id, self.pos, &text);
Ok(Some(number_of_words))
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "()" })
Ok(None)
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType {
type_name: "unit struct",
})
Ok(None)
}
fn serialize_unit_variant(
@ -128,9 +122,7 @@ impl<'a> ser::Serializer for Indexer<'a> {
_variant_index: u32,
_variant: &'static str,
) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType {
type_name: "unit variant",
})
Ok(None)
}
fn serialize_newtype_struct<T: ?Sized>(
@ -161,7 +153,7 @@ impl<'a> ser::Serializer for Indexer<'a> {
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
let indexer = SeqIndexer {
attribute: self.attribute,
pos: self.pos,
document_id: self.document_id,
indexer: self.indexer,
texts: Vec::new(),
@ -172,7 +164,7 @@ impl<'a> ser::Serializer for Indexer<'a> {
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
let indexer = TupleIndexer {
attribute: self.attribute,
pos: self.pos,
document_id: self.document_id,
indexer: self.indexer,
texts: Vec::new(),
@ -205,7 +197,7 @@ impl<'a> ser::Serializer for Indexer<'a> {
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
let indexer = MapIndexer {
attribute: self.attribute,
pos: self.pos,
document_id: self.document_id,
indexer: self.indexer,
texts: Vec::new(),
@ -219,9 +211,14 @@ impl<'a> ser::Serializer for Indexer<'a> {
_name: &'static str,
_len: usize,
) -> Result<Self::SerializeStruct, Self::Error> {
Err(SerializerError::UnindexableType {
type_name: "struct",
})
let indexer = StructIndexer {
pos: self.pos,
document_id: self.document_id,
indexer: self.indexer,
texts: Vec::new(),
};
Ok(indexer)
}
fn serialize_struct_variant(
@ -238,7 +235,7 @@ impl<'a> ser::Serializer for Indexer<'a> {
}
pub struct SeqIndexer<'a> {
attribute: SchemaAttr,
pos: IndexedPos,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
@ -260,13 +257,13 @@ impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer
.index_text_seq(self.document_id, self.attribute, texts);
.index_text_seq(self.document_id, self.pos, texts);
Ok(None)
}
}
pub struct MapIndexer<'a> {
attribute: SchemaAttr,
pos: IndexedPos,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
@ -297,19 +294,19 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> {
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer
.index_text_seq(self.document_id, self.attribute, texts);
.index_text_seq(self.document_id, self.pos, texts);
Ok(None)
}
}
pub struct StructSerializer<'a> {
attribute: SchemaAttr,
pub struct StructIndexer<'a> {
pos: IndexedPos,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
impl<'a> ser::SerializeStruct for StructIndexer<'a> {
type Ok = Option<usize>;
type Error = SerializerError;
@ -331,13 +328,13 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer
.index_text_seq(self.document_id, self.attribute, texts);
.index_text_seq(self.document_id, self.pos, texts);
Ok(None)
}
}
pub struct TupleIndexer<'a> {
attribute: SchemaAttr,
pos: IndexedPos,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
@ -359,7 +356,7 @@ impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer
.index_text_seq(self.document_id, self.attribute, texts);
.index_text_seq(self.document_id, self.pos, texts);
Ok(None)
}
}

View File

@ -20,16 +20,15 @@ pub use self::convert_to_string::ConvertToString;
pub use self::deserializer::{Deserializer, DeserializerError};
pub use self::extract_document_id::{compute_document_id, extract_document_id, value_to_string};
pub use self::indexer::Indexer;
pub use self::serializer::Serializer;
pub use self::serializer::{serialize_value, serialize_value_with_id, Serializer};
use std::collections::BTreeMap;
use std::{error::Error, fmt};
use meilidb_schema::SchemaAttr;
use serde::ser;
use serde_json::Error as SerdeJsonError;
use meilisearch_schema::Error as SchemaError;
use crate::{DocumentId, ParseNumberError};
use crate::ParseNumberError;
#[derive(Debug)]
pub enum SerializerError {
@ -38,6 +37,7 @@ pub enum SerializerError {
Zlmdb(heed::Error),
SerdeJson(SerdeJsonError),
ParseNumber(ParseNumberError),
Schema(SchemaError),
UnserializableType { type_name: &'static str },
UnindexableType { type_name: &'static str },
UnrankableType { type_name: &'static str },
@ -57,13 +57,14 @@ impl fmt::Display for SerializerError {
f.write_str("serialized document does not have an id according to the schema")
}
SerializerError::InvalidDocumentIdType => {
f.write_str("document identifier can only be of type string or number")
f.write_str("a document primary key can be of type integer or string only composed of alphanumeric characters, hyphens (-) and underscores (_).")
}
SerializerError::Zlmdb(e) => write!(f, "heed related error: {}", e),
SerializerError::SerdeJson(e) => write!(f, "serde json error: {}", e),
SerializerError::ParseNumber(e) => {
write!(f, "error while trying to parse a number: {}", e)
}
SerializerError::Schema(e) => write!(f, "impossible to update schema: {}", e),
SerializerError::UnserializableType { type_name } => {
write!(f, "{} is not a serializable type", type_name)
}
@ -104,24 +105,8 @@ impl From<ParseNumberError> for SerializerError {
}
}
pub struct RamDocumentStore(BTreeMap<(DocumentId, SchemaAttr), Vec<u8>>);
impl RamDocumentStore {
pub fn new() -> RamDocumentStore {
RamDocumentStore(BTreeMap::new())
}
pub fn set_document_field(&mut self, id: DocumentId, attr: SchemaAttr, value: Vec<u8>) {
self.0.insert((id, attr), value);
}
pub fn into_inner(self) -> BTreeMap<(DocumentId, SchemaAttr), Vec<u8>> {
self.0
}
}
impl Default for RamDocumentStore {
fn default() -> Self {
Self::new()
}
impl From<SchemaError> for SerializerError {
fn from(error: SchemaError) -> SerializerError {
SerializerError::Schema(error)
}
}

View File

@ -1,31 +1,32 @@
use meilidb_schema::{Schema, SchemaAttr};
use meilisearch_schema::{Schema, FieldId};
use serde::ser;
use std::collections::HashMap;
use crate::database::MainT;
use crate::raw_indexer::RawIndexer;
use crate::serde::RamDocumentStore;
use crate::store::{DocumentsFields, DocumentsFieldsCounts};
use crate::{DocumentId, RankedMap};
use super::{ConvertToNumber, ConvertToString, Indexer, SerializerError};
pub struct Serializer<'a> {
pub schema: &'a Schema,
pub document_store: &'a mut RamDocumentStore,
pub document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
pub struct Serializer<'a, 'b> {
pub txn: &'a mut heed::RwTxn<'b, MainT>,
pub schema: &'a mut Schema,
pub document_store: DocumentsFields,
pub document_fields_counts: DocumentsFieldsCounts,
pub indexer: &'a mut RawIndexer,
pub ranked_map: &'a mut RankedMap,
pub document_id: DocumentId,
}
impl<'a> ser::Serializer for Serializer<'a> {
impl<'a, 'b> ser::Serializer for Serializer<'a, 'b> {
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapSerializer<'a>;
type SerializeStruct = StructSerializer<'a>;
type SerializeMap = MapSerializer<'a, 'b>;
type SerializeStruct = StructSerializer<'a, 'b>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
@ -150,6 +151,7 @@ impl<'a> ser::Serializer for Serializer<'a> {
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Ok(MapSerializer {
txn: self.txn,
schema: self.schema,
document_id: self.document_id,
document_store: self.document_store,
@ -166,6 +168,7 @@ impl<'a> ser::Serializer for Serializer<'a> {
_len: usize,
) -> Result<Self::SerializeStruct, Self::Error> {
Ok(StructSerializer {
txn: self.txn,
schema: self.schema,
document_id: self.document_id,
document_store: self.document_store,
@ -188,17 +191,18 @@ impl<'a> ser::Serializer for Serializer<'a> {
}
}
pub struct MapSerializer<'a> {
schema: &'a Schema,
pub struct MapSerializer<'a, 'b> {
txn: &'a mut heed::RwTxn<'b, MainT>,
schema: &'a mut Schema,
document_id: DocumentId,
document_store: &'a mut RamDocumentStore,
document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
document_store: DocumentsFields,
document_fields_counts: DocumentsFieldsCounts,
indexer: &'a mut RawIndexer,
ranked_map: &'a mut RankedMap,
current_key_name: Option<String>,
}
impl<'a> ser::SerializeMap for MapSerializer<'a> {
impl<'a, 'b> ser::SerializeMap for MapSerializer<'a, 'b> {
type Ok = ();
type Error = SerializerError;
@ -229,15 +233,15 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
V: ser::Serialize,
{
let key = key.serialize(ConvertToString)?;
serialize_value(
self.txn,
key.as_str(),
self.schema,
self.document_id,
self.document_store,
self.document_fields_counts,
self.indexer,
self.ranked_map,
&key,
value,
)
}
@ -247,16 +251,17 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
}
}
pub struct StructSerializer<'a> {
schema: &'a Schema,
pub struct StructSerializer<'a, 'b> {
txn: &'a mut heed::RwTxn<'b, MainT>,
schema: &'a mut Schema,
document_id: DocumentId,
document_store: &'a mut RamDocumentStore,
document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
document_store: DocumentsFields,
document_fields_counts: DocumentsFieldsCounts,
indexer: &'a mut RawIndexer,
ranked_map: &'a mut RankedMap,
}
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
impl<'a, 'b> ser::SerializeStruct for StructSerializer<'a, 'b> {
type Ok = ();
type Error = SerializerError;
@ -269,13 +274,14 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
T: ser::Serialize,
{
serialize_value(
self.txn,
key,
self.schema,
self.document_id,
self.document_store,
self.document_fields_counts,
self.indexer,
self.ranked_map,
key,
value,
)
}
@ -285,40 +291,70 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
}
}
fn serialize_value<T: ?Sized>(
schema: &Schema,
pub fn serialize_value<'a, T: ?Sized>(
txn: &mut heed::RwTxn<MainT>,
attribute: &str,
schema: &'a mut Schema,
document_id: DocumentId,
document_store: &mut RamDocumentStore,
documents_fields_counts: &mut HashMap<(DocumentId, SchemaAttr), u64>,
document_store: DocumentsFields,
documents_fields_counts: DocumentsFieldsCounts,
indexer: &mut RawIndexer,
ranked_map: &mut RankedMap,
key: &str,
value: &T,
) -> Result<(), SerializerError>
where
T: ser::Serialize,
{
if let Some(attribute) = schema.attribute(key) {
let props = schema.props(attribute);
let field_id = schema.insert_and_index(&attribute)?;
serialize_value_with_id(
txn,
field_id,
schema,
document_id,
document_store,
documents_fields_counts,
indexer,
ranked_map,
value,
)
}
let serialized = serde_json::to_vec(value)?;
document_store.set_document_field(document_id, attribute, serialized);
pub fn serialize_value_with_id<'a, T: ?Sized>(
txn: &mut heed::RwTxn<MainT>,
field_id: FieldId,
schema: &'a Schema,
document_id: DocumentId,
document_store: DocumentsFields,
documents_fields_counts: DocumentsFieldsCounts,
indexer: &mut RawIndexer,
ranked_map: &mut RankedMap,
value: &T,
) -> Result<(), SerializerError>
where
T: ser::Serialize,
{
let serialized = serde_json::to_vec(value)?;
document_store.put_document_field(txn, document_id, field_id, &serialized)?;
if props.is_indexed() {
let indexer = Indexer {
attribute,
indexer,
if let Some(indexed_pos) = schema.is_indexed(field_id) {
let indexer = Indexer {
pos: *indexed_pos,
indexer,
document_id,
};
if let Some(number_of_words) = value.serialize(indexer)? {
documents_fields_counts.put_document_field_count(
txn,
document_id,
};
if let Some(number_of_words) = value.serialize(indexer)? {
documents_fields_counts.insert((document_id, attribute), number_of_words as u64);
}
*indexed_pos,
number_of_words as u16,
)?;
}
}
if props.is_ranked() {
let number = value.serialize(ConvertToNumber)?;
ranked_map.insert(document_id, attribute, number);
}
if schema.is_ranked(field_id) {
let number = value.serialize(ConvertToNumber).unwrap_or_default();
ranked_map.insert(document_id, field_id, number);
}
Ok(())

View File

@ -0,0 +1,184 @@
use std::collections::{BTreeMap, BTreeSet, HashSet};
use std::str::FromStr;
use std::iter::IntoIterator;
use serde::{Deserialize, Deserializer, Serialize};
use once_cell::sync::Lazy;
use self::RankingRule::*;
pub const DEFAULT_RANKING_RULES: [RankingRule; 6] = [Typo, Words, Proximity, Attribute, WordsPosition, Exactness];
static RANKING_RULE_REGEX: Lazy<regex::Regex> = Lazy::new(|| {
let regex = regex::Regex::new(r"(asc|desc)\(([a-zA-Z0-9-_]*)\)").unwrap();
regex
});
#[derive(Default, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
pub struct Settings {
#[serde(default, deserialize_with = "deserialize_some")]
pub ranking_rules: Option<Option<Vec<String>>>,
#[serde(default, deserialize_with = "deserialize_some")]
pub distinct_attribute: Option<Option<String>>,
#[serde(default, deserialize_with = "deserialize_some")]
pub searchable_attributes: Option<Option<Vec<String>>>,
#[serde(default, deserialize_with = "deserialize_some")]
pub displayed_attributes: Option<Option<HashSet<String>>>,
#[serde(default, deserialize_with = "deserialize_some")]
pub stop_words: Option<Option<BTreeSet<String>>>,
#[serde(default, deserialize_with = "deserialize_some")]
pub synonyms: Option<Option<BTreeMap<String, Vec<String>>>>,
#[serde(default, deserialize_with = "deserialize_some")]
pub accept_new_fields: Option<Option<bool>>,
}
// Any value that is present is considered Some value, including null.
fn deserialize_some<'de, T, D>(deserializer: D) -> Result<Option<T>, D::Error>
where T: Deserialize<'de>,
D: Deserializer<'de>
{
Deserialize::deserialize(deserializer).map(Some)
}
impl Settings {
pub fn into_update(&self) -> Result<SettingsUpdate, RankingRuleConversionError> {
let settings = self.clone();
let ranking_rules = match settings.ranking_rules {
Some(Some(rules)) => UpdateState::Update(RankingRule::from_iter(rules.iter())?),
Some(None) => UpdateState::Clear,
None => UpdateState::Nothing,
};
Ok(SettingsUpdate {
ranking_rules,
distinct_attribute: settings.distinct_attribute.into(),
primary_key: UpdateState::Nothing,
searchable_attributes: settings.searchable_attributes.into(),
displayed_attributes: settings.displayed_attributes.into(),
stop_words: settings.stop_words.into(),
synonyms: settings.synonyms.into(),
accept_new_fields: settings.accept_new_fields.into(),
})
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum UpdateState<T> {
Update(T),
Clear,
Nothing,
}
impl <T> From<Option<Option<T>>> for UpdateState<T> {
fn from(opt: Option<Option<T>>) -> UpdateState<T> {
match opt {
Some(Some(t)) => UpdateState::Update(t),
Some(None) => UpdateState::Clear,
None => UpdateState::Nothing,
}
}
}
#[derive(Debug, Clone)]
pub struct RankingRuleConversionError;
impl std::fmt::Display for RankingRuleConversionError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "impossible to convert into RankingRule")
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum RankingRule {
Typo,
Words,
Proximity,
Attribute,
WordsPosition,
Exactness,
Asc(String),
Desc(String),
}
impl std::fmt::Display for RankingRule {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
RankingRule::Typo => f.write_str("typo"),
RankingRule::Words => f.write_str("words"),
RankingRule::Proximity => f.write_str("proximity"),
RankingRule::Attribute => f.write_str("attribute"),
RankingRule::WordsPosition => f.write_str("wordsPosition"),
RankingRule::Exactness => f.write_str("exactness"),
RankingRule::Asc(field) => write!(f, "asc({})", field),
RankingRule::Desc(field) => write!(f, "desc({})", field),
}
}
}
impl FromStr for RankingRule {
type Err = RankingRuleConversionError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let rule = match s {
"typo" => RankingRule::Typo,
"words" => RankingRule::Words,
"proximity" => RankingRule::Proximity,
"attribute" => RankingRule::Attribute,
"wordsPosition" => RankingRule::WordsPosition,
"exactness" => RankingRule::Exactness,
_ => {
let captures = RANKING_RULE_REGEX.captures(s).ok_or(RankingRuleConversionError)?;
match (captures.get(1).map(|m| m.as_str()), captures.get(2)) {
(Some("asc"), Some(field)) => RankingRule::Asc(field.as_str().to_string()),
(Some("desc"), Some(field)) => RankingRule::Desc(field.as_str().to_string()),
_ => return Err(RankingRuleConversionError)
}
}
};
Ok(rule)
}
}
impl RankingRule {
pub fn field(&self) -> Option<&str> {
match self {
RankingRule::Asc(field) | RankingRule::Desc(field) => Some(field),
_ => None,
}
}
pub fn from_iter(rules: impl IntoIterator<Item = impl AsRef<str>>) -> Result<Vec<RankingRule>, RankingRuleConversionError> {
rules.into_iter()
.map(|s| RankingRule::from_str(s.as_ref()))
.collect()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SettingsUpdate {
pub ranking_rules: UpdateState<Vec<RankingRule>>,
pub distinct_attribute: UpdateState<String>,
pub primary_key: UpdateState<String>,
pub searchable_attributes: UpdateState<Vec<String>>,
pub displayed_attributes: UpdateState<HashSet<String>>,
pub stop_words: UpdateState<BTreeSet<String>>,
pub synonyms: UpdateState<BTreeMap<String, Vec<String>>>,
pub accept_new_fields: UpdateState<bool>,
}
impl Default for SettingsUpdate {
fn default() -> Self {
Self {
ranking_rules: UpdateState::Nothing,
distinct_attribute: UpdateState::Nothing,
primary_key: UpdateState::Nothing,
searchable_attributes: UpdateState::Nothing,
displayed_attributes: UpdateState::Nothing,
stop_words: UpdateState::Nothing,
synonyms: UpdateState::Nothing,
accept_new_fields: UpdateState::Nothing,
}
}
}

View File

@ -1,4 +1,5 @@
use super::BEU64;
use crate::database::MainT;
use crate::DocumentId;
use heed::types::{ByteSlice, OwnedType};
use heed::Result as ZResult;
@ -12,7 +13,7 @@ pub struct DocsWords {
impl DocsWords {
pub fn put_doc_words(
self,
writer: &mut heed::RwTxn,
writer: &mut heed::RwTxn<MainT>,
document_id: DocumentId,
words: &fst::Set,
) -> ZResult<()> {
@ -21,21 +22,25 @@ impl DocsWords {
self.docs_words.put(writer, &document_id, bytes)
}
pub fn del_doc_words(self, writer: &mut heed::RwTxn, document_id: DocumentId) -> ZResult<bool> {
pub fn del_doc_words(self, writer: &mut heed::RwTxn<MainT>, document_id: DocumentId) -> ZResult<bool> {
let document_id = BEU64::new(document_id.0);
self.docs_words.delete(writer, &document_id)
}
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
self.docs_words.clear(writer)
}
pub fn doc_words(
self,
reader: &heed::RoTxn,
reader: &heed::RoTxn<MainT>,
document_id: DocumentId,
) -> ZResult<Option<fst::Set>> {
let document_id = BEU64::new(document_id.0);
match self.docs_words.get(reader, &document_id)? {
Some(bytes) => {
let len = bytes.len();
let bytes = Arc::from(bytes);
let bytes = Arc::new(bytes.to_owned());
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst)))
}

View File

@ -0,0 +1,79 @@
use heed::types::{ByteSlice, OwnedType};
use crate::database::MainT;
use heed::Result as ZResult;
use meilisearch_schema::FieldId;
use super::DocumentFieldStoredKey;
use crate::DocumentId;
#[derive(Copy, Clone)]
pub struct DocumentsFields {
pub(crate) documents_fields: heed::Database<OwnedType<DocumentFieldStoredKey>, ByteSlice>,
}
impl DocumentsFields {
pub fn put_document_field(
self,
writer: &mut heed::RwTxn<MainT>,
document_id: DocumentId,
field: FieldId,
value: &[u8],
) -> ZResult<()> {
let key = DocumentFieldStoredKey::new(document_id, field);
self.documents_fields.put(writer, &key, value)
}
pub fn del_all_document_fields(
self,
writer: &mut heed::RwTxn<MainT>,
document_id: DocumentId,
) -> ZResult<usize> {
let start = DocumentFieldStoredKey::new(document_id, FieldId::min());
let end = DocumentFieldStoredKey::new(document_id, FieldId::max());
self.documents_fields.delete_range(writer, &(start..=end))
}
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
self.documents_fields.clear(writer)
}
pub fn document_attribute<'txn>(
self,
reader: &'txn heed::RoTxn<MainT>,
document_id: DocumentId,
field: FieldId,
) -> ZResult<Option<&'txn [u8]>> {
let key = DocumentFieldStoredKey::new(document_id, field);
self.documents_fields.get(reader, &key)
}
pub fn document_fields<'txn>(
self,
reader: &'txn heed::RoTxn<MainT>,
document_id: DocumentId,
) -> ZResult<DocumentFieldsIter<'txn>> {
let start = DocumentFieldStoredKey::new(document_id, FieldId::min());
let end = DocumentFieldStoredKey::new(document_id, FieldId::max());
let iter = self.documents_fields.range(reader, &(start..=end))?;
Ok(DocumentFieldsIter { iter })
}
}
pub struct DocumentFieldsIter<'txn> {
iter: heed::RoRange<'txn, OwnedType<DocumentFieldStoredKey>, ByteSlice>,
}
impl<'txn> Iterator for DocumentFieldsIter<'txn> {
type Item = ZResult<(FieldId, &'txn [u8])>;
fn next(&mut self) -> Option<Self::Item> {
match self.iter.next() {
Some(Ok((key, bytes))) => {
let field_id = FieldId(key.field_id.get());
Some(Ok((field_id, bytes)))
}
Some(Err(e)) => Some(Err(e)),
None => None,
}
}
}

View File

@ -1,44 +1,48 @@
use super::DocumentAttrKey;
use super::DocumentFieldIndexedKey;
use crate::database::MainT;
use crate::DocumentId;
use heed::types::OwnedType;
use heed::Result as ZResult;
use meilidb_schema::SchemaAttr;
use meilisearch_schema::IndexedPos;
#[derive(Copy, Clone)]
pub struct DocumentsFieldsCounts {
pub(crate) documents_fields_counts: heed::Database<OwnedType<DocumentAttrKey>, OwnedType<u64>>,
pub(crate) documents_fields_counts: heed::Database<OwnedType<DocumentFieldIndexedKey>, OwnedType<u16>>,
}
impl DocumentsFieldsCounts {
pub fn put_document_field_count(
self,
writer: &mut heed::RwTxn,
writer: &mut heed::RwTxn<MainT>,
document_id: DocumentId,
attribute: SchemaAttr,
value: u64,
attribute: IndexedPos,
value: u16,
) -> ZResult<()> {
let key = DocumentAttrKey::new(document_id, attribute);
let key = DocumentFieldIndexedKey::new(document_id, attribute);
self.documents_fields_counts.put(writer, &key, &value)
}
pub fn del_all_document_fields_counts(
self,
writer: &mut heed::RwTxn,
writer: &mut heed::RwTxn<MainT>,
document_id: DocumentId,
) -> ZResult<usize> {
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
self.documents_fields_counts
.delete_range(writer, start..=end)
let start = DocumentFieldIndexedKey::new(document_id, IndexedPos::min());
let end = DocumentFieldIndexedKey::new(document_id, IndexedPos::max());
self.documents_fields_counts.delete_range(writer, &(start..=end))
}
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
self.documents_fields_counts.clear(writer)
}
pub fn document_field_count(
self,
reader: &heed::RoTxn,
reader: &heed::RoTxn<MainT>,
document_id: DocumentId,
attribute: SchemaAttr,
) -> ZResult<Option<u64>> {
let key = DocumentAttrKey::new(document_id, attribute);
attribute: IndexedPos,
) -> ZResult<Option<u16>> {
let key = DocumentFieldIndexedKey::new(document_id, attribute);
match self.documents_fields_counts.get(reader, &key)? {
Some(count) => Ok(Some(count)),
None => Ok(None),
@ -47,16 +51,16 @@ impl DocumentsFieldsCounts {
pub fn document_fields_counts<'txn>(
self,
reader: &'txn heed::RoTxn,
reader: &'txn heed::RoTxn<MainT>,
document_id: DocumentId,
) -> ZResult<DocumentFieldsCountsIter<'txn>> {
let start = DocumentAttrKey::new(document_id, SchemaAttr::min());
let end = DocumentAttrKey::new(document_id, SchemaAttr::max());
let iter = self.documents_fields_counts.range(reader, start..=end)?;
let start = DocumentFieldIndexedKey::new(document_id, IndexedPos::min());
let end = DocumentFieldIndexedKey::new(document_id, IndexedPos::max());
let iter = self.documents_fields_counts.range(reader, &(start..=end))?;
Ok(DocumentFieldsCountsIter { iter })
}
pub fn documents_ids<'txn>(self, reader: &'txn heed::RoTxn) -> ZResult<DocumentsIdsIter<'txn>> {
pub fn documents_ids<'txn>(self, reader: &'txn heed::RoTxn<MainT>) -> ZResult<DocumentsIdsIter<'txn>> {
let iter = self.documents_fields_counts.iter(reader)?;
Ok(DocumentsIdsIter {
last_seen_id: None,
@ -66,7 +70,7 @@ impl DocumentsFieldsCounts {
pub fn all_documents_fields_counts<'txn>(
self,
reader: &'txn heed::RoTxn,
reader: &'txn heed::RoTxn<MainT>,
) -> ZResult<AllDocumentsFieldsCountsIter<'txn>> {
let iter = self.documents_fields_counts.iter(reader)?;
Ok(AllDocumentsFieldsCountsIter { iter })
@ -74,17 +78,17 @@ impl DocumentsFieldsCounts {
}
pub struct DocumentFieldsCountsIter<'txn> {
iter: heed::RoRange<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
iter: heed::RoRange<'txn, OwnedType<DocumentFieldIndexedKey>, OwnedType<u16>>,
}
impl Iterator for DocumentFieldsCountsIter<'_> {
type Item = ZResult<(SchemaAttr, u64)>;
type Item = ZResult<(IndexedPos, u16)>;
fn next(&mut self) -> Option<Self::Item> {
match self.iter.next() {
Some(Ok((key, count))) => {
let attr = SchemaAttr(key.attr.get());
Some(Ok((attr, count)))
let indexed_pos = IndexedPos(key.indexed_pos.get());
Some(Ok((indexed_pos, count)))
}
Some(Err(e)) => Some(Err(e)),
None => None,
@ -94,7 +98,7 @@ impl Iterator for DocumentFieldsCountsIter<'_> {
pub struct DocumentsIdsIter<'txn> {
last_seen_id: Option<DocumentId>,
iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
iter: heed::RoIter<'txn, OwnedType<DocumentFieldIndexedKey>, OwnedType<u16>>,
}
impl Iterator for DocumentsIdsIter<'_> {
@ -118,18 +122,18 @@ impl Iterator for DocumentsIdsIter<'_> {
}
pub struct AllDocumentsFieldsCountsIter<'txn> {
iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
iter: heed::RoIter<'txn, OwnedType<DocumentFieldIndexedKey>, OwnedType<u16>>,
}
impl<'r> Iterator for AllDocumentsFieldsCountsIter<'r> {
type Item = ZResult<(DocumentId, SchemaAttr, u64)>;
impl Iterator for AllDocumentsFieldsCountsIter<'_> {
type Item = ZResult<(DocumentId, IndexedPos, u16)>;
fn next(&mut self) -> Option<Self::Item> {
match self.iter.next() {
Some(Ok((key, count))) => {
let docid = DocumentId(key.docid.get());
let attr = SchemaAttr(key.attr.get());
Some(Ok((docid, attr, count)))
let indexed_pos = IndexedPos(key.indexed_pos.get());
Some(Ok((docid, indexed_pos, count)))
}
Some(Err(e)) => Some(Err(e)),
None => None,

View File

@ -0,0 +1,226 @@
use std::sync::Arc;
use std::collections::HashMap;
use chrono::{DateTime, Utc};
use heed::types::{ByteSlice, OwnedType, SerdeBincode, Str};
use heed::Result as ZResult;
use meilisearch_schema::Schema;
use crate::database::MainT;
use crate::RankedMap;
use crate::settings::RankingRule;
const CREATED_AT_KEY: &str = "created-at";
const RANKING_RULES_KEY: &str = "ranking-rules";
const DISTINCT_ATTRIBUTE_KEY: &str = "distinct-attribute";
const STOP_WORDS_KEY: &str = "stop-words";
const SYNONYMS_KEY: &str = "synonyms";
const CUSTOMS_KEY: &str = "customs";
const FIELDS_FREQUENCY_KEY: &str = "fields-frequency";
const NAME_KEY: &str = "name";
const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
const RANKED_MAP_KEY: &str = "ranked-map";
const SCHEMA_KEY: &str = "schema";
const UPDATED_AT_KEY: &str = "updated-at";
const WORDS_KEY: &str = "words";
pub type FreqsMap = HashMap<String, usize>;
type SerdeFreqsMap = SerdeBincode<FreqsMap>;
type SerdeDatetime = SerdeBincode<DateTime<Utc>>;
#[derive(Copy, Clone)]
pub struct Main {
pub(crate) main: heed::PolyDatabase,
}
impl Main {
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
self.main.clear(writer)
}
pub fn put_name(self, writer: &mut heed::RwTxn<MainT>, name: &str) -> ZResult<()> {
self.main.put::<_, Str, Str>(writer, NAME_KEY, name)
}
pub fn name(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<String>> {
Ok(self
.main
.get::<_, Str, Str>(reader, NAME_KEY)?
.map(|name| name.to_owned()))
}
pub fn put_created_at(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
self.main
.put::<_, Str, SerdeDatetime>(writer, CREATED_AT_KEY, &Utc::now())
}
pub fn created_at(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<DateTime<Utc>>> {
self.main.get::<_, Str, SerdeDatetime>(reader, CREATED_AT_KEY)
}
pub fn put_updated_at(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
self.main
.put::<_, Str, SerdeDatetime>(writer, UPDATED_AT_KEY, &Utc::now())
}
pub fn updated_at(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<DateTime<Utc>>> {
self.main.get::<_, Str, SerdeDatetime>(reader, UPDATED_AT_KEY)
}
pub fn put_words_fst(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set) -> ZResult<()> {
let bytes = fst.as_fst().as_bytes();
self.main.put::<_, Str, ByteSlice>(writer, WORDS_KEY, bytes)
}
pub unsafe fn static_words_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<fst::Set>> {
match self.main.get::<_, Str, ByteSlice>(reader, WORDS_KEY)? {
Some(bytes) => {
let bytes: &'static [u8] = std::mem::transmute(bytes);
let set = fst::Set::from_static_slice(bytes).unwrap();
Ok(Some(set))
}
None => Ok(None),
}
}
pub fn words_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<fst::Set>> {
match self.main.get::<_, Str, ByteSlice>(reader, WORDS_KEY)? {
Some(bytes) => {
let len = bytes.len();
let bytes = Arc::new(bytes.to_owned());
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst)))
}
None => Ok(None),
}
}
pub fn put_schema(self, writer: &mut heed::RwTxn<MainT>, schema: &Schema) -> ZResult<()> {
self.main.put::<_, Str, SerdeBincode<Schema>>(writer, SCHEMA_KEY, schema)
}
pub fn schema(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<Schema>> {
self.main.get::<_, Str, SerdeBincode<Schema>>(reader, SCHEMA_KEY)
}
pub fn delete_schema(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<bool> {
self.main.delete::<_, Str>(writer, SCHEMA_KEY)
}
pub fn put_ranked_map(self, writer: &mut heed::RwTxn<MainT>, ranked_map: &RankedMap) -> ZResult<()> {
self.main.put::<_, Str, SerdeBincode<RankedMap>>(writer, RANKED_MAP_KEY, &ranked_map)
}
pub fn ranked_map(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<RankedMap>> {
self.main.get::<_, Str, SerdeBincode<RankedMap>>(reader, RANKED_MAP_KEY)
}
pub fn put_synonyms_fst(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set) -> ZResult<()> {
let bytes = fst.as_fst().as_bytes();
self.main.put::<_, Str, ByteSlice>(writer, SYNONYMS_KEY, bytes)
}
pub fn synonyms_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<fst::Set>> {
match self.main.get::<_, Str, ByteSlice>(reader, SYNONYMS_KEY)? {
Some(bytes) => {
let len = bytes.len();
let bytes = Arc::new(bytes.to_owned());
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst)))
}
None => Ok(None),
}
}
pub fn put_stop_words_fst(self, writer: &mut heed::RwTxn<MainT>, fst: &fst::Set) -> ZResult<()> {
let bytes = fst.as_fst().as_bytes();
self.main.put::<_, Str, ByteSlice>(writer, STOP_WORDS_KEY, bytes)
}
pub fn stop_words_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<fst::Set>> {
match self.main.get::<_, Str, ByteSlice>(reader, STOP_WORDS_KEY)? {
Some(bytes) => {
let len = bytes.len();
let bytes = Arc::new(bytes.to_owned());
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst)))
}
None => Ok(None),
}
}
pub fn put_number_of_documents<F>(self, writer: &mut heed::RwTxn<MainT>, f: F) -> ZResult<u64>
where
F: Fn(u64) -> u64,
{
let new = self.number_of_documents(&*writer).map(f)?;
self.main
.put::<_, Str, OwnedType<u64>>(writer, NUMBER_OF_DOCUMENTS_KEY, &new)?;
Ok(new)
}
pub fn number_of_documents(self, reader: &heed::RoTxn<MainT>) -> ZResult<u64> {
match self
.main
.get::<_, Str, OwnedType<u64>>(reader, NUMBER_OF_DOCUMENTS_KEY)?
{
Some(value) => Ok(value),
None => Ok(0),
}
}
pub fn put_fields_frequency(
self,
writer: &mut heed::RwTxn<MainT>,
fields_frequency: &FreqsMap,
) -> ZResult<()> {
self.main
.put::<_, Str, SerdeFreqsMap>(writer, FIELDS_FREQUENCY_KEY, fields_frequency)
}
pub fn fields_frequency(&self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<FreqsMap>> {
match self
.main
.get::<_, Str, SerdeFreqsMap>(reader, FIELDS_FREQUENCY_KEY)?
{
Some(freqs) => Ok(Some(freqs)),
None => Ok(None),
}
}
pub fn ranking_rules(&self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<Vec<RankingRule>>> {
self.main.get::<_, Str, SerdeBincode<Vec<RankingRule>>>(reader, RANKING_RULES_KEY)
}
pub fn put_ranking_rules(self, writer: &mut heed::RwTxn<MainT>, value: &[RankingRule]) -> ZResult<()> {
self.main.put::<_, Str, SerdeBincode<Vec<RankingRule>>>(writer, RANKING_RULES_KEY, &value.to_vec())
}
pub fn delete_ranking_rules(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<bool> {
self.main.delete::<_, Str>(writer, RANKING_RULES_KEY)
}
pub fn distinct_attribute(&self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<String>> {
if let Some(value) = self.main.get::<_, Str, Str>(reader, DISTINCT_ATTRIBUTE_KEY)? {
return Ok(Some(value.to_owned()))
}
return Ok(None)
}
pub fn put_distinct_attribute(self, writer: &mut heed::RwTxn<MainT>, value: &str) -> ZResult<()> {
self.main.put::<_, Str, Str>(writer, DISTINCT_ATTRIBUTE_KEY, value)
}
pub fn delete_distinct_attribute(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<bool> {
self.main.delete::<_, Str>(writer, DISTINCT_ATTRIBUTE_KEY)
}
pub fn put_customs(self, writer: &mut heed::RwTxn<MainT>, customs: &[u8]) -> ZResult<()> {
self.main
.put::<_, Str, ByteSlice>(writer, CUSTOMS_KEY, customs)
}
pub fn customs<'txn>(self, reader: &'txn heed::RoTxn<MainT>) -> ZResult<Option<&'txn [u8]>> {
self.main.get::<_, Str, ByteSlice>(reader, CUSTOMS_KEY)
}
}

View File

@ -0,0 +1,518 @@
mod docs_words;
mod prefix_documents_cache;
mod prefix_postings_lists_cache;
mod documents_fields;
mod documents_fields_counts;
mod main;
mod postings_lists;
mod synonyms;
mod updates;
mod updates_results;
pub use self::docs_words::DocsWords;
pub use self::prefix_documents_cache::PrefixDocumentsCache;
pub use self::prefix_postings_lists_cache::PrefixPostingsListsCache;
pub use self::documents_fields::{DocumentFieldsIter, DocumentsFields};
pub use self::documents_fields_counts::{
DocumentFieldsCountsIter, DocumentsFieldsCounts, DocumentsIdsIter,
};
pub use self::main::Main;
pub use self::postings_lists::PostingsLists;
pub use self::synonyms::Synonyms;
pub use self::updates::Updates;
pub use self::updates_results::UpdatesResults;
use std::borrow::Cow;
use std::collections::HashSet;
use std::convert::TryInto;
use std::{mem, ptr};
use heed::Result as ZResult;
use heed::{BytesEncode, BytesDecode};
use meilisearch_schema::{IndexedPos, FieldId};
use sdset::{Set, SetBuf};
use serde::de::{self, Deserialize};
use zerocopy::{AsBytes, FromBytes};
use crate::criterion::Criteria;
use crate::database::{MainT, UpdateT};
use crate::database::{UpdateEvent, UpdateEventsEmitter};
use crate::serde::Deserializer;
use crate::settings::SettingsUpdate;
use crate::{query_builder::QueryBuilder, update, DocIndex, DocumentId, Error, MResult};
type BEU64 = zerocopy::U64<byteorder::BigEndian>;
type BEU16 = zerocopy::U16<byteorder::BigEndian>;
#[derive(Debug, Copy, Clone, AsBytes, FromBytes)]
#[repr(C)]
pub struct DocumentFieldIndexedKey {
docid: BEU64,
indexed_pos: BEU16,
}
impl DocumentFieldIndexedKey {
fn new(docid: DocumentId, indexed_pos: IndexedPos) -> DocumentFieldIndexedKey {
DocumentFieldIndexedKey {
docid: BEU64::new(docid.0),
indexed_pos: BEU16::new(indexed_pos.0),
}
}
}
#[derive(Debug, Copy, Clone, AsBytes, FromBytes)]
#[repr(C)]
pub struct DocumentFieldStoredKey {
docid: BEU64,
field_id: BEU16,
}
impl DocumentFieldStoredKey {
fn new(docid: DocumentId, field_id: FieldId) -> DocumentFieldStoredKey {
DocumentFieldStoredKey {
docid: BEU64::new(docid.0),
field_id: BEU16::new(field_id.0),
}
}
}
#[derive(Default, Debug)]
pub struct Postings<'a> {
pub docids: Cow<'a, Set<DocumentId>>,
pub matches: Cow<'a, Set<DocIndex>>,
}
pub struct PostingsCodec;
impl<'a> BytesEncode<'a> for PostingsCodec {
type EItem = Postings<'a>;
fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
let u64_size = mem::size_of::<u64>();
let docids_size = item.docids.len() * mem::size_of::<DocumentId>();
let matches_size = item.matches.len() * mem::size_of::<DocIndex>();
let mut buffer = Vec::with_capacity(u64_size + docids_size + matches_size);
let docids_len = item.docids.len();
buffer.extend_from_slice(&docids_len.to_be_bytes());
buffer.extend_from_slice(item.docids.as_bytes());
buffer.extend_from_slice(item.matches.as_bytes());
Some(Cow::Owned(buffer))
}
}
fn aligned_to(bytes: &[u8], align: usize) -> bool {
(bytes as *const _ as *const () as usize) % align == 0
}
fn from_bytes_to_set<'a, T: 'a>(bytes: &'a [u8]) -> Option<Cow<'a, Set<T>>>
where T: Clone + FromBytes
{
match zerocopy::LayoutVerified::<_, [T]>::new_slice(bytes) {
Some(layout) => Some(Cow::Borrowed(Set::new_unchecked(layout.into_slice()))),
None => {
let len = bytes.len();
let elem_size = mem::size_of::<T>();
// ensure that it is the alignment that is wrong
// and the length is valid
if len % elem_size == 0 && !aligned_to(bytes, mem::align_of::<T>()) {
let elems = len / elem_size;
let mut vec = Vec::<T>::with_capacity(elems);
unsafe {
let dst = vec.as_mut_ptr() as *mut u8;
ptr::copy_nonoverlapping(bytes.as_ptr(), dst, len);
vec.set_len(elems);
}
return Some(Cow::Owned(SetBuf::new_unchecked(vec)));
}
None
}
}
}
impl<'a> BytesDecode<'a> for PostingsCodec {
type DItem = Postings<'a>;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let u64_size = mem::size_of::<u64>();
let docid_size = mem::size_of::<DocumentId>();
let (len_bytes, bytes) = bytes.split_at(u64_size);
let docids_len = len_bytes.try_into().ok().map(u64::from_be_bytes)? as usize;
let docids_size = docids_len * docid_size;
let docids_bytes = &bytes[..docids_size];
let matches_bytes = &bytes[docids_size..];
let docids = from_bytes_to_set(docids_bytes)?;
let matches = from_bytes_to_set(matches_bytes)?;
Some(Postings { docids, matches })
}
}
fn main_name(name: &str) -> String {
format!("store-{}", name)
}
fn postings_lists_name(name: &str) -> String {
format!("store-{}-postings-lists", name)
}
fn documents_fields_name(name: &str) -> String {
format!("store-{}-documents-fields", name)
}
fn documents_fields_counts_name(name: &str) -> String {
format!("store-{}-documents-fields-counts", name)
}
fn synonyms_name(name: &str) -> String {
format!("store-{}-synonyms", name)
}
fn docs_words_name(name: &str) -> String {
format!("store-{}-docs-words", name)
}
fn prefix_documents_cache_name(name: &str) -> String {
format!("store-{}-prefix-documents-cache", name)
}
fn prefix_postings_lists_cache_name(name: &str) -> String {
format!("store-{}-prefix-postings-lists-cache", name)
}
fn updates_name(name: &str) -> String {
format!("store-{}-updates", name)
}
fn updates_results_name(name: &str) -> String {
format!("store-{}-updates-results", name)
}
#[derive(Clone)]
pub struct Index {
pub main: Main,
pub postings_lists: PostingsLists,
pub documents_fields: DocumentsFields,
pub documents_fields_counts: DocumentsFieldsCounts,
pub synonyms: Synonyms,
pub docs_words: DocsWords,
pub prefix_documents_cache: PrefixDocumentsCache,
pub prefix_postings_lists_cache: PrefixPostingsListsCache,
pub updates: Updates,
pub updates_results: UpdatesResults,
pub(crate) updates_notifier: UpdateEventsEmitter,
}
impl Index {
pub fn document<T: de::DeserializeOwned>(
&self,
reader: &heed::RoTxn<MainT>,
attributes: Option<&HashSet<&str>>,
document_id: DocumentId,
) -> MResult<Option<T>> {
let schema = self.main.schema(reader)?;
let schema = schema.ok_or(Error::SchemaMissing)?;
let attributes = match attributes {
Some(attributes) => Some(attributes.iter().filter_map(|name| schema.id(*name)).collect()),
None => None,
};
let mut deserializer = Deserializer {
document_id,
reader,
documents_fields: self.documents_fields,
schema: &schema,
fields: attributes.as_ref(),
};
Ok(Option::<T>::deserialize(&mut deserializer)?)
}
pub fn document_attribute<T: de::DeserializeOwned>(
&self,
reader: &heed::RoTxn<MainT>,
document_id: DocumentId,
attribute: FieldId,
) -> MResult<Option<T>> {
let bytes = self
.documents_fields
.document_attribute(reader, document_id, attribute)?;
match bytes {
Some(bytes) => Ok(Some(serde_json::from_slice(bytes)?)),
None => Ok(None),
}
}
pub fn document_attribute_bytes<'txn>(
&self,
reader: &'txn heed::RoTxn<MainT>,
document_id: DocumentId,
attribute: FieldId,
) -> MResult<Option<&'txn [u8]>> {
let bytes = self
.documents_fields
.document_attribute(reader, document_id, attribute)?;
match bytes {
Some(bytes) => Ok(Some(bytes)),
None => Ok(None),
}
}
pub fn customs_update(&self, writer: &mut heed::RwTxn<UpdateT>, customs: Vec<u8>) -> ZResult<u64> {
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
update::push_customs_update(writer, self.updates, self.updates_results, customs)
}
pub fn settings_update(&self, writer: &mut heed::RwTxn<UpdateT>, update: SettingsUpdate) -> ZResult<u64> {
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
update::push_settings_update(writer, self.updates, self.updates_results, update)
}
pub fn documents_addition<D>(&self) -> update::DocumentsAddition<D> {
update::DocumentsAddition::new(
self.updates,
self.updates_results,
self.updates_notifier.clone(),
)
}
pub fn documents_partial_addition<D>(&self) -> update::DocumentsAddition<D> {
update::DocumentsAddition::new_partial(
self.updates,
self.updates_results,
self.updates_notifier.clone(),
)
}
pub fn documents_deletion(&self) -> update::DocumentsDeletion {
update::DocumentsDeletion::new(
self.updates,
self.updates_results,
self.updates_notifier.clone(),
)
}
pub fn clear_all(&self, writer: &mut heed::RwTxn<UpdateT>) -> MResult<u64> {
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
update::push_clear_all(writer, self.updates, self.updates_results)
}
pub fn current_update_id(&self, reader: &heed::RoTxn<UpdateT>) -> MResult<Option<u64>> {
match self.updates.last_update(reader)? {
Some((id, _)) => Ok(Some(id)),
None => Ok(None),
}
}
pub fn update_status(
&self,
reader: &heed::RoTxn<UpdateT>,
update_id: u64,
) -> MResult<Option<update::UpdateStatus>> {
update::update_status(reader, self.updates, self.updates_results, update_id)
}
pub fn all_updates_status(&self, reader: &heed::RoTxn<UpdateT>) -> MResult<Vec<update::UpdateStatus>> {
let mut updates = Vec::new();
let mut last_update_result_id = 0;
// retrieve all updates results
if let Some((last_id, _)) = self.updates_results.last_update(reader)? {
updates.reserve(last_id as usize);
for id in 0..=last_id {
if let Some(update) = self.update_status(reader, id)? {
updates.push(update);
last_update_result_id = id;
}
}
}
// retrieve all enqueued updates
if let Some((last_id, _)) = self.updates.last_update(reader)? {
for id in last_update_result_id + 1..=last_id {
if let Some(update) = self.update_status(reader, id)? {
updates.push(update);
}
}
}
Ok(updates)
}
pub fn query_builder(&self) -> QueryBuilder {
QueryBuilder::new(
self.main,
self.postings_lists,
self.documents_fields_counts,
self.synonyms,
self.prefix_documents_cache,
self.prefix_postings_lists_cache,
)
}
pub fn query_builder_with_criteria<'c, 'f, 'd>(
&self,
criteria: Criteria<'c>,
) -> QueryBuilder<'c, 'f, 'd> {
QueryBuilder::with_criteria(
self.main,
self.postings_lists,
self.documents_fields_counts,
self.synonyms,
self.prefix_documents_cache,
self.prefix_postings_lists_cache,
criteria,
)
}
}
pub fn create(
env: &heed::Env,
update_env: &heed::Env,
name: &str,
updates_notifier: UpdateEventsEmitter,
) -> MResult<Index> {
// create all the store names
let main_name = main_name(name);
let postings_lists_name = postings_lists_name(name);
let documents_fields_name = documents_fields_name(name);
let documents_fields_counts_name = documents_fields_counts_name(name);
let synonyms_name = synonyms_name(name);
let docs_words_name = docs_words_name(name);
let prefix_documents_cache_name = prefix_documents_cache_name(name);
let prefix_postings_lists_cache_name = prefix_postings_lists_cache_name(name);
let updates_name = updates_name(name);
let updates_results_name = updates_results_name(name);
// open all the stores
let main = env.create_poly_database(Some(&main_name))?;
let postings_lists = env.create_database(Some(&postings_lists_name))?;
let documents_fields = env.create_database(Some(&documents_fields_name))?;
let documents_fields_counts = env.create_database(Some(&documents_fields_counts_name))?;
let synonyms = env.create_database(Some(&synonyms_name))?;
let docs_words = env.create_database(Some(&docs_words_name))?;
let prefix_documents_cache = env.create_database(Some(&prefix_documents_cache_name))?;
let prefix_postings_lists_cache = env.create_database(Some(&prefix_postings_lists_cache_name))?;
let updates = update_env.create_database(Some(&updates_name))?;
let updates_results = update_env.create_database(Some(&updates_results_name))?;
Ok(Index {
main: Main { main },
postings_lists: PostingsLists { postings_lists },
documents_fields: DocumentsFields { documents_fields },
documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts },
synonyms: Synonyms { synonyms },
docs_words: DocsWords { docs_words },
prefix_postings_lists_cache: PrefixPostingsListsCache { prefix_postings_lists_cache },
prefix_documents_cache: PrefixDocumentsCache { prefix_documents_cache },
updates: Updates { updates },
updates_results: UpdatesResults { updates_results },
updates_notifier,
})
}
pub fn open(
env: &heed::Env,
update_env: &heed::Env,
name: &str,
updates_notifier: UpdateEventsEmitter,
) -> MResult<Option<Index>> {
// create all the store names
let main_name = main_name(name);
let postings_lists_name = postings_lists_name(name);
let documents_fields_name = documents_fields_name(name);
let documents_fields_counts_name = documents_fields_counts_name(name);
let synonyms_name = synonyms_name(name);
let docs_words_name = docs_words_name(name);
let prefix_documents_cache_name = prefix_documents_cache_name(name);
let prefix_postings_lists_cache_name = prefix_postings_lists_cache_name(name);
let updates_name = updates_name(name);
let updates_results_name = updates_results_name(name);
// open all the stores
let main = match env.open_poly_database(Some(&main_name))? {
Some(main) => main,
None => return Ok(None),
};
let postings_lists = match env.open_database(Some(&postings_lists_name))? {
Some(postings_lists) => postings_lists,
None => return Ok(None),
};
let documents_fields = match env.open_database(Some(&documents_fields_name))? {
Some(documents_fields) => documents_fields,
None => return Ok(None),
};
let documents_fields_counts = match env.open_database(Some(&documents_fields_counts_name))? {
Some(documents_fields_counts) => documents_fields_counts,
None => return Ok(None),
};
let synonyms = match env.open_database(Some(&synonyms_name))? {
Some(synonyms) => synonyms,
None => return Ok(None),
};
let docs_words = match env.open_database(Some(&docs_words_name))? {
Some(docs_words) => docs_words,
None => return Ok(None),
};
let prefix_documents_cache = match env.open_database(Some(&prefix_documents_cache_name))? {
Some(prefix_documents_cache) => prefix_documents_cache,
None => return Ok(None),
};
let prefix_postings_lists_cache = match env.open_database(Some(&prefix_postings_lists_cache_name))? {
Some(prefix_postings_lists_cache) => prefix_postings_lists_cache,
None => return Ok(None),
};
let updates = match update_env.open_database(Some(&updates_name))? {
Some(updates) => updates,
None => return Ok(None),
};
let updates_results = match update_env.open_database(Some(&updates_results_name))? {
Some(updates_results) => updates_results,
None => return Ok(None),
};
Ok(Some(Index {
main: Main { main },
postings_lists: PostingsLists { postings_lists },
documents_fields: DocumentsFields { documents_fields },
documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts },
synonyms: Synonyms { synonyms },
docs_words: DocsWords { docs_words },
prefix_documents_cache: PrefixDocumentsCache { prefix_documents_cache },
prefix_postings_lists_cache: PrefixPostingsListsCache { prefix_postings_lists_cache },
updates: Updates { updates },
updates_results: UpdatesResults { updates_results },
updates_notifier,
}))
}
pub fn clear(
writer: &mut heed::RwTxn<MainT>,
update_writer: &mut heed::RwTxn<UpdateT>,
index: &Index,
) -> MResult<()> {
// clear all the stores
index.main.clear(writer)?;
index.postings_lists.clear(writer)?;
index.documents_fields.clear(writer)?;
index.documents_fields_counts.clear(writer)?;
index.synonyms.clear(writer)?;
index.docs_words.clear(writer)?;
index.prefix_documents_cache.clear(writer)?;
index.prefix_postings_lists_cache.clear(writer)?;
index.updates.clear(update_writer)?;
index.updates_results.clear(update_writer)?;
Ok(())
}

View File

@ -0,0 +1,47 @@
use std::borrow::Cow;
use heed::Result as ZResult;
use heed::types::ByteSlice;
use sdset::{Set, SetBuf};
use slice_group_by::GroupBy;
use crate::database::MainT;
use crate::DocIndex;
use crate::store::{Postings, PostingsCodec};
#[derive(Copy, Clone)]
pub struct PostingsLists {
pub(crate) postings_lists: heed::Database<ByteSlice, PostingsCodec>,
}
impl PostingsLists {
pub fn put_postings_list(
self,
writer: &mut heed::RwTxn<MainT>,
word: &[u8],
matches: &Set<DocIndex>,
) -> ZResult<()> {
let docids = matches.linear_group_by_key(|m| m.document_id).map(|g| g[0].document_id).collect();
let docids = Cow::Owned(SetBuf::new_unchecked(docids));
let matches = Cow::Borrowed(matches);
let postings = Postings { docids, matches };
self.postings_lists.put(writer, word, &postings)
}
pub fn del_postings_list(self, writer: &mut heed::RwTxn<MainT>, word: &[u8]) -> ZResult<bool> {
self.postings_lists.delete(writer, word)
}
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
self.postings_lists.clear(writer)
}
pub fn postings_list<'txn>(
self,
reader: &'txn heed::RoTxn<MainT>,
word: &[u8],
) -> ZResult<Option<Postings<'txn>>> {
self.postings_lists.get(reader, word)
}
}

View File

@ -0,0 +1,80 @@
use std::borrow::Cow;
use heed::types::{OwnedType, CowSlice};
use heed::Result as ZResult;
use zerocopy::{AsBytes, FromBytes};
use super::BEU64;
use crate::{DocumentId, Highlight};
use crate::database::MainT;
#[derive(Debug, Copy, Clone, AsBytes, FromBytes)]
#[repr(C)]
pub struct PrefixKey {
prefix: [u8; 4],
index: BEU64,
docid: BEU64,
}
impl PrefixKey {
pub fn new(prefix: [u8; 4], index: u64, docid: u64) -> PrefixKey {
PrefixKey {
prefix,
index: BEU64::new(index),
docid: BEU64::new(docid),
}
}
}
#[derive(Copy, Clone)]
pub struct PrefixDocumentsCache {
pub(crate) prefix_documents_cache: heed::Database<OwnedType<PrefixKey>, CowSlice<Highlight>>,
}
impl PrefixDocumentsCache {
pub fn put_prefix_document(
self,
writer: &mut heed::RwTxn<MainT>,
prefix: [u8; 4],
index: usize,
docid: DocumentId,
highlights: &[Highlight],
) -> ZResult<()> {
let key = PrefixKey::new(prefix, index as u64, docid.0);
self.prefix_documents_cache.put(writer, &key, highlights)
}
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
self.prefix_documents_cache.clear(writer)
}
pub fn prefix_documents<'txn>(
self,
reader: &'txn heed::RoTxn<MainT>,
prefix: [u8; 4],
) -> ZResult<PrefixDocumentsIter<'txn>> {
let start = PrefixKey::new(prefix, 0, 0);
let end = PrefixKey::new(prefix, u64::max_value(), u64::max_value());
let iter = self.prefix_documents_cache.range(reader, &(start..=end))?;
Ok(PrefixDocumentsIter { iter })
}
}
pub struct PrefixDocumentsIter<'txn> {
iter: heed::RoRange<'txn, OwnedType<PrefixKey>, CowSlice<Highlight>>,
}
impl<'txn> Iterator for PrefixDocumentsIter<'txn> {
type Item = ZResult<(DocumentId, Cow<'txn, [Highlight]>)>;
fn next(&mut self) -> Option<Self::Item> {
match self.iter.next() {
Some(Ok((key, highlights))) => {
let docid = DocumentId(key.docid.get());
Some(Ok((docid, highlights)))
}
Some(Err(e)) => Some(Err(e)),
None => None,
}
}
}

View File

@ -0,0 +1,45 @@
use std::borrow::Cow;
use heed::Result as ZResult;
use heed::types::OwnedType;
use sdset::{Set, SetBuf};
use slice_group_by::GroupBy;
use crate::database::MainT;
use crate::DocIndex;
use crate::store::{PostingsCodec, Postings};
#[derive(Copy, Clone)]
pub struct PrefixPostingsListsCache {
pub(crate) prefix_postings_lists_cache: heed::Database<OwnedType<[u8; 4]>, PostingsCodec>,
}
impl PrefixPostingsListsCache {
pub fn put_prefix_postings_list(
self,
writer: &mut heed::RwTxn<MainT>,
prefix: [u8; 4],
matches: &Set<DocIndex>,
) -> ZResult<()>
{
let docids = matches.linear_group_by_key(|m| m.document_id).map(|g| g[0].document_id).collect();
let docids = Cow::Owned(SetBuf::new_unchecked(docids));
let matches = Cow::Borrowed(matches);
let postings = Postings { docids, matches };
self.prefix_postings_lists_cache.put(writer, &prefix, &postings)
}
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
self.prefix_postings_lists_cache.clear(writer)
}
pub fn prefix_postings_list<'txn>(
self,
reader: &'txn heed::RoTxn<MainT>,
prefix: [u8; 4],
) -> ZResult<Option<Postings<'txn>>>
{
self.prefix_postings_lists_cache.get(reader, &prefix)
}
}

View File

@ -1,4 +1,5 @@
use heed::types::ByteSlice;
use crate::database::MainT;
use heed::Result as ZResult;
use std::sync::Arc;
@ -10,7 +11,7 @@ pub struct Synonyms {
impl Synonyms {
pub fn put_synonyms(
self,
writer: &mut heed::RwTxn,
writer: &mut heed::RwTxn<MainT>,
word: &[u8],
synonyms: &fst::Set,
) -> ZResult<()> {
@ -18,15 +19,19 @@ impl Synonyms {
self.synonyms.put(writer, word, bytes)
}
pub fn del_synonyms(self, writer: &mut heed::RwTxn, word: &[u8]) -> ZResult<bool> {
pub fn del_synonyms(self, writer: &mut heed::RwTxn<MainT>, word: &[u8]) -> ZResult<bool> {
self.synonyms.delete(writer, word)
}
pub fn synonyms(self, reader: &heed::RoTxn, word: &[u8]) -> ZResult<Option<fst::Set>> {
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
self.synonyms.clear(writer)
}
pub fn synonyms(self, reader: &heed::RoTxn<MainT>, word: &[u8]) -> ZResult<Option<fst::Set>> {
match self.synonyms.get(reader, word)? {
Some(bytes) => {
let len = bytes.len();
let bytes = Arc::from(bytes);
let bytes = Arc::new(bytes.to_owned());
let fst = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
Ok(Some(fst::Set::from(fst)))
}

View File

@ -1,4 +1,5 @@
use super::BEU64;
use crate::database::UpdateT;
use crate::update::Update;
use heed::types::{OwnedType, SerdeJson};
use heed::Result as ZResult;
@ -10,7 +11,7 @@ pub struct Updates {
impl Updates {
// TODO do not trigger deserialize if possible
pub fn last_update_id(self, reader: &heed::RoTxn) -> ZResult<Option<(u64, Update)>> {
pub fn last_update(self, reader: &heed::RoTxn<UpdateT>) -> ZResult<Option<(u64, Update)>> {
match self.updates.last(reader)? {
Some((key, data)) => Ok(Some((key.get(), data))),
None => Ok(None),
@ -18,7 +19,7 @@ impl Updates {
}
// TODO do not trigger deserialize if possible
fn first_update_id(self, reader: &heed::RoTxn) -> ZResult<Option<(u64, Update)>> {
pub fn first_update(self, reader: &heed::RoTxn<UpdateT>) -> ZResult<Option<(u64, Update)>> {
match self.updates.first(reader)? {
Some((key, data)) => Ok(Some((key.get(), data))),
None => Ok(None),
@ -26,14 +27,14 @@ impl Updates {
}
// TODO do not trigger deserialize if possible
pub fn contains(self, reader: &heed::RoTxn, update_id: u64) -> ZResult<bool> {
pub fn get(self, reader: &heed::RoTxn<UpdateT>, update_id: u64) -> ZResult<Option<Update>> {
let update_id = BEU64::new(update_id);
self.updates.get(reader, &update_id).map(|v| v.is_some())
self.updates.get(reader, &update_id)
}
pub fn put_update(
self,
writer: &mut heed::RwTxn,
writer: &mut heed::RwTxn<UpdateT>,
update_id: u64,
update: &Update,
) -> ZResult<()> {
@ -42,8 +43,13 @@ impl Updates {
self.updates.put(writer, &update_id, update)
}
pub fn pop_front(self, writer: &mut heed::RwTxn) -> ZResult<Option<(u64, Update)>> {
match self.first_update_id(writer)? {
pub fn del_update(self, writer: &mut heed::RwTxn<UpdateT>, update_id: u64) -> ZResult<bool> {
let update_id = BEU64::new(update_id);
self.updates.delete(writer, &update_id)
}
pub fn pop_front(self, writer: &mut heed::RwTxn<UpdateT>) -> ZResult<Option<(u64, Update)>> {
match self.first_update(writer)? {
Some((update_id, update)) => {
let key = BEU64::new(update_id);
self.updates.delete(writer, &key)?;
@ -52,4 +58,8 @@ impl Updates {
None => Ok(None),
}
}
pub fn clear(self, writer: &mut heed::RwTxn<UpdateT>) -> ZResult<()> {
self.updates.clear(writer)
}
}

View File

@ -1,15 +1,19 @@
use super::BEU64;
use crate::update::UpdateResult;
use heed::types::{OwnedType, SerdeBincode};
use crate::database::UpdateT;
use crate::update::ProcessedUpdateResult;
use heed::types::{OwnedType, SerdeJson};
use heed::Result as ZResult;
#[derive(Copy, Clone)]
pub struct UpdatesResults {
pub(crate) updates_results: heed::Database<OwnedType<BEU64>, SerdeBincode<UpdateResult>>,
pub(crate) updates_results: heed::Database<OwnedType<BEU64>, SerdeJson<ProcessedUpdateResult>>,
}
impl UpdatesResults {
pub fn last_update_id(self, reader: &heed::RoTxn) -> ZResult<Option<(u64, UpdateResult)>> {
pub fn last_update(
self,
reader: &heed::RoTxn<UpdateT>,
) -> ZResult<Option<(u64, ProcessedUpdateResult)>> {
match self.updates_results.last(reader)? {
Some((key, data)) => Ok(Some((key.get(), data))),
None => Ok(None),
@ -18,9 +22,9 @@ impl UpdatesResults {
pub fn put_update_result(
self,
writer: &mut heed::RwTxn,
writer: &mut heed::RwTxn<UpdateT>,
update_id: u64,
update_result: &UpdateResult,
update_result: &ProcessedUpdateResult,
) -> ZResult<()> {
let update_id = BEU64::new(update_id);
self.updates_results.put(writer, &update_id, update_result)
@ -28,10 +32,14 @@ impl UpdatesResults {
pub fn update_result(
self,
reader: &heed::RoTxn,
reader: &heed::RoTxn<UpdateT>,
update_id: u64,
) -> ZResult<Option<UpdateResult>> {
) -> ZResult<Option<ProcessedUpdateResult>> {
let update_id = BEU64::new(update_id);
self.updates_results.get(reader, &update_id)
}
pub fn clear(self, writer: &mut heed::RwTxn<UpdateT>) -> ZResult<()> {
self.updates_results.clear(writer)
}
}

View File

@ -0,0 +1,32 @@
use crate::database::{MainT, UpdateT};
use crate::update::{next_update_id, Update};
use crate::{store, MResult, RankedMap};
pub fn apply_clear_all(
writer: &mut heed::RwTxn<MainT>,
index: &store::Index,
) -> MResult<()> {
index.main.put_words_fst(writer, &fst::Set::default())?;
index.main.put_ranked_map(writer, &RankedMap::default())?;
index.main.put_number_of_documents(writer, |_| 0)?;
index.documents_fields.clear(writer)?;
index.documents_fields_counts.clear(writer)?;
index.postings_lists.clear(writer)?;
index.docs_words.clear(writer)?;
index.prefix_documents_cache.clear(writer)?;
index.prefix_postings_lists_cache.clear(writer)?;
Ok(())
}
pub fn push_clear_all(
writer: &mut heed::RwTxn<UpdateT>,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
) -> MResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::clear_all();
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}

View File

@ -1,9 +1,11 @@
use crate::store;
use crate::update::{next_update_id, Update};
use heed::Result as ZResult;
use crate::database::{MainT, UpdateT};
use crate::store;
use crate::update::{next_update_id, Update};
pub fn apply_customs_update(
writer: &mut heed::RwTxn,
writer: &mut heed::RwTxn<MainT>,
main_store: store::Main,
customs: &[u8],
) -> ZResult<()> {
@ -11,14 +13,14 @@ pub fn apply_customs_update(
}
pub fn push_customs_update(
writer: &mut heed::RwTxn,
writer: &mut heed::RwTxn<UpdateT>,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
customs: Vec<u8>,
) -> ZResult<u64> {
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = Update::Customs(customs);
let update = Update::customs(customs);
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)

View File

@ -0,0 +1,382 @@
use std::collections::HashMap;
use fst::{set::OpBuilder, SetBuilder};
use indexmap::IndexMap;
use sdset::{duo::Union, SetOperation};
use serde::{Deserialize, Serialize};
use crate::database::{MainT, UpdateT};
use crate::database::{UpdateEvent, UpdateEventsEmitter};
use crate::raw_indexer::RawIndexer;
use crate::serde::{extract_document_id, serialize_value_with_id, Deserializer, Serializer};
use crate::store;
use crate::update::{apply_documents_deletion, compute_short_prefixes, next_update_id, Update};
use crate::{Error, MResult, RankedMap};
pub struct DocumentsAddition<D> {
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: UpdateEventsEmitter,
documents: Vec<D>,
is_partial: bool,
}
impl<D> DocumentsAddition<D> {
pub fn new(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: UpdateEventsEmitter,
) -> DocumentsAddition<D> {
DocumentsAddition {
updates_store,
updates_results_store,
updates_notifier,
documents: Vec::new(),
is_partial: false,
}
}
pub fn new_partial(
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
updates_notifier: UpdateEventsEmitter,
) -> DocumentsAddition<D> {
DocumentsAddition {
updates_store,
updates_results_store,
updates_notifier,
documents: Vec::new(),
is_partial: true,
}
}
pub fn update_document(&mut self, document: D) {
self.documents.push(document);
}
pub fn finalize(self, writer: &mut heed::RwTxn<UpdateT>) -> MResult<u64>
where
D: serde::Serialize,
{
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
let update_id = push_documents_addition(
writer,
self.updates_store,
self.updates_results_store,
self.documents,
self.is_partial,
)?;
Ok(update_id)
}
}
impl<D> Extend<D> for DocumentsAddition<D> {
fn extend<T: IntoIterator<Item = D>>(&mut self, iter: T) {
self.documents.extend(iter)
}
}
pub fn push_documents_addition<D: serde::Serialize>(
writer: &mut heed::RwTxn<UpdateT>,
updates_store: store::Updates,
updates_results_store: store::UpdatesResults,
addition: Vec<D>,
is_partial: bool,
) -> MResult<u64> {
let mut values = Vec::with_capacity(addition.len());
for add in addition {
let vec = serde_json::to_vec(&add)?;
let add = serde_json::from_slice(&vec)?;
values.push(add);
}
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
let update = if is_partial {
Update::documents_partial(values)
} else {
Update::documents_addition(values)
};
updates_store.put_update(writer, last_update_id, &update)?;
Ok(last_update_id)
}
pub fn apply_documents_addition<'a, 'b>(
writer: &'a mut heed::RwTxn<'b, MainT>,
index: &store::Index,
addition: Vec<IndexMap<String, serde_json::Value>>,
) -> MResult<()> {
let mut documents_additions = HashMap::new();
let mut schema = match index.main.schema(writer)? {
Some(schema) => schema,
None => return Err(Error::SchemaMissing),
};
let primary_key = schema.primary_key().ok_or(Error::MissingPrimaryKey)?;
// 1. store documents ids for future deletion
for document in addition {
let document_id = match extract_document_id(&primary_key, &document)? {
Some(id) => id,
None => return Err(Error::MissingDocumentId),
};
documents_additions.insert(document_id, document);
}
// 2. remove the documents posting lists
let number_of_inserted_documents = documents_additions.len();
let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect();
apply_documents_deletion(writer, index, documents_ids)?;
let mut ranked_map = match index.main.ranked_map(writer)? {
Some(ranked_map) => ranked_map,
None => RankedMap::default(),
};
let stop_words = match index.main.stop_words_fst(writer)? {
Some(stop_words) => stop_words,
None => fst::Set::default(),
};
// 3. index the documents fields in the stores
let mut indexer = RawIndexer::new(stop_words);
for (document_id, document) in documents_additions {
let serializer = Serializer {
txn: writer,
schema: &mut schema,
document_store: index.documents_fields,
document_fields_counts: index.documents_fields_counts,
indexer: &mut indexer,
ranked_map: &mut ranked_map,
document_id,
};
document.serialize(serializer)?;
}
write_documents_addition_index(
writer,
index,
&ranked_map,
number_of_inserted_documents,
indexer,
)?;
index.main.put_schema(writer, &schema)?;
Ok(())
}
pub fn apply_documents_partial_addition<'a, 'b>(
writer: &'a mut heed::RwTxn<'b, MainT>,
index: &store::Index,
addition: Vec<IndexMap<String, serde_json::Value>>,
) -> MResult<()> {
let mut documents_additions = HashMap::new();
let mut schema = match index.main.schema(writer)? {
Some(schema) => schema,
None => return Err(Error::SchemaMissing),
};
let primary_key = schema.primary_key().ok_or(Error::MissingPrimaryKey)?;
// 1. store documents ids for future deletion
for mut document in addition {
let document_id = match extract_document_id(&primary_key, &document)? {
Some(id) => id,
None => return Err(Error::MissingDocumentId),
};
let mut deserializer = Deserializer {
document_id,
reader: writer,
documents_fields: index.documents_fields,
schema: &schema,
fields: None,
};
// retrieve the old document and
// update the new one with missing keys found in the old one
let result = Option::<HashMap<String, serde_json::Value>>::deserialize(&mut deserializer)?;
if let Some(old_document) = result {
for (key, value) in old_document {
document.entry(key).or_insert(value);
}
}
documents_additions.insert(document_id, document);
}
// 2. remove the documents posting lists
let number_of_inserted_documents = documents_additions.len();
let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect();
apply_documents_deletion(writer, index, documents_ids)?;
let mut ranked_map = match index.main.ranked_map(writer)? {
Some(ranked_map) => ranked_map,
None => RankedMap::default(),
};
let stop_words = match index.main.stop_words_fst(writer)? {
Some(stop_words) => stop_words,
None => fst::Set::default(),
};
// 3. index the documents fields in the stores
let mut indexer = RawIndexer::new(stop_words);
for (document_id, document) in documents_additions {
let serializer = Serializer {
txn: writer,
schema: &mut schema,
document_store: index.documents_fields,
document_fields_counts: index.documents_fields_counts,
indexer: &mut indexer,
ranked_map: &mut ranked_map,
document_id,
};
document.serialize(serializer)?;
}
write_documents_addition_index(
writer,
index,
&ranked_map,
number_of_inserted_documents,
indexer,
)?;
index.main.put_schema(writer, &schema)?;
Ok(())
}
pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Index) -> MResult<()> {
let schema = match index.main.schema(writer)? {
Some(schema) => schema,
None => return Err(Error::SchemaMissing),
};
let mut ranked_map = RankedMap::default();
// 1. retrieve all documents ids
let mut documents_ids_to_reindex = Vec::new();
for result in index.documents_fields_counts.documents_ids(writer)? {
let document_id = result?;
documents_ids_to_reindex.push(document_id);
}
// 2. remove the documents posting lists
index.main.put_words_fst(writer, &fst::Set::default())?;
index.main.put_ranked_map(writer, &ranked_map)?;
index.main.put_number_of_documents(writer, |_| 0)?;
index.postings_lists.clear(writer)?;
index.docs_words.clear(writer)?;
let stop_words = match index.main.stop_words_fst(writer)? {
Some(stop_words) => stop_words,
None => fst::Set::default(),
};
let number_of_inserted_documents = documents_ids_to_reindex.len();
let mut indexer = RawIndexer::new(stop_words);
let mut ram_store = HashMap::new();
for document_id in documents_ids_to_reindex {
for result in index.documents_fields.document_fields(writer, document_id)? {
let (field_id, bytes) = result?;
let value: serde_json::Value = serde_json::from_slice(bytes)?;
ram_store.insert((document_id, field_id), value);
}
for ((docid, field_id), value) in ram_store.drain() {
serialize_value_with_id(
writer,
field_id,
&schema,
docid,
index.documents_fields,
index.documents_fields_counts,
&mut indexer,
&mut ranked_map,
&value
)?;
}
}
// 4. write the new index in the main store
write_documents_addition_index(
writer,
index,
&ranked_map,
number_of_inserted_documents,
indexer,
)?;
index.main.put_schema(writer, &schema)?;
Ok(())
}
pub fn write_documents_addition_index(
writer: &mut heed::RwTxn<MainT>,
index: &store::Index,
ranked_map: &RankedMap,
number_of_inserted_documents: usize,
indexer: RawIndexer,
) -> MResult<()> {
let indexed = indexer.build();
let mut delta_words_builder = SetBuilder::memory();
for (word, delta_set) in indexed.words_doc_indexes {
delta_words_builder.insert(&word).unwrap();
let set = match index.postings_lists.postings_list(writer, &word)? {
Some(postings) => Union::new(&postings.matches, &delta_set).into_set_buf(),
None => delta_set,
};
index.postings_lists.put_postings_list(writer, &word, &set)?;
}
for (id, words) in indexed.docs_words {
index.docs_words.put_doc_words(writer, id, &words)?;
}
let delta_words = delta_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
let words = match index.main.words_fst(writer)? {
Some(words) => {
let op = OpBuilder::new()
.add(words.stream())
.add(delta_words.stream())
.r#union();
let mut words_builder = SetBuilder::memory();
words_builder.extend_stream(op).unwrap();
words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
}
None => delta_words,
};
index.main.put_words_fst(writer, &words)?;
index.main.put_ranked_map(writer, ranked_map)?;
index.main.put_number_of_documents(writer, |old| old + number_of_inserted_documents as u64)?;
compute_short_prefixes(writer, index)?;
Ok(())
}

Some files were not shown because too many files have changed in this diff Show More