Compare commits

...

219 Commits

Author SHA1 Message Date
aaeb25828f Merge pull request #183 from meilisearch/number-of-documents
Compute the number of documents on updates
2019-09-14 16:32:18 +02:00
af26c39482 test: Improve the tests of the number of documents counting 2019-09-14 15:29:46 +02:00
2006259a23 feat: Improve the number of documents counting 2019-09-14 15:26:41 +02:00
707e2f4d77 feat: Update the number of documents in the KV 2019-09-14 15:26:39 +02:00
8d8aed36a8 feat: Count the number of deleted/inserted documents 2019-09-14 15:24:39 +02:00
2658ef0176 Merge pull request #182 from meilisearch/replace-sled-by-rocksdb
Replace sled by RocksDB
2019-09-14 11:32:26 +02:00
400d542fef feat: Update the README to reflect the kv store update 2019-09-12 16:28:23 +02:00
f46868407c feat: Make RocksDB works seemlessly like sled 2019-09-05 18:43:10 +02:00
e3fa07077c feat: Introduce the CfTree and CfIter types 2019-09-05 14:53:09 +02:00
e5763e73eb chore: Prefer using const names to avoid typos 2019-09-05 13:22:53 +02:00
fd880e0a0e Merge pull request #175 from meilisearch/moving-back-to-sled
Moving back to sled
2019-09-05 13:14:48 +02:00
e33cc89846 feat: Introduce update callbacks 2019-09-05 11:48:26 +02:00
f40b373f9f feat: Introduce the UpdateStatus type 2019-09-05 11:48:26 +02:00
cd8535d410 feat: Introduce the update_status/_blocking functions 2019-09-05 11:48:25 +02:00
f07b99fe97 fix: Make the tests work with the new update system 2019-09-05 11:48:25 +02:00
f45a00df3b fix: Cloned ArcSwaps are unsynchronized versions 2019-09-05 11:46:02 +02:00
cd864c40bc feat: Make the update update serialization be based on message pack 2019-09-05 11:46:02 +02:00
91b44a2759 chore: Change the Box<Error> to be marked dyn 2019-09-05 11:46:01 +02:00
d8cd8c5def chore: Move the updates in their own module 2019-09-05 11:46:01 +02:00
b0be06540a chore: Simplify the update application 2019-09-05 11:46:01 +02:00
4deee93a55 feat: Introduce synonyms deletion using the update system 2019-09-05 11:33:11 +02:00
451c0a6d03 feat: Introduce synonyms addition using the update system 2019-09-05 11:33:10 +02:00
0db3e6c58c feat: Introduce documents deletion using the update system 2019-09-05 11:33:10 +02:00
f83d6df4ef feat: Introduce documents addition using the update system 2019-09-05 11:33:10 +02:00
5a9e25c315 feat: Introduce the UpdatesIndex type 2019-09-05 11:14:11 +02:00
50e3c2c3de chore: Upgrade the meilidb-data dependencies 2019-09-05 10:49:46 +02:00
093ee9732f Merge pull request #180 from meilisearch/store-every-document
Change the STORED attribute property by DISPLAYED
2019-09-04 14:45:00 +02:00
333189ee51 fix: Change every stored schema property by displayed 2019-09-04 11:16:36 +02:00
50b8a66794 feat: Change the STORED attribute property by DISPLAYED 2019-09-03 11:14:20 +02:00
8be3fc1a66 Merge pull request #179 from meilisearch/deunicode-before-tokenize
Improve the tokenizer by split after deunicode
2019-09-02 17:20:30 +02:00
b5503989f9 feat: Improve the tokenizer by split after deunicode 2019-09-02 16:54:54 +02:00
5b8bc09826 Merge pull request #176 from meilisearch/no-more-hanging-threads
Replace the rayon::scope by always checking time
2019-09-01 20:02:03 +02:00
c8ee21f227 feat: Replace the rayon::scope by always checking time 2019-09-01 18:52:38 +02:00
a420fbf1e8 Merge pull request #174 from meilisearch/arc-fst-sets
Do not clone probably large fst::Sets, Arc them
2019-08-30 14:52:28 +02:00
ca34c28335 feat: Do not clone probably large fst::Sets, Arc them 2019-08-30 14:37:28 +02:00
3e1b81c4ce Merge pull request #173 from meilisearch/fix-ranked-map-set
Use the right ranked-map key name
2019-08-30 14:21:14 +02:00
9b353dfda6 chore: Use const names to avoid typos 2019-08-30 12:36:10 +02:00
d8dcc6f34b fix: Use the right ranked-map key name 2019-08-30 12:21:00 +02:00
fba1272a3e Merge pull request #172 from meilisearch/expose-internal-functions
Expose some internal functions
2019-08-29 15:26:42 +02:00
e20a038970 fix: Expose some internal functions 2019-08-29 15:11:51 +02:00
6f34dccc89 Merge pull request #171 from meilisearch/stringify-document-id
Transform identifiers fields into a string before hashing it
2019-08-29 13:42:46 +02:00
f5b0eb044a fix: Transform the identifier value into a string before hashing it 2019-08-29 11:41:20 +02:00
bae86e978e Merge pull request #170 from meilisearch/async-word-index-fetching-with-rayon-scope
Async word index fetching with rayon scope
2019-08-28 14:37:38 +02:00
8030a822ab test: Add a way to setup the fetch timeout of the query-database example 2019-08-28 13:42:20 +02:00
9c5ec110e5 feat: Introduce a way to enable or disable query timeouts 2019-08-28 13:24:34 +02:00
67302d09f3 feat: Multiword rewrite while there is time 2019-08-19 11:12:23 +02:00
7dc9ea78fa feat: Make the automaton DFA construction lazy 2019-08-19 11:12:23 +02:00
0ee56314fb feat: Try to simplify Store trait bound with a rayon scope 2019-08-19 11:10:54 +02:00
b7b60b5fe5 feat: Introduce a new thread to avoid waiting on doc indexes fetchs 2019-08-16 16:35:19 +02:00
d9c9fafd78 feat: Fetch doc indexes while there is time 2019-08-16 15:01:25 +02:00
bb0a79c577 feat: Process automatons in the order they were sort 2019-08-16 12:25:35 +02:00
81d44a0854 feat: Order automatons by importance 2019-08-16 12:19:34 +02:00
ebc95cb8f2 feat: Display the documents fields in the order they were declared 2019-08-16 11:25:42 +02:00
a488c00a2e feat: Use RustyLine in the query-database example 2019-08-16 11:25:42 +02:00
bf3c2c3725 feat: Move the multi-word rewriting algorithm into its own function 2019-08-16 11:25:42 +02:00
89df496f0c feat: Separate highlights from matches to make the code easier to follow 2019-08-16 11:25:42 +02:00
9959f2e952 feat: Move the RawDocument type to its own module 2019-08-16 11:25:42 +02:00
795557c046 feat: Remove query splitting from the automaton generation 2019-08-16 11:25:42 +02:00
225a3bf184 test: Produce tests that work with the new cumulative word index system 2019-08-16 11:25:42 +02:00
e65d7418b7 feat: Remove the query index from the Automaton type 2019-08-16 11:25:42 +02:00
f478bbf826 feat: Introduce the QueryEnhancer in the query synonym system 2019-08-16 11:25:42 +02:00
5e691c2140 feat: Introduce the QueryEnhancer type 2019-08-16 11:25:42 +02:00
e0cadaa68d Merge pull request #165 from meilisearch/reorder-schema-attributes
Reorder schema attributes
2019-07-01 16:12:33 +02:00
9175e4686b feat: Collect TmpMatches only on tests, producing data useful for tests 2019-07-01 14:55:47 +02:00
e8afca614c chore: Little clean ups of meilidb-core 2019-07-01 14:34:06 +02:00
4f4b630ae9 fix: Make the examples compile with the new Highlight type 2019-07-01 12:06:17 +02:00
6b6db2f8e6 feat: Introduce the Highlight type to simplify the data oriented design 2019-07-01 12:06:16 +02:00
b7ed22bc59 feat: Introduce on the fly attributes reordering with meilidb-core 2019-07-01 12:03:31 +02:00
97cc3c7cce Merge pull request #166 from meilisearch/split-query-words
Split query words
2019-06-28 18:30:13 +02:00
f5d52396f5 feat: Support query words splits 2019-06-28 18:04:35 +02:00
9cc154da05 chore: Rewrite tests to use iterators and be easily testable 2019-06-28 18:04:35 +02:00
5aa49d232c feat: Rewrite Automaton generation related code 2019-06-28 18:04:35 +02:00
1cb42cbb30 Merge pull request #164 from meilisearch/concat-query-words
Support query words concatenation
2019-06-28 18:03:49 +02:00
9f320590d3 feat: Support query words concatenation 2019-06-27 10:14:17 +02:00
1b0fd2e0ba Merge pull request #160 from meilisearch/synonyms
Support all types of synonyms
2019-06-26 14:59:45 +02:00
b249b2a81b feat: Support removing specific synonym alternatives 2019-06-26 10:45:51 +02:00
0a5d4eb7ed feat: Normalize synonym strings and query strings to search for synonyms 2019-06-26 10:45:51 +02:00
3dcbc737f3 feat: Make synonyms be not considered like exact matches 2019-06-26 10:45:51 +02:00
43f11e929d fix: Do not trigger a synonym when its not the last word and is a prefix 2019-06-26 10:45:51 +02:00
8f2a551cca feat: Trigger synonym replacement only when the last word is tipped 2019-06-26 10:45:50 +02:00
8f044c6853 fix: Only create non-prefix DFA when generating synonyms alternatives 2019-06-26 10:45:50 +02:00
a76c00a787 feat: Create types to edit synonyms and keep them in the database 2019-06-26 10:45:50 +02:00
0633f16b4d feat: Make multi-word support multi-word synonyms 2019-06-26 10:45:50 +02:00
59fafb8b30 feat: Support one word has multi-word alternatives 2019-06-26 10:45:50 +02:00
d2bd99cc2a fix: Append DocIndexes when building InMemorySetStore from an Iterator 2019-06-26 10:45:50 +02:00
62930ecc4e feat: Deduplicate automatons when synonyms produce duplicated ones 2019-06-26 10:45:49 +02:00
6cb57aa8a4 feat: Unique word has multi-word synonyms basically work 2019-06-26 10:45:49 +02:00
9861c3878e tests: Add more tests about synonyms 2019-06-26 10:45:49 +02:00
707d7b062b feat: Made query handle synonyms via the Store 2019-06-26 10:45:49 +02:00
18736bdcd0 feat: Introduce the synonyms concept to the Store trait 2019-06-26 10:45:49 +02:00
e8b2e86007 feat: Introduce a basic way to handle synonyms 2019-06-26 10:45:48 +02:00
ae8b4f56f2 Merge pull request #163 from meilisearch/export-compute-docid
Expose a function to compute the DocumentId from an Hashable value
2019-06-25 12:25:38 +02:00
28a0074497 feat: Expose a function to compute the DocumentId from an Hashable value 2019-06-25 11:21:12 +02:00
71c039db09 Merge pull request #162 from meilisearch/trustful-hash
Prefer using a reliable SipHash to compute document ids
2019-06-22 11:51:52 +02:00
15646c258b fix: Prefer using a reliable SipHash to compute document ids 2019-06-22 11:22:21 +02:00
25a5605b35 Merge pull request #161 from meilisearch/remove-tide
Remove tide as it break compilation on the latest nightly
2019-06-18 14:04:47 +02:00
b630e32c6a fix: Remove tide as it break compilation on the latest nightly 2019-06-18 13:40:46 +02:00
c39254bf98 Merge pull request #159 from meilisearch/create-specific-schema-crate
Move the Schema to its own workspace crate
2019-06-03 09:17:14 +02:00
994a0e78f1 feat: Move the Schema to its own workspace crate 2019-05-29 15:37:28 +02:00
ab2ca15c5c Merge pull request #158 from meilisearch/moving-back-to-rocksdb
Moving back to RocksDB
2019-05-29 14:56:55 +02:00
07f447c457 feat: Force RocksDB compaction 2019-05-28 17:38:59 +02:00
62c8f1ba04 feat: Fix the index opening when index already exists 2019-05-26 11:36:47 +02:00
e08edc2d6b feat: Introduce some stats to ease debugging 2019-05-25 12:12:24 +02:00
a147c09b06 feat: Make more functions accessible on the custom settings 2019-05-24 14:37:04 +02:00
9fca74443e feat: Wrap the database index access to improve usability 2019-05-24 14:26:05 +02:00
6f258f71d5 feat: Implement some convenient accessors for custom settings 2019-05-23 15:43:41 +02:00
ce61c16dbe feat: Disable all the default RocksDB compression features 2019-05-23 15:35:53 +02:00
4c973238a1 feat: Introduce a basic RocksDB based version 2019-05-23 14:57:29 +02:00
3a8da82792 Merge pull request #157 from meilisearch/update-readme
Fix some badly spelled sentences
2019-05-22 14:01:33 +02:00
f10da122ff doc: Fix some badly spelled sentences 2019-05-22 11:41:03 +02:00
ec20a8cacb Merge pull request #156 from meilisearch/clippy-pass
Do a little clippy pass
2019-05-22 11:33:55 +02:00
102fb506db chore: Do a little clippy pass 2019-05-22 11:00:58 +02:00
34ba520f44 Merge pull request #155 from meilisearch/update-sdset
Use safest SetBuf constructor instead of new_unchecked
2019-05-21 18:23:39 +02:00
fa099555c0 feat: Use safest SetBuf constructor instead of new_unchecked 2019-05-21 18:15:48 +02:00
8387c5b14e Merge pull request #153 from meilisearch/example-expose-system-stats
Output more informations from the examples on document injection
2019-05-21 16:50:25 +02:00
5040095228 feat: Output more informations from the examples on document injection 2019-05-21 16:37:17 +02:00
788fae59a1 Merge pull request #154 from meilisearch/reintroduce-sort-by-attr
Reintroduce the `SortByAttr` custom criterion
2019-05-21 16:32:12 +02:00
e042f44e0d feat: Reintroduce the SortByAttr custom criterion 2019-05-21 16:22:23 +02:00
b1fc3e5cec Merge pull request #152 from meilisearch/documents-deletion-updates-ranked-map
Remove the documents from the ranked map on documents deletion
2019-05-21 13:59:21 +02:00
d7b1b7a2a9 feat: Remove the documents from the ranked map on documents deletion 2019-05-21 13:33:42 +02:00
97744ad24f Merge pull request #151 from meilisearch/expose-sled-compression-factor
Expose the sled compression setting
2019-05-20 15:03:43 +02:00
2e79b2a871 feat: Expose the sled compression setting 2019-05-20 14:41:15 +02:00
349f0f7068 Merge pull request #148 from meilisearch/split-fst-docindexes
Split fst doc-indexes
2019-05-20 14:24:48 +02:00
94f9587db1 feat: Implement Debug on RawDocument for more convenience 2019-05-20 11:21:41 +02:00
6df8f62022 test: Add more test to some criteria 2019-05-20 11:21:40 +02:00
8c71473498 feat: Introduce the Criterion::name to allow better debugging 2019-05-20 11:21:40 +02:00
08d89053da feat: Introduce a little simple http server for demo 2019-05-16 17:09:41 +02:00
4b36fa0739 test: Add tests about additions and deletions of documents 2019-05-16 13:44:21 +02:00
921b063a71 feat: Make the DocumentsDeletion public interface to take serde types 2019-05-16 12:04:08 +02:00
3de633c869 feat: Reexport sled to reduce user level library incompatibilities 2019-05-16 12:04:08 +02:00
021f0545eb doc: Update the deep-dive explanation text 2019-05-16 12:04:08 +02:00
b701eb85b8 doc: Update the README features links 2019-05-16 12:04:08 +02:00
4e80378a77 chore: Rename the ebay example into kaggle 2019-05-16 12:04:07 +02:00
830d2f28b9 feat: Introduce a custom tree for user custom settings 2019-05-16 12:04:07 +02:00
c5ba34d0b0 chore: Replace crate only public interface to be completely public 2019-05-16 12:04:07 +02:00
2e31bb519a chore: Split the database structure internal types 2019-05-16 12:04:07 +02:00
169bd4cb39 feat: Store all documents words by document rather than by attribute 2019-05-15 15:42:13 +02:00
aa90f22865 feat: Remove the Index dependency of the Serializer 2019-05-15 15:42:12 +02:00
9bba90c47e fix: Fix a bug in the Database open-index method 2019-05-15 15:42:12 +02:00
2844cb5bca fix: Make the examples compile 2019-05-15 15:42:12 +02:00
dff81bb161 feat: Prefer set/del methods instead of set with an Option type 2019-05-15 15:42:12 +02:00
1f2abce7c3 feat: Introduce the DocumentsDeletion type 2019-05-15 15:42:11 +02:00
e67ada8823 feat: Introduce the DocumentsAddition type 2019-05-15 15:42:11 +02:00
42e39f6eb5 feat: Introduce a simplified version of the Store trait 2019-05-15 15:42:11 +02:00
f317a7a322 feat: implement open/create_index on the Database type 2019-05-15 15:42:11 +02:00
8434ecbb43 feat: Introduce the RankedMap real type 2019-05-15 15:42:10 +02:00
0c18026240 feat: Introduce Tree wrappers for each index component 2019-05-15 15:42:10 +02:00
6eb25687f8 feat: Handle word doc-indexes sled tree errors 2019-05-15 15:42:10 +02:00
737db5668b chore: Remove the WriteToBytes trait 2019-05-15 15:42:10 +02:00
f16e0333e4 chore: Remove the SharedData/Cursor types 2019-05-15 15:42:09 +02:00
27ffcaabe9 chore: Remove the DocIndexes type 2019-05-15 15:42:09 +02:00
db031a5b95 chore: Remove the DocIds type 2019-05-15 15:42:09 +02:00
2e9fbd07cd chore: Remove most of the warnings 2019-05-15 15:42:09 +02:00
74acf83464 chore: Remove the NewIndexEvent type 2019-05-15 15:42:08 +02:00
3dc057ca9c feat: Introduce the new Index system 2019-05-15 15:42:08 +02:00
e142339106 Merge pull request #150 from felixonmars/patch-1
chore: Fix some typos
2019-05-06 15:00:53 +02:00
39038750a8 chore: Fix some typos 2019-05-06 20:12:33 +08:00
f68733bf11 Merge pull request #149 from meilisearch/ci-only-nightly
Update ci with rust nightly only
2019-05-02 15:43:53 +02:00
85edb3e90c Update ci with rust nightly only 2019-05-02 11:43:45 +02:00
d7ce6d016b Merge pull request #147 from meilisearch/moving-to-sled
Make the repository a workspace and move to sled
2019-04-29 15:21:02 +02:00
9023a12ad4 feat: Introduce the unrankable error variant 2019-04-29 14:32:04 +02:00
0547671246 feat: Take ranked attributes into account 2019-04-29 14:32:04 +02:00
068f1bc202 feat: Index unidecoded words 2019-04-29 14:32:04 +02:00
7035f76077 squash-me: Make better measurements of the retrieving spent time 2019-04-29 14:32:04 +02:00
f0268d49fe fix: Always lowercase indexed tokens 2019-04-29 14:32:04 +02:00
7dbf5d6319 fix: Make the examples build 2019-04-29 14:32:03 +02:00
ed6b6038ee feat: Finalize index merging on document insertion 2019-04-29 14:32:03 +02:00
ad24ef8a25 feat: Index words of structs, maps and tuples 2019-04-29 14:32:03 +02:00
645bab7748 feat: Index documents using the Serializer struct 2019-04-29 14:32:03 +02:00
abd7d1de48 feat: Introduce the extract_document_id function 2019-04-29 14:32:03 +02:00
ea0ee070ef feat: Introduce the Serializer
Which will serialize documents fields as message pack in the kv-store
2019-04-29 14:32:03 +02:00
2a69170f14 feat: Introduce the DocumentsDeletion type 2019-04-29 14:32:02 +02:00
725e7b4229 chore: Move the Deserializer into the the serde module 2019-04-29 14:32:02 +02:00
187e6740bd feat: Allow users to construct query builders from database indexes 2019-04-29 14:32:02 +02:00
4b40d5b0d4 feat: Introduce the Index struct 2019-04-29 14:32:02 +02:00
ee2bad20c7 feat: Store the RankedMap into the inner sled tree 2019-04-29 14:32:02 +02:00
b7805fee93 feat: Store already opened indexes and word indexes 2019-04-29 14:32:02 +02:00
0104e93ba9 feat: Introduce index events to update the WordIndex 2019-04-29 14:32:02 +02:00
25a4961453 feat: Introduce the Indexer struct 2019-04-29 14:32:01 +02:00
7338e522bd squash-me: Add set/get/del_document_attribute to Index methods 2019-04-29 14:32:01 +02:00
58c020a2e1 feat: Store the word index into the database index 2019-04-29 14:32:01 +02:00
f7eced03fd chore: Using a fork of the fst library that support Arc<[u8]> 2019-04-29 14:32:01 +02:00
9be7c02461 chore: Update sled to 0.22.1 2019-04-29 14:32:01 +02:00
9483f2df60 feat: Introduce a custom Error type 2019-04-29 14:32:01 +02:00
f17a05c342 feat: Introduce the RankedMap type 2019-04-29 14:32:00 +02:00
e41c551757 feat: Introduce the Number type 2019-04-29 14:32:00 +02:00
95dfbd1fe0 feat: Introduce the meilidb-data schema module 2019-04-29 14:32:00 +02:00
287d5dee4d feat: Introduce the meilidb-data workspace member 2019-04-29 14:32:00 +02:00
77405cc103 chore: Remove the database module from meilidb 2019-04-29 14:32:00 +02:00
abf7191eec feat: Make the Tokenizer able to support tokenizing sequences 2019-04-29 14:32:00 +02:00
c6bb2b6f9c chore: Make the debug symbols available for release binaries 2019-04-29 14:31:59 +02:00
acede0f3e8 fix: Correctly assert the DocIndex memory size 2019-04-29 14:31:59 +02:00
e56106cbdc chore: Update the toml dependency 2019-04-29 14:31:59 +02:00
87f9528791 feat: Use the new Tokenizer 2019-04-29 14:31:59 +02:00
397522f277 fet: Move meilidb example into the meilidb workspace 2019-04-29 14:31:59 +02:00
a745819ddf feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-04-29 14:31:37 +02:00
5d5bcf7011 feat: Remove the FilterFunc alias type 2019-04-29 14:31:37 +02:00
19e67dcf0b feat: Move query splitting into the tokenizer workspace 2019-04-29 14:31:37 +02:00
1897da5348 feat: Move tokenizer things into the meilidb-tokenizer workspace 2019-04-29 14:31:37 +02:00
d8cbb03c42 chore: Update the .gitignore file 2019-04-29 14:31:36 +02:00
bc227bef21 chore: Add a nightly feature to meilidb-core 2019-04-29 14:31:36 +02:00
3bcb1dc802 chore: Allow the activation of the meilidb-core i128 feature 2019-04-29 14:31:36 +02:00
d0786b4156 chore: Move the SortByAttr into meilidb 2019-04-29 14:31:36 +02:00
14790eeae3 chore: Move index related things to the meilidb-core workspace member 2019-04-29 14:31:35 +02:00
3056b351fa Merge pull request #143 from ndudnicz/examples-movies
doc: add a new +19k movies example dataset
2019-04-15 10:11:38 +02:00
52fca57114 doc: add a new +19k movies example dataset 2019-04-13 21:11:28 +02:00
ee7a570b2f doc: Fix a little typo 2019-03-24 16:45:33 +01:00
61dcf72e04 Merge pull request #131 from meilisearch/update-readme
Add a Features section to the readme
2019-03-24 16:44:00 +01:00
bace8ad510 doc: Add a features section to the readme 2019-03-24 16:28:19 +01:00
e0b759839d Merge pull request #129 from meilisearch/ci-badge
Add CI badge
2019-03-10 22:46:57 +01:00
05b0a3e7d2 Add CI badge 2019-03-10 21:38:04 +01:00
2518037b91 Merge pull request #128 from meilisearch/azure-pipeline
Azure pipeline
2019-03-10 17:38:47 +01:00
3e452f362c Replace TravisCI by Azure CI 2019-03-10 15:46:59 +01:00
4900544574 Merge pull request #126 from Kerollmops/searchable-attributes
Searchable attributes
2019-03-05 17:11:15 +01:00
858589dc6b feat: Limit the QueryBuilder to search only into some attributes 2019-03-05 16:34:29 +01:00
915f2e70a3 Merge pull request #125 from Kerollmops/limit-memory-usage
Limit memory usage
2019-03-05 16:17:56 +01:00
aae301878c fix: Flush the database after each WriteBatch injected 2019-03-05 14:55:57 +01:00
383a49b44f fix: Compact the whole database for each WriteBatch injected 2019-03-05 14:55:57 +01:00
a45cc4b618 fix: Reduce the size of the DocIndex type 2019-03-05 14:55:57 +01:00
93 changed files with 26456 additions and 4845 deletions

3
.gitignore vendored
View File

@ -1,6 +1,7 @@
/rocksdb
/target
/Cargo.lock
meilidb/Cargo.lock
meilidb-core/Cargo.lock
**/*.rs.bk
**/*.csv
**/*.json_lines

View File

@ -1,22 +0,0 @@
language: rust
cache: cargo
branches:
only:
- master
matrix:
fast_finish: true
include:
# Test crates on their minimum Rust versions.
- rust: 1.32.0
name: "meilidb on 1.32.0"
script: ./ci/meilidb.sh
# Test crates on nightly Rust.
- rust: nightly
name: "meilidb on nightly"
script: ./ci/meilidb.sh

View File

@ -1,55 +1,11 @@
[package]
edition = "2018"
name = "meilidb"
version = "0.3.2"
authors = ["Kerollmops <renault.cle@gmail.com>"]
[dependencies]
arc-swap = "0.3.7"
bincode = "1.1.2"
byteorder = "1.3.1"
fst = "0.3.3"
hashbrown = { version = "0.1.8", features = ["serde"] }
lazy_static = "1.2.0"
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
linked-hash-map = { version = "0.5.1", features = ["serde_impl"] }
lockfree = "0.5.1"
log = "0.4.6"
rayon = "1.0.3"
sdset = "0.3.1"
serde = "1.0.88"
serde_derive = "1.0.88"
serde_json = { version = "1.0.38", features = ["preserve_order"] }
size_format = "1.0.2"
slice-group-by = "0.2.4"
unidecode = "0.3.0"
[dependencies.toml]
git = "https://github.com/Kerollmops/toml-rs.git"
features = ["preserve_order"]
rev = "0372ba6"
[dependencies.rocksdb]
git = "https://github.com/pingcap/rust-rocksdb.git"
rev = "306e201"
[features]
default = ["simd"]
i128 = ["bincode/i128", "byteorder/i128"]
portable = ["rocksdb/portable"]
simd = ["rocksdb/sse"]
nightly = ["hashbrown/nightly", "slice-group-by/nightly"]
[dev-dependencies]
csv = "1.0.5"
env_logger = "0.6.0"
jemallocator = "0.1.9"
quickcheck = "0.8.2"
rand = "0.6.5"
rand_xorshift = "0.1.1"
structopt = "0.2.14"
tempfile = "3.0.7"
termcolor = "1.0.4"
[workspace]
members = [
"meilidb",
"meilidb-core",
"meilidb-data",
"meilidb-schema",
"meilidb-tokenizer",
]
[profile.release]
debug = true

View File

@ -1,6 +1,6 @@
# MeiliDB
[![Build Status](https://travis-ci.org/Kerollmops/MeiliDB.svg?branch=master)](https://travis-ci.org/Kerollmops/MeiliDB)
[![Build Status](https://dev.azure.com/thomas0884/thomas/_apis/build/status/meilisearch.MeiliDB?branchName=master)](https://dev.azure.com/thomas0884/thomas/_build/latest?definitionId=1&branchName=master)
[![dependency status](https://deps.rs/repo/github/Kerollmops/MeiliDB/status.svg)](https://deps.rs/repo/github/Kerollmops/MeiliDB)
[![License](https://img.shields.io/github/license/Kerollmops/MeiliDB.svg)](https://github.com/Kerollmops/MeiliDB)
[![Rust 1.31+](https://img.shields.io/badge/rust-1.31+-lightgray.svg)](
@ -8,15 +8,29 @@ https://www.rust-lang.org)
A _full-text search database_ using a key-value store internally.
It uses [RocksDB](https://github.com/facebook/rocksdb) as the internal key-value store. The key-value store allows us to handle updates and queries with small memory and CPU overheads.
## Features
- Provides [6 default ranking criteria](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/criterion/mod.rs#L95-L101) used to [bucket sort](https://en.wikipedia.org/wiki/Bucket_sort) documents
- Accepts [custom criteria](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/criterion/mod.rs#L22-L29) and can apply them in any custom order
- Support [ranged queries](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/query_builder.rs#L146), useful for paginating results
- Can [distinct](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/query_builder.rs#L68) and [filter](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/query_builder.rs#L57) returned documents based on context defined rules
- Can store complete documents or only [user schema specified fields](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/examples/movies/schema-movies.toml)
- The [default tokenizer](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-tokenizer/src/lib.rs#L99) can index latin and kanji based languages
- Returns [the matching text areas](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/lib.rs#L117-L120), useful to highlight matched words in results
- Accepts query time search config like the [searchable fields](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/query_builder.rs#L79)
- Supports run time indexing (incremental indexing)
It uses [RocksDB](https://github.com/facebook/rocksdb) as the internal key-value store. The key-value store allows us to handle updates and queries with small memory and CPU overheads. The whole ranking system is [data oriented](https://github.com/meilisearch/MeiliDB/issues/82) and provides great performances.
You can [read the deep dive](deep-dive.md) if you want more information on the engine, it describes the whole process of generating updates and handling queries or you can take a look at the [typos and ranking rules](typos-ranking-rules.md) if you want to know the default rules used to sort the documents.
We will be proud if you submit issues and pull requests. You can help to grow this project and start contributing by checking [issues tagged "good-first-issue"](https://github.com/Kerollmops/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). It is a good start!
We will be proud if you submit issues and pull requests. You can help to grow this project and start contributing by checking [issues tagged "good-first-issue"](https://github.com/meilisearch/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). It is a good start!
The project is only a library yet. It means that there is no binary provided yet. To get started, you can check the examples wich are made to work with the data located in the `misc/` folder.
MeiliDB will be a binary in a near future so you will be able to use it as a database out-of-the-box. We should be able to query it using a [to-be-defined](https://github.com/Kerollmops/MeiliDB/issues/38) protocol. This is our current goal, [see the milestones](https://github.com/Kerollmops/MeiliDB/milestones). In the end, the binary will be a bunch of network protocols and wrappers around the library - which will also be published on [crates.io](https://crates.io). Both the binary and the library will follow the same update cycle.
MeiliDB will be a binary in a near future so you will be able to use it as a database out-of-the-box. We should be able to query it using a [to-be-defined](https://github.com/meilisearch/MeiliDB/issues/38) protocol. This is our current goal, [see the milestones](https://github.com/meilisearch/MeiliDB/milestones). In the end, the binary will be a bunch of network protocols and wrappers around the library - which will also be published on [crates.io](https://crates.io). Both the binary and the library will follow the same update cycle.
@ -45,16 +59,35 @@ We have seen much better performances when [using jemalloc as the global allocat
## Usage and examples
MeiliDB runs with an index like most search engines.
So to test the library you can create one by indexing a simple csv file.
You can try a little part of MeiliDB with the following commands.
It creates an index named _movies_ and insert two great Tarantino movies in it.
```bash
cargo run --release --example create-database -- test.mdb misc/kaggle.csv --schema schema-example.toml
cargo run --release
curl -XPOST 'http://127.0.0.1:8000/movies' \
-d '
identifier = "id"
[attributes.id]
stored = true
[attributes.title]
stored = true
indexed = true
'
curl -H 'Content-Type: application/json' \
-XPUT 'http://127.0.0.1:8000/movies' \
-d '{ "id": 123, "title": "Inglorious Bastards" }'
curl -H 'Content-Type: application/json' \
-XPUT 'http://127.0.0.1:8000/movies' \
-d '{ "id": 456, "title": "Django Unchained" }'
```
Once the command is executed, the index should be in the `test.mdb` folder. You are now able to run the `query-database` example and play with MeiliDB.
Once the database is initialized you can query it by using the following command:
```bash
cargo run --release --example query-database -- test.mdb -n 10 id title
curl -XGET 'http://127.0.0.1:8000/movies/search?q=inglo'
```

47
azure-pipelines.yml Normal file
View File

@ -0,0 +1,47 @@
---
trigger:
branches:
include: [ master ]
pr: [ master ]
jobs:
- job: test
pool:
vmImage: 'Ubuntu 16.04'
container: tpayet/chiquitita:latest
steps:
- script: |
curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain nightly
displayName: 'Install rustc'
- script: |
$HOME/.cargo/bin/cargo check
displayName: 'Check MeiliDB'
- script: |
$HOME/.cargo/bin/cargo test
displayName: 'Test MeiliDB'
- job: build
dependsOn:
- test
condition: succeeded()
pool:
vmImage: 'Ubuntu 16.04'
container: tpayet/chiquitita:latest
steps:
- script: |
curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain nightly
displayName: 'Install rustc'
- script: |
$HOME/.cargo/bin/cargo build --release
displayName: 'Build MeiliDB'
- task: CopyFiles@2
inputs:
contents: '$(System.DefaultWorkingDirectory)/target/release/libmeilidb.rlib'
targetFolder: $(Build.ArtifactStagingDirectory)
displayName: 'Copy build'
- task: PublishBuildArtifacts@1
inputs:
artifactName: libmeilidb.rlib
displayName: 'Upload artifacts'

View File

@ -1,28 +1,22 @@
# A deep dive in MeiliDB
On the 9 of december 2018.
MeiliDB is a full text search engine based on a final state transducer named [fst](https://github.com/BurntSushi/fst) and a key-value store named [RocksDB](https://github.com/facebook/rocksdb). The goal of a search engine is to store data and to respond to queries as accurate and fast as possible. To achieve this it must save the data as an [inverted index](https://en.wikipedia.org/wiki/Inverted_index).
On the 15 of May 2019.
MeiliDB is a full text search engine based on a final state transducer named [fst](https://github.com/BurntSushi/fst) and a key-value store named [sled](https://github.com/spacejam/sled). The goal of a search engine is to store data and to respond to queries as accurate and fast as possible. To achieve this it must save the matching words in an [inverted index](https://en.wikipedia.org/wiki/Inverted_index).
<!-- MarkdownTOC autolink="true" -->
- [Where is the data stored?](#where-is-the-data-stored)
- [What does the key-value store contains?](#what-does-the-key-value-store-contains)
- [The blob type](#the-blob-type)
- [The inverted word index](#the-inverted-word-index)
- [A final state transducer](#a-final-state-transducer)
- [Document indexes](#document-indexes)
- [Document ids](#document-ids)
- [The schema](#the-schema)
- [Document attributes](#document-attributes)
- [How is an update handled?](#how-is-an-update-handled)
- [The merge operation is CPU consuming](#the-merge-operation-is-cpu-consuming)
- [How is a request processed?](#how-is-a-request-processed)
- [Query lexemes](#query-lexemes)
- [Automatons and query index](#automatons-and-query-index)
- [Sort by criteria](#sort-by-criteria)
- [Retrieve original documents](#retrieve-original-documents)
<!-- /MarkdownTOC -->
@ -30,21 +24,17 @@ MeiliDB is a full text search engine based on a final state transducer named [fs
MeiliDB is entirely backed by a key-value store like any good database (i.e. Postgres, MySQL). This brings a great flexibility in the way documents can be stored and updates handled along time.
[RocksDB brings some](https://rocksdb.org/blog/2015/02/27/write-batch-with-index.html) of the [A.C.I.D. properties](https://en.wikipedia.org/wiki/ACID_(computer_science)) to help us be sure the saved data is consistent, for example we use SST files and the key-value store ability to load them in one time to manage updates.
Note that the SST file have the same restriction as the fst, it needs its keys to be added in order at creation.
[sled will brings some](https://github.com/spacejam/sled/tree/434533332a3f485e6d2e467023be0a0b55d3a1af#plans) of the [A.C.I.D. properties](https://en.wikipedia.org/wiki/ACID_(computer_science)) to help us be sure the saved data is consistent.
## What does the key-value store contains?
It contain the blob, the schema and the documents stored attributes.
It contain the inverted word index, the schema and the documents fields.
### The blob type
### The inverted word index
[The Blob type](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/blob/mod.rs#L16-L19) is a data structure that indicate if an update is a positive or a negative one. In the case where the update is considered positive, the blob will contain [an fst map and the document indexes](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/blob/positive/blob.rs#L15-L18) associated. In the other case it will only contain [all the document ids](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/blob/negative/blob.rs#L12-L14) that must be considered removed.
The Blob type [is stored under the "*data-index*" entry](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/update/positive/update.rs#L497-L499) and marked as [a merge operation](https://github.com/facebook/rocksdb/wiki/Merge-Operator-Implementation) in the key-value store.
[The inverted word index](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/words_index.rs) is a sled Tree dedicated to store and give access to all documents that contains a specific word. The information stored under the word is simply a big ordered array of where in the document the word has been found. In other word, a big list of [`DocIndex`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/lib.rs#L35-L51).
#### A final state transducer
@ -52,89 +42,54 @@ _...also abbreviated fst_
This is the first entry point of the engine, you can read more about how it work with the beautiful blog post of @BurntSushi, [Index 1,600,000,000 Keys with Automata and Rust](https://blog.burntsushi.net/transducers/).
To make it short it is a powerful way to store all the words that are present in the indexed documents. You construct it by giving it all the words you want to index associated with a value that, for the moment, can only be an `u64`. When you want to search in it you can provide any automaton you want, in MeiliDB [a custom levenshtein automaton](https://github.com/tantivy-search/levenshtein-automata/) is used.
Note that the number under each word is auto-incremental, each new word have a new number that is greater than the prevous one.
Another powerful feature of `fst` is that it can nearly avoid using RAM and be streamed to disk for example, the problem is that the keys must be always added in lexicographic order, so you must sort them before, for the moment MeiliDB uses a [BTreeMap](https://github.com/Kerollmops/raptor-rs/blob/8abdb0a228e2808fe1814a6a0641a4b72d158579/src/metadata/doc_indexes.rs#L107-L112).
To make it short it is a powerful way to store all the words that are present in the indexed documents. You construct it by giving it all the words you want to index. When you want to search in it you can provide any automaton you want, in MeiliDB [a custom levenshtein automaton](https://github.com/tantivy-search/levenshtein-automata/) is used.
#### Document indexes
As it has been specified, the `fst` can only store a number corresponding to a word, an `u64`, but the goal of the search engine is to retrieve a match in a document when a query is made. You want it to return some sort of position in an attribute in a document, an information about where the given word match.
The `fst` will only return the words that match with the search automaton but the goal of the search engine is to retrieve all matches in all the documents when a query is made. You want it to return some sort of position in an attribute in a document, an information about where the given word matched.
To make it possible, a custom data structure has been developed, the document indexes is composed of two arrays, the ranges array and all the docindexes corresponding to a given range, each range identify the word number. The [DocIndexes](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/data/doc_indexes.rs#L23) type is designed to be streamed when constructed, consumming a minimum amount of ram like the fst. Another advantage is that the slices are accessible in `O(1)` when you know the word associated number.
#### Document ids
This is a simple ordered list of all documents ids which must be considered deleted. It is used with [the sdset library](https://docs.rs/sdset/0.3.0/sdset/duo/struct.DifferenceByKey.html), the docindexes and the `DifferenceByKey` operation builder when merging blobs.
When a blob represent a negative update it only contains this simple slice of deleted documents ids.
To make it possible we retrieve all of the `DocIndex` corresponding to all the matching words in the fst, we use the [`WordsIndex`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/words_index.rs#L11-L21) Tree to get the `DocIndexes` corresponding the words.
### The schema
The schema is a data struture that represents which documents attributes should be stored and which should be indexed. It is stored under the "_data-schema_" entry and given to MeiliDB only at the creation.
The schema is a data structure that represents which documents attributes should be stored and which should be indexed. It is stored under a the [`MainIndex`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/main_index.rs#L12) Tree and given to MeiliDB only at the creation of an index.
Each document attribute is associated to a unique 32 bit number named `SchemaAttr`.
Each document attribute is associated to a unique 16 bit number named [`SchemaAttr`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/schema.rs#L186).
In the future this schema type could be given along with updates and probably be different from the original, the database could be able to handled this document structure and reindex it.
In the future, this schema type could be given along with updates, the database could be able to handled a new schema and reindex the database according to the new one.
### Document attributes
When the engine handle a query the result that the requester want is a document, not only the [match](https://github.com/Kerollmops/MeiliDB/blob/fc2cdf92596fc002ce278e3aa8718640ac44724d/src/lib.rs#L51-L79) associated to it, fields of the original document must be returned too.
When the engine handle a query the result that the requester want is a document, not only the [`Matches`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/lib.rs#L62-L88) associated to it, fields of the original document must be returned too.
So MeiliDB again uses the power of the underlying key-value store and save the documents attributes marked as _STORE_. The key is prefixed by "_doc_" followed by the 64 bit document id in bytes and the schema attribute number in bytes corresponding to the document attribute stored.
So MeiliDB again uses the power of the underlying key-value store and save the documents attributes marked as _STORE_ in the schema. The dedicated Tree for this information is the [`DocumentsIndex`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/documents_index.rs#L11).
When a document field is saved in the key-value store its value is binary encoded using the [bincode](https://docs.rs/bincode/) library, so a document must be serializable using serde.
## How is an update handled?
First of all an update in MeiliDB is nothing more than [a RocksDB SST file](https://github.com/facebook/rocksdb/wiki/Creating-and-Ingesting-SST-files). It contains the blob and all the documents attributes binary encoded like described above. Note that the blob is stored under the "_data-index_" key marked as [a merge operation](https://github.com/facebook/rocksdb/wiki/Merge-Operator-Implementation).
### The merge operation is CPU consuming
When [the database ingest an update](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/mod.rs#L108-L145) it gives the SST file to the underlying RocksDB, once it has ingested it there is a "_data-index_" entry available, we can request it but the key-value store will call a function before, a merge operation is performed.
This merge operation is done on multiple blobs as you have understood and will compute a [PositiveBlob](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/blob/positive/blob.rs#L15), this type contains the fst and document indexes structures allowing us to search for documents. This two data structures can be considered as the inverted index.
The computation time of this merge is important, RocksDB doesn't keep the previous merged result, it will call our merge operation each time until it decided to do a compaction. So [we must force this compaction earlier](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/mod.rs#L129-L131) when we receive an update to reduce this cost.
This way when we request the "_data-index_" value it will gives us the previously merged positive blob without any other merge overhead.
When a document field is saved in the key-value store its value is binary encoded using [message pack](https://github.com/3Hren/msgpack-rust), so a document must be serializable using serde.
## How is a request processed?
Now that we have our "_data-index_" we are able to return results based on a query. In the MeiliDB universe a query is a string.
Now that we have our inverted index we are able to return results based on a query. In the MeiliDB universe a query is a simple string containing words.
### Query lexemes
The first step to be able to call the underlying structures is to split the query in words, for that we use a [custom tokenizer](https://github.com/Kerollmops/MeiliDB/blob/fc2cdf92596fc002ce278e3aa8718640ac44724d/src/tokenizer/mod.rs) that is not finished for the moment, [there is an open issue](https://github.com/Kerollmops/MeiliDB/issues/3). Note that a tokenizer is specialized for a human language, this is the hard part.
The first step to be able to call the underlying structures is to split the query in words, for that we use a [custom tokenizer](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-tokenizer/src/lib.rs#L82-L84). Note that a tokenizer is specialized for a human language, this is the hard part.
### Automatons and query index
So to query the fst we need an automaton, in MeiliDB we use a [levenshtein automaton](https://en.wikipedia.org/wiki/Levenshtein_automaton), this automaton is constructed using a string and a maximum distance. According to the [Algolia's blog post](https://blog.algolia.com/inside-the-algolia-engine-part-3-query-processing/#algolia%e2%80%99s-way-of-searching-for-alternatives) we [created the DFAs](https://github.com/Kerollmops/MeiliDB/blob/fc2cdf92596fc002ce278e3aa8718640ac44724d/src/automaton.rs#L62-L75) with different settings.
So to query the fst we need an automaton, in MeiliDB we use a [levenshtein automaton](https://en.wikipedia.org/wiki/Levenshtein_automaton), this automaton is constructed using a string and a maximum distance. According to the [Algolia's blog post](https://blog.algolia.com/inside-the-algolia-engine-part-3-query-processing/#algolia%e2%80%99s-way-of-searching-for-alternatives) we [created the DFAs](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/automaton.rs#L59-L78) with different settings.
Thanks to the power of the fst library [it is possible to union multiple automatons](https://docs.rs/fst/0.3.2/fst/map/struct.OpBuilder.html#method.union) on the same fst map, it will allow us to know which [automaton returns a word according to its index](https://github.com/Kerollmops/MeiliDB/blob/fc2cdf92596fc002ce278e3aa8718640ac44724d/src/metadata/ops.rs#L111). The `Stream` is able to return all the numbers associated to the words. We use these numbers to find the whole list of `DocIndexes` associated and do the union set operation.
Thanks to the power of the fst library [it is possible to union multiple automatons](https://docs.rs/fst/0.3.2/fst/map/struct.OpBuilder.html#method.union) on the same fst set. The `Stream` is able to return all the matching words. We use these words to find the whole list of `DocIndexes` associated.
With all these informations it is possible [to reconstruct a list of all the DocIndexes associated](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/rank/query_builder.rs#L62-L99) with the words queried.
With all these informations it is possible [to reconstruct a list of all the `DocIndexes` associated](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/query_builder.rs#L103-L130) with the words queried.
### Sort by criteria
Now that we are able to get a big list of [DocIndexes](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/lib.rs#L21-L36) it is not enough to sort them by criteria, we need more informations like the levenshtein distance or the fact that a query word match exactly the word stored in the fst. So [we stuff it a little bit](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/rank/query_builder.rs#L86-L93), and aggregate all these [Matches](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/lib.rs#L47-L74) for each document. This way it will be easy to sort a simple vector of document using a bunch of functions.
With this big list of documents and associated matches [we are able to sort only the part of the slice that we want](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/rank/query_builder.rs#L108-L119) using bucket sorting. [Each criterion](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/rank/criterion/mod.rs#L75-L87) is evaluated on each subslice without copy, thanks to [GroupByMut](https://github.com/Kerollmops/group-by/blob/cab857bae01463dbd0edb99b0e0d7f3624e6c6f5/src/lib.rs#L180-L185) which, I hope [will soon be merged](https://github.com/rust-lang/rfcs/pull/2477).
Note that it is possible to customize the criteria used by using the `QueryBuilder::with_criteria` constructor, this way you can implement some custom ranking based on the document attributes using the appropriate structure and the `retrieve_document` method.
### Retrieve original documents
The [DatabaseView](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/database_view.rs#L18-L24) structure that you must have created to be able to query the database have [two functions](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/database_view.rs#L60-L76) that allows you to retrieve a full (or not) document according to the schema you specified at creation time (i.e. the _STORED_ attributes).
As you can see, these functions force the created type `T` to implement [the serde Deserialize trait](https://docs.rs/serde/1.0.81/serde/trait.Deserialize.html), MeiliDB will use the `bincode::deserialise` function for each attribute to construct your type and return it to you.
With this big list of documents and associated matches [we are able to sort only the part of the slice that we want](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/query_builder.rs#L160-L188) using bucket sorting. [Each criterion](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/criterion/mod.rs#L95-L101) is evaluated on each subslice without copy, thanks to [GroupByMut](https://docs.rs/slice-group-by/0.2.4/slice_group_by/) which, I hope [will soon be merged](https://github.com/rust-lang/rfcs/pull/2477).
Note that it is possible to customize the criteria used by using the `QueryBuilder::with_criteria` constructor, this way you can implement some custom ranking based on the document attributes using the appropriate structure and the [`document` method](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/index.rs#L86).
At this point, MeiliDB work is over 🎉

View File

@ -1,137 +0,0 @@
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
use std::collections::{HashMap, HashSet};
use std::io::{self, BufRead, BufReader};
use std::path::{Path, PathBuf};
use std::time::Instant;
use std::error::Error;
use std::borrow::Cow;
use std::fs::File;
use serde_derive::{Serialize, Deserialize};
use structopt::StructOpt;
use meilidb::database::{Database, Schema};
use meilidb::tokenizer::DefaultBuilder;
#[derive(Debug, StructOpt)]
pub struct Opt {
/// The destination where the database must be created.
#[structopt(parse(from_os_str))]
pub database_path: PathBuf,
/// The csv file to index.
#[structopt(parse(from_os_str))]
pub csv_data_path: PathBuf,
/// The path to the schema.
#[structopt(long = "schema", parse(from_os_str))]
pub schema_path: PathBuf,
/// The path to the list of stop words (one by line).
#[structopt(long = "stop-words", parse(from_os_str))]
pub stop_words_path: Option<PathBuf>,
#[structopt(long = "update-group-size")]
pub update_group_size: Option<usize>,
}
#[derive(Serialize, Deserialize)]
struct Document<'a> (
#[serde(borrow)]
HashMap<Cow<'a, str>, Cow<'a, str>>
);
fn index(
schema: Schema,
database_path: &Path,
csv_data_path: &Path,
update_group_size: Option<usize>,
stop_words: &HashSet<String>,
) -> Result<Database, Box<Error>>
{
let database = Database::create(database_path)?;
database.create_index("default", &schema)?;
let mut rdr = csv::Reader::from_path(csv_data_path)?;
let mut raw_record = csv::StringRecord::new();
let headers = rdr.headers()?.clone();
let mut i = 0;
let mut end_of_file = false;
while !end_of_file {
let tokenizer_builder = DefaultBuilder::new();
let mut update = database.start_update("default")?;
loop {
end_of_file = !rdr.read_record(&mut raw_record)?;
if end_of_file { break }
let document: Document = match raw_record.deserialize(Some(&headers)) {
Ok(document) => document,
Err(e) => {
eprintln!("{:?}", e);
continue;
}
};
update.update_document(&document, &tokenizer_builder, &stop_words)?;
print!("\rindexing document {}", i);
i += 1;
if let Some(group_size) = update_group_size {
if i % group_size == 0 { break }
}
}
println!();
println!("committing update...");
database.commit_update(update)?;
}
Ok(database)
}
fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
let f = File::open(path)?;
let reader = BufReader::new(f);
let mut words = HashSet::new();
for line in reader.lines() {
let line = line?;
let word = line.trim().to_string();
words.insert(word);
}
Ok(words)
}
fn main() -> Result<(), Box<Error>> {
let _ = env_logger::init();
let opt = Opt::from_args();
let schema = {
let file = File::open(&opt.schema_path)?;
Schema::from_toml(file)?
};
let stop_words = match opt.stop_words_path {
Some(ref path) => retrieve_stop_words(path)?,
None => HashSet::new(),
};
let start = Instant::now();
let result = index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words);
if let Err(e) = result {
return Err(e.into())
}
println!("database created in {:.2?} at: {:?}", start.elapsed(), opt.database_path);
Ok(())
}

View File

@ -5,15 +5,15 @@
identifier = "id"
[attributes.id]
stored = true
displayed = true
[attributes.title]
stored = true
displayed = true
indexed = true
[attributes.description]
stored = true
displayed = true
indexed = true
[attributes.image]
stored = true
displayed = true

View File

@ -0,0 +1 @@
_datas in movies.csv are from https://www.themoviedb.org/_

19700
examples/movies/movies.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
# This schema has been generated ...
# The order in which the attributes are declared is important,
# it specify the attribute xxx...
identifier = "id"
[attributes.id]
displayed = true
[attributes.title]
displayed = true
indexed = true
[attributes.overview]
displayed = true
indexed = true
[attributes.release_date]
displayed = true
[attributes.poster]
displayed = true

View File

@ -1,210 +0,0 @@
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
use std::collections::btree_map::{BTreeMap, Entry};
use std::iter::FromIterator;
use std::io::{self, Write};
use std::time::Instant;
use std::path::PathBuf;
use std::error::Error;
use hashbrown::{HashMap, HashSet};
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
use structopt::StructOpt;
use meilidb::database::schema::SchemaAttr;
use meilidb::database::Database;
use meilidb::Match;
#[derive(Debug, StructOpt)]
pub struct Opt {
/// The destination where the database must be created
#[structopt(parse(from_os_str))]
pub database_path: PathBuf,
/// Fields that must be displayed.
pub displayed_fields: Vec<String>,
/// The number of returned results
#[structopt(short = "n", long = "number-results", default_value = "10")]
pub number_results: usize,
/// The number of characters before and after the first match
#[structopt(short = "C", long = "context", default_value = "35")]
pub char_context: usize,
}
type Document = HashMap<String, String>;
fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
let mut stdout = StandardStream::stdout(ColorChoice::Always);
let mut highlighted = false;
for range in ranges.windows(2) {
let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() };
if highlighted {
stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
}
write!(&mut stdout, "{}", &text[start..end])?;
stdout.reset()?;
highlighted = !highlighted;
}
Ok(())
}
fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize) {
let mut byte_index = 0;
let mut byte_length = 0;
for (n, (i, c)) in text.char_indices().enumerate() {
if n == index {
byte_index = i;
}
if n + 1 == index + length {
byte_length = i - byte_index + c.len_utf8();
break;
}
}
(byte_index, byte_length)
}
fn create_highlight_areas(text: &str, matches: &[Match]) -> Vec<usize> {
let mut byte_indexes = BTreeMap::new();
for match_ in matches {
let char_index = match_.char_index as usize;
let char_length = match_.char_length as usize;
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
match byte_indexes.entry(byte_index) {
Entry::Vacant(entry) => { entry.insert(byte_length); },
Entry::Occupied(mut entry) => {
if *entry.get() < byte_length {
entry.insert(byte_length);
}
},
}
}
let mut title_areas = Vec::new();
title_areas.push(0);
for (byte_index, length) in byte_indexes {
title_areas.push(byte_index);
title_areas.push(byte_index + length);
}
title_areas.push(text.len());
title_areas.sort_unstable();
title_areas
}
/// note: matches must have been sorted by `char_index` and `char_length` before being passed.
///
/// ```no_run
/// matches.sort_unstable_by_key(|m| (m.char_index, m.char_length));
///
/// let matches = matches.matches.iter().filter(|m| SchemaAttr::new(m.attribute) == attr).cloned();
///
/// let (text, matches) = crop_text(&text, matches, 35);
/// ```
fn crop_text(
text: &str,
matches: impl IntoIterator<Item=Match>,
context: usize,
) -> (String, Vec<Match>)
{
let mut matches = matches.into_iter().peekable();
let char_index = matches.peek().map(|m| m.char_index as usize).unwrap_or(0);
let start = char_index.saturating_sub(context);
let text = text.chars().skip(start).take(context * 2).collect();
let matches = matches
.take_while(|m| {
(m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)
})
.map(|match_| {
Match { char_index: match_.char_index - start as u32, ..match_ }
})
.collect();
(text, matches)
}
fn main() -> Result<(), Box<Error>> {
let _ = env_logger::init();
let opt = Opt::from_args();
let start = Instant::now();
let database = Database::open(&opt.database_path)?;
println!("database prepared for you in {:.2?}", start.elapsed());
let mut buffer = String::new();
let input = io::stdin();
loop {
print!("Searching for: ");
io::stdout().flush()?;
if input.read_line(&mut buffer)? == 0 { break }
let query = buffer.trim_end_matches('\n');
let view = database.view("default")?;
let schema = view.schema();
let start = Instant::now();
let builder = view.query_builder();
let documents = builder.query(query, 0..opt.number_results);
let number_of_documents = documents.len();
for mut doc in documents {
doc.matches.sort_unstable_by_key(|m| (m.char_index, m.char_index));
match view.document_by_id::<Document>(doc.id) {
Ok(document) => {
for name in &opt.displayed_fields {
let attr = match schema.attribute(name) {
Some(attr) => attr,
None => continue,
};
let text = match document.get(name) {
Some(text) => text,
None => continue,
};
print!("{}: ", name);
let matches = doc.matches.iter()
.filter(|m| SchemaAttr::new(m.attribute) == attr)
.cloned();
let (text, matches) = crop_text(&text, matches, opt.char_context);
let areas = create_highlight_areas(&text, &matches);
display_highlights(&text, &areas)?;
println!();
}
},
Err(e) => eprintln!("{}", e),
}
let mut matching_attributes = HashSet::new();
for _match in doc.matches {
let attr = SchemaAttr::new(_match.attribute);
let name = schema.attribute_name(attr);
matching_attributes.insert(name);
}
let matching_attributes = Vec::from_iter(matching_attributes);
println!("matching in: {:?}", matching_attributes);
println!();
}
eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start.elapsed());
buffer.clear();
}
Ok(())
}

34
meilidb-core/Cargo.toml Normal file
View File

@ -0,0 +1,34 @@
[package]
name = "meilidb-core"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"
[dependencies]
byteorder = "1.3.1"
deunicode = "1.0.0"
hashbrown = "0.2.2"
lazy_static = "1.2.0"
log = "0.4.6"
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
rayon = "1.2.0"
sdset = "0.3.2"
serde = { version = "1.0.88", features = ["derive"] }
slice-group-by = "0.2.6"
zerocopy = "0.2.2"
[dependencies.fst]
git = "https://github.com/Kerollmops/fst.git"
branch = "arc-byte-slice"
[dependencies.levenshtein_automata]
git = "https://github.com/Kerollmops/levenshtein-automata.git"
branch = "arc-byte-slice"
features = ["fst_automaton"]
[dev-dependencies]
assert_matches = "1.3"
[features]
i128 = ["byteorder/i128"]
nightly = ["hashbrown/nightly", "slice-group-by/nightly"]

View File

@ -0,0 +1,44 @@
use lazy_static::lazy_static;
use levenshtein_automata::{
LevenshteinAutomatonBuilder as LevBuilder,
DFA,
};
lazy_static! {
static ref LEVDIST0: LevBuilder = LevBuilder::new(0, false);
static ref LEVDIST1: LevBuilder = LevBuilder::new(1, false);
static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false);
}
#[derive(Copy, Clone)]
enum PrefixSetting {
Prefix,
NoPrefix,
}
fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA {
use self::PrefixSetting::{Prefix, NoPrefix};
match query.len() {
0 ..= 4 => match setting {
Prefix => LEVDIST0.build_prefix_dfa(query),
NoPrefix => LEVDIST0.build_dfa(query),
},
5 ..= 8 => match setting {
Prefix => LEVDIST1.build_prefix_dfa(query),
NoPrefix => LEVDIST1.build_dfa(query),
},
_ => match setting {
Prefix => LEVDIST2.build_prefix_dfa(query),
NoPrefix => LEVDIST2.build_dfa(query),
},
}
}
pub fn build_prefix_dfa(query: &str) -> DFA {
build_dfa_with_setting(query, PrefixSetting::Prefix)
}
pub fn build_dfa(query: &str) -> DFA {
build_dfa_with_setting(query, PrefixSetting::NoPrefix)
}

View File

@ -1,7 +1,6 @@
use std::cmp::Ordering;
use crate::rank::criterion::Criterion;
use crate::rank::RawDocument;
use crate::criterion::Criterion;
use crate::RawDocument;
#[derive(Debug, Clone, Copy)]
pub struct DocumentId;
@ -10,4 +9,8 @@ impl Criterion for DocumentId {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
lhs.id.cmp(&rhs.id)
}
fn name(&self) -> &'static str {
"DocumentId"
}
}

View File

@ -1,9 +1,7 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::rank::criterion::Criterion;
use crate::rank::RawDocument;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize {
@ -38,4 +36,30 @@ impl Criterion for Exact {
lhs.cmp(&rhs).reverse()
}
fn name(&self) -> &'static str {
"Exact"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "soulier"
//
// doc0: "Soulier bleu"
// doc1: "souliereres rouge"
#[test]
fn easy_case() {
let query_index0 = &[0];
let is_exact0 = &[true];
let query_index1 = &[0];
let is_exact1 = &[false];
let doc0 = number_exact_matches(query_index0, is_exact0);
let doc1 = number_exact_matches(query_index1, is_exact1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
}

View File

@ -4,11 +4,10 @@ mod words_proximity;
mod sum_of_words_attribute;
mod sum_of_words_position;
mod exact;
mod sort_by_attr;
mod document_id;
use std::cmp::Ordering;
use crate::rank::RawDocument;
use crate::RawDocument;
pub use self::{
sum_of_typos::SumOfTypos,
@ -17,13 +16,14 @@ pub use self::{
sum_of_words_attribute::SumOfWordsAttribute,
sum_of_words_position::SumOfWordsPosition,
exact::Exact,
sort_by_attr::SortByAttr,
document_id::DocumentId,
};
pub trait Criterion: Send + Sync {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering;
fn name(&self) -> &'static str;
#[inline]
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
self.evaluate(lhs, rhs) == Ordering::Equal
@ -35,6 +35,10 @@ impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T {
(**self).evaluate(lhs, rhs)
}
fn name(&self) -> &'static str {
(**self).name()
}
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
(**self).eq(lhs, rhs)
}
@ -45,6 +49,10 @@ impl<T: Criterion + ?Sized> Criterion for Box<T> {
(**self).evaluate(lhs, rhs)
}
fn name(&self) -> &'static str {
(**self).name()
}
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
(**self).eq(lhs, rhs)
}
@ -105,7 +113,7 @@ impl<'a> Default for Criteria<'a> {
}
}
impl<'a> AsRef<[Box<Criterion + 'a>]> for Criteria<'a> {
impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> {
fn as_ref(&self) -> &[Box<dyn Criterion + 'a>] {
&self.inner
}

View File

@ -1,9 +1,7 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::rank::criterion::Criterion;
use crate::rank::RawDocument;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn number_of_query_words(query_index: &[u32]) -> usize {
@ -26,4 +24,8 @@ impl Criterion for NumberOfWords {
lhs.cmp(&rhs).reverse()
}
fn name(&self) -> &'static str {
"NumberOfWords"
}
}

View File

@ -2,8 +2,8 @@ use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::rank::criterion::Criterion;
use crate::rank::RawDocument;
use crate::criterion::Criterion;
use crate::RawDocument;
// This function is a wrong logarithmic 10 function.
// It is safe to panic on input number higher than 3,
@ -21,7 +21,7 @@ fn custom_log10(n: u8) -> f32 {
#[inline]
fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize {
let mut number_words = 0;
let mut number_words: usize = 0;
let mut sum_typos = 0.0;
let mut index = 0;
@ -53,6 +53,10 @@ impl Criterion for SumOfTypos {
lhs.cmp(&rhs).reverse()
}
fn name(&self) -> &'static str {
"SumOfTypos"
}
}
#[cfg(test)]

View File

@ -1,9 +1,7 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::rank::criterion::Criterion;
use crate::rank::RawDocument;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
@ -37,4 +35,30 @@ impl Criterion for SumOfWordsAttribute {
lhs.cmp(&rhs)
}
fn name(&self) -> &'static str {
"SumOfWordsAttribute"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "soulier"
//
// doc0: { 0. "Soulier bleu", 1. "bla bla bla" }
// doc1: { 0. "Botte rouge", 1. "Soulier en cuir" }
#[test]
fn title_vs_description() {
let query_index0 = &[0];
let attribute0 = &[0];
let query_index1 = &[0];
let attribute1 = &[1];
let doc0 = sum_matches_attributes(query_index0, attribute0);
let doc1 = sum_matches_attributes(query_index1, attribute1);
assert_eq!(doc0.cmp(&doc1), Ordering::Less);
}
}

View File

@ -1,12 +1,10 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::rank::criterion::Criterion;
use crate::rank::RawDocument;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u32]) -> usize {
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
let mut sum_word_index = 0;
let mut index = 0;
@ -37,4 +35,30 @@ impl Criterion for SumOfWordsPosition {
lhs.cmp(&rhs)
}
fn name(&self) -> &'static str {
"SumOfWordsPosition"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "soulier"
//
// doc0: "Soulier bleu"
// doc1: "Botte rouge et soulier noir"
#[test]
fn easy_case() {
let query_index0 = &[0];
let word_index0 = &[0];
let query_index1 = &[0];
let word_index1 = &[3];
let doc0 = sum_matches_attribute_index(query_index0, word_index0);
let doc1 = sum_matches_attribute_index(query_index1, word_index1);
assert_eq!(doc0.cmp(&doc1), Ordering::Less);
}
}

View File

@ -1,18 +1,16 @@
use std::cmp::{self, Ordering};
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
use crate::rank::criterion::Criterion;
use crate::rank::RawDocument;
const MAX_DISTANCE: u32 = 8;
const MAX_DISTANCE: u16 = 8;
#[inline]
fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
(a.clone(), b.clone())
}
fn index_proximity(lhs: u32, rhs: u32) -> u32 {
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
if lhs < rhs {
cmp::min(rhs - lhs, MAX_DISTANCE)
} else {
@ -20,13 +18,13 @@ fn index_proximity(lhs: u32, rhs: u32) -> u32 {
}
}
fn attribute_proximity((lattr, lwi): (u16, u32), (rattr, rwi): (u16, u32)) -> u32 {
fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
if lattr != rattr { return MAX_DISTANCE }
index_proximity(lwi, rwi)
}
fn min_proximity((lattr, lwi): (&[u16], &[u32]), (rattr, rwi): (&[u16], &[u32])) -> u32 {
let mut min_prox = u32::max_value();
fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 {
let mut min_prox = u16::max_value();
for a in lattr.iter().zip(lwi) {
for b in rattr.iter().zip(rwi) {
@ -43,8 +41,8 @@ fn matches_proximity(
query_index: &[u32],
distance: &[u8],
attribute: &[u16],
word_index: &[u32],
) -> u32
word_index: &[u16],
) -> u16
{
let mut query_index_groups = query_index.linear_group();
let mut proximity = 0;
@ -100,6 +98,10 @@ impl Criterion for WordsProximity {
lhs.cmp(&rhs)
}
fn name(&self) -> &'static str {
"WordsProximity"
}
}
#[cfg(test)]

View File

@ -1,5 +1,4 @@
use std::hash::Hash;
use hashbrown::HashMap;
pub struct DistinctMap<K> {
@ -12,7 +11,7 @@ impl<K: Hash + Eq> DistinctMap<K> {
pub fn new(limit: usize) -> Self {
DistinctMap {
inner: HashMap::new(),
limit: limit,
limit,
len: 0,
}
}
@ -31,7 +30,7 @@ pub struct BufferedDistinctMap<'a, K> {
impl<'a, K: Hash + Eq> BufferedDistinctMap<'a, K> {
pub fn new(internal: &'a mut DistinctMap<K>) -> BufferedDistinctMap<'a, K> {
BufferedDistinctMap {
internal: internal,
internal,
inner: HashMap::new(),
len: 0,
}

144
meilidb-core/src/lib.rs Normal file
View File

@ -0,0 +1,144 @@
#![feature(checked_duration_since)]
#[cfg(test)]
#[macro_use] extern crate assert_matches;
mod automaton;
mod distinct_map;
mod query_builder;
mod query_enhancer;
mod raw_document;
mod reordered_attrs;
mod store;
pub mod criterion;
use serde::{Serialize, Deserialize};
use zerocopy::{AsBytes, FromBytes};
use self::raw_document::raw_documents_from;
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder, normalize_str};
pub use self::raw_document::RawDocument;
pub use self::store::Store;
/// Represent an internally generated document unique identifier.
///
/// It is used to inform the database the document you want to deserialize.
/// Helpful for custom ranking.
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
#[derive(Serialize, Deserialize)]
#[derive(AsBytes, FromBytes)]
#[repr(C)]
pub struct DocumentId(pub u64);
/// This structure represent the position of a word
/// in a document and its attributes.
///
/// This is stored in the map, generated at index time,
/// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[derive(AsBytes, FromBytes)]
#[repr(C)]
pub struct DocIndex {
/// The document identifier where the word was found.
pub document_id: DocumentId,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
pub word_index: u16,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
pub char_length: u16,
}
/// This structure represent a matching word with informations
/// on the location of the word in the document.
///
/// The order of the field is important because it defines
/// the way these structures are ordered between themselves.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Highlight {
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
/// The position in bytes where the word was found.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
/// The length in bytes of the found word.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_length: u16,
}
#[doc(hidden)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct TmpMatch {
pub query_index: u32,
pub distance: u8,
pub attribute: u16,
pub word_index: u16,
pub is_exact: bool,
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Document {
pub id: DocumentId,
pub highlights: Vec<Highlight>,
#[cfg(test)]
pub matches: Vec<TmpMatch>,
}
impl Document {
#[cfg(not(test))]
fn from_raw(raw: RawDocument) -> Document {
Document { id: raw.id, highlights: raw.highlights }
}
#[cfg(test)]
fn from_raw(raw: RawDocument) -> Document {
let len = raw.query_index().len();
let mut matches = Vec::with_capacity(len);
let query_index = raw.query_index();
let distance = raw.distance();
let attribute = raw.attribute();
let word_index = raw.word_index();
let is_exact = raw.is_exact();
for i in 0..len {
let match_ = TmpMatch {
query_index: query_index[i],
distance: distance[i],
attribute: attribute[i],
word_index: word_index[i],
is_exact: is_exact[i],
};
matches.push(match_);
}
Document { id: raw.id, matches, highlights: raw.highlights }
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::mem;
#[test]
fn docindex_mem_size() {
assert_eq!(mem::size_of::<DocIndex>(), 16);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,398 @@
use std::ops::Range;
use std::cmp::Ordering::{Less, Greater, Equal};
/// Return `true` if the specified range can accept the given replacements words.
/// Returns `false` if the replacements words are already present in the original query
/// or if there is fewer replacement words than the range to replace.
//
//
// ## Ignored because already present in original
//
// new york city subway
// -------- ^^^^
// / \
// [new york city]
//
//
// ## Ignored because smaller than the original
//
// new york city subway
// -------------
// \ /
// [new york]
//
//
// ## Accepted because bigger than the original
//
// NYC subway
// ---
// / \
// / \
// / \
// / \
// / \
// [new york city]
//
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
where S: AsRef<str>,
T: AsRef<str>,
{
if words.len() <= range.len() {
// there is fewer or equal replacement words
// than there is already in the replaced range
return false
}
// retrieve the part to rewrite but with the length
// of the replacement part
let original = query.iter().skip(range.start).take(words.len());
// check if the original query doesn't already contain
// the replacement words
!original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref))
}
type Origin = usize;
type RealLength = usize;
struct FakeIntervalTree {
intervals: Vec<(Range<usize>, (Origin, RealLength))>,
}
impl FakeIntervalTree {
fn new(mut intervals: Vec<(Range<usize>, (Origin, RealLength))>) -> FakeIntervalTree {
intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end));
FakeIntervalTree { intervals }
}
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
let element = self.intervals.binary_search_by(|(r, _)| {
if point >= r.start {
if point < r.end { Equal } else { Less }
} else { Greater }
});
let n = match element { Ok(n) => n, Err(n) => n };
match self.intervals.get(n) {
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
_otherwise => None,
}
}
}
pub struct QueryEnhancerBuilder<'a, S> {
query: &'a [S],
origins: Vec<usize>,
real_to_origin: Vec<(Range<usize>, (Origin, RealLength))>,
}
impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
// we initialize origins query indices based on their positions
let origins: Vec<_> = (0..query.len() + 1).collect();
let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect();
QueryEnhancerBuilder { query, origins, real_to_origin }
}
/// Update the final real to origin query indices mapping.
///
/// `range` is the original words range that this `replacement` words replace
/// and `real` is the first real query index of these replacement words.
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
where T: AsRef<str>,
{
// check if the range of original words
// can be rewritten with the replacement words
if rewrite_range_with(self.query, range.clone(), replacement) {
// this range can be replaced so we need to
// modify the origins accordingly
let offset = replacement.len() - range.len();
let previous_padding = self.origins[range.end - 1];
let current_offset = (self.origins[range.end] - 1) - previous_padding;
let diff = offset.saturating_sub(current_offset);
self.origins[range.end] += diff;
for r in &mut self.origins[range.end + 1..] {
*r += diff;
}
}
// we need to store the real number and origins relations
// this way it will be possible to know by how many
// we need to pad real query indices
let real_range = real..real + replacement.len().max(range.len());
let real_length = replacement.len();
self.real_to_origin.push((real_range, (range.start, real_length)));
}
pub fn build(self) -> QueryEnhancer {
QueryEnhancer {
origins: self.origins,
real_to_origin: FakeIntervalTree::new(self.real_to_origin),
}
}
}
pub struct QueryEnhancer {
origins: Vec<usize>,
real_to_origin: FakeIntervalTree,
}
impl QueryEnhancer {
/// Returns the query indices to use to replace this real query index.
pub fn replacement(&self, real: u32) -> Range<u32> {
let real = real as usize;
// query the fake interval tree with the real query index
let (range, (origin, real_length)) =
self.real_to_origin
.query(real)
.expect("real has never been declared");
// if `real` is the end bound of the range
if (range.start + real_length - 1) == real {
let mut count = range.len();
let mut new_origin = origin;
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
let len = slice[1] - slice[0];
count = count.saturating_sub(len);
if count == 0 { new_origin = origin + i; break }
}
let n = real - range.start;
let start = self.origins[origin];
let end = self.origins[new_origin + 1];
let remaining = (end - start) - n;
Range { start: (start + n) as u32, end: (start + n + remaining) as u32 }
} else {
// just return the origin along with
// the real position of the word
let n = real as usize - range.start;
let origin = self.origins[origin];
Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 }
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn original_unmodified() {
let query = ["new", "york", "city", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// new york = new york city
builder.declare(0..2, 4, &["new", "york", "city"]);
// ^ 4 5 6
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // new
assert_eq!(enhancer.replacement(1), 1..2); // york
assert_eq!(enhancer.replacement(2), 2..3); // city
assert_eq!(enhancer.replacement(3), 3..4); // subway
assert_eq!(enhancer.replacement(4), 0..1); // new
assert_eq!(enhancer.replacement(5), 1..2); // york
assert_eq!(enhancer.replacement(6), 2..3); // city
}
#[test]
fn simple_growing() {
let query = ["new", "york", "subway"];
// 0 1 2
let mut builder = QueryEnhancerBuilder::new(&query);
// new york = new york city
builder.declare(0..2, 3, &["new", "york", "city"]);
// ^ 3 4 5
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // new
assert_eq!(enhancer.replacement(1), 1..3); // york
assert_eq!(enhancer.replacement(2), 3..4); // subway
assert_eq!(enhancer.replacement(3), 0..1); // new
assert_eq!(enhancer.replacement(4), 1..2); // york
assert_eq!(enhancer.replacement(5), 2..3); // city
}
#[test]
fn same_place_growings() {
let query = ["NY", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NY = new york
builder.declare(0..1, 2, &["new", "york"]);
// ^ 2 3
// NY = new york city
builder.declare(0..1, 4, &["new", "york", "city"]);
// ^ 4 5 6
// NY = NYC
builder.declare(0..1, 7, &["NYC"]);
// ^ 7
// NY = new york city
builder.declare(0..1, 8, &["new", "york", "city"]);
// ^ 8 9 10
// subway = underground train
builder.declare(1..2, 11, &["underground", "train"]);
// ^ 11 12
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..3); // NY
assert_eq!(enhancer.replacement(1), 3..5); // subway
assert_eq!(enhancer.replacement(2), 0..1); // new
assert_eq!(enhancer.replacement(3), 1..3); // york
assert_eq!(enhancer.replacement(4), 0..1); // new
assert_eq!(enhancer.replacement(5), 1..2); // york
assert_eq!(enhancer.replacement(6), 2..3); // city
assert_eq!(enhancer.replacement(7), 0..3); // NYC
assert_eq!(enhancer.replacement(8), 0..1); // new
assert_eq!(enhancer.replacement(9), 1..2); // york
assert_eq!(enhancer.replacement(10), 2..3); // city
assert_eq!(enhancer.replacement(11), 3..4); // underground
assert_eq!(enhancer.replacement(12), 4..5); // train
}
#[test]
fn bigger_growing() {
let query = ["NYC", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(0..1, 2, &["new", "york", "city"]);
// ^ 2 3 4
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..3); // NYC
assert_eq!(enhancer.replacement(1), 3..4); // subway
assert_eq!(enhancer.replacement(2), 0..1); // new
assert_eq!(enhancer.replacement(3), 1..2); // york
assert_eq!(enhancer.replacement(4), 2..3); // city
}
#[test]
fn middle_query_growing() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..6); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
}
#[test]
fn end_query_growing() {
let query = ["NYC", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(1..2, 2, &["underground", "train"]);
// ^ 2 3
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // NYC
assert_eq!(enhancer.replacement(1), 1..3); // subway
assert_eq!(enhancer.replacement(2), 1..2); // underground
assert_eq!(enhancer.replacement(3), 2..3); // train
}
#[test]
fn multiple_growings() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
// subway = underground train
builder.declare(3..4, 7, &["underground", "train"]);
// ^ 7 8
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..7); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
assert_eq!(enhancer.replacement(7), 5..6); // underground
assert_eq!(enhancer.replacement(8), 6..7); // train
}
#[test]
fn multiple_probable_growings() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
// subway = underground train
builder.declare(3..4, 7, &["underground", "train"]);
// ^ 7 8
// great awesome = good
builder.declare(0..2, 9, &["good"]);
// ^ 9
// awesome NYC = NY
builder.declare(1..3, 10, &["NY"]);
// ^^ 10
// NYC subway = metro
builder.declare(2..4, 11, &["metro"]);
// ^^ 11
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..7); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
assert_eq!(enhancer.replacement(7), 5..6); // underground
assert_eq!(enhancer.replacement(8), 6..7); // train
assert_eq!(enhancer.replacement(9), 0..2); // good
assert_eq!(enhancer.replacement(10), 1..5); // NY
assert_eq!(enhancer.replacement(11), 2..5); // metro
}
}

View File

@ -0,0 +1,141 @@
use std::sync::Arc;
use std::fmt;
use sdset::SetBuf;
use slice_group_by::GroupBy;
use crate::{TmpMatch, DocumentId, Highlight};
#[derive(Clone)]
pub struct RawDocument {
pub id: DocumentId,
pub matches: SharedMatches,
pub highlights: Vec<Highlight>,
}
impl RawDocument {
fn new(id: DocumentId, matches: SharedMatches, highlights: Vec<Highlight>) -> RawDocument {
RawDocument { id, matches, highlights }
}
pub fn query_index(&self) -> &[u32] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
}
pub fn distance(&self) -> &[u8] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
}
pub fn attribute(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
}
pub fn word_index(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
}
pub fn is_exact(&self) -> &[bool] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
}
}
impl fmt::Debug for RawDocument {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str("RawDocument {\r\n")?;
f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "query_index", self.query_index()))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "distance", self.distance()))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "attribute", self.attribute()))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "word_index", self.word_index()))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?;
f.write_str("}")?;
Ok(())
}
}
pub fn raw_documents_from(
matches: SetBuf<(DocumentId, TmpMatch)>,
highlights: SetBuf<(DocumentId, Highlight)>,
) -> Vec<RawDocument>
{
let mut docs_ranges: Vec<(_, Range, _)> = Vec::new();
let mut matches2 = Matches::with_capacity(matches.len());
let matches = matches.linear_group_by_key(|(id, _)| *id);
let highlights = highlights.linear_group_by_key(|(id, _)| *id);
for (mgroup, hgroup) in matches.zip(highlights) {
debug_assert_eq!(mgroup[0].0, hgroup[0].0);
let document_id = mgroup[0].0;
let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0);
let end = start + mgroup.len();
let highlights = hgroup.iter().map(|(_, h)| *h).collect();
docs_ranges.push((document_id, Range { start, end }, highlights));
matches2.extend_from_slice(mgroup);
}
let matches = Arc::new(matches2);
docs_ranges.into_iter().map(|(id, range, highlights)| {
let matches = SharedMatches { range, matches: matches.clone() };
RawDocument::new(id, matches, highlights)
}).collect()
}
#[derive(Debug, Copy, Clone)]
struct Range {
start: usize,
end: usize,
}
#[derive(Clone)]
pub struct SharedMatches {
range: Range,
matches: Arc<Matches>,
}
#[derive(Clone)]
struct Matches {
query_index: Vec<u32>,
distance: Vec<u8>,
attribute: Vec<u16>,
word_index: Vec<u16>,
is_exact: Vec<bool>,
}
impl Matches {
fn with_capacity(cap: usize) -> Matches {
Matches {
query_index: Vec::with_capacity(cap),
distance: Vec::with_capacity(cap),
attribute: Vec::with_capacity(cap),
word_index: Vec::with_capacity(cap),
is_exact: Vec::with_capacity(cap),
}
}
fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) {
for (_, match_) in matches {
self.query_index.push(match_.query_index);
self.distance.push(match_.distance);
self.attribute.push(match_.attribute);
self.word_index.push(match_.word_index);
self.is_exact.push(match_.is_exact);
}
}
}

View File

@ -0,0 +1,24 @@
#[derive(Default, Clone)]
pub struct ReorderedAttrs {
count: usize,
reorders: Vec<Option<u16>>,
}
impl ReorderedAttrs {
pub fn new() -> ReorderedAttrs {
ReorderedAttrs { count: 0, reorders: Vec::new() }
}
pub fn insert_attribute(&mut self, attribute: u16) {
self.reorders.resize(attribute as usize + 1, None);
self.reorders[attribute as usize] = Some(self.count as u16);
self.count += 1;
}
pub fn get(&self, attribute: u16) -> Option<u16> {
match self.reorders.get(attribute as usize) {
Some(Some(attribute)) => Some(*attribute),
_ => None,
}
}
}

34
meilidb-core/src/store.rs Normal file
View File

@ -0,0 +1,34 @@
use std::error::Error;
use fst::Set;
use sdset::SetBuf;
use crate::DocIndex;
pub trait Store {
type Error: Error;
fn words(&self) -> Result<&Set, Self::Error>;
fn word_indexes(&self, word: &[u8]) -> Result<Option<SetBuf<DocIndex>>, Self::Error>;
fn synonyms(&self) -> Result<&Set, Self::Error>;
fn alternatives_to(&self, word: &[u8]) -> Result<Option<Set>, Self::Error>;
}
impl<T> Store for &'_ T where T: Store {
type Error = T::Error;
fn words(&self) -> Result<&Set, Self::Error> {
(*self).words()
}
fn word_indexes(&self, word: &[u8]) -> Result<Option<SetBuf<DocIndex>>, Self::Error> {
(*self).word_indexes(word)
}
fn synonyms(&self) -> Result<&Set, Self::Error> {
(*self).synonyms()
}
fn alternatives_to(&self, word: &[u8]) -> Result<Option<Set>, Self::Error> {
(*self).alternatives_to(word)
}
}

39
meilidb-data/Cargo.toml Normal file
View File

@ -0,0 +1,39 @@
[package]
name = "meilidb-data"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"
[dependencies]
arc-swap = "0.4.2"
bincode = "1.1.4"
crossbeam-channel = "0.3.9"
deunicode = "1.0.0"
hashbrown = { version = "0.6.0", features = ["serde"] }
log = "0.4.6"
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
meilidb-schema = { path = "../meilidb-schema", version = "0.1.0" }
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
ordered-float = { version = "1.0.2", features = ["serde"] }
rocksdb = "0.12.3"
sdset = "0.3.2"
serde = { version = "1.0.99", features = ["derive"] }
serde_json = "1.0.40"
siphasher = "0.3.0"
zerocopy = "0.2.8"
[dependencies.rmp-serde]
git = "https://github.com/3Hren/msgpack-rust.git"
rev = "40b3d48"
[dependencies.rmpv]
git = "https://github.com/3Hren/msgpack-rust.git"
rev = "40b3d48"
features = ["with-serde"]
[dependencies.fst]
git = "https://github.com/Kerollmops/fst.git"
branch = "arc-byte-slice"
[dev-dependencies]
tempfile = "3.1.0"

113
meilidb-data/src/cf_tree.rs Normal file
View File

@ -0,0 +1,113 @@
use std::sync::Arc;
use crossbeam_channel::{unbounded, Sender, Receiver};
use rocksdb::{DBVector, IteratorMode, Direction};
use crate::RocksDbResult;
#[derive(Clone)]
pub struct CfTree {
index: Arc<CfTreeInner>,
sender: Option<Sender<()>>,
}
struct CfTreeInner {
db: Arc<rocksdb::DB>,
name: String,
}
impl CfTree {
pub fn create(db: Arc<rocksdb::DB>, name: String) -> RocksDbResult<CfTree> {
let mut options = rocksdb::Options::default();
options.create_missing_column_families(true);
let _cf = db.create_cf(&name, &options)?;
let index = Arc::new(CfTreeInner { db, name });
Ok(CfTree { index, sender: None })
}
pub fn create_with_subcription(
db: Arc<rocksdb::DB>,
name: String,
) -> RocksDbResult<(CfTree, Receiver<()>)>
{
let mut options = rocksdb::Options::default();
options.create_missing_column_families(true);
let _cf = db.create_cf(&name, &options)?;
let index = Arc::new(CfTreeInner { db, name });
let (sender, receiver) = unbounded();
Ok((CfTree { index, sender: Some(sender) }, receiver))
}
pub fn insert<K, V>(&self, key: K, value: V) -> RocksDbResult<()>
where K: AsRef<[u8]>,
V: AsRef<[u8]>,
{
let cf = self.index.db.cf_handle(&self.index.name).unwrap();
let result = self.index.db.put_cf(cf, key, value);
if let Some(sender) = &self.sender {
let _err = sender.send(());
}
result
}
pub fn get<K>(&self, key: K) -> RocksDbResult<Option<DBVector>>
where K: AsRef<[u8]>,
{
let cf = self.index.db.cf_handle(&self.index.name).unwrap();
self.index.db.get_cf(cf, key)
}
pub fn remove<K>(&self, key: K) -> RocksDbResult<()>
where K: AsRef<[u8]>
{
let cf = self.index.db.cf_handle(&self.index.name).unwrap();
self.index.db.delete_cf(cf, key)
}
/// Start and end key range is inclusive on both bounds.
pub fn range<KS, KE>(&self, start: KS, end: KE) -> RocksDbResult<CfIter>
where KS: AsRef<[u8]>,
KE: AsRef<[u8]>,
{
let cf = self.index.db.cf_handle(&self.index.name).unwrap();
let mut iter = self.index.db.iterator_cf(cf, IteratorMode::Start)?;
iter.set_mode(IteratorMode::From(start.as_ref(), Direction::Forward));
let end_bound = Box::from(end.as_ref());
Ok(CfIter { iter, end_bound: Some(end_bound) })
}
pub fn iter(&self) -> RocksDbResult<CfIter> {
let cf = self.index.db.cf_handle(&self.index.name).unwrap();
let iter = self.index.db.iterator_cf(cf, IteratorMode::Start)?;
Ok(CfIter { iter, end_bound: None })
}
pub fn last_key(&self) -> RocksDbResult<Option<Box<[u8]>>> {
let cf = self.index.db.cf_handle(&self.index.name).unwrap();
let mut iter = self.index.db.iterator_cf(cf, IteratorMode::End)?;
Ok(iter.next().map(|(key, _)| key))
}
}
pub struct CfIter<'a> {
iter: rocksdb::DBIterator<'a>,
end_bound: Option<Box<[u8]>>,
}
impl Iterator for CfIter<'_> {
type Item = (Box<[u8]>, Box<[u8]>);
fn next(&mut self) -> Option<Self::Item> {
match (self.iter.next(), &self.end_bound) {
(Some((ref key, _)), Some(end_bound)) if key > end_bound => None,
(Some(entry), _) => Some(entry),
(None, _) => None,
}
}
}

View File

@ -0,0 +1,73 @@
use std::{error, fmt};
use crate::serde::SerializerError;
#[derive(Debug)]
pub enum Error {
SchemaDiffer,
SchemaMissing,
WordIndexMissing,
MissingDocumentId,
RocksDbError(rocksdb::Error),
FstError(fst::Error),
RmpDecodeError(rmp_serde::decode::Error),
RmpEncodeError(rmp_serde::encode::Error),
BincodeError(bincode::Error),
SerializerError(SerializerError),
}
impl From<rocksdb::Error> for Error {
fn from(error: rocksdb::Error) -> Error {
Error::RocksDbError(error)
}
}
impl From<fst::Error> for Error {
fn from(error: fst::Error) -> Error {
Error::FstError(error)
}
}
impl From<rmp_serde::decode::Error> for Error {
fn from(error: rmp_serde::decode::Error) -> Error {
Error::RmpDecodeError(error)
}
}
impl From<rmp_serde::encode::Error> for Error {
fn from(error: rmp_serde::encode::Error) -> Error {
Error::RmpEncodeError(error)
}
}
impl From<bincode::Error> for Error {
fn from(error: bincode::Error) -> Error {
Error::BincodeError(error)
}
}
impl From<SerializerError> for Error {
fn from(error: SerializerError) -> Error {
Error::SerializerError(error)
}
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use self::Error::*;
match self {
SchemaDiffer => write!(f, "schemas differ"),
SchemaMissing => write!(f, "this index does not have a schema"),
WordIndexMissing => write!(f, "this index does not have a word index"),
MissingDocumentId => write!(f, "document id is missing"),
RocksDbError(e) => write!(f, "RocksDB error; {}", e),
FstError(e) => write!(f, "fst error; {}", e),
RmpDecodeError(e) => write!(f, "rmp decode error; {}", e),
RmpEncodeError(e) => write!(f, "rmp encode error; {}", e),
BincodeError(e) => write!(f, "bincode error; {}", e),
SerializerError(e) => write!(f, "serializer error; {}", e),
}
}
}
impl error::Error for Error { }

View File

@ -0,0 +1,12 @@
use std::ops::Deref;
#[derive(Clone)]
pub struct CustomSettingsIndex(pub(crate) crate::CfTree);
impl Deref for CustomSettingsIndex {
type Target = crate::CfTree;
fn deref(&self) -> &Self::Target {
&self.0
}
}

View File

@ -0,0 +1,33 @@
use std::sync::Arc;
use meilidb_core::DocumentId;
use crate::database::Error;
#[derive(Clone)]
pub struct DocsWordsIndex(pub crate::CfTree);
impl DocsWordsIndex {
pub fn doc_words(&self, id: DocumentId) -> Result<Option<fst::Set>, Error> {
let key = id.0.to_be_bytes();
match self.0.get(key)? {
Some(bytes) => {
let len = bytes.len();
let value = Arc::from(bytes.as_ref());
let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?;
Ok(Some(fst::Set::from(fst)))
},
None => Ok(None)
}
}
pub fn set_doc_words(&self, id: DocumentId, words: &fst::Set) -> Result<(), Error> {
let key = id.0.to_be_bytes();
self.0.insert(key, words.as_fst().as_bytes())?;
Ok(())
}
pub fn del_doc_words(&self, id: DocumentId) -> Result<(), Error> {
let key = id.0.to_be_bytes();
self.0.remove(key)?;
Ok(())
}
}

View File

@ -0,0 +1,90 @@
use std::convert::TryInto;
use meilidb_core::DocumentId;
use meilidb_schema::SchemaAttr;
use rocksdb::DBVector;
use crate::document_attr_key::DocumentAttrKey;
use crate::RocksDbResult;
fn document_fields_range(id: DocumentId) -> ([u8; 10], [u8; 10]) {
let start = DocumentAttrKey::new(id, SchemaAttr::min()).to_be_bytes();
let end = DocumentAttrKey::new(id, SchemaAttr::max()).to_be_bytes();
(start, end)
}
#[derive(Clone)]
pub struct DocumentsIndex(pub(crate) crate::CfTree);
impl DocumentsIndex {
pub fn document_field(&self, id: DocumentId, attr: SchemaAttr) -> RocksDbResult<Option<DBVector>> {
let key = DocumentAttrKey::new(id, attr).to_be_bytes();
self.0.get(key)
}
pub fn set_document_field(&self, id: DocumentId, attr: SchemaAttr, value: Vec<u8>) -> RocksDbResult<()> {
let key = DocumentAttrKey::new(id, attr).to_be_bytes();
self.0.insert(key, value)?;
Ok(())
}
pub fn del_document_field(&self, id: DocumentId, attr: SchemaAttr) -> RocksDbResult<()> {
let key = DocumentAttrKey::new(id, attr).to_be_bytes();
self.0.remove(key)?;
Ok(())
}
pub fn del_all_document_fields(&self, id: DocumentId) -> RocksDbResult<usize> {
let (start, end) = document_fields_range(id);
let mut count = 0;
for (key, _) in self.0.range(start, end)? {
self.0.remove(key)?;
count += 1;
}
Ok(count)
}
pub fn document_fields(&self, id: DocumentId) -> RocksDbResult<DocumentFieldsIter> {
let (start, end) = document_fields_range(id);
let iter = self.0.range(start, end)?;
Ok(DocumentFieldsIter(iter))
}
pub fn len(&self) -> RocksDbResult<u64> {
let mut last_document_id = None;
let mut count = 0;
for (key, _) in self.0.iter()? {
let array = key.as_ref().try_into().unwrap();
let document_id = DocumentAttrKey::from_be_bytes(array).document_id;
if Some(document_id) != last_document_id {
last_document_id = Some(document_id);
count += 1;
}
}
Ok(count)
}
}
pub struct DocumentFieldsIter<'a>(crate::CfIter<'a>);
impl Iterator for DocumentFieldsIter<'_> {
type Item = (SchemaAttr, Box<[u8]>);
fn next(&mut self) -> Option<Self::Item> {
match self.0.next() {
Some((key, value)) => {
let array = key.as_ref().try_into().unwrap();
let key = DocumentAttrKey::from_be_bytes(array);
Some((key.attribute, value))
},
None => None,
}
}
}

View File

@ -0,0 +1,102 @@
use std::sync::Arc;
use std::convert::TryInto;
use meilidb_schema::Schema;
use crate::ranked_map::RankedMap;
use crate::database::Error;
const SCHEMA_KEY: &str = "schema";
const WORDS_KEY: &str = "words";
const SYNONYMS_KEY: &str = "synonyms";
const RANKED_MAP_KEY: &str = "ranked-map";
const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
#[derive(Clone)]
pub struct MainIndex(pub(crate) crate::CfTree);
impl MainIndex {
pub fn schema(&self) -> Result<Option<Schema>, Error> {
match self.0.get(SCHEMA_KEY)? {
Some(bytes) => {
let schema = Schema::read_from_bin(bytes.as_ref())?;
Ok(Some(schema))
},
None => Ok(None),
}
}
pub fn set_schema(&self, schema: &Schema) -> Result<(), Error> {
let mut bytes = Vec::new();
schema.write_to_bin(&mut bytes)?;
self.0.insert(SCHEMA_KEY, bytes)?;
Ok(())
}
pub fn words_set(&self) -> Result<Option<fst::Set>, Error> {
match self.0.get(WORDS_KEY)? {
Some(bytes) => {
let len = bytes.len();
let value = Arc::from(bytes.as_ref());
let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?;
Ok(Some(fst::Set::from(fst)))
},
None => Ok(None),
}
}
pub fn set_words_set(&self, value: &fst::Set) -> Result<(), Error> {
self.0.insert(WORDS_KEY, value.as_fst().as_bytes()).map(drop).map_err(Into::into)
}
pub fn synonyms_set(&self) -> Result<Option<fst::Set>, Error> {
match self.0.get(SYNONYMS_KEY)? {
Some(bytes) => {
let len = bytes.len();
let value = Arc::from(bytes.as_ref());
let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?;
Ok(Some(fst::Set::from(fst)))
},
None => Ok(None),
}
}
pub fn set_synonyms_set(&self, value: &fst::Set) -> Result<(), Error> {
self.0.insert(SYNONYMS_KEY, value.as_fst().as_bytes()).map(drop).map_err(Into::into)
}
pub fn ranked_map(&self) -> Result<Option<RankedMap>, Error> {
match self.0.get(RANKED_MAP_KEY)? {
Some(bytes) => {
let ranked_map = RankedMap::read_from_bin(bytes.as_ref())?;
Ok(Some(ranked_map))
},
None => Ok(None),
}
}
pub fn set_ranked_map(&self, value: &RankedMap) -> Result<(), Error> {
let mut bytes = Vec::new();
value.write_to_bin(&mut bytes)?;
self.0.insert(RANKED_MAP_KEY, bytes)?;
Ok(())
}
pub fn number_of_documents(&self) -> Result<u64, Error> {
match self.0.get(NUMBER_OF_DOCUMENTS_KEY)? {
Some(bytes) => {
let array = (*bytes).try_into().unwrap();
Ok(u64::from_be_bytes(array))
},
None => Ok(0),
}
}
pub fn set_number_of_documents<F>(&self, f: F) -> Result<u64, Error>
where F: FnOnce(u64) -> u64,
{
let new = self.number_of_documents().map(f)?;
self.0.insert(NUMBER_OF_DOCUMENTS_KEY, new.to_be_bytes())?;
Ok(new)
}
}

View File

@ -0,0 +1,487 @@
use std::collections::{HashSet, BTreeMap};
use std::convert::TryInto;
use std::sync::Arc;
use std::sync::atomic::{AtomicU64, Ordering};
use std::thread;
use std::time::{Duration, Instant};
use arc_swap::{ArcSwap, ArcSwapOption, Guard};
use crossbeam_channel::Receiver;
use meilidb_core::criterion::Criteria;
use meilidb_core::{DocIndex, Store, DocumentId, QueryBuilder};
use meilidb_schema::Schema;
use sdset::SetBuf;
use serde::{de, Serialize, Deserialize};
use crate::CfTree;
use crate::ranked_map::RankedMap;
use crate::serde::{Deserializer, DeserializerError};
pub use self::custom_settings_index::CustomSettingsIndex;
use self::docs_words_index::DocsWordsIndex;
use self::documents_index::DocumentsIndex;
use self::main_index::MainIndex;
use self::synonyms_index::SynonymsIndex;
use self::words_index::WordsIndex;
use crate::RocksDbResult;
use crate::database::{
Error,
DocumentsAddition, DocumentsDeletion,
SynonymsAddition, SynonymsDeletion,
apply_documents_addition, apply_documents_deletion,
apply_synonyms_addition, apply_synonyms_deletion,
};
mod custom_settings_index;
mod docs_words_index;
mod documents_index;
mod main_index;
mod synonyms_index;
mod words_index;
#[derive(Deserialize)]
enum UpdateOwned {
DocumentsAddition(Vec<rmpv::Value>),
DocumentsDeletion(Vec<DocumentId>),
SynonymsAddition(BTreeMap<String, Vec<String>>),
SynonymsDeletion(BTreeMap<String, Option<Vec<String>>>),
}
#[derive(Serialize)]
enum Update {
DocumentsAddition(Vec<rmpv::Value>),
DocumentsDeletion(Vec<DocumentId>),
SynonymsAddition(BTreeMap<String, Vec<String>>),
SynonymsDeletion(BTreeMap<String, Option<Vec<String>>>),
}
#[derive(Clone, Serialize, Deserialize)]
pub enum UpdateType {
DocumentsAddition { number: usize },
DocumentsDeletion { number: usize },
SynonymsAddition { number: usize },
SynonymsDeletion { number: usize },
}
#[derive(Clone, Serialize, Deserialize)]
pub struct DetailedDuration {
main: Duration,
}
#[derive(Clone, Serialize, Deserialize)]
pub struct UpdateStatus {
pub update_id: u64,
pub update_type: UpdateType,
pub result: Result<(), String>,
pub detailed_duration: DetailedDuration,
}
fn spawn_update_system(index: Index, subscription: Receiver<()>) -> thread::JoinHandle<()> {
thread::spawn(move || {
let mut subscription = subscription.into_iter();
loop {
while let Some((key, _)) = index.updates_index.iter().unwrap().next() {
let update_id = key.as_ref().try_into().map(u64::from_be_bytes).unwrap();
let updates = &index.updates_index;
let results = &index.updates_results_index;
let update = updates.get(&key).unwrap().unwrap();
let (update_type, result, duration) = match rmp_serde::from_read_ref(&update).unwrap() {
UpdateOwned::DocumentsAddition(documents) => {
let update_type = UpdateType::DocumentsAddition { number: documents.len() };
let ranked_map = index.cache.load().ranked_map.clone();
let start = Instant::now();
let result = apply_documents_addition(&index, ranked_map, documents);
(update_type, result, start.elapsed())
},
UpdateOwned::DocumentsDeletion(documents) => {
let update_type = UpdateType::DocumentsDeletion { number: documents.len() };
let ranked_map = index.cache.load().ranked_map.clone();
let start = Instant::now();
let result = apply_documents_deletion(&index, ranked_map, documents);
(update_type, result, start.elapsed())
},
UpdateOwned::SynonymsAddition(synonyms) => {
let update_type = UpdateType::SynonymsAddition { number: synonyms.len() };
let start = Instant::now();
let result = apply_synonyms_addition(&index, synonyms);
(update_type, result, start.elapsed())
},
UpdateOwned::SynonymsDeletion(synonyms) => {
let update_type = UpdateType::SynonymsDeletion { number: synonyms.len() };
let start = Instant::now();
let result = apply_synonyms_deletion(&index, synonyms);
(update_type, result, start.elapsed())
},
};
let detailed_duration = DetailedDuration { main: duration };
let status = UpdateStatus {
update_id,
update_type,
result: result.map_err(|e| e.to_string()),
detailed_duration,
};
if let Some(callback) = &*index.update_callback.load() {
(callback)(status.clone());
}
let value = bincode::serialize(&status).unwrap();
results.insert(&key, value).unwrap();
updates.remove(&key).unwrap();
}
// this subscription is just used to block
// the loop until a new update is inserted
subscription.next();
}
})
}
fn last_update_id(
update_index: &crate::CfTree,
update_results_index: &crate::CfTree,
) -> RocksDbResult<u64>
{
let uikey = match update_index.last_key()? {
Some(key) => Some(key.as_ref().try_into().map(u64::from_be_bytes).unwrap()),
None => None,
};
let urikey = match update_results_index.last_key()? {
Some(key) => Some(key.as_ref().try_into().map(u64::from_be_bytes).unwrap()),
None => None,
};
Ok(uikey.max(urikey).unwrap_or(0))
}
#[derive(Copy, Clone)]
pub struct IndexStats {
pub number_of_words: usize,
pub number_of_documents: u64,
pub number_attrs_in_ranked_map: usize,
}
#[derive(Clone)]
pub struct Index {
pub(crate) cache: Arc<ArcSwap<Cache>>,
// TODO this will be a snapshot in the future
main_index: MainIndex,
synonyms_index: SynonymsIndex,
words_index: WordsIndex,
docs_words_index: DocsWordsIndex,
documents_index: DocumentsIndex,
custom_settings_index: CustomSettingsIndex,
// used by the update system
updates_id: Arc<AtomicU64>,
updates_index: crate::CfTree,
updates_results_index: crate::CfTree,
update_callback: Arc<ArcSwapOption<Box<dyn Fn(UpdateStatus) + Send + Sync + 'static>>>,
}
pub(crate) struct Cache {
pub words: Arc<fst::Set>,
pub synonyms: Arc<fst::Set>,
pub schema: Schema,
pub ranked_map: RankedMap,
pub number_of_documents: u64,
}
impl Index {
pub fn new(db: Arc<rocksdb::DB>, name: &str) -> Result<Index, Error> {
Index::new_raw(db, name, None)
}
pub fn with_schema(db: Arc<rocksdb::DB>, name: &str, schema: Schema) -> Result<Index, Error> {
Index::new_raw(db, name, Some(schema))
}
fn new_raw(db: Arc<rocksdb::DB>, name: &str, schema: Option<Schema>) -> Result<Index, Error> {
let main_index = CfTree::create(db.clone(), name.to_string()).map(MainIndex)?;
let synonyms_index = CfTree::create(db.clone(), format!("{}-synonyms", name)).map(SynonymsIndex)?;
let words_index = CfTree::create(db.clone(), format!("{}-words", name)).map(WordsIndex)?;
let docs_words_index = CfTree::create(db.clone(), format!("{}-docs-words", name)).map(DocsWordsIndex)?;
let documents_index = CfTree::create(db.clone(), format!("{}-documents", name)).map(DocumentsIndex)?;
let custom_settings_index = CfTree::create(db.clone(), format!("{}-custom", name)).map(CustomSettingsIndex)?;
let (updates_index, subscription) = CfTree::create_with_subcription(db.clone(), format!("{}-updates", name))?;
let updates_results_index = CfTree::create(db.clone(), format!("{}-updates-results", name))?;
let words = match main_index.words_set()? {
Some(words) => Arc::new(words),
None => Arc::new(fst::Set::default()),
};
let synonyms = match main_index.synonyms_set()? {
Some(synonyms) => Arc::new(synonyms),
None => Arc::new(fst::Set::default()),
};
let schema = match (schema, main_index.schema()?) {
(Some(ref expected), Some(ref current)) if current != expected => {
return Err(Error::SchemaDiffer)
},
(Some(expected), Some(_)) => expected,
(Some(expected), None) => {
main_index.set_schema(&expected)?;
expected
},
(None, Some(current)) => current,
(None, None) => return Err(Error::SchemaMissing),
};
let ranked_map = match main_index.ranked_map()? {
Some(map) => map,
None => RankedMap::default(),
};
let number_of_documents = documents_index.len()?;
let cache = Cache { words, synonyms, schema, ranked_map, number_of_documents };
let cache = Arc::new(ArcSwap::from_pointee(cache));
let last_update_id = last_update_id(&updates_index, &updates_results_index)?;
let updates_id = Arc::new(AtomicU64::new(last_update_id + 1));
let index = Index {
cache,
main_index,
synonyms_index,
words_index,
docs_words_index,
documents_index,
custom_settings_index,
updates_id,
updates_index,
updates_results_index,
update_callback: Arc::new(ArcSwapOption::empty()),
};
let _handle = spawn_update_system(index.clone(), subscription);
Ok(index)
}
pub fn set_update_callback<F>(&self, callback: F)
where F: Fn(UpdateStatus) + Send + Sync + 'static
{
self.update_callback.store(Some(Arc::new(Box::new(callback))));
}
pub fn unset_update_callback(&self) {
self.update_callback.store(None);
}
pub fn stats(&self) -> RocksDbResult<IndexStats> {
let cache = self.cache.load();
Ok(IndexStats {
number_of_words: cache.words.len(),
number_of_documents: cache.number_of_documents,
number_attrs_in_ranked_map: cache.ranked_map.len(),
})
}
pub fn query_builder(&self) -> QueryBuilder<RefIndex> {
let ref_index = self.as_ref();
QueryBuilder::new(ref_index)
}
pub fn query_builder_with_criteria<'c>(
&self,
criteria: Criteria<'c>,
) -> QueryBuilder<'c, RefIndex>
{
let ref_index = self.as_ref();
QueryBuilder::with_criteria(ref_index, criteria)
}
pub fn as_ref(&self) -> RefIndex {
RefIndex {
cache: self.cache.load(),
main_index: &self.main_index,
synonyms_index: &self.synonyms_index,
words_index: &self.words_index,
docs_words_index: &self.docs_words_index,
documents_index: &self.documents_index,
custom_settings_index: &self.custom_settings_index,
}
}
pub fn schema(&self) -> Schema {
self.cache.load().schema.clone()
}
pub fn custom_settings(&self) -> CustomSettingsIndex {
self.custom_settings_index.clone()
}
pub fn number_of_documents(&self) -> u64 {
self.cache.load().number_of_documents
}
pub fn documents_addition<D>(&self) -> DocumentsAddition<D> {
DocumentsAddition::new(self)
}
pub fn documents_deletion(&self) -> DocumentsDeletion {
DocumentsDeletion::new(self)
}
pub fn synonyms_addition(&self) -> SynonymsAddition {
SynonymsAddition::new(self)
}
pub fn synonyms_deletion(&self) -> SynonymsDeletion {
SynonymsDeletion::new(self)
}
pub fn update_status(
&self,
update_id: u64,
) -> Result<Option<UpdateStatus>, Error>
{
let update_id = update_id.to_be_bytes();
match self.updates_results_index.get(update_id)? {
Some(value) => {
let value = bincode::deserialize(&value)?;
Ok(Some(value))
},
None => Ok(None),
}
}
pub fn update_status_blocking(
&self,
update_id: u64,
) -> Result<UpdateStatus, Error>
{
// if we find the update result return it now
if let Some(result) = self.update_status(update_id)? {
return Ok(result)
}
loop {
if self.updates_results_index.get(&update_id.to_be_bytes())?.is_some() { break }
std::thread::sleep(Duration::from_millis(300));
}
// the thread has been unblocked, it means that the update result
// has been inserted in the tree, retrieve it
Ok(self.update_status(update_id)?.unwrap())
}
pub fn document<T>(
&self,
fields: Option<&HashSet<&str>>,
id: DocumentId,
) -> Result<Option<T>, DeserializerError>
where T: de::DeserializeOwned,
{
let schema = self.schema();
let fields = match fields {
Some(fields) => fields.into_iter().map(|name| schema.attribute(name)).collect(),
None => None,
};
let mut deserializer = Deserializer {
document_id: id,
index: &self,
fields: fields.as_ref(),
};
// TODO: currently we return an error if all document fields are missing,
// returning None would have been better
T::deserialize(&mut deserializer).map(Some)
}
}
impl Index {
pub(crate) fn push_documents_addition<D>(&self, addition: Vec<D>) -> Result<u64, Error>
where D: serde::Serialize
{
let mut values = Vec::with_capacity(addition.len());
for add in addition {
let vec = rmp_serde::to_vec_named(&add)?;
let add = rmp_serde::from_read(&vec[..])?;
values.push(add);
}
let addition = Update::DocumentsAddition(values);
let update = rmp_serde::to_vec_named(&addition)?;
self.raw_push_update(update)
}
pub(crate) fn push_documents_deletion(
&self,
deletion: Vec<DocumentId>,
) -> Result<u64, Error>
{
let deletion = Update::DocumentsDeletion(deletion);
let update = rmp_serde::to_vec_named(&deletion)?;
self.raw_push_update(update)
}
pub(crate) fn push_synonyms_addition(
&self,
addition: BTreeMap<String, Vec<String>>,
) -> Result<u64, Error>
{
let addition = Update::SynonymsAddition(addition);
let update = rmp_serde::to_vec_named(&addition)?;
self.raw_push_update(update)
}
pub(crate) fn push_synonyms_deletion(
&self,
deletion: BTreeMap<String, Option<Vec<String>>>,
) -> Result<u64, Error>
{
let deletion = Update::SynonymsDeletion(deletion);
let update = rmp_serde::to_vec_named(&deletion)?;
self.raw_push_update(update)
}
fn raw_push_update(&self, raw_update: Vec<u8>) -> Result<u64, Error> {
let update_id = self.updates_id.fetch_add(1, Ordering::SeqCst);
let update_id_array = update_id.to_be_bytes();
self.updates_index.insert(update_id_array, raw_update)?;
Ok(update_id)
}
}
pub struct RefIndex<'a> {
pub(crate) cache: Guard<'static, Arc<Cache>>,
pub main_index: &'a MainIndex,
pub synonyms_index: &'a SynonymsIndex,
pub words_index: &'a WordsIndex,
pub docs_words_index: &'a DocsWordsIndex,
pub documents_index: &'a DocumentsIndex,
pub custom_settings_index: &'a CustomSettingsIndex,
}
impl Store for RefIndex<'_> {
type Error = Error;
fn words(&self) -> Result<&fst::Set, Self::Error> {
Ok(&self.cache.words)
}
fn word_indexes(&self, word: &[u8]) -> Result<Option<SetBuf<DocIndex>>, Self::Error> {
Ok(self.words_index.doc_indexes(word)?)
}
fn synonyms(&self) -> Result<&fst::Set, Self::Error> {
Ok(&self.cache.synonyms)
}
fn alternatives_to(&self, word: &[u8]) -> Result<Option<fst::Set>, Self::Error> {
Ok(self.synonyms_index.alternatives_to(word)?)
}
}

View File

@ -0,0 +1,21 @@
use crate::RocksDbResult;
#[derive(Clone)]
pub struct SynonymsIndex(pub(crate) crate::CfTree);
impl SynonymsIndex {
pub fn alternatives_to(&self, word: &[u8]) -> RocksDbResult<Option<fst::Set>> {
match self.0.get(word)? {
Some(vector) => Ok(Some(fst::Set::from_bytes(vector.to_vec()).unwrap())),
None => Ok(None),
}
}
pub fn set_alternatives_to(&self, word: &[u8], value: Vec<u8>) -> RocksDbResult<()> {
self.0.insert(word, value).map(drop)
}
pub fn del_alternatives_of(&self, word: &[u8]) -> RocksDbResult<()> {
self.0.remove(word).map(drop)
}
}

View File

@ -0,0 +1,45 @@
use meilidb_core::DocIndex;
use sdset::{Set, SetBuf};
use zerocopy::{LayoutVerified, AsBytes};
use crate::RocksDbResult;
#[derive(Clone)]
pub struct WordsIndex(pub(crate) crate::CfTree);
impl WordsIndex {
pub fn doc_indexes(&self, word: &[u8]) -> RocksDbResult<Option<SetBuf<DocIndex>>> {
// we must force an allocation to make the memory aligned
match self.0.get(word)? {
Some(bytes) => {
let vec = match LayoutVerified::new_slice(bytes.as_ref()) {
Some(layout) => layout.into_slice().to_vec(),
None => {
let len = bytes.as_ref().len();
let count = len / std::mem::size_of::<DocIndex>();
let mut buf: Vec<DocIndex> = Vec::with_capacity(count);
unsafe {
let src = bytes.as_ref().as_ptr();
let dst = buf.as_mut_ptr() as *mut u8;
std::ptr::copy_nonoverlapping(src, dst, len);
buf.set_len(count);
}
buf
}
};
let setbuf = SetBuf::new_unchecked(vec);
Ok(Some(setbuf))
},
None => Ok(None),
}
}
pub fn set_doc_indexes(&self, word: &[u8], set: &Set<DocIndex>) -> RocksDbResult<()> {
self.0.insert(word, set.as_bytes()).map(drop)
}
pub fn del_doc_indexes(&self, word: &[u8]) -> RocksDbResult<()> {
self.0.remove(word).map(drop)
}
}

View File

@ -0,0 +1,115 @@
use std::collections::hash_map::Entry;
use std::collections::{HashSet, HashMap};
use std::path::Path;
use std::sync::Arc;
use std::sync::RwLock;
use meilidb_schema::Schema;
mod error;
mod index;
mod update;
pub use self::error::Error;
pub use self::index::{Index, CustomSettingsIndex};
pub use self::update::DocumentsAddition;
pub use self::update::DocumentsDeletion;
pub use self::update::SynonymsAddition;
pub use self::update::SynonymsDeletion;
use self::update::apply_documents_addition;
use self::update::apply_documents_deletion;
use self::update::apply_synonyms_addition;
use self::update::apply_synonyms_deletion;
const INDEXES_KEY: &str = "indexes";
fn load_indexes(tree: &rocksdb::DB) -> Result<HashSet<String>, Error> {
match tree.get(INDEXES_KEY)? {
Some(bytes) => Ok(bincode::deserialize(&bytes)?),
None => Ok(HashSet::new())
}
}
pub struct Database {
cache: RwLock<HashMap<String, Index>>,
inner: Arc<rocksdb::DB>,
}
impl Database {
pub fn open<P: AsRef<Path>>(path: P) -> Result<Database, Error> {
let cache = RwLock::new(HashMap::new());
let mut options = rocksdb::Options::default();
options.create_if_missing(true);
let cfs = rocksdb::DB::list_cf(&options, &path).unwrap_or_default();
let inner = Arc::new(rocksdb::DB::open_cf(&options, path, cfs)?);
let indexes = load_indexes(&inner)?;
let database = Database { cache, inner };
for index in indexes {
database.open_index(&index)?;
}
Ok(database)
}
pub fn indexes(&self) -> Result<HashSet<String>, Error> {
load_indexes(&self.inner)
}
fn set_indexes(&self, value: &HashSet<String>) -> Result<(), Error> {
let bytes = bincode::serialize(value)?;
self.inner.put(INDEXES_KEY, bytes)?;
Ok(())
}
pub fn open_index(&self, name: &str) -> Result<Option<Index>, Error> {
{
let cache = self.cache.read().unwrap();
if let Some(index) = cache.get(name).cloned() {
return Ok(Some(index))
}
}
let mut cache = self.cache.write().unwrap();
let index = match cache.entry(name.to_string()) {
Entry::Occupied(occupied) => {
occupied.get().clone()
},
Entry::Vacant(vacant) => {
if !self.indexes()?.contains(name) {
return Ok(None)
}
let index = Index::new(self.inner.clone(), name)?;
vacant.insert(index).clone()
},
};
Ok(Some(index))
}
pub fn create_index(&self, name: &str, schema: Schema) -> Result<Index, Error> {
let mut cache = self.cache.write().unwrap();
let index = match cache.entry(name.to_string()) {
Entry::Occupied(occupied) => {
occupied.get().clone()
},
Entry::Vacant(vacant) => {
let index = Index::with_schema(self.inner.clone(), name, schema)?;
let mut indexes = self.indexes()?;
indexes.insert(name.to_string());
self.set_indexes(&indexes)?;
vacant.insert(index).clone()
},
};
Ok(index)
}
}

View File

@ -0,0 +1,139 @@
use std::collections::HashSet;
use std::sync::Arc;
use fst::{SetBuilder, set::OpBuilder};
use sdset::{SetOperation, duo::Union};
use serde::Serialize;
use crate::RankedMap;
use crate::database::{Error, Index, index::Cache, apply_documents_deletion};
use crate::indexer::Indexer;
use crate::serde::{extract_document_id, Serializer, RamDocumentStore};
pub struct DocumentsAddition<'a, D> {
index: &'a Index,
documents: Vec<D>,
}
impl<'a, D> DocumentsAddition<'a, D> {
pub fn new(index: &'a Index) -> DocumentsAddition<'a, D> {
DocumentsAddition { index, documents: Vec::new() }
}
pub fn update_document(&mut self, document: D) {
self.documents.push(document);
}
pub fn finalize(self) -> Result<u64, Error>
where D: serde::Serialize
{
self.index.push_documents_addition(self.documents)
}
}
pub fn apply_documents_addition(
index: &Index,
mut ranked_map: RankedMap,
addition: Vec<rmpv::Value>,
) -> Result<(), Error>
{
let mut document_ids = HashSet::new();
let mut document_store = RamDocumentStore::new();
let mut indexer = Indexer::new();
let schema = &index.schema();
let identifier = schema.identifier_name();
for document in addition {
let document_id = match extract_document_id(identifier, &document)? {
Some(id) => id,
None => return Err(Error::MissingDocumentId),
};
// 1. store the document id for future deletion
document_ids.insert(document_id);
// 2. index the document fields in ram stores
let serializer = Serializer {
schema,
document_store: &mut document_store,
indexer: &mut indexer,
ranked_map: &mut ranked_map,
document_id,
};
document.serialize(serializer)?;
}
let ref_index = index.as_ref();
let docs_words = ref_index.docs_words_index;
let documents = ref_index.documents_index;
let main = ref_index.main_index;
let words = ref_index.words_index;
// 1. remove the previous documents match indexes
let documents_to_insert = document_ids.iter().cloned().collect();
apply_documents_deletion(index, ranked_map.clone(), documents_to_insert)?;
// 2. insert new document attributes in the database
for ((id, attr), value) in document_store.into_inner() {
documents.set_document_field(id, attr, value)?;
}
let indexed = indexer.build();
let mut delta_words_builder = SetBuilder::memory();
for (word, delta_set) in indexed.words_doc_indexes {
delta_words_builder.insert(&word).unwrap();
let set = match words.doc_indexes(&word)? {
Some(set) => Union::new(&set, &delta_set).into_set_buf(),
None => delta_set,
};
words.set_doc_indexes(&word, &set)?;
}
for (id, words) in indexed.docs_words {
docs_words.set_doc_words(id, &words)?;
}
let delta_words = delta_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
let words = match main.words_set()? {
Some(words) => {
let op = OpBuilder::new()
.add(words.stream())
.add(delta_words.stream())
.r#union();
let mut words_builder = SetBuilder::memory();
words_builder.extend_stream(op).unwrap();
words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
},
None => delta_words,
};
main.set_words_set(&words)?;
main.set_ranked_map(&ranked_map)?;
let inserted_documents_len = document_ids.len() as u64;
let number_of_documents = main.set_number_of_documents(|old| old + inserted_documents_len)?;
// update the "consistent" view of the Index
let cache = ref_index.cache;
let words = Arc::new(words);
let synonyms = cache.synonyms.clone();
let schema = cache.schema.clone();
let cache = Cache { words, synonyms, schema, ranked_map, number_of_documents };
index.cache.store(Arc::new(cache));
Ok(())
}

View File

@ -0,0 +1,150 @@
use std::collections::{HashMap, HashSet, BTreeSet};
use std::sync::Arc;
use fst::{SetBuilder, Streamer};
use meilidb_core::DocumentId;
use sdset::{SetBuf, SetOperation, duo::DifferenceByKey};
use crate::RankedMap;
use crate::serde::extract_document_id;
use crate::database::{Index, Error, index::Cache};
pub struct DocumentsDeletion<'a> {
index: &'a Index,
documents: Vec<DocumentId>,
}
impl<'a> DocumentsDeletion<'a> {
pub fn new(index: &'a Index) -> DocumentsDeletion<'a> {
DocumentsDeletion { index, documents: Vec::new() }
}
pub fn delete_document_by_id(&mut self, document_id: DocumentId) {
self.documents.push(document_id);
}
pub fn delete_document<D>(&mut self, document: D) -> Result<(), Error>
where D: serde::Serialize,
{
let schema = self.index.schema();
let identifier = schema.identifier_name();
let document_id = match extract_document_id(identifier, &document)? {
Some(id) => id,
None => return Err(Error::MissingDocumentId),
};
self.delete_document_by_id(document_id);
Ok(())
}
pub fn finalize(self) -> Result<u64, Error> {
self.index.push_documents_deletion(self.documents)
}
}
impl Extend<DocumentId> for DocumentsDeletion<'_> {
fn extend<T: IntoIterator<Item=DocumentId>>(&mut self, iter: T) {
self.documents.extend(iter)
}
}
pub fn apply_documents_deletion(
index: &Index,
mut ranked_map: RankedMap,
deletion: Vec<DocumentId>,
) -> Result<(), Error>
{
let ref_index = index.as_ref();
let schema = index.schema();
let docs_words = ref_index.docs_words_index;
let documents = ref_index.documents_index;
let main = ref_index.main_index;
let words = ref_index.words_index;
let idset = SetBuf::from_dirty(deletion);
// collect the ranked attributes according to the schema
let ranked_attrs: Vec<_> = schema.iter()
.filter_map(|(_, attr, prop)| {
if prop.is_ranked() { Some(attr) } else { None }
})
.collect();
let mut words_document_ids = HashMap::new();
for id in idset {
// remove all the ranked attributes from the ranked_map
for ranked_attr in &ranked_attrs {
ranked_map.remove(id, *ranked_attr);
}
if let Some(words) = docs_words.doc_words(id)? {
let mut stream = words.stream();
while let Some(word) = stream.next() {
let word = word.to_vec();
words_document_ids.entry(word).or_insert_with(Vec::new).push(id);
}
}
}
let mut deleted_documents = HashSet::new();
let mut removed_words = BTreeSet::new();
for (word, document_ids) in words_document_ids {
let document_ids = SetBuf::from_dirty(document_ids);
if let Some(doc_indexes) = words.doc_indexes(&word)? {
let op = DifferenceByKey::new(&doc_indexes, &document_ids, |d| d.document_id, |id| *id);
let doc_indexes = op.into_set_buf();
if !doc_indexes.is_empty() {
words.set_doc_indexes(&word, &doc_indexes)?;
} else {
words.del_doc_indexes(&word)?;
removed_words.insert(word);
}
}
for id in document_ids {
if documents.del_all_document_fields(id)? != 0 {
deleted_documents.insert(id);
}
docs_words.del_doc_words(id)?;
}
}
let removed_words = fst::Set::from_iter(removed_words).unwrap();
let words = match main.words_set()? {
Some(words_set) => {
let op = fst::set::OpBuilder::new()
.add(words_set.stream())
.add(removed_words.stream())
.difference();
let mut words_builder = SetBuilder::memory();
words_builder.extend_stream(op).unwrap();
words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
},
None => fst::Set::default(),
};
main.set_words_set(&words)?;
main.set_ranked_map(&ranked_map)?;
let deleted_documents_len = deleted_documents.len() as u64;
let number_of_documents = main.set_number_of_documents(|old| old - deleted_documents_len)?;
// update the "consistent" view of the Index
let cache = ref_index.cache;
let words = Arc::new(words);
let synonyms = cache.synonyms.clone();
let schema = cache.schema.clone();
let cache = Cache { words, synonyms, schema, ranked_map, number_of_documents };
index.cache.store(Arc::new(cache));
Ok(())
}

View File

@ -0,0 +1,9 @@
mod documents_addition;
mod documents_deletion;
mod synonyms_addition;
mod synonyms_deletion;
pub use self::documents_addition::{DocumentsAddition, apply_documents_addition};
pub use self::documents_deletion::{DocumentsDeletion, apply_documents_deletion};
pub use self::synonyms_addition::{SynonymsAddition, apply_synonyms_addition};
pub use self::synonyms_deletion::{SynonymsDeletion, apply_synonyms_deletion};

View File

@ -0,0 +1,94 @@
use std::collections::BTreeMap;
use std::sync::Arc;
use fst::{SetBuilder, set::OpBuilder};
use meilidb_core::normalize_str;
use sdset::SetBuf;
use crate::database::{Error, Index,index::Cache};
pub struct SynonymsAddition<'a> {
index: &'a Index,
synonyms: BTreeMap<String, Vec<String>>,
}
impl<'a> SynonymsAddition<'a> {
pub fn new(index: &'a Index) -> SynonymsAddition<'a> {
SynonymsAddition { index, synonyms: BTreeMap::new() }
}
pub fn add_synonym<S, T, I>(&mut self, synonym: S, alternatives: I)
where S: AsRef<str>,
T: AsRef<str>,
I: IntoIterator<Item=T>,
{
let synonym = normalize_str(synonym.as_ref());
let alternatives = alternatives.into_iter().map(|s| s.as_ref().to_lowercase());
self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives);
}
pub fn finalize(self) -> Result<u64, Error> {
self.index.push_synonyms_addition(self.synonyms)
}
}
pub fn apply_synonyms_addition(
index: &Index,
addition: BTreeMap<String, Vec<String>>,
) -> Result<(), Error>
{
let ref_index = index.as_ref();
let synonyms = ref_index.synonyms_index;
let main = ref_index.main_index;
let mut synonyms_builder = SetBuilder::memory();
for (synonym, alternatives) in addition {
synonyms_builder.insert(&synonym).unwrap();
let alternatives = {
let alternatives = SetBuf::from_dirty(alternatives);
let mut alternatives_builder = SetBuilder::memory();
alternatives_builder.extend_iter(alternatives).unwrap();
alternatives_builder.into_inner().unwrap()
};
synonyms.set_alternatives_to(synonym.as_bytes(), alternatives)?;
}
let delta_synonyms = synonyms_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
let synonyms = match main.synonyms_set()? {
Some(synonyms) => {
let op = OpBuilder::new()
.add(synonyms.stream())
.add(delta_synonyms.stream())
.r#union();
let mut synonyms_builder = SetBuilder::memory();
synonyms_builder.extend_stream(op).unwrap();
synonyms_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
},
None => delta_synonyms,
};
main.set_synonyms_set(&synonyms)?;
// update the "consistent" view of the Index
let cache = ref_index.cache;
let words = Arc::new(main.words_set()?.unwrap_or_default());
let ranked_map = cache.ranked_map.clone();
let synonyms = Arc::new(synonyms);
let schema = cache.schema.clone();
let number_of_documents = cache.number_of_documents;
let cache = Cache { words, synonyms, schema, ranked_map, number_of_documents };
index.cache.store(Arc::new(cache));
Ok(())
}

View File

@ -0,0 +1,137 @@
use std::collections::BTreeMap;
use std::iter::FromIterator;
use std::sync::Arc;
use fst::{SetBuilder, set::OpBuilder};
use meilidb_core::normalize_str;
use sdset::SetBuf;
use crate::database::{Error, Index, index::Cache};
pub struct SynonymsDeletion<'a> {
index: &'a Index,
synonyms: BTreeMap<String, Option<Vec<String>>>,
}
impl<'a> SynonymsDeletion<'a> {
pub fn new(index: &'a Index) -> SynonymsDeletion<'a> {
SynonymsDeletion { index, synonyms: BTreeMap::new() }
}
pub fn delete_all_alternatives_of<S: AsRef<str>>(&mut self, synonym: S) {
let synonym = normalize_str(synonym.as_ref());
self.synonyms.insert(synonym, None);
}
pub fn delete_specific_alternatives_of<S, T, I>(&mut self, synonym: S, alternatives: I)
where S: AsRef<str>,
T: AsRef<str>,
I: Iterator<Item=T>,
{
let synonym = normalize_str(synonym.as_ref());
let value = self.synonyms.entry(synonym).or_insert(None);
let alternatives = alternatives.map(|s| s.as_ref().to_lowercase());
match value {
Some(v) => v.extend(alternatives),
None => *value = Some(Vec::from_iter(alternatives)),
}
}
pub fn finalize(self) -> Result<u64, Error> {
self.index.push_synonyms_deletion(self.synonyms)
}
}
pub fn apply_synonyms_deletion(
index: &Index,
deletion: BTreeMap<String, Option<Vec<String>>>,
) -> Result<(), Error>
{
let ref_index = index.as_ref();
let synonyms = ref_index.synonyms_index;
let main = ref_index.main_index;
let mut delete_whole_synonym_builder = SetBuilder::memory();
for (synonym, alternatives) in deletion {
match alternatives {
Some(alternatives) => {
let prev_alternatives = synonyms.alternatives_to(synonym.as_bytes())?;
let prev_alternatives = match prev_alternatives {
Some(alternatives) => alternatives,
None => continue,
};
let delta_alternatives = {
let alternatives = SetBuf::from_dirty(alternatives);
let mut builder = SetBuilder::memory();
builder.extend_iter(alternatives).unwrap();
builder.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
};
let op = OpBuilder::new()
.add(prev_alternatives.stream())
.add(delta_alternatives.stream())
.difference();
let (alternatives, empty_alternatives) = {
let mut builder = SetBuilder::memory();
let len = builder.get_ref().len();
builder.extend_stream(op).unwrap();
let is_empty = len == builder.get_ref().len();
let alternatives = builder.into_inner().unwrap();
(alternatives, is_empty)
};
if empty_alternatives {
delete_whole_synonym_builder.insert(synonym.as_bytes())?;
} else {
synonyms.set_alternatives_to(synonym.as_bytes(), alternatives)?;
}
},
None => {
delete_whole_synonym_builder.insert(&synonym).unwrap();
synonyms.del_alternatives_of(synonym.as_bytes())?;
}
}
}
let delta_synonyms = delete_whole_synonym_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
let synonyms = match main.synonyms_set()? {
Some(synonyms) => {
let op = OpBuilder::new()
.add(synonyms.stream())
.add(delta_synonyms.stream())
.difference();
let mut synonyms_builder = SetBuilder::memory();
synonyms_builder.extend_stream(op).unwrap();
synonyms_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
},
None => fst::Set::default(),
};
main.set_synonyms_set(&synonyms)?;
// update the "consistent" view of the Index
let cache = ref_index.cache;
let words = Arc::new(main.words_set()?.unwrap_or_default());
let ranked_map = cache.ranked_map.clone();
let synonyms = Arc::new(synonyms);
let schema = cache.schema.clone();
let number_of_documents = cache.number_of_documents;
let cache = Cache { words, synonyms, schema, ranked_map, number_of_documents };
index.cache.store(Arc::new(cache));
Ok(())
}

View File

@ -0,0 +1,69 @@
use meilidb_core::DocumentId;
use meilidb_schema::SchemaAttr;
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct DocumentAttrKey {
pub document_id: DocumentId,
pub attribute: SchemaAttr,
}
impl DocumentAttrKey {
pub fn new(document_id: DocumentId, attribute: SchemaAttr) -> DocumentAttrKey {
DocumentAttrKey { document_id, attribute }
}
pub fn to_be_bytes(self) -> [u8; 10] {
let mut output = [0u8; 10];
let document_id = self.document_id.0.to_be_bytes();
let attribute = self.attribute.0.to_be_bytes();
unsafe {
use std::{mem::size_of, ptr::copy_nonoverlapping};
let output = output.as_mut_ptr();
copy_nonoverlapping(document_id.as_ptr(), output, size_of::<u64>());
let output = output.add(size_of::<u64>());
copy_nonoverlapping(attribute.as_ptr(), output, size_of::<u16>());
}
output
}
pub fn from_be_bytes(bytes: [u8; 10]) -> DocumentAttrKey {
let document_id;
let attribute;
unsafe {
use std::ptr::read_unaligned;
let pointer = bytes.as_ptr() as *const _;
let document_id_bytes = read_unaligned(pointer);
document_id = u64::from_be_bytes(document_id_bytes);
let pointer = pointer.add(1) as *const _;
let attribute_bytes = read_unaligned(pointer);
attribute = u16::from_be_bytes(attribute_bytes);
}
DocumentAttrKey {
document_id: DocumentId(document_id),
attribute: SchemaAttr(attribute),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn to_from_be_bytes() {
let document_id = DocumentId(67578308);
let schema_attr = SchemaAttr(3456);
let x = DocumentAttrKey::new(document_id, schema_attr);
assert_eq!(x, DocumentAttrKey::from_be_bytes(x.to_be_bytes()));
}
}

208
meilidb-data/src/indexer.rs Normal file
View File

@ -0,0 +1,208 @@
use std::collections::{BTreeMap, HashMap};
use std::convert::TryFrom;
use deunicode::deunicode_with_tofu;
use meilidb_core::{DocumentId, DocIndex};
use meilidb_schema::SchemaAttr;
use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token};
use sdset::SetBuf;
type Word = Vec<u8>; // TODO make it be a SmallVec
pub struct Indexer {
word_limit: usize, // the maximum number of indexed words
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
docs_words: HashMap<DocumentId, Vec<Word>>,
}
pub struct Indexed {
pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,
pub docs_words: HashMap<DocumentId, fst::Set>,
}
impl Indexer {
pub fn new() -> Indexer {
Indexer::with_word_limit(1000)
}
pub fn with_word_limit(limit: usize) -> Indexer {
Indexer {
word_limit: limit,
words_doc_indexes: BTreeMap::new(),
docs_words: HashMap::new(),
}
}
pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) {
let lowercase_text = text.to_lowercase();
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
// TODO compute the deunicoded version after the cjk check
let next = if !lowercase_text.contains(is_cjk) && lowercase_text != deunicoded {
Some(deunicoded)
} else {
None
};
let iter = Some(lowercase_text).into_iter().chain(next);
for text in iter {
for token in Tokenizer::new(&text) {
let must_continue = index_token(
token,
id,
attr,
self.word_limit,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
if !must_continue { break }
}
}
}
pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
where I: IntoIterator<Item=&'a str, IntoIter=IT>,
IT: Iterator<Item = &'a str> + Clone,
{
// TODO serialize this to one call to the SeqTokenizer loop
let lowercased: Vec<_> = iter.into_iter().map(str::to_lowercase).collect();
let iter = lowercased.iter().map(|t| t.as_str());
for token in SeqTokenizer::new(iter) {
let must_continue = index_token(
token,
id,
attr,
self.word_limit,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
if !must_continue { break }
}
let deunicoded: Vec<_> = lowercased.into_iter().map(|lowercase_text| {
if lowercase_text.contains(is_cjk) { return lowercase_text }
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
if lowercase_text != deunicoded { deunicoded } else { lowercase_text }
}).collect();
let iter = deunicoded.iter().map(|t| t.as_str());
for token in SeqTokenizer::new(iter) {
let must_continue = index_token(
token,
id,
attr,
self.word_limit,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
if !must_continue { break }
}
}
pub fn build(self) -> Indexed {
let words_doc_indexes = self.words_doc_indexes
.into_iter()
.map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
.collect();
let docs_words = self.docs_words
.into_iter()
.map(|(id, mut words)| {
words.sort_unstable();
words.dedup();
(id, fst::Set::from_iter(words).unwrap())
})
.collect();
Indexed { words_doc_indexes, docs_words }
}
}
fn index_token(
token: Token,
id: DocumentId,
attr: SchemaAttr,
word_limit: usize,
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool
{
if token.word_index >= word_limit { return false }
match token_to_docindex(id, attr, token) {
Some(docindex) => {
let word = Vec::from(token.word);
words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex);
docs_words.entry(id).or_insert_with(Vec::new).push(word);
},
None => return false,
}
true
}
fn token_to_docindex(id: DocumentId, attr: SchemaAttr, token: Token) -> Option<DocIndex> {
let word_index = u16::try_from(token.word_index).ok()?;
let char_index = u16::try_from(token.char_index).ok()?;
let char_length = u16::try_from(token.word.chars().count()).ok()?;
let docindex = DocIndex {
document_id: id,
attribute: attr.0,
word_index,
char_index,
char_length,
};
Some(docindex)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn strange_apostrophe() {
let mut indexer = Indexer::new();
let docid = DocumentId(0);
let attr = SchemaAttr(0);
let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
indexer.index_text(docid, attr, text);
let Indexed { words_doc_indexes, .. } = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe...
assert!(words_doc_indexes.get(&"l’éteindre".to_owned().into_bytes()).is_some());
}
#[test]
fn strange_apostrophe_in_sequence() {
let mut indexer = Indexer::new();
let docid = DocumentId(0);
let attr = SchemaAttr(0);
let text = vec!["Zut, l’aspirateur, j’ai oublié de l’éteindre !"];
indexer.index_text_seq(docid, attr, text);
let Indexed { words_doc_indexes, .. } = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe...
assert!(words_doc_indexes.get(&"l’éteindre".to_owned().into_bytes()).is_some());
}
}

15
meilidb-data/src/lib.rs Normal file
View File

@ -0,0 +1,15 @@
mod cf_tree;
mod database;
mod document_attr_key;
mod indexer;
mod number;
mod ranked_map;
mod serde;
pub use self::cf_tree::{CfTree, CfIter};
pub use self::database::{Database, Index, CustomSettingsIndex};
pub use self::number::Number;
pub use self::ranked_map::RankedMap;
pub use self::serde::{compute_document_id, extract_document_id, value_to_string};
pub type RocksDbResult<T> = Result<T, rocksdb::Error>;

View File

@ -0,0 +1,55 @@
use std::num::{ParseIntError, ParseFloatError};
use std::str::FromStr;
use std::fmt;
use ordered_float::OrderedFloat;
use serde::{Serialize, Deserialize};
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum Number {
Unsigned(u64),
Signed(i64),
Float(OrderedFloat<f64>),
}
impl FromStr for Number {
type Err = ParseNumberError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let uint_error = match u64::from_str(s) {
Ok(unsigned) => return Ok(Number::Unsigned(unsigned)),
Err(error) => error,
};
let int_error = match i64::from_str(s) {
Ok(signed) => return Ok(Number::Signed(signed)),
Err(error) => error,
};
let float_error = match f64::from_str(s) {
Ok(float) => return Ok(Number::Float(OrderedFloat(float))),
Err(error) => error,
};
Err(ParseNumberError { uint_error, int_error, float_error })
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ParseNumberError {
uint_error: ParseIntError,
int_error: ParseIntError,
float_error: ParseFloatError,
}
impl fmt::Display for ParseNumberError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
if self.uint_error == self.int_error {
write!(f, "can not parse number: {}, {}", self.uint_error, self.float_error)
} else {
write!(f, "can not parse number: {}, {}, {}",
self.uint_error, self.int_error, self.float_error)
}
}
}

View File

@ -0,0 +1,36 @@
use std::io::{Read, Write};
use hashbrown::HashMap;
use meilidb_core::DocumentId;
use meilidb_schema::SchemaAttr;
use crate::Number;
#[derive(Debug, Default, Clone, PartialEq, Eq)]
pub struct RankedMap(HashMap<(DocumentId, SchemaAttr), Number>);
impl RankedMap {
pub fn len(&self) -> usize {
self.0.len()
}
pub fn insert(&mut self, document: DocumentId, attribute: SchemaAttr, number: Number) {
self.0.insert((document, attribute), number);
}
pub fn remove(&mut self, document: DocumentId, attribute: SchemaAttr) {
self.0.remove(&(document, attribute));
}
pub fn get(&self, document: DocumentId, attribute: SchemaAttr) -> Option<Number> {
self.0.get(&(document, attribute)).cloned()
}
pub fn read_from_bin<R: Read>(reader: R) -> bincode::Result<RankedMap> {
bincode::deserialize_from(reader).map(RankedMap)
}
pub fn write_to_bin<W: Write>(&self, writer: W) -> bincode::Result<()> {
bincode::serialize_into(writer, &self.0)
}
}

View File

@ -1,12 +1,16 @@
use serde::Serialize;
use std::str::FromStr;
use ordered_float::OrderedFloat;
use serde::ser;
use serde::Serialize;
use crate::database::serde::SerializerError;
use super::SerializerError;
use crate::Number;
pub struct KeyToStringSerializer;
pub struct ConvertToNumber;
impl ser::Serializer for KeyToStringSerializer {
type Ok = String;
impl ser::Serializer for ConvertToNumber {
type Ok = Number;
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
@ -16,48 +20,78 @@ impl ser::Serializer for KeyToStringSerializer {
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
fn serialize_bool(self, value: bool) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(u64::from(value)))
}
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
fn serialize_char(self, _value: char) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "char" })
}
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(i64::from(value)))
}
f32 => serialize_f32,
f64 => serialize_f64,
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(i64::from(value)))
}
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(i64::from(value)))
}
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(value))
}
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(u64::from(value)))
}
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(u64::from(value)))
}
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(u64::from(value)))
}
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(value))
}
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Float(OrderedFloat(f64::from(value))))
}
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Float(OrderedFloat(value)))
}
fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
Ok(Number::from_str(value)?)
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
Err(SerializerError::UnrankableType { type_name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
Err(SerializerError::UnrankableType { type_name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
Err(SerializerError::UnrankableType { type_name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
Err(SerializerError::UnrankableType { type_name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
Err(SerializerError::UnrankableType { type_name: "unit struct" })
}
fn serialize_unit_variant(
@ -67,7 +101,7 @@ impl ser::Serializer for KeyToStringSerializer {
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
Err(SerializerError::UnrankableType { type_name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
@ -89,15 +123,15 @@ impl ser::Serializer for KeyToStringSerializer {
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
Err(SerializerError::UnrankableType { type_name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" })
Err(SerializerError::UnrankableType { type_name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
Err(SerializerError::UnrankableType { type_name: "tuple" })
}
fn serialize_tuple_struct(
@ -106,7 +140,7 @@ impl ser::Serializer for KeyToStringSerializer {
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
Err(SerializerError::UnrankableType { type_name: "tuple struct" })
}
fn serialize_tuple_variant(
@ -117,11 +151,11 @@ impl ser::Serializer for KeyToStringSerializer {
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
Err(SerializerError::UnrankableType { type_name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { name: "map" })
Err(SerializerError::UnrankableType { type_name: "map" })
}
fn serialize_struct(
@ -130,7 +164,7 @@ impl ser::Serializer for KeyToStringSerializer {
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct" })
Err(SerializerError::UnrankableType { type_name: "struct" })
}
fn serialize_struct_variant(
@ -141,6 +175,6 @@ impl ser::Serializer for KeyToStringSerializer {
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
Err(SerializerError::UnrankableType { type_name: "struct variant" })
}
}

View File

@ -1,15 +1,12 @@
use std::str::FromStr;
use serde::Serialize;
use serde::{ser, ser::Error};
use serde::ser;
use crate::database::serde::SerializerError;
use crate::database::Number;
use super::SerializerError;
pub struct ValueToNumberSerializer;
pub struct ConvertToString;
impl ser::Serializer for ValueToNumberSerializer {
type Ok = Number;
impl ser::Serializer for ConvertToString {
type Ok = String;
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
@ -19,75 +16,78 @@ impl ser::Serializer for ValueToNumberSerializer {
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "boolean" })
}
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(value as i64))
Ok(value.to_string())
}
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(value as i64))
Ok(value.to_string())
}
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(value as i64))
Ok(value.to_string())
}
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(value as i64))
Ok(value.to_string())
}
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(value as u64))
Ok(value.to_string())
}
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(value as u64))
Ok(value.to_string())
}
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(value as u64))
Ok(value.to_string())
}
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(value as u64))
Ok(value.to_string())
}
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Float(value as f64))
Ok(value.to_string())
}
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Float(value))
Ok(value.to_string())
}
fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
Number::from_str(value).map_err(SerializerError::custom)
Ok(value.to_string())
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
Err(SerializerError::UnserializableType { type_name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
Err(SerializerError::UnserializableType { type_name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
Err(SerializerError::UnserializableType { type_name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
Err(SerializerError::UnserializableType { type_name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
Err(SerializerError::UnserializableType { type_name: "unit struct" })
}
fn serialize_unit_variant(
@ -97,7 +97,7 @@ impl ser::Serializer for ValueToNumberSerializer {
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
Err(SerializerError::UnserializableType { type_name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
@ -119,15 +119,15 @@ impl ser::Serializer for ValueToNumberSerializer {
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" })
Err(SerializerError::UnserializableType { type_name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
Err(SerializerError::UnserializableType { type_name: "tuple" })
}
fn serialize_tuple_struct(
@ -136,7 +136,7 @@ impl ser::Serializer for ValueToNumberSerializer {
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
}
fn serialize_tuple_variant(
@ -147,11 +147,11 @@ impl ser::Serializer for ValueToNumberSerializer {
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { name: "map" })
Err(SerializerError::UnserializableType { type_name: "map" })
}
fn serialize_struct(
@ -160,7 +160,7 @@ impl ser::Serializer for ValueToNumberSerializer {
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct" })
Err(SerializerError::UnserializableType { type_name: "struct" })
}
fn serialize_struct_variant(
@ -171,6 +171,6 @@ impl ser::Serializer for ValueToNumberSerializer {
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
Err(SerializerError::UnserializableType { type_name: "struct variant" })
}
}

View File

@ -0,0 +1,132 @@
use std::collections::HashSet;
use std::io::Cursor;
use std::{fmt, error::Error};
use meilidb_core::DocumentId;
use meilidb_schema::SchemaAttr;
use rmp_serde::decode::{Deserializer as RmpDeserializer, ReadReader};
use rmp_serde::decode::{Error as RmpError};
use serde::{de, forward_to_deserialize_any};
use crate::database::Index;
#[derive(Debug)]
pub enum DeserializerError {
RmpError(RmpError),
RocksDbError(rocksdb::Error),
Custom(String),
}
impl de::Error for DeserializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
DeserializerError::Custom(msg.to_string())
}
}
impl fmt::Display for DeserializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
DeserializerError::RmpError(e) => write!(f, "rmp serde related error: {}", e),
DeserializerError::RocksDbError(e) => write!(f, "RocksDB related error: {}", e),
DeserializerError::Custom(s) => f.write_str(s),
}
}
}
impl Error for DeserializerError {}
impl From<RmpError> for DeserializerError {
fn from(error: RmpError) -> DeserializerError {
DeserializerError::RmpError(error)
}
}
impl From<rocksdb::Error> for DeserializerError {
fn from(error: rocksdb::Error) -> DeserializerError {
DeserializerError::RocksDbError(error)
}
}
pub struct Deserializer<'a> {
pub document_id: DocumentId,
pub index: &'a Index,
pub fields: Option<&'a HashSet<SchemaAttr>>,
}
impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a>
{
type Error = DeserializerError;
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de>
{
self.deserialize_map(visitor)
}
forward_to_deserialize_any! {
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string
bytes byte_buf option unit unit_struct newtype_struct seq tuple
tuple_struct struct enum identifier ignored_any
}
fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de>
{
let schema = self.index.schema();
let documents = self.index.as_ref().documents_index;
let iter = documents
.document_fields(self.document_id)?
.filter_map(|(attr, value)| {
let is_displayed = schema.props(attr).is_displayed();
if is_displayed && self.fields.map_or(true, |f| f.contains(&attr)) {
let attribute_name = schema.attribute_name(attr);
Some((attribute_name, Value::new(value)))
} else {
None
}
});
let map_deserializer = de::value::MapDeserializer::new(iter);
let result = visitor.visit_map(map_deserializer).map_err(DeserializerError::from);
result
}
}
struct Value<A>(RmpDeserializer<ReadReader<Cursor<A>>>) where A: AsRef<[u8]>;
impl<A> Value<A> where A: AsRef<[u8]>
{
fn new(value: A) -> Value<A> {
Value(RmpDeserializer::new(Cursor::new(value)))
}
}
impl<'de, A> de::IntoDeserializer<'de, RmpError> for Value<A>
where A: AsRef<[u8]>,
{
type Deserializer = Self;
fn into_deserializer(self) -> Self::Deserializer {
self
}
}
impl<'de, 'a, A> de::Deserializer<'de> for Value<A>
where A: AsRef<[u8]>,
{
type Error = RmpError;
fn deserialize_any<V>(mut self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de>
{
self.0.deserialize_any(visitor)
}
forward_to_deserialize_any! {
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string
bytes byte_buf option unit unit_struct newtype_struct seq tuple
tuple_struct map struct enum identifier ignored_any
}
}

View File

@ -1,23 +1,53 @@
use serde::Serialize;
use serde::ser;
use std::hash::{Hash, Hasher};
use crate::database::serde::key_to_string::KeyToStringSerializer;
use crate::database::serde::{SerializerError, calculate_hash};
use crate::DocumentId;
use meilidb_core::DocumentId;
use serde::{ser, Serialize};
use serde_json::Value;
use siphasher::sip::SipHasher;
pub struct FindDocumentIdSerializer<'a> {
pub id_attribute_name: &'a str,
use super::{SerializerError, ConvertToString};
pub fn extract_document_id<D>(
identifier: &str,
document: &D,
) -> Result<Option<DocumentId>, SerializerError>
where D: serde::Serialize,
{
let serializer = ExtractDocumentId { identifier };
document.serialize(serializer)
}
impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
type Ok = DocumentId;
pub fn value_to_string(value: &Value) -> Option<String> {
match value {
Value::Null => None,
Value::Bool(_) => None,
Value::Number(value) => Some(value.to_string()),
Value::String(value) => Some(value.to_string()),
Value::Array(_) => None,
Value::Object(_) => None,
}
}
pub fn compute_document_id<H: Hash>(t: H) -> DocumentId {
let mut s = SipHasher::new();
t.hash(&mut s);
let hash = s.finish();
DocumentId(hash)
}
struct ExtractDocumentId<'a> {
identifier: &'a str,
}
impl<'a> ser::Serializer for ExtractDocumentId<'a> {
type Ok = Option<DocumentId>;
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = FindDocumentIdMapSerializer<'a>;
type SerializeStruct = FindDocumentIdStructSerializer<'a>;
type SerializeMap = ExtractDocumentIdMapSerializer<'a>;
type SerializeStruct = ExtractDocumentIdStructSerializer<'a>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
@ -38,30 +68,30 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
f64 => serialize_f64,
}
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "str" })
fn serialize_str(self, _value: &str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "str" })
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
fn serialize_bytes(self, _value: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
Err(SerializerError::UnserializableType { type_name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
Err(SerializerError::UnserializableType { type_name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
Err(SerializerError::UnserializableType { type_name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
Err(SerializerError::UnserializableType { type_name: "unit struct" })
}
fn serialize_unit_variant(
@ -71,7 +101,7 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
Err(SerializerError::UnserializableType { type_name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
@ -93,15 +123,15 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" })
Err(SerializerError::UnserializableType { type_name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
Err(SerializerError::UnserializableType { type_name: "tuple" })
}
fn serialize_tuple_struct(
@ -110,7 +140,7 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
}
fn serialize_tuple_variant(
@ -121,15 +151,17 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Ok(FindDocumentIdMapSerializer {
id_attribute_name: self.id_attribute_name,
let serializer = ExtractDocumentIdMapSerializer {
identifier: self.identifier,
document_id: None,
current_key_name: None,
})
};
Ok(serializer)
}
fn serialize_struct(
@ -138,10 +170,12 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Ok(FindDocumentIdStructSerializer {
id_attribute_name: self.id_attribute_name,
let serializer = ExtractDocumentIdStructSerializer {
identifier: self.identifier,
document_id: None,
})
};
Ok(serializer)
}
fn serialize_struct_variant(
@ -152,24 +186,24 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
Err(SerializerError::UnserializableType { type_name: "struct variant" })
}
}
pub struct FindDocumentIdMapSerializer<'a> {
id_attribute_name: &'a str,
pub struct ExtractDocumentIdMapSerializer<'a> {
identifier: &'a str,
document_id: Option<DocumentId>,
current_key_name: Option<String>,
}
impl<'a> ser::SerializeMap for FindDocumentIdMapSerializer<'a> {
type Ok = DocumentId;
impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> {
type Ok = Option<DocumentId>;
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: Serialize,
{
let key = key.serialize(KeyToStringSerializer)?;
let key = key.serialize(ConvertToString)?;
self.current_key_name = Some(key);
Ok(())
}
@ -188,33 +222,31 @@ impl<'a> ser::SerializeMap for FindDocumentIdMapSerializer<'a> {
) -> Result<(), Self::Error>
where K: Serialize, V: Serialize,
{
let key = key.serialize(KeyToStringSerializer)?;
let key = key.serialize(ConvertToString)?;
if self.id_attribute_name == key {
// TODO is it possible to have multiple ids?
let id = bincode::serialize(value).unwrap();
let hash = calculate_hash(&id);
self.document_id = Some(DocumentId(hash));
if self.identifier == key {
let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?;
match value_to_string(&value).map(|s| compute_document_id(&s)) {
Some(document_id) => self.document_id = Some(document_id),
None => return Err(SerializerError::InvalidDocumentIdType),
}
}
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
match self.document_id {
Some(document_id) => Ok(document_id),
None => Err(SerializerError::DocumentIdNotFound)
}
Ok(self.document_id)
}
}
pub struct FindDocumentIdStructSerializer<'a> {
id_attribute_name: &'a str,
pub struct ExtractDocumentIdStructSerializer<'a> {
identifier: &'a str,
document_id: Option<DocumentId>,
}
impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> {
type Ok = DocumentId;
impl<'a> ser::SerializeStruct for ExtractDocumentIdStructSerializer<'a> {
type Ok = Option<DocumentId>;
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
@ -224,20 +256,18 @@ impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> {
) -> Result<(), Self::Error>
where T: Serialize,
{
if self.id_attribute_name == key {
// TODO can it be possible to have multiple ids?
let id = bincode::serialize(value).unwrap();
let hash = calculate_hash(&id);
self.document_id = Some(DocumentId(hash));
if self.identifier == key {
let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?;
match value_to_string(&value).map(compute_document_id) {
Some(document_id) => self.document_id = Some(document_id),
None => return Err(SerializerError::InvalidDocumentIdType),
}
}
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
match self.document_id {
Some(document_id) => Ok(document_id),
None => Err(SerializerError::DocumentIdNotFound)
}
Ok(self.document_id)
}
}

View File

@ -0,0 +1,336 @@
use meilidb_core::DocumentId;
use meilidb_schema::SchemaAttr;
use serde::ser;
use serde::Serialize;
use crate::indexer::Indexer as RawIndexer;
use super::{SerializerError, ConvertToString};
pub struct Indexer<'a> {
pub attribute: SchemaAttr,
pub indexer: &'a mut RawIndexer,
pub document_id: DocumentId,
}
impl<'a> ser::Serializer for Indexer<'a> {
type Ok = ();
type Error = SerializerError;
type SerializeSeq = SeqIndexer<'a>;
type SerializeTuple = TupleIndexer<'a>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapIndexer<'a>;
type SerializeStruct = StructSerializer<'a>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "boolean" })
}
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_str(self, text: &str) -> Result<Self::Ok, Self::Error> {
self.indexer.index_text(self.document_id, self.attribute, text);
Ok(())
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "Option" })
}
fn serialize_some<T: ?Sized>(self, value: &T) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
let text = value.serialize(ConvertToString)?;
self.indexer.index_text(self.document_id, self.attribute, &text);
Ok(())
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
Err(SerializerError::UnindexableType { type_name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
let indexer = SeqIndexer {
attribute: self.attribute,
document_id: self.document_id,
indexer: self.indexer,
texts: Vec::new(),
};
Ok(indexer)
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
let indexer = TupleIndexer {
attribute: self.attribute,
document_id: self.document_id,
indexer: self.indexer,
texts: Vec::new(),
};
Ok(indexer)
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
let indexer = MapIndexer {
attribute: self.attribute,
document_id: self.document_id,
indexer: self.indexer,
texts: Vec::new(),
};
Ok(indexer)
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "struct" })
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "struct variant" })
}
}
pub struct SeqIndexer<'a> {
attribute: SchemaAttr,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: ser::Serialize
{
let text = value.serialize(ConvertToString)?;
self.texts.push(text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
Ok(())
}
}
pub struct MapIndexer<'a> {
attribute: SchemaAttr,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeMap for MapIndexer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: ser::Serialize,
{
let text = key.serialize(ConvertToString)?;
self.texts.push(text);
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: ser::Serialize,
{
let text = value.serialize(ConvertToString)?;
self.texts.push(text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
Ok(())
}
}
pub struct StructSerializer<'a> {
attribute: SchemaAttr,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T,
) -> Result<(), Self::Error>
where T: ser::Serialize,
{
let key_text = key.to_owned();
let value_text = value.serialize(ConvertToString)?;
self.texts.push(key_text);
self.texts.push(value_text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
Ok(())
}
}
pub struct TupleIndexer<'a> {
attribute: SchemaAttr,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: Serialize
{
let text = value.serialize(ConvertToString)?;
self.texts.push(text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
Ok(())
}
}

View File

@ -0,0 +1,131 @@
macro_rules! forward_to_unserializable_type {
($($ty:ident => $se_method:ident,)*) => {
$(
fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "$ty" })
}
)*
}
}
mod convert_to_number;
mod convert_to_string;
mod deserializer;
mod extract_document_id;
mod indexer;
mod serializer;
pub use self::deserializer::{Deserializer, DeserializerError};
pub use self::extract_document_id::{extract_document_id, compute_document_id, value_to_string};
pub use self::convert_to_string::ConvertToString;
pub use self::convert_to_number::ConvertToNumber;
pub use self::indexer::Indexer;
pub use self::serializer::Serializer;
use std::collections::BTreeMap;
use std::{fmt, error::Error};
use meilidb_core::DocumentId;
use meilidb_schema::SchemaAttr;
use rmp_serde::encode::Error as RmpError;
use serde_json::Error as SerdeJsonError;
use serde::ser;
use crate::number::ParseNumberError;
#[derive(Debug)]
pub enum SerializerError {
DocumentIdNotFound,
InvalidDocumentIdType,
RmpError(RmpError),
RocksDbError(rocksdb::Error),
SerdeJsonError(SerdeJsonError),
ParseNumberError(ParseNumberError),
UnserializableType { type_name: &'static str },
UnindexableType { type_name: &'static str },
UnrankableType { type_name: &'static str },
Custom(String),
}
impl ser::Error for SerializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
SerializerError::Custom(msg.to_string())
}
}
impl fmt::Display for SerializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
SerializerError::DocumentIdNotFound => {
write!(f, "serialized document does not have an id according to the schema")
},
SerializerError::InvalidDocumentIdType => {
write!(f, "document identifier can only be of type string or number")
},
SerializerError::RmpError(e) => write!(f, "rmp serde related error: {}", e),
SerializerError::RocksDbError(e) => write!(f, "RocksDB related error: {}", e),
SerializerError::SerdeJsonError(e) => write!(f, "serde json error: {}", e),
SerializerError::ParseNumberError(e) => {
write!(f, "error while trying to parse a number: {}", e)
},
SerializerError::UnserializableType { type_name } => {
write!(f, "{} are not a serializable type", type_name)
},
SerializerError::UnindexableType { type_name } => {
write!(f, "{} are not an indexable type", type_name)
},
SerializerError::UnrankableType { type_name } => {
write!(f, "{} types can not be used for ranking", type_name)
},
SerializerError::Custom(s) => f.write_str(s),
}
}
}
impl Error for SerializerError {}
impl From<String> for SerializerError {
fn from(value: String) -> SerializerError {
SerializerError::Custom(value)
}
}
impl From<RmpError> for SerializerError {
fn from(error: RmpError) -> SerializerError {
SerializerError::RmpError(error)
}
}
impl From<SerdeJsonError> for SerializerError {
fn from(error: SerdeJsonError) -> SerializerError {
SerializerError::SerdeJsonError(error)
}
}
impl From<rocksdb::Error> for SerializerError {
fn from(error: rocksdb::Error) -> SerializerError {
SerializerError::RocksDbError(error)
}
}
impl From<ParseNumberError> for SerializerError {
fn from(error: ParseNumberError) -> SerializerError {
SerializerError::ParseNumberError(error)
}
}
pub struct RamDocumentStore(BTreeMap<(DocumentId, SchemaAttr), Vec<u8>>);
impl RamDocumentStore {
pub fn new() -> RamDocumentStore {
RamDocumentStore(BTreeMap::new())
}
pub fn set_document_field(&mut self, id: DocumentId, attr: SchemaAttr, value: Vec<u8>) {
self.0.insert((id, attr), value);
}
pub fn into_inner(self) -> BTreeMap<(DocumentId, SchemaAttr), Vec<u8>> {
self.0
}
}

View File

@ -0,0 +1,287 @@
use meilidb_core::DocumentId;
use meilidb_schema::Schema;
use serde::ser;
use crate::indexer::Indexer as RawIndexer;
use crate::ranked_map::RankedMap;
use super::{RamDocumentStore, SerializerError, ConvertToString, ConvertToNumber, Indexer};
pub struct Serializer<'a> {
pub schema: &'a Schema,
pub document_store: &'a mut RamDocumentStore,
pub indexer: &'a mut RawIndexer,
pub ranked_map: &'a mut RankedMap,
pub document_id: DocumentId,
}
impl<'a> ser::Serializer for Serializer<'a> {
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapSerializer<'a>;
type SerializeStruct = StructSerializer<'a>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "str" })
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
Err(SerializerError::UnserializableType { type_name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Ok(MapSerializer {
schema: self.schema,
document_id: self.document_id,
document_store: self.document_store,
indexer: self.indexer,
ranked_map: self.ranked_map,
current_key_name: None,
})
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Ok(StructSerializer {
schema: self.schema,
document_id: self.document_id,
document_store: self.document_store,
indexer: self.indexer,
ranked_map: self.ranked_map,
})
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "struct variant" })
}
}
pub struct MapSerializer<'a> {
schema: &'a Schema,
document_id: DocumentId,
document_store: &'a mut RamDocumentStore,
indexer: &'a mut RawIndexer,
ranked_map: &'a mut RankedMap,
current_key_name: Option<String>,
}
impl<'a> ser::SerializeMap for MapSerializer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: ser::Serialize,
{
let key = key.serialize(ConvertToString)?;
self.current_key_name = Some(key);
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: ser::Serialize,
{
let key = self.current_key_name.take().unwrap();
self.serialize_entry(&key, value)
}
fn serialize_entry<K: ?Sized, V: ?Sized>(
&mut self,
key: &K,
value: &V,
) -> Result<(), Self::Error>
where K: ser::Serialize, V: ser::Serialize,
{
let key = key.serialize(ConvertToString)?;
serialize_value(
self.schema,
self.document_id,
self.document_store,
self.indexer,
self.ranked_map,
&key,
value,
)
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}
pub struct StructSerializer<'a> {
schema: &'a Schema,
document_id: DocumentId,
document_store: &'a mut RamDocumentStore,
indexer: &'a mut RawIndexer,
ranked_map: &'a mut RankedMap,
}
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T,
) -> Result<(), Self::Error>
where T: ser::Serialize,
{
serialize_value(
self.schema,
self.document_id,
self.document_store,
self.indexer,
self.ranked_map,
key,
value,
)
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}
fn serialize_value<T: ?Sized>(
schema: &Schema,
document_id: DocumentId,
document_store: &mut RamDocumentStore,
indexer: &mut RawIndexer,
ranked_map: &mut RankedMap,
key: &str,
value: &T,
) -> Result<(), SerializerError>
where T: ser::Serialize,
{
if let Some(attribute) = schema.attribute(key) {
let props = schema.props(attribute);
let serialized = rmp_serde::to_vec_named(value)?;
document_store.set_document_field(document_id, attribute, serialized);
if props.is_indexed() {
let indexer = Indexer { attribute, indexer, document_id };
value.serialize(indexer)?;
}
if props.is_ranked() {
let number = value.serialize(ConvertToNumber)?;
ranked_map.insert(document_id, attribute, number);
}
}
Ok(())
}

View File

@ -0,0 +1,96 @@
use std::sync::atomic::{AtomicBool, Ordering::Relaxed};
use std::sync::Arc;
use serde_json::json;
use meilidb_data::Database;
use meilidb_schema::{Schema, SchemaBuilder, DISPLAYED, INDEXED};
fn simple_schema() -> Schema {
let mut builder = SchemaBuilder::with_identifier("objectId");
builder.new_attribute("objectId", DISPLAYED | INDEXED);
builder.new_attribute("title", DISPLAYED | INDEXED);
builder.build()
}
#[test]
fn insert_delete_document() {
let tmp_dir = tempfile::tempdir().unwrap();
let database = Database::open(&tmp_dir).unwrap();
let as_been_updated = Arc::new(AtomicBool::new(false));
let schema = simple_schema();
let index = database.create_index("hello", schema).unwrap();
let as_been_updated_clone = as_been_updated.clone();
index.set_update_callback(move |_| as_been_updated_clone.store(true, Relaxed));
let doc1 = json!({ "objectId": 123, "title": "hello" });
let mut addition = index.documents_addition();
addition.update_document(&doc1);
let update_id = addition.finalize().unwrap();
let status = index.update_status_blocking(update_id).unwrap();
assert!(as_been_updated.swap(false, Relaxed));
assert!(status.result.is_ok());
assert_eq!(index.number_of_documents(), 1);
let docs = index.query_builder().query("hello", 0..10).unwrap();
assert_eq!(docs.len(), 1);
assert_eq!(index.document(None, docs[0].id).unwrap().as_ref(), Some(&doc1));
let mut deletion = index.documents_deletion();
deletion.delete_document(&doc1).unwrap();
let update_id = deletion.finalize().unwrap();
let status = index.update_status_blocking(update_id).unwrap();
assert!(as_been_updated.swap(false, Relaxed));
assert!(status.result.is_ok());
assert_eq!(index.number_of_documents(), 0);
let docs = index.query_builder().query("hello", 0..10).unwrap();
assert_eq!(docs.len(), 0);
}
#[test]
fn replace_document() {
let tmp_dir = tempfile::tempdir().unwrap();
let database = Database::open(&tmp_dir).unwrap();
let as_been_updated = Arc::new(AtomicBool::new(false));
let schema = simple_schema();
let index = database.create_index("hello", schema).unwrap();
let as_been_updated_clone = as_been_updated.clone();
index.set_update_callback(move |_| as_been_updated_clone.store(true, Relaxed));
let doc1 = json!({ "objectId": 123, "title": "hello" });
let doc2 = json!({ "objectId": 123, "title": "coucou" });
let mut addition = index.documents_addition();
addition.update_document(&doc1);
let update_id = addition.finalize().unwrap();
let status = index.update_status_blocking(update_id).unwrap();
assert!(as_been_updated.swap(false, Relaxed));
assert!(status.result.is_ok());
assert_eq!(index.number_of_documents(), 1);
let docs = index.query_builder().query("hello", 0..10).unwrap();
assert_eq!(docs.len(), 1);
assert_eq!(index.document(None, docs[0].id).unwrap().as_ref(), Some(&doc1));
let mut addition = index.documents_addition();
addition.update_document(&doc2);
let update_id = addition.finalize().unwrap();
let status = index.update_status_blocking(update_id).unwrap();
assert!(as_been_updated.swap(false, Relaxed));
assert!(status.result.is_ok());
assert_eq!(index.number_of_documents(), 1);
let docs = index.query_builder().query("hello", 0..10).unwrap();
assert_eq!(docs.len(), 0);
let docs = index.query_builder().query("coucou", 0..10).unwrap();
assert_eq!(docs.len(), 1);
assert_eq!(index.document(None, docs[0].id).unwrap().as_ref(), Some(&doc2));
}

12
meilidb-schema/Cargo.toml Normal file
View File

@ -0,0 +1,12 @@
[package]
name = "meilidb-schema"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"
[dependencies]
bincode = "1.1.2"
linked-hash-map = { version = "0.5.2", features = ["serde_impl"] }
serde = { version = "1.0.91", features = ["derive"] }
serde_json = { version = "1.0.39", features = ["preserve_order"] }
toml = { version = "0.5.0", features = ["preserve_order"] }

View File

@ -5,21 +5,17 @@ use std::{fmt, u16};
use std::ops::BitOr;
use std::sync::Arc;
use serde_derive::{Serialize, Deserialize};
use serde::{Serialize, Deserialize};
use linked_hash_map::LinkedHashMap;
use crate::database::serde::find_id::FindDocumentIdSerializer;
use crate::database::serde::SerializerError;
use crate::DocumentId;
pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false, ranked: false };
pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true, ranked: false };
pub const RANKED: SchemaProps = SchemaProps { stored: false, indexed: false, ranked: true };
pub const DISPLAYED: SchemaProps = SchemaProps { displayed: true, indexed: false, ranked: false };
pub const INDEXED: SchemaProps = SchemaProps { displayed: false, indexed: true, ranked: false };
pub const RANKED: SchemaProps = SchemaProps { displayed: false, indexed: false, ranked: true };
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct SchemaProps {
#[serde(default)]
stored: bool,
displayed: bool,
#[serde(default)]
indexed: bool,
@ -29,8 +25,8 @@ pub struct SchemaProps {
}
impl SchemaProps {
pub fn is_stored(self) -> bool {
self.stored
pub fn is_displayed(self) -> bool {
self.displayed
}
pub fn is_indexed(self) -> bool {
@ -47,7 +43,7 @@ impl BitOr for SchemaProps {
fn bitor(self, other: Self) -> Self::Output {
SchemaProps {
stored: self.stored | other.stored,
displayed: self.displayed | other.displayed,
indexed: self.indexed | other.indexed,
ranked: self.ranked | other.ranked,
}
@ -103,14 +99,14 @@ struct InnerSchema {
}
impl Schema {
pub fn from_toml<R: Read>(mut reader: R) -> Result<Schema, Box<Error>> {
pub fn from_toml<R: Read>(mut reader: R) -> Result<Schema, Box<dyn Error>> {
let mut buffer = Vec::new();
reader.read_to_end(&mut buffer)?;
let builder: SchemaBuilder = toml::from_slice(&buffer)?;
Ok(builder.build())
}
pub fn to_toml<W: Write>(&self, mut writer: W) -> Result<(), Box<Error>> {
pub fn to_toml<W: Write>(&self, mut writer: W) -> Result<(), Box<dyn Error>> {
let identifier = self.inner.identifier.clone();
let attributes = self.attributes_ordered();
let builder = SchemaBuilder { identifier, attributes };
@ -121,14 +117,14 @@ impl Schema {
Ok(())
}
pub fn from_json<R: Read>(mut reader: R) -> Result<Schema, Box<Error>> {
pub fn from_json<R: Read>(mut reader: R) -> Result<Schema, Box<dyn Error>> {
let mut buffer = Vec::new();
reader.read_to_end(&mut buffer)?;
let builder: SchemaBuilder = serde_json::from_slice(&buffer)?;
Ok(builder.build())
}
pub fn to_json<W: Write>(&self, mut writer: W) -> Result<(), Box<Error>> {
pub fn to_json<W: Write>(&self, mut writer: W) -> Result<(), Box<dyn Error>> {
let identifier = self.inner.identifier.clone();
let attributes = self.attributes_ordered();
let builder = SchemaBuilder { identifier, attributes };
@ -138,12 +134,12 @@ impl Schema {
Ok(())
}
pub(crate) fn read_from_bin<R: Read>(reader: R) -> bincode::Result<Schema> {
pub fn read_from_bin<R: Read>(reader: R) -> bincode::Result<Schema> {
let builder: SchemaBuilder = bincode::deserialize_from(reader)?;
Ok(builder.build())
}
pub(crate) fn write_to_bin<W: Write>(&self, writer: W) -> bincode::Result<()> {
pub fn write_to_bin<W: Write>(&self, writer: W) -> bincode::Result<()> {
let identifier = self.inner.identifier.clone();
let attributes = self.attributes_ordered();
let builder = SchemaBuilder { identifier, attributes };
@ -166,14 +162,6 @@ impl Schema {
attributes
}
pub fn document_id<T>(&self, document: T) -> Result<DocumentId, SerializerError>
where T: serde::Serialize,
{
let id_attribute_name = &self.inner.identifier;
let serializer = FindDocumentIdSerializer { id_attribute_name };
document.serialize(serializer)
}
pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
let (_, props) = self.inner.props[attr.0 as usize];
props
@ -191,19 +179,31 @@ impl Schema {
let (name, _) = &self.inner.props[attr.0 as usize];
name
}
pub fn iter<'a>(&'a self) -> impl Iterator<Item=(&str, SchemaAttr, SchemaProps)> + 'a {
self.inner.props.iter()
.map(move |(name, prop)| {
let attr = self.inner.attrs.get(name).unwrap();
(name.as_str(), *attr, *prop)
})
}
}
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct SchemaAttr(pub(crate) u16);
pub struct SchemaAttr(pub u16);
impl SchemaAttr {
pub fn new(value: u16) -> SchemaAttr {
pub const fn new(value: u16) -> SchemaAttr {
SchemaAttr(value)
}
pub fn min() -> SchemaAttr {
SchemaAttr(0)
pub const fn min() -> SchemaAttr {
SchemaAttr(u16::min_value())
}
pub const fn max() -> SchemaAttr {
SchemaAttr(u16::max_value())
}
pub fn next(self) -> Option<SchemaAttr> {
@ -213,10 +213,6 @@ impl SchemaAttr {
pub fn prev(self) -> Option<SchemaAttr> {
self.0.checked_sub(1).map(SchemaAttr)
}
pub fn max() -> SchemaAttr {
SchemaAttr(u16::MAX)
}
}
impl fmt::Display for SchemaAttr {
@ -233,8 +229,8 @@ mod tests {
#[test]
fn serialize_deserialize() -> bincode::Result<()> {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", STORED);
builder.new_attribute("beta", STORED | INDEXED);
builder.new_attribute("alpha", DISPLAYED);
builder.new_attribute("beta", DISPLAYED | INDEXED);
builder.new_attribute("gamma", INDEXED);
let schema = builder.build();
@ -249,10 +245,10 @@ mod tests {
}
#[test]
fn serialize_deserialize_toml() -> Result<(), Box<Error>> {
fn serialize_deserialize_toml() -> Result<(), Box<dyn Error>> {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", STORED);
builder.new_attribute("beta", STORED | INDEXED);
builder.new_attribute("alpha", DISPLAYED);
builder.new_attribute("beta", DISPLAYED | INDEXED);
builder.new_attribute("gamma", INDEXED);
let schema = builder.build();
@ -266,10 +262,10 @@ mod tests {
identifier = "id"
[attributes."alpha"]
stored = true
displayed = true
[attributes."beta"]
stored = true
displayed = true
indexed = true
[attributes."gamma"]
@ -282,10 +278,10 @@ mod tests {
}
#[test]
fn serialize_deserialize_json() -> Result<(), Box<Error>> {
fn serialize_deserialize_json() -> Result<(), Box<dyn Error>> {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", STORED);
builder.new_attribute("beta", STORED | INDEXED);
builder.new_attribute("alpha", DISPLAYED);
builder.new_attribute("beta", DISPLAYED | INDEXED);
builder.new_attribute("gamma", INDEXED);
let schema = builder.build();
@ -300,10 +296,10 @@ mod tests {
"identifier": "id",
"attributes": {
"alpha": {
"stored": true
"displayed": true
},
"beta": {
"stored": true,
"displayed": true,
"indexed": true
},
"gamma": {

View File

@ -0,0 +1,8 @@
[package]
name = "meilidb-tokenizer"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"
[dependencies]
slice-group-by = "0.2.4"

View File

@ -0,0 +1,295 @@
use std::iter::Peekable;
use slice_group_by::StrGroupBy;
use self::SeparatorCategory::*;
pub fn is_cjk(c: char) -> bool {
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
(c >= '\u{3040}' && c <= '\u{309f}') ||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
(c >= '\u{3100}' && c <= '\u{312f}') ||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
(c >= '\u{f900}' && c <= '\u{faff}')
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
enum SeparatorCategory {
Soft,
Hard,
}
impl SeparatorCategory {
fn merge(self, other: SeparatorCategory) -> SeparatorCategory {
if let (Soft, Soft) = (self, other) { Soft } else { Hard }
}
fn to_usize(self) -> usize {
match self {
Soft => 1,
Hard => 8,
}
}
}
fn is_separator(c: char) -> bool {
classify_separator(c).is_some()
}
fn classify_separator(c: char) -> Option<SeparatorCategory> {
match c {
' ' | '\'' | '"' => Some(Soft),
'.' | ';' | ',' | '!' | '?' | '-' | '(' | ')' => Some(Hard),
_ => None,
}
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
enum CharCategory {
Separator(SeparatorCategory),
Cjk,
Other,
}
fn classify_char(c: char) -> CharCategory {
if let Some(category) = classify_separator(c) {
CharCategory::Separator(category)
} else if is_cjk(c) {
CharCategory::Cjk
} else {
CharCategory::Other
}
}
fn is_str_word(s: &str) -> bool {
!s.chars().any(is_separator)
}
fn same_group_category(a: char, b: char) -> bool {
match (classify_char(a), classify_char(b)) {
(CharCategory::Cjk, _) | (_, CharCategory::Cjk) => false,
(CharCategory::Separator(_), CharCategory::Separator(_)) => true,
(a, b) => a == b,
}
}
// fold the number of chars along with the index position
fn chars_count_index((n, _): (usize, usize), (i, c): (usize, char)) -> (usize, usize) {
(n + 1, i + c.len_utf8())
}
pub fn split_query_string(query: &str) -> impl Iterator<Item=&str> {
Tokenizer::new(query).map(|t| t.word)
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub struct Token<'a> {
pub word: &'a str,
pub word_index: usize,
pub char_index: usize,
}
pub struct Tokenizer<'a> {
inner: &'a str,
word_index: usize,
char_index: usize,
}
impl<'a> Tokenizer<'a> {
pub fn new(string: &str) -> Tokenizer {
// skip every separator and set `char_index`
// to the number of char trimmed
let (count, index) = string.char_indices()
.take_while(|(_, c)| is_separator(*c))
.fold((0, 0), chars_count_index);
Tokenizer {
inner: &string[index..],
word_index: 0,
char_index: count,
}
}
}
impl<'a> Iterator for Tokenizer<'a> {
type Item = Token<'a>;
fn next(&mut self) -> Option<Self::Item> {
let mut iter = self.inner.linear_group_by(same_group_category).peekable();
while let (Some(string), next_string) = (iter.next(), iter.peek()) {
let (count, index) = string.char_indices().fold((0, 0), chars_count_index);
if !is_str_word(string) {
self.word_index += string.chars()
.filter_map(classify_separator)
.fold(Soft, |a, x| a.merge(x))
.to_usize();
self.char_index += count;
self.inner = &self.inner[index..];
continue;
}
let token = Token {
word: string,
word_index: self.word_index,
char_index: self.char_index,
};
if next_string.filter(|s| is_str_word(s)).is_some() {
self.word_index += 1;
}
self.char_index += count;
self.inner = &self.inner[index..];
return Some(token);
}
self.inner = "";
None
}
}
pub struct SeqTokenizer<'a, I>
where I: Iterator<Item=&'a str>,
{
inner: I,
current: Option<Peekable<Tokenizer<'a>>>,
word_offset: usize,
char_offset: usize,
}
impl<'a, I> SeqTokenizer<'a, I>
where I: Iterator<Item=&'a str>,
{
pub fn new(mut iter: I) -> SeqTokenizer<'a, I> {
let current = iter.next().map(|s| Tokenizer::new(s).peekable());
SeqTokenizer {
inner: iter,
current: current,
word_offset: 0,
char_offset: 0,
}
}
}
impl<'a, I> Iterator for SeqTokenizer<'a, I>
where I: Iterator<Item=&'a str>,
{
type Item = Token<'a>;
fn next(&mut self) -> Option<Self::Item> {
match &mut self.current {
Some(current) => {
match current.next() {
Some(token) => {
// we must apply the word and char offsets
// to the token before returning it
let token = Token {
word: token.word,
word_index: token.word_index + self.word_offset,
char_index: token.char_index + self.char_offset,
};
// if this is the last iteration on this text
// we must save the offsets for next texts
if current.peek().is_none() {
let hard_space = SeparatorCategory::Hard.to_usize();
self.word_offset = token.word_index + hard_space;
self.char_offset = token.char_index + hard_space;
}
Some(token)
},
None => {
// no more words in this text we must
// start tokenizing the next text
self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable());
self.next()
},
}
},
// no more texts available
None => None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn easy() {
let mut tokenizer = Tokenizer::new("salut");
assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard() {
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 18 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 32, char_index: 24 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard_long_chars() {
let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard_kanjis() {
let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}");
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 11, char_index: 14 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 12, char_index: 23 }));
assert_eq!(tokenizer.next(), None);
}
}

28
meilidb/Cargo.toml Normal file
View File

@ -0,0 +1,28 @@
[package]
edition = "2018"
name = "meilidb"
version = "0.3.1"
authors = ["Kerollmops <renault.cle@gmail.com>"]
[dependencies]
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
meilidb-data = { path = "../meilidb-data", version = "0.1.0" }
meilidb-schema = { path = "../meilidb-schema", version = "0.1.0" }
[dev-dependencies]
csv = "1.0.7"
diskus = "0.5.0"
env_logger = "0.6.1"
jemallocator = "0.1.9"
linked-hash-map = "0.5.2"
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
quickcheck = "0.8.2"
rand = "0.6.5"
rand_xorshift = "0.1.1"
rustyline = { version = "5.0.0", default-features = false }
serde = { version = "1.0.91" , features = ["derive"] }
serde_json = "1.0.39"
structopt = "0.2.15"
sysinfo = "0.8.4"
tempfile = "3.0.7"
termcolor = "1.0.4"

View File

@ -0,0 +1,214 @@
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
use std::collections::{HashMap, HashSet};
use std::io::{self, BufRead, BufReader};
use std::path::{Path, PathBuf};
use std::time::Instant;
use std::error::Error;
use std::fs::File;
use diskus::Walk;
use sysinfo::{SystemExt, ProcessExt};
use serde::{Serialize, Deserialize};
use structopt::StructOpt;
use meilidb_data::Database;
use meilidb_schema::Schema;
#[derive(Debug, StructOpt)]
pub struct Opt {
/// The destination where the database must be created.
#[structopt(parse(from_os_str))]
pub database_path: PathBuf,
/// The csv file to index.
#[structopt(parse(from_os_str))]
pub csv_data_path: PathBuf,
/// The path to the schema.
#[structopt(long = "schema", parse(from_os_str))]
pub schema_path: PathBuf,
/// The file with the synonyms.
#[structopt(long = "synonyms", parse(from_os_str))]
pub synonyms: Option<PathBuf>,
/// The path to the list of stop words (one by line).
#[structopt(long = "stop-words", parse(from_os_str))]
pub stop_words: Option<PathBuf>,
#[structopt(long = "update-group-size")]
pub update_group_size: Option<usize>,
}
#[derive(Serialize, Deserialize)]
struct Document (
HashMap<String, String>
);
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum Synonym {
OneWay(SynonymOneWay),
MultiWay { synonyms: Vec<String> },
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SynonymOneWay {
pub search_terms: String,
pub synonyms: Synonyms,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum Synonyms {
Multiple(Vec<String>),
Single(String),
}
fn read_synomys(path: &Path) -> Result<Vec<Synonym>, Box<dyn Error>> {
let file = File::open(path)?;
let synonyms = serde_json::from_reader(file)?;
Ok(synonyms)
}
fn index(
schema: Schema,
database_path: &Path,
csv_data_path: &Path,
update_group_size: Option<usize>,
stop_words: &HashSet<String>,
synonyms: Vec<Synonym>,
) -> Result<Database, Box<dyn Error>>
{
let database = Database::open(database_path)?;
let mut wtr = csv::Writer::from_path("./stats.csv").unwrap();
wtr.write_record(&["NumberOfDocuments", "DiskUsed", "MemoryUsed"])?;
let mut system = sysinfo::System::new();
let index = database.create_index("test", schema.clone())?;
let mut synonyms_adder = index.synonyms_addition();
for synonym in synonyms {
match synonym {
Synonym::OneWay(SynonymOneWay { search_terms, synonyms }) => {
let alternatives = match synonyms {
Synonyms::Multiple(alternatives) => alternatives,
Synonyms::Single(alternative) => vec![alternative],
};
synonyms_adder.add_synonym(search_terms, alternatives);
},
Synonym::MultiWay { mut synonyms } => {
for _ in 0..synonyms.len() {
if let Some((synonym, alternatives)) = synonyms.split_first() {
synonyms_adder.add_synonym(synonym, alternatives);
}
synonyms.rotate_left(1);
}
},
}
}
synonyms_adder.finalize()?;
let mut rdr = csv::Reader::from_path(csv_data_path)?;
let mut raw_record = csv::StringRecord::new();
let headers = rdr.headers()?.clone();
let mut i = 0;
let mut end_of_file = false;
while !end_of_file {
let mut update = index.documents_addition();
loop {
end_of_file = !rdr.read_record(&mut raw_record)?;
if end_of_file { break }
let document: Document = match raw_record.deserialize(Some(&headers)) {
Ok(document) => document,
Err(e) => {
eprintln!("{:?}", e);
continue;
}
};
update.update_document(document);
print!("\rindexing document {}", i);
i += 1;
if let Some(group_size) = update_group_size {
if i % group_size == 0 { break }
}
}
println!();
println!("committing update...");
update.finalize()?;
// write stats
let directory_size = Walk::new(&[database_path.to_owned()], 4).run();
system.refresh_all();
let memory = system.get_process(sysinfo::get_current_pid()).unwrap().memory(); // in kb
wtr.write_record(&[i.to_string(), directory_size.to_string(), memory.to_string()])?;
wtr.flush()?;
}
Ok(database)
}
fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
let f = File::open(path)?;
let reader = BufReader::new(f);
let mut words = HashSet::new();
for line in reader.lines() {
let line = line?;
let word = line.trim().to_string();
words.insert(word);
}
Ok(words)
}
fn main() -> Result<(), Box<dyn Error>> {
let _ = env_logger::init();
let opt = Opt::from_args();
let schema = {
let file = File::open(&opt.schema_path)?;
Schema::from_toml(file)?
};
let stop_words = match opt.stop_words {
Some(ref path) => retrieve_stop_words(path)?,
None => HashSet::new(),
};
let synonyms = match opt.synonyms {
Some(ref path) => read_synomys(path)?,
None => Vec::new(),
};
let start = Instant::now();
let result = index(
schema,
&opt.database_path,
&opt.csv_data_path,
opt.update_group_size,
&stop_words,
synonyms,
);
if let Err(e) = result {
return Err(e.into())
}
println!("database created in {:.2?} at: {:?}", start.elapsed(), opt.database_path);
Ok(())
}

View File

@ -0,0 +1,229 @@
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
use std::collections::btree_map::{BTreeMap, Entry};
use std::collections::HashSet;
use std::error::Error;
use std::io::{self, Write};
use std::iter::FromIterator;
use std::path::PathBuf;
use std::time::{Instant, Duration};
use linked_hash_map::LinkedHashMap;
use rustyline::{Editor, Config};
use structopt::StructOpt;
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
use meilidb_core::Highlight;
use meilidb_data::Database;
use meilidb_schema::SchemaAttr;
#[derive(Debug, StructOpt)]
pub struct Opt {
/// The destination where the database must be created
#[structopt(parse(from_os_str))]
pub database_path: PathBuf,
#[structopt(long = "fetch-timeout-ms")]
pub fetch_timeout_ms: Option<u64>,
/// Fields that must be displayed.
pub displayed_fields: Vec<String>,
/// The number of returned results
#[structopt(short = "n", long = "number-results", default_value = "10")]
pub number_results: usize,
/// The number of characters before and after the first match
#[structopt(short = "C", long = "context", default_value = "35")]
pub char_context: usize,
}
type Document = LinkedHashMap<String, String>;
fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
let mut stdout = StandardStream::stdout(ColorChoice::Always);
let mut highlighted = false;
for range in ranges.windows(2) {
let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() };
if highlighted {
stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
}
write!(&mut stdout, "{}", &text[start..end])?;
stdout.reset()?;
highlighted = !highlighted;
}
Ok(())
}
fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize) {
let mut byte_index = 0;
let mut byte_length = 0;
for (n, (i, c)) in text.char_indices().enumerate() {
if n == index {
byte_index = i;
}
if n + 1 == index + length {
byte_length = i - byte_index + c.len_utf8();
break;
}
}
(byte_index, byte_length)
}
fn create_highlight_areas(text: &str, highlights: &[Highlight]) -> Vec<usize> {
let mut byte_indexes = BTreeMap::new();
for highlight in highlights {
let char_index = highlight.char_index as usize;
let char_length = highlight.char_length as usize;
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
match byte_indexes.entry(byte_index) {
Entry::Vacant(entry) => { entry.insert(byte_length); },
Entry::Occupied(mut entry) => {
if *entry.get() < byte_length {
entry.insert(byte_length);
}
},
}
}
let mut title_areas = Vec::new();
title_areas.push(0);
for (byte_index, length) in byte_indexes {
title_areas.push(byte_index);
title_areas.push(byte_index + length);
}
title_areas.push(text.len());
title_areas.sort_unstable();
title_areas
}
/// note: matches must have been sorted by `char_index` and `char_length` before being passed.
///
/// ```no_run
/// matches.sort_unstable_by_key(|m| (m.char_index, m.char_length));
///
/// let matches = matches.matches.iter().filter(|m| SchemaAttr::new(m.attribute) == attr).cloned();
///
/// let (text, matches) = crop_text(&text, matches, 35);
/// ```
fn crop_text(
text: &str,
highlights: impl IntoIterator<Item=Highlight>,
context: usize,
) -> (String, Vec<Highlight>)
{
let mut highlights = highlights.into_iter().peekable();
let char_index = highlights.peek().map(|m| m.char_index as usize).unwrap_or(0);
let start = char_index.saturating_sub(context);
let text = text.chars().skip(start).take(context * 2).collect();
let highlights = highlights
.take_while(|m| {
(m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)
})
.map(|highlight| {
Highlight { char_index: highlight.char_index - start as u16, ..highlight }
})
.collect();
(text, highlights)
}
fn main() -> Result<(), Box<dyn Error>> {
let _ = env_logger::init();
let opt = Opt::from_args();
let start = Instant::now();
let database = Database::open(&opt.database_path)?;
let index = database.open_index("test")?.unwrap();
let schema = index.schema();
println!("database prepared for you in {:.2?}", start.elapsed());
let fields = opt.displayed_fields.iter().map(String::as_str);
let fields = HashSet::from_iter(fields);
let config = Config::builder().auto_add_history(true).build();
let mut readline = Editor::<()>::with_config(config);
let _ = readline.load_history("query-history.txt");
for result in readline.iter("Searching for: ") {
match result {
Ok(query) => {
let start_total = Instant::now();
let builder = match opt.fetch_timeout_ms {
Some(timeout_ms) => {
let timeout = Duration::from_millis(timeout_ms);
index.query_builder().with_fetch_timeout(timeout)
},
None => index.query_builder(),
};
let documents = builder.query(&query, 0..opt.number_results)?;
let mut retrieve_duration = Duration::default();
let number_of_documents = documents.len();
for mut doc in documents {
doc.highlights.sort_unstable_by_key(|m| (m.char_index, m.char_length));
let start_retrieve = Instant::now();
let result = index.document::<Document>(Some(&fields), doc.id);
retrieve_duration += start_retrieve.elapsed();
match result {
Ok(Some(document)) => {
for (name, text) in document {
print!("{}: ", name);
let attr = schema.attribute(&name).unwrap();
let highlights = doc.highlights.iter()
.filter(|m| SchemaAttr::new(m.attribute) == attr)
.cloned();
let (text, highlights) = crop_text(&text, highlights, opt.char_context);
let areas = create_highlight_areas(&text, &highlights);
display_highlights(&text, &areas)?;
println!();
}
},
Ok(None) => eprintln!("missing document"),
Err(e) => eprintln!("{}", e),
}
let mut matching_attributes = HashSet::new();
for highlight in doc.highlights {
let attr = SchemaAttr::new(highlight.attribute);
let name = schema.attribute_name(attr);
matching_attributes.insert(name);
}
let matching_attributes = Vec::from_iter(matching_attributes);
println!("matching in: {:?}", matching_attributes);
println!();
}
eprintln!("document field retrieve took {:.2?}", retrieve_duration);
eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed());
},
Err(err) => {
println!("Error: {:?}", err);
break
}
}
}
readline.save_history("query-history.txt").unwrap();
Ok(())
}

3
meilidb/src/lib.rs Normal file
View File

@ -0,0 +1,3 @@
mod sort_by_attr;
pub use self::sort_by_attr::SortByAttr;

View File

@ -2,10 +2,9 @@ use std::cmp::Ordering;
use std::error::Error;
use std::fmt;
use crate::database::schema::{Schema, SchemaAttr};
use crate::rank::criterion::Criterion;
use crate::database::RankedMap;
use crate::rank::RawDocument;
use meilidb_core::{criterion::Criterion, RawDocument};
use meilidb_data::RankedMap;
use meilidb_schema::{Schema, SchemaAttr};
/// An helper struct that permit to sort documents by
/// some of their stored attributes.
@ -88,8 +87,8 @@ impl<'a> SortByAttr<'a> {
impl<'a> Criterion for SortByAttr<'a> {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = self.ranked_map.get(&(lhs.id, self.attr));
let rhs = self.ranked_map.get(&(rhs.id, self.attr));
let lhs = self.ranked_map.get(lhs.id, self.attr);
let rhs = self.ranked_map.get(rhs.id, self.attr);
match (lhs, rhs) {
(Some(lhs), Some(rhs)) => {
@ -101,6 +100,10 @@ impl<'a> Criterion for SortByAttr<'a> {
(None, None) => Ordering::Equal,
}
}
fn name(&self) -> &'static str {
"SortByAttr"
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]

View File

@ -1,91 +0,0 @@
use fst::Automaton;
use lazy_static::lazy_static;
use levenshtein_automata::{
LevenshteinAutomatonBuilder as LevBuilder,
DFA, Distance,
};
lazy_static! {
static ref LEVDIST0: LevBuilder = LevBuilder::new(0, false);
static ref LEVDIST1: LevBuilder = LevBuilder::new(1, false);
static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false);
}
pub struct DfaExt {
query_len: usize,
automaton: DFA,
}
impl Automaton for DfaExt {
type State = <DFA as Automaton>::State;
fn start(&self) -> Self::State {
self.automaton.start()
}
fn is_match(&self, state: &Self::State) -> bool {
self.automaton.is_match(state)
}
fn can_match(&self, state: &Self::State) -> bool {
self.automaton.can_match(state)
}
fn will_always_match(&self, state: &Self::State) -> bool {
self.automaton.will_always_match(state)
}
fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
self.automaton.accept(state, byte)
}
}
impl AutomatonExt for DfaExt {
fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance {
self.automaton.eval(s)
}
fn query_len(&self) -> usize {
self.query_len
}
}
#[derive(Copy, Clone)]
enum PrefixSetting {
Prefix,
NoPrefix,
}
fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DfaExt {
use self::PrefixSetting::{Prefix, NoPrefix};
let dfa = match query.len() {
0 ..= 4 => match setting {
Prefix => LEVDIST0.build_prefix_dfa(query),
NoPrefix => LEVDIST0.build_dfa(query),
},
5 ..= 8 => match setting {
Prefix => LEVDIST1.build_prefix_dfa(query),
NoPrefix => LEVDIST1.build_dfa(query),
},
_ => match setting {
Prefix => LEVDIST2.build_prefix_dfa(query),
NoPrefix => LEVDIST2.build_dfa(query),
},
};
DfaExt { query_len: query.len(), automaton: dfa }
}
pub fn build_prefix_dfa(query: &str) -> DfaExt {
build_dfa_with_setting(query, PrefixSetting::Prefix)
}
pub fn build_dfa(query: &str) -> DfaExt {
build_dfa_with_setting(query, PrefixSetting::NoPrefix)
}
pub trait AutomatonExt: Automaton {
fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance;
fn query_len(&self) -> usize;
}

View File

@ -1,26 +0,0 @@
use std::io::{self, BufReader, BufRead};
use std::collections::HashSet;
use std::path::Path;
use std::fs::File;
#[derive(Debug)]
pub struct CommonWords(HashSet<String>);
impl CommonWords {
pub fn from_file<P>(path: P) -> io::Result<Self>
where P: AsRef<Path>
{
let file = File::open(path)?;
let file = BufReader::new(file);
let mut set = HashSet::new();
for line in file.lines().filter_map(|l| l.ok()) {
let word = line.trim().to_owned();
set.insert(word);
}
Ok(CommonWords(set))
}
pub fn contains(&self, word: &str) -> bool {
self.0.contains(word)
}
}

View File

@ -1,61 +0,0 @@
use std::slice::from_raw_parts;
use std::mem::size_of;
use std::error::Error;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use sdset::Set;
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
use crate::write_to_bytes::WriteToBytes;
use crate::data::SharedData;
use crate::DocumentId;
use super::into_u8_slice;
#[derive(Default, Clone)]
pub struct DocIds(SharedData);
impl DocIds {
pub fn new(ids: &Set<DocumentId>) -> DocIds {
let bytes = unsafe { into_u8_slice(ids.as_slice()) };
let data = SharedData::from_bytes(bytes.to_vec());
DocIds(data)
}
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
pub fn as_bytes(&self) -> &[u8] {
&self.0
}
}
impl AsRef<Set<DocumentId>> for DocIds {
fn as_ref(&self) -> &Set<DocumentId> {
let slice = &self.0;
let ptr = slice.as_ptr() as *const DocumentId;
let len = slice.len() / size_of::<DocumentId>();
let slice = unsafe { from_raw_parts(ptr, len) };
Set::new_unchecked(slice)
}
}
impl FromSharedDataCursor for DocIds {
type Error = Box<Error>;
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<DocIds, Self::Error> {
let len = cursor.read_u64::<LittleEndian>()? as usize;
let data = cursor.extract(len);
Ok(DocIds(data))
}
}
impl WriteToBytes for DocIds {
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
let len = self.0.len() as u64;
bytes.write_u64::<LittleEndian>(len).unwrap();
bytes.extend_from_slice(&self.0);
}
}

View File

@ -1,231 +0,0 @@
use std::io::{self, Write};
use std::slice::from_raw_parts;
use std::mem::size_of;
use std::ops::Index;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use sdset::Set;
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
use crate::write_to_bytes::WriteToBytes;
use crate::data::SharedData;
use crate::DocIndex;
use super::into_u8_slice;
#[derive(Debug)]
#[repr(C)]
struct Range {
start: u64,
end: u64,
}
#[derive(Clone, Default)]
pub struct DocIndexes {
ranges: SharedData,
indexes: SharedData,
}
impl DocIndexes {
pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> {
self.ranges().get(index).map(|Range { start, end }| {
let start = *start as usize;
let end = *end as usize;
let slice = &self.indexes()[start..end];
Set::new_unchecked(slice)
})
}
fn ranges(&self) -> &[Range] {
let slice = &self.ranges;
let ptr = slice.as_ptr() as *const Range;
let len = slice.len() / size_of::<Range>();
unsafe { from_raw_parts(ptr, len) }
}
fn indexes(&self) -> &[DocIndex] {
let slice = &self.indexes;
let ptr = slice.as_ptr() as *const DocIndex;
let len = slice.len() / size_of::<DocIndex>();
unsafe { from_raw_parts(ptr, len) }
}
}
impl Index<usize> for DocIndexes {
type Output = [DocIndex];
fn index(&self, index: usize) -> &Self::Output {
match self.get(index) {
Some(indexes) => indexes,
None => panic!("index {} out of range for a maximum of {} ranges", index, self.ranges().len()),
}
}
}
impl FromSharedDataCursor for DocIndexes {
type Error = io::Error;
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<DocIndexes, Self::Error> {
let len = cursor.read_u64::<LittleEndian>()? as usize;
let ranges = cursor.extract(len);
let len = cursor.read_u64::<LittleEndian>()? as usize;
let indexes = cursor.extract(len);
Ok(DocIndexes { ranges, indexes })
}
}
impl WriteToBytes for DocIndexes {
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
let ranges_len = self.ranges.len() as u64;
let _ = bytes.write_u64::<LittleEndian>(ranges_len);
bytes.extend_from_slice(&self.ranges);
let indexes_len = self.indexes.len() as u64;
let _ = bytes.write_u64::<LittleEndian>(indexes_len);
bytes.extend_from_slice(&self.indexes);
}
}
pub struct DocIndexesBuilder<W> {
ranges: Vec<Range>,
indexes: Vec<DocIndex>,
wtr: W,
}
impl DocIndexesBuilder<Vec<u8>> {
pub fn memory() -> Self {
DocIndexesBuilder {
ranges: Vec::new(),
indexes: Vec::new(),
wtr: Vec::new(),
}
}
}
impl<W: Write> DocIndexesBuilder<W> {
pub fn new(wtr: W) -> Self {
DocIndexesBuilder {
ranges: Vec::new(),
indexes: Vec::new(),
wtr: wtr,
}
}
pub fn insert(&mut self, indexes: &Set<DocIndex>) {
let len = indexes.len() as u64;
let start = self.ranges.last().map(|r| r.end).unwrap_or(0);
let range = Range { start, end: start + len };
self.ranges.push(range);
self.indexes.extend_from_slice(indexes);
}
pub fn finish(self) -> io::Result<()> {
self.into_inner().map(drop)
}
pub fn into_inner(mut self) -> io::Result<W> {
let ranges = unsafe { into_u8_slice(&self.ranges) };
let len = ranges.len() as u64;
self.wtr.write_u64::<LittleEndian>(len)?;
self.wtr.write_all(ranges)?;
let indexes = unsafe { into_u8_slice(&self.indexes) };
let len = indexes.len() as u64;
self.wtr.write_u64::<LittleEndian>(len)?;
self.wtr.write_all(indexes)?;
Ok(self.wtr)
}
}
#[cfg(test)]
mod tests {
use std::error::Error;
use crate::DocumentId;
use super::*;
#[test]
fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex {
document_id: DocumentId(0),
attribute: 3,
word_index: 11,
char_index: 30,
char_length: 4,
};
let b = DocIndex {
document_id: DocumentId(1),
attribute: 4,
word_index: 21,
char_index: 35,
char_length: 6,
};
let c = DocIndex {
document_id: DocumentId(2),
attribute: 8,
word_index: 2,
char_index: 89,
char_length: 6,
};
let mut builder = DocIndexesBuilder::memory();
builder.insert(Set::new(&[a])?);
builder.insert(Set::new(&[a, b, c])?);
builder.insert(Set::new(&[a, c])?);
let bytes = builder.into_inner()?;
let docs = DocIndexes::from_bytes(bytes)?;
assert_eq!(docs.get(0), Some(Set::new(&[a])?));
assert_eq!(docs.get(1), Some(Set::new(&[a, b, c])?));
assert_eq!(docs.get(2), Some(Set::new(&[a, c])?));
assert_eq!(docs.get(3), None);
Ok(())
}
#[test]
fn serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex {
document_id: DocumentId(0),
attribute: 3,
word_index: 11,
char_index: 30,
char_length: 4,
};
let b = DocIndex {
document_id: DocumentId(1),
attribute: 4,
word_index: 21,
char_index: 35,
char_length: 6,
};
let c = DocIndex {
document_id: DocumentId(2),
attribute: 8,
word_index: 2,
char_index: 89,
char_length: 6,
};
let mut builder = DocIndexesBuilder::memory();
builder.insert(Set::new(&[a])?);
builder.insert(Set::new(&[a, b, c])?);
builder.insert(Set::new(&[a, c])?);
let builder_bytes = builder.into_inner()?;
let docs = DocIndexes::from_bytes(builder_bytes.clone())?;
let mut bytes = Vec::new();
docs.write_to_bytes(&mut bytes);
assert_eq!(builder_bytes, bytes);
Ok(())
}
}

View File

@ -1,16 +0,0 @@
mod doc_ids;
mod doc_indexes;
mod shared_data;
use std::slice::from_raw_parts;
use std::mem::size_of;
pub use self::doc_ids::DocIds;
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
pub use self::shared_data::SharedData;
unsafe fn into_u8_slice<T: Sized>(slice: &[T]) -> &[u8] {
let ptr = slice.as_ptr() as *const u8;
let len = slice.len() * size_of::<T>();
from_raw_parts(ptr, len)
}

View File

@ -1,48 +0,0 @@
use std::sync::Arc;
use std::ops::Deref;
#[derive(Default, Clone)]
pub struct SharedData {
pub bytes: Arc<Vec<u8>>,
pub offset: usize,
pub len: usize,
}
impl SharedData {
pub fn from_bytes(vec: Vec<u8>) -> SharedData {
let len = vec.len();
let bytes = Arc::from(vec);
SharedData::new(bytes, 0, len)
}
pub fn new(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedData {
SharedData { bytes, offset, len }
}
pub fn as_slice(&self) -> &[u8] {
&self.bytes[self.offset..self.offset + self.len]
}
pub fn range(&self, offset: usize, len: usize) -> SharedData {
assert!(offset + len <= self.len);
SharedData {
bytes: self.bytes.clone(),
offset: self.offset + offset,
len: len,
}
}
}
impl Deref for SharedData {
type Target = [u8];
fn deref(&self) -> &Self::Target {
self.as_slice()
}
}
impl AsRef<[u8]> for SharedData {
fn as_ref(&self) -> &[u8] {
self.as_slice()
}
}

View File

@ -1,46 +0,0 @@
use std::collections::{HashSet, HashMap};
use serde_derive::{Serialize, Deserialize};
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum RankingOrdering {
Asc,
Dsc
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct AccessToken {
pub read_key: String,
pub write_key: String,
pub admin_key: String,
}
#[derive(Default, Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Config {
pub stop_words: Option<HashSet<String>>,
pub ranking_order: Option<Vec<String>>,
pub distinct_field: Option<String>,
pub ranking_rules: Option<HashMap<String, RankingOrdering>>,
pub access_token: Option<AccessToken>,
}
impl Config {
pub fn update_with(&mut self, new: Config) {
if let Some(stop_words) = new.stop_words {
self.stop_words = Some(stop_words);
};
if let Some(ranking_order) = new.ranking_order {
self.ranking_order = Some(ranking_order);
};
if let Some(distinct_field) = new.distinct_field {
self.distinct_field = Some(distinct_field);
};
if let Some(ranking_rules) = new.ranking_rules {
self.ranking_rules = Some(ranking_rules);
};
if let Some(access_token) = new.access_token {
self.access_token = Some(access_token);
};
}
}

View File

@ -1,149 +0,0 @@
use std::io::{Cursor, Read, Write};
use std::mem::size_of;
use std::fmt;
use byteorder::{BigEndian, WriteBytesExt, ReadBytesExt};
use crate::database::schema::SchemaAttr;
use crate::DocumentId;
const DOC_KEY_LEN: usize = 4 + size_of::<u64>();
const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::<u16>();
#[derive(Copy, Clone)]
pub struct DocumentKey([u8; DOC_KEY_LEN]);
impl DocumentKey {
pub fn new(id: DocumentId) -> DocumentKey {
let mut buffer = [0; DOC_KEY_LEN];
let mut wtr = Cursor::new(&mut buffer[..]);
wtr.write_all(b"doc-").unwrap();
wtr.write_u64::<BigEndian>(id.0).unwrap();
DocumentKey(buffer)
}
pub fn from_bytes(mut bytes: &[u8]) -> DocumentKey {
assert!(bytes.len() >= DOC_KEY_LEN);
assert_eq!(&bytes[..4], b"doc-");
let mut buffer = [0; DOC_KEY_LEN];
bytes.read_exact(&mut buffer).unwrap();
DocumentKey(buffer)
}
pub fn with_attribute(&self, attr: SchemaAttr) -> DocumentKeyAttr {
DocumentKeyAttr::new(self.document_id(), attr)
}
pub fn with_attribute_min(&self) -> DocumentKeyAttr {
DocumentKeyAttr::new(self.document_id(), SchemaAttr::min())
}
pub fn with_attribute_max(&self) -> DocumentKeyAttr {
DocumentKeyAttr::new(self.document_id(), SchemaAttr::max())
}
pub fn document_id(&self) -> DocumentId {
let id = (&self.0[4..]).read_u64::<BigEndian>().unwrap();
DocumentId(id)
}
}
impl AsRef<[u8]> for DocumentKey {
fn as_ref(&self) -> &[u8] {
&self.0
}
}
impl fmt::Debug for DocumentKey {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("DocumentKey")
.field("document_id", &self.document_id())
.finish()
}
}
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct DocumentKeyAttr([u8; DOC_KEY_ATTR_LEN]);
impl DocumentKeyAttr {
pub fn new(id: DocumentId, attr: SchemaAttr) -> DocumentKeyAttr {
let mut buffer = [0; DOC_KEY_ATTR_LEN];
let DocumentKey(raw_key) = DocumentKey::new(id);
let mut wtr = Cursor::new(&mut buffer[..]);
wtr.write_all(&raw_key).unwrap();
wtr.write_all(b"-").unwrap();
wtr.write_u16::<BigEndian>(attr.0).unwrap();
DocumentKeyAttr(buffer)
}
pub fn with_attribute_min(id: DocumentId) -> DocumentKeyAttr {
DocumentKeyAttr::new(id, SchemaAttr::min())
}
pub fn with_attribute_max(id: DocumentId) -> DocumentKeyAttr {
DocumentKeyAttr::new(id, SchemaAttr::max())
}
pub fn from_bytes(mut bytes: &[u8]) -> DocumentKeyAttr {
assert!(bytes.len() >= DOC_KEY_ATTR_LEN);
assert_eq!(&bytes[..4], b"doc-");
let mut buffer = [0; DOC_KEY_ATTR_LEN];
bytes.read_exact(&mut buffer).unwrap();
DocumentKeyAttr(buffer)
}
pub fn document_id(&self) -> DocumentId {
let id = (&self.0[4..]).read_u64::<BigEndian>().unwrap();
DocumentId(id)
}
pub fn attribute(&self) -> SchemaAttr {
let offset = 4 + size_of::<u64>() + 1;
let value = (&self.0[offset..]).read_u16::<BigEndian>().unwrap();
SchemaAttr::new(value)
}
pub fn into_document_key(self) -> DocumentKey {
DocumentKey::new(self.document_id())
}
}
impl AsRef<[u8]> for DocumentKeyAttr {
fn as_ref(&self) -> &[u8] {
&self.0
}
}
impl fmt::Debug for DocumentKeyAttr {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("DocumentKeyAttr")
.field("document_id", &self.document_id())
.field("attribute", &self.attribute().0)
.finish()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn keep_as_ref_order() {
for (a, b) in (0..).zip(1..).take(u16::max_value() as usize - 1) {
let id = DocumentId(0);
let a = DocumentKeyAttr::new(id, SchemaAttr(a));
let b = DocumentKeyAttr::new(id, SchemaAttr(b));
assert!(a < b);
assert!(a.as_ref() < b.as_ref());
}
}
}

View File

@ -1,175 +0,0 @@
use std::error::Error;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use fst::{map, Map, IntoStreamer, Streamer};
use fst::raw::Fst;
use sdset::duo::{Union, DifferenceByKey};
use sdset::{Set, SetOperation};
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
use crate::write_to_bytes::WriteToBytes;
use crate::data::{DocIndexes, DocIndexesBuilder};
use crate::{DocumentId, DocIndex};
#[derive(Default)]
pub struct Index {
pub map: Map,
pub indexes: DocIndexes,
}
impl Index {
pub fn remove_documents(&self, documents: &Set<DocumentId>) -> Index {
let mut buffer = Vec::new();
let mut builder = IndexBuilder::new();
let mut stream = self.into_stream();
while let Some((key, indexes)) = stream.next() {
buffer.clear();
let op = DifferenceByKey::new(indexes, documents, |x| x.document_id, |x| *x);
op.extend_vec(&mut buffer);
if !buffer.is_empty() {
let indexes = Set::new_unchecked(&buffer);
builder.insert(key, indexes).unwrap();
}
}
builder.build()
}
pub fn union(&self, other: &Index) -> Index {
let mut builder = IndexBuilder::new();
let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union();
let mut buffer = Vec::new();
while let Some((key, ivalues)) = stream.next() {
buffer.clear();
match ivalues {
[a, b] => {
let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
let indexes = &indexes[a.value as usize];
let a = Set::new_unchecked(indexes);
let indexes = if b.index == 0 { &self.indexes } else { &other.indexes };
let indexes = &indexes[b.value as usize];
let b = Set::new_unchecked(indexes);
let op = Union::new(a, b);
op.extend_vec(&mut buffer);
},
[x] => {
let indexes = if x.index == 0 { &self.indexes } else { &other.indexes };
let indexes = &indexes[x.value as usize];
buffer.extend_from_slice(indexes)
},
_ => continue,
}
if !buffer.is_empty() {
let indexes = Set::new_unchecked(&buffer);
builder.insert(key, indexes).unwrap();
}
}
builder.build()
}
}
impl FromSharedDataCursor for Index {
type Error = Box<Error>;
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Index, Self::Error> {
let len = cursor.read_u64::<LittleEndian>()? as usize;
let data = cursor.extract(len);
let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?;
let map = Map::from(fst);
let indexes = DocIndexes::from_shared_data_cursor(cursor)?;
Ok(Index { map, indexes})
}
}
impl WriteToBytes for Index {
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
let slice = self.map.as_fst().as_bytes();
let len = slice.len() as u64;
let _ = bytes.write_u64::<LittleEndian>(len);
bytes.extend_from_slice(slice);
self.indexes.write_to_bytes(bytes);
}
}
impl<'m, 'a> IntoStreamer<'a> for &'m Index {
type Item = (&'a [u8], &'a Set<DocIndex>);
type Into = Stream<'m>;
fn into_stream(self) -> Self::Into {
Stream {
map_stream: self.map.into_stream(),
indexes: &self.indexes,
}
}
}
pub struct Stream<'m> {
map_stream: map::Stream<'m>,
indexes: &'m DocIndexes,
}
impl<'m, 'a> Streamer<'a> for Stream<'m> {
type Item = (&'a [u8], &'a Set<DocIndex>);
fn next(&'a mut self) -> Option<Self::Item> {
match self.map_stream.next() {
Some((input, index)) => {
let indexes = &self.indexes[index as usize];
let indexes = Set::new_unchecked(indexes);
Some((input, indexes))
},
None => None,
}
}
}
pub struct IndexBuilder {
map: fst::MapBuilder<Vec<u8>>,
indexes: DocIndexesBuilder<Vec<u8>>,
value: u64,
}
impl IndexBuilder {
pub fn new() -> Self {
IndexBuilder {
map: fst::MapBuilder::memory(),
indexes: DocIndexesBuilder::memory(),
value: 0,
}
}
/// If a key is inserted that is less than or equal to any previous key added,
/// then an error is returned. Similarly, if there was a problem writing
/// to the underlying writer, an error is returned.
// FIXME what if one write doesn't work but the other do ?
pub fn insert<K>(&mut self, key: K, indexes: &Set<DocIndex>) -> fst::Result<()>
where K: AsRef<[u8]>,
{
self.map.insert(key, self.value)?;
self.indexes.insert(indexes);
self.value += 1;
Ok(())
}
pub fn build(self) -> Index {
let map = self.map.into_inner().unwrap();
let indexes = self.indexes.into_inner().unwrap();
let map = Map::from_bytes(map).unwrap();
let indexes = DocIndexes::from_bytes(indexes).unwrap();
Index { map, indexes }
}
}

View File

@ -1,909 +0,0 @@
use std::time::Instant;
use std::error::Error;
use std::ffi::OsStr;
use std::sync::Arc;
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicBool, Ordering};
use std::ops::{Deref, DerefMut};
use rocksdb::rocksdb_options::{DBOptions, ColumnFamilyOptions};
use rocksdb::rocksdb::{Writable, Snapshot};
use rocksdb::{DB, MergeOperands};
use size_format::SizeFormatterBinary;
use arc_swap::ArcSwap;
use lockfree::map::Map;
use hashbrown::HashMap;
use log::{info, error, warn};
use crate::database::schema::SchemaAttr;
use crate::shared_data_cursor::FromSharedDataCursor;
use crate::write_to_bytes::WriteToBytes;
use crate::DocumentId;
use self::update::{ReadIndexEvent, ReadRankedMapEvent};
pub use self::config::Config;
pub use self::document_key::{DocumentKey, DocumentKeyAttr};
pub use self::view::{DatabaseView, DocumentIter};
pub use self::update::Update;
pub use self::serde::SerializerError;
pub use self::schema::Schema;
pub use self::index::Index;
pub use self::number::{Number, ParseNumberError};
pub type RankedMap = HashMap<(DocumentId, SchemaAttr), Number>;
const DATA_INDEX: &[u8] = b"data-index";
const DATA_RANKED_MAP: &[u8] = b"data-ranked-map";
const DATA_SCHEMA: &[u8] = b"data-schema";
const CONFIG: &[u8] = b"config";
pub mod config;
pub mod schema;
pub(crate) mod index;
mod number;
mod document_key;
mod serde;
mod update;
mod view;
fn retrieve_data_schema<D>(snapshot: &Snapshot<D>) -> Result<Schema, Box<Error>>
where D: Deref<Target=DB>
{
match snapshot.get(DATA_SCHEMA)? {
Some(vector) => Ok(Schema::read_from_bin(&*vector)?),
None => Err(String::from("BUG: no schema found in the database").into()),
}
}
fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<Index, Box<Error>>
where D: Deref<Target=DB>
{
let start = Instant::now();
let vector = snapshot.get(DATA_INDEX)?;
info!("loading index from kv-store took {:.2?}", start.elapsed());
match vector {
Some(vector) => {
let start = Instant::now();
let bytes = vector.as_ref().to_vec();
info!("index size is {}B", SizeFormatterBinary::new(bytes.len() as u64));
let event = ReadIndexEvent::from_bytes(bytes)?;
let index = event.updated_documents().expect("BUG: invalid event deserialized");
info!("loading index from bytes took {:.2?}", start.elapsed());
Ok(index)
},
None => Ok(Index::default()),
}
}
fn retrieve_data_ranked_map<D>(snapshot: &Snapshot<D>) -> Result<RankedMap, Box<Error>>
where D: Deref<Target=DB>,
{
let start = Instant::now();
let vector = snapshot.get(DATA_RANKED_MAP)?;
info!("loading ranked map from kv-store took {:.2?}", start.elapsed());
match vector {
Some(vector) => {
let start = Instant::now();
let bytes = vector.as_ref().to_vec();
info!("ranked map size is {}B", SizeFormatterBinary::new(bytes.len() as u64));
let event = ReadRankedMapEvent::from_bytes(bytes)?;
let ranked_map = event.updated_documents().expect("BUG: invalid event deserialized");
info!("loading ranked map from bytes took {:.2?}", start.elapsed());
Ok(ranked_map)
},
None => Ok(RankedMap::new()),
}
}
fn retrieve_config<D>(snapshot: &Snapshot<D>) -> Result<Config, Box<Error>>
where D: Deref<Target=DB>,
{
match snapshot.get(CONFIG)? {
Some(vector) => Ok(bincode::deserialize(&*vector)?),
None => Ok(Config::default()),
}
}
fn merge_indexes(existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
use self::update::ReadIndexEvent::{self, *};
use self::update::WriteIndexEvent;
let mut index = Index::default();
for bytes in existing.into_iter().chain(operands) {
match ReadIndexEvent::from_bytes(bytes.to_vec()).unwrap() {
RemovedDocuments(d) => index = index.remove_documents(d.as_ref()),
UpdatedDocuments(i) => index = index.union(&i),
}
}
WriteIndexEvent::UpdatedDocuments(&index).into_bytes()
}
fn merge_ranked_maps(existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
use self::update::ReadRankedMapEvent::{self, *};
use self::update::WriteRankedMapEvent;
let mut ranked_map = RankedMap::default();
for bytes in existing.into_iter().chain(operands) {
match ReadRankedMapEvent::from_bytes(bytes.to_vec()).unwrap() {
RemovedDocuments(d) => ranked_map.retain(|(k, _), _| !d.as_ref().binary_search(k).is_ok()),
UpdatedDocuments(i) => ranked_map.extend(i),
}
}
WriteRankedMapEvent::UpdatedDocuments(&ranked_map).into_bytes()
}
fn merge_operator(key: &[u8], existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
match key {
DATA_INDEX => merge_indexes(existing, operands),
DATA_RANKED_MAP => merge_ranked_maps(existing, operands),
key => panic!("The merge operator does not support merging {:?}", key),
}
}
pub struct IndexUpdate {
index: String,
update: Update,
}
impl Deref for IndexUpdate {
type Target = Update;
fn deref(&self) -> &Update {
&self.update
}
}
impl DerefMut for IndexUpdate {
fn deref_mut(&mut self) -> &mut Update {
&mut self.update
}
}
struct DatabaseIndex {
db: Arc<DB>,
// This view is updated each time the DB ingests an update.
view: ArcSwap<DatabaseView<Arc<DB>>>,
// The path of the mdb folder stored on disk.
path: PathBuf,
// must_die false by default, must be set as true when the Index is dropped.
// It is used to erase the folder saved on disk when the user request to delete an index.
must_die: AtomicBool,
}
impl DatabaseIndex {
fn create<P: AsRef<Path>>(path: P, schema: &Schema) -> Result<DatabaseIndex, Box<Error>> {
let path = path.as_ref();
if path.exists() {
return Err(format!("File already exists at path: {}, cannot create database.",
path.display()).into())
}
let path_lossy = path.to_string_lossy();
let mut opts = DBOptions::new();
opts.create_if_missing(true);
// opts.error_if_exists(true); // FIXME pull request that
let mut cf_opts = ColumnFamilyOptions::new();
cf_opts.add_merge_operator("data merge operator", merge_operator);
let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?;
let mut schema_bytes = Vec::new();
schema.write_to_bin(&mut schema_bytes)?;
db.put(DATA_SCHEMA, &schema_bytes)?;
let db = Arc::new(db);
let snapshot = Snapshot::new(db.clone());
let view = ArcSwap::new(Arc::new(DatabaseView::new(snapshot)?));
Ok(DatabaseIndex {
db: db,
view: view,
path: path.to_path_buf(),
must_die: AtomicBool::new(false)
})
}
fn open<P: AsRef<Path>>(path: P) -> Result<DatabaseIndex, Box<Error>> {
let path_lossy = path.as_ref().to_string_lossy();
let mut opts = DBOptions::new();
opts.create_if_missing(false);
let mut cf_opts = ColumnFamilyOptions::new();
cf_opts.add_merge_operator("data merge operator", merge_operator);
let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?;
// FIXME create a generic function to do that !
let _schema = match db.get(DATA_SCHEMA)? {
Some(value) => Schema::read_from_bin(&*value)?,
None => return Err(String::from("Database does not contain a schema").into()),
};
let db = Arc::new(db);
let snapshot = Snapshot::new(db.clone());
let view = ArcSwap::new(Arc::new(DatabaseView::new(snapshot)?));
Ok(DatabaseIndex {
db: db,
view: view,
path: path.as_ref().to_path_buf(),
must_die: AtomicBool::new(false)
})
}
fn must_die(&self) {
self.must_die.store(true, Ordering::Relaxed)
}
fn start_update(&self) -> Result<Update, Box<Error>> {
let schema = match self.db.get(DATA_SCHEMA)? {
Some(value) => Schema::read_from_bin(&*value)?,
None => panic!("Database does not contain a schema"),
};
Ok(Update::new(schema))
}
fn commit_update(&self, update: Update) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
let batch = update.build()?;
self.db.write(batch)?;
let snapshot = Snapshot::new(self.db.clone());
let view = Arc::new(DatabaseView::new(snapshot)?);
self.view.store(view.clone());
Ok(view)
}
fn view(&self) -> Arc<DatabaseView<Arc<DB>>> {
self.view.load()
}
fn get_config(&self) -> Config {
self.view().config().clone()
}
fn update_config(&self, config: Config) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>>{
let data = bincode::serialize(&config)?;
self.db.put(CONFIG, &data)?;
let snapshot = Snapshot::new(self.db.clone());
let view = Arc::new(DatabaseView::new(snapshot)?);
self.view.store(view.clone());
Ok(view)
}
fn path(&self) -> &Path {
self.path.as_path()
}
}
impl Drop for DatabaseIndex {
fn drop(&mut self) {
if self.must_die.load(Ordering::Relaxed) {
if let Err(err) = fs::remove_dir_all(&self.path) {
error!("Impossible to remove mdb when Database is dropped; {}", err);
}
}
}
}
pub struct Database {
indexes: Map<String, Arc<DatabaseIndex>>,
path: PathBuf,
}
impl Database {
pub fn create<P: AsRef<Path>>(path: P) -> Result<Database, Box<Error>> {
Ok(Database {
indexes: Map::new(),
path: path.as_ref().to_path_buf(),
})
}
pub fn open<P: AsRef<Path>>(path: P) -> Result<Database, Box<Error>> {
let entries = fs::read_dir(&path)?;
let indexes = Map::new();
for entry in entries {
let path = match entry {
Ok(p) => p.path(),
Err(err) => {
warn!("Impossible to retrieve the path from an entry; {}", err);
continue
}
};
let name = match path.file_stem().and_then(OsStr::to_str) {
Some(name) => name.to_owned(),
None => continue
};
let db = match DatabaseIndex::open(path.clone()) {
Ok(db) => db,
Err(err) => {
warn!("Impossible to open the database; {}", err);
continue
}
};
info!("Load database {}", name);
indexes.insert(name, Arc::new(db));
}
Ok(Database {
indexes: indexes,
path: path.as_ref().to_path_buf(),
})
}
pub fn create_index(&self, name: &str, schema: &Schema) -> Result<(), Box<Error>> {
let index_path = self.path.join(name);
if index_path.exists() {
return Err("Index already exists".into());
}
let index = DatabaseIndex::create(index_path, schema)?;
self.indexes.insert(name.to_owned(), Arc::new(index));
Ok(())
}
pub fn delete_index(&self, name: &str) -> Result<(), Box<Error>> {
let index_guard = self.indexes.remove(name).ok_or("Index not found")?;
index_guard.val().must_die();
Ok(())
}
pub fn list_indexes(&self) -> Vec<String> {
self.indexes.iter().map(|g| g.key().clone()).collect()
}
pub fn start_update(&self, index: &str) -> Result<IndexUpdate, Box<Error>> {
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
let update = index_guard.val().start_update()?;
Ok(IndexUpdate { index: index.to_owned(), update })
}
pub fn commit_update(&self, update: IndexUpdate)-> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
let index_guard = self.indexes.get(&update.index).ok_or("Index not found")?;
index_guard.val().commit_update(update.update)
}
pub fn view(&self, index: &str) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
Ok(index_guard.val().view())
}
pub fn get_config(&self, index: &str) -> Result<Config, Box<Error>> {
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
Ok(index_guard.val().get_config())
}
pub fn update_config(&self, index: &str, config: Config) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>>{
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
Ok(index_guard.val().update_config(config)?)
}
pub fn path(&self) -> &Path {
self.path.as_path()
}
pub fn index_path(&self, index: &str) -> Result<PathBuf, Box<Error>> {
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
let path = index_guard.val().path();
Ok(path.to_path_buf())
}
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use std::error::Error;
use serde_derive::{Serialize, Deserialize};
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
use crate::tokenizer::DefaultBuilder;
use super::*;
#[test]
fn ingest_one_easy_update() -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let meilidb_path = dir.path().join("meilidb.mdb");
let meilidb_index_name = "default";
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
struct SimpleDoc {
id: u64,
title: String,
description: String,
timestamp: u64,
}
let schema = {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("id", STORED);
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
builder.new_attribute("timestamp", STORED);
builder.build()
};
let database = Database::create(&meilidb_path)?;
database.create_index(meilidb_index_name, &schema)?;
let doc0 = SimpleDoc {
id: 0,
title: String::from("I am a title"),
description: String::from("I am a description"),
timestamp: 1234567,
};
let doc1 = SimpleDoc {
id: 1,
title: String::from("I am the second title"),
description: String::from("I am the second description"),
timestamp: 7654321,
};
let tokenizer_builder = DefaultBuilder::new();
let mut builder = database.start_update(meilidb_index_name)?;
let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
let view = database.commit_update(builder)?;
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
assert_eq!(doc0, de_doc0);
assert_eq!(doc1, de_doc1);
Ok(dir.close()?)
}
#[test]
fn ingest_two_easy_updates() -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let meilidb_path = dir.path().join("meilidb.mdb");
let meilidb_index_name = "default";
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
struct SimpleDoc {
id: u64,
title: String,
description: String,
timestamp: u64,
}
let schema = {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("id", STORED);
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
builder.new_attribute("timestamp", STORED);
builder.build()
};
let database = Database::create(&meilidb_path)?;
database.create_index(meilidb_index_name, &schema)?;
let doc0 = SimpleDoc {
id: 0,
title: String::from("I am a title"),
description: String::from("I am a description"),
timestamp: 1234567,
};
let doc1 = SimpleDoc {
id: 1,
title: String::from("I am the second title"),
description: String::from("I am the second description"),
timestamp: 7654321,
};
let doc2 = SimpleDoc {
id: 2,
title: String::from("I am the third title"),
description: String::from("I am the third description"),
timestamp: 7654321,
};
let doc3 = SimpleDoc {
id: 3,
title: String::from("I am the fourth title"),
description: String::from("I am the fourth description"),
timestamp: 7654321,
};
let tokenizer_builder = DefaultBuilder::new();
let mut builder = database.start_update(meilidb_index_name)?;
let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
database.commit_update(builder)?;
let mut builder = database.start_update(meilidb_index_name)?;
let docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?;
let docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?;
let view = database.commit_update(builder)?;
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
assert_eq!(doc0, de_doc0);
assert_eq!(doc1, de_doc1);
let de_doc2: SimpleDoc = view.document_by_id(docid2)?;
let de_doc3: SimpleDoc = view.document_by_id(docid3)?;
assert_eq!(doc2, de_doc2);
assert_eq!(doc3, de_doc3);
Ok(dir.close()?)
}
}
#[cfg(all(feature = "nightly", test))]
mod bench {
extern crate test;
use std::collections::HashSet;
use std::error::Error;
use std::iter::repeat_with;
use self::test::Bencher;
use rand::distributions::Alphanumeric;
use rand_xorshift::XorShiftRng;
use rand::{Rng, SeedableRng};
use serde_derive::Serialize;
use rand::seq::SliceRandom;
use crate::tokenizer::DefaultBuilder;
use crate::database::schema::*;
use super::*;
fn random_sentences<R: Rng>(number: usize, rng: &mut R) -> String {
let mut words = String::new();
for i in 0..number {
let word_len = rng.gen_range(1, 12);
let iter = repeat_with(|| rng.sample(Alphanumeric)).take(word_len);
words.extend(iter);
if i == number - 1 { // last word
let final_ = [".", "?", "!", "..."].choose(rng).cloned();
words.extend(final_);
} else {
let middle = [",", ", "].choose(rng).cloned();
words.extend(middle);
}
}
words
}
#[bench]
fn open_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let index_name = "default";
let database = Database::create(&db_path)?;
database.create_index(index_name, &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..300 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
database.commit_update(builder)?;
drop(database);
bench.iter(|| {
let database = Database::open(db_path.clone()).unwrap();
test::black_box(|| database);
});
Ok(())
}
#[bench]
fn open_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let index_name = "default";
let database = Database::create(&db_path)?;
database.create_index(index_name, &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..3000 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
database.commit_update(builder)?;
drop(database);
bench.iter(|| {
let database = Database::open(db_path.clone()).unwrap();
test::black_box(|| database);
});
Ok(())
}
#[bench]
#[ignore]
fn open_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let index_name = "default";
let database = Database::create(&db_path)?;
database.create_index(index_name, &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..30_000 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
database.commit_update(builder)?;
drop(database);
bench.iter(|| {
let database = Database::open(db_path.clone()).unwrap();
test::black_box(|| database);
});
Ok(())
}
#[bench]
fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let index_name = "default";
let database = Database::create(&db_path)?;
database.create_index(index_name, &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..300 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let view = database.commit_update(builder)?;
bench.iter(|| {
for q in &["a", "b", "c", "d", "e"] {
let documents = view.query_builder().query(q, 0..20);
test::black_box(|| documents);
}
});
Ok(())
}
#[bench]
fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let index_name = "default";
let database = Database::create(&db_path)?;
database.create_index(index_name, &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..3000 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let view = database.commit_update(builder)?;
bench.iter(|| {
for q in &["a", "b", "c", "d", "e"] {
let documents = view.query_builder().query(q, 0..20);
test::black_box(|| documents);
}
});
Ok(())
}
#[bench]
#[ignore]
fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let index_name = "default";
let database = Database::create(&db_path)?;
database.create_index(index_name, &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..30_000 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let view = database.commit_update(builder)?;
bench.iter(|| {
for q in &["a", "b", "c", "d", "e"] {
let documents = view.query_builder().query(q, 0..20);
test::black_box(|| documents);
}
});
Ok(())
}
}

View File

@ -1,98 +0,0 @@
use std::cmp::Ordering;
use std::str::FromStr;
use std::fmt;
use serde_derive::{Serialize, Deserialize};
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone)]
pub enum Number {
Unsigned(u64),
Signed(i64),
Float(f64),
}
impl FromStr for Number {
type Err = ParseNumberError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
if let Ok(unsigned) = u64::from_str(s) {
return Ok(Number::Unsigned(unsigned))
}
if let Ok(signed) = i64::from_str(s) {
return Ok(Number::Signed(signed))
}
if let Ok(float) = f64::from_str(s) {
if float == 0.0 || float.is_normal() {
return Ok(Number::Float(float))
}
}
Err(ParseNumberError)
}
}
impl PartialOrd for Number {
fn partial_cmp(&self, other: &Number) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Number {
fn cmp(&self, other: &Number) -> Ordering {
use Number::*;
match (self, other) {
(Unsigned(s), Unsigned(o)) => s.cmp(o),
(Unsigned(s), Signed(o)) => {
let s = i128::from(*s);
let o = i128::from(*o);
s.cmp(&o)
},
(Unsigned(s), Float(o)) => {
let s = *s as f64;
s.partial_cmp(&o).unwrap_or(Ordering::Equal)
},
(Signed(s), Unsigned(o)) => {
let s = i128::from(*s);
let o = i128::from(*o);
s.cmp(&o)
},
(Signed(s), Signed(o)) => s.cmp(o),
(Signed(s), Float(o)) => {
let s = *s as f64;
s.partial_cmp(o).unwrap_or(Ordering::Equal)
},
(Float(s), Unsigned(o)) => {
let o = *o as f64;
s.partial_cmp(&o).unwrap_or(Ordering::Equal)
},
(Float(s), Signed(o)) => {
let o = *o as f64;
s.partial_cmp(&o).unwrap_or(Ordering::Equal)
},
(Float(s), Float(o)) => {
s.partial_cmp(o).unwrap_or(Ordering::Equal)
},
}
}
}
impl PartialEq for Number {
fn eq(&self, other: &Number) -> bool {
self.cmp(other) == Ordering::Equal
}
}
impl Eq for Number { }
pub struct ParseNumberError;
impl fmt::Display for ParseNumberError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str("can not parse number")
}
}

View File

@ -1,186 +0,0 @@
use std::error::Error;
use std::ops::Deref;
use std::fmt;
use rocksdb::rocksdb::{DB, Snapshot, SeekKey};
use rocksdb::rocksdb_options::ReadOptions;
use serde::forward_to_deserialize_any;
use serde::de::value::MapDeserializer;
use serde::de::{self, Visitor, IntoDeserializer};
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
use crate::database::schema::Schema;
use crate::DocumentId;
pub struct Deserializer<'a, D>
where D: Deref<Target=DB>
{
snapshot: &'a Snapshot<D>,
schema: &'a Schema,
document_id: DocumentId,
}
impl<'a, D> Deserializer<'a, D>
where D: Deref<Target=DB>
{
pub fn new(snapshot: &'a Snapshot<D>, schema: &'a Schema, doc: DocumentId) -> Self {
Deserializer { snapshot, schema, document_id: doc }
}
}
impl<'de, 'a, 'b, D> de::Deserializer<'de> for &'b mut Deserializer<'a, D>
where D: Deref<Target=DB>
{
type Error = DeserializerError;
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
self.deserialize_map(visitor)
}
forward_to_deserialize_any! {
bool u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 char str string unit seq
bytes byte_buf unit_struct tuple_struct
identifier tuple ignored_any option newtype_struct enum struct
}
fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
let mut options = ReadOptions::new();
let lower = DocumentKey::new(self.document_id);
let upper = lower.with_attribute_max();
options.set_iterate_lower_bound(lower.as_ref());
options.set_iterate_upper_bound(upper.as_ref());
let mut iter = self.snapshot.iter_opt(options);
iter.seek(SeekKey::Start);
if iter.kv().is_none() {
// FIXME return an error
}
let iter = iter.map(|(key, value)| {
// retrieve the schema attribute name
// from the schema attribute number
let document_key_attr = DocumentKeyAttr::from_bytes(&key);
let schema_attr = document_key_attr.attribute();
let attribute_name = self.schema.attribute_name(schema_attr);
(attribute_name, Value(value))
});
let map_deserializer = MapDeserializer::new(iter);
visitor.visit_map(map_deserializer)
}
}
struct Value(Vec<u8>);
impl<'de> IntoDeserializer<'de, DeserializerError> for Value {
type Deserializer = Self;
fn into_deserializer(self) -> Self::Deserializer {
self
}
}
macro_rules! forward_to_bincode_values {
($($ty:ident => $de_method:ident,)*) => {
$(
fn $de_method<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de>
{
match bincode::deserialize::<$ty>(&self.0) {
Ok(val) => val.into_deserializer().$de_method(visitor),
Err(e) => Err(de::Error::custom(e)),
}
}
)*
}
}
impl<'de, 'a> de::Deserializer<'de> for Value {
type Error = DeserializerError;
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
self.0.into_deserializer().deserialize_any(visitor)
}
fn deserialize_str<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
self.deserialize_string(visitor)
}
fn deserialize_string<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
match bincode::deserialize::<String>(&self.0) {
Ok(val) => val.into_deserializer().deserialize_string(visitor),
Err(e) => Err(de::Error::custom(e)),
}
}
fn deserialize_bytes<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
self.deserialize_byte_buf(visitor)
}
fn deserialize_byte_buf<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
match bincode::deserialize::<Vec<u8>>(&self.0) {
Ok(val) => val.into_deserializer().deserialize_byte_buf(visitor),
Err(e) => Err(de::Error::custom(e)),
}
}
forward_to_bincode_values! {
char => deserialize_char,
bool => deserialize_bool,
u8 => deserialize_u8,
u16 => deserialize_u16,
u32 => deserialize_u32,
u64 => deserialize_u64,
i8 => deserialize_i8,
i16 => deserialize_i16,
i32 => deserialize_i32,
i64 => deserialize_i64,
f32 => deserialize_f32,
f64 => deserialize_f64,
}
forward_to_deserialize_any! {
unit seq map
unit_struct tuple_struct
identifier tuple ignored_any option newtype_struct enum struct
}
}
#[derive(Debug)]
pub enum DeserializerError {
Custom(String),
}
impl de::Error for DeserializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
DeserializerError::Custom(msg.to_string())
}
}
impl fmt::Display for DeserializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
DeserializerError::Custom(s) => f.write_str(&s),
}
}
}
impl Error for DeserializerError {}

View File

@ -1,194 +0,0 @@
use std::collections::HashSet;
use serde::Serialize;
use serde::ser;
use crate::database::update::DocumentUpdate;
use crate::database::serde::SerializerError;
use crate::database::schema::SchemaAttr;
use crate::tokenizer::TokenizerBuilder;
use crate::tokenizer::Token;
use crate::{is_cjk, DocumentId, DocIndex};
pub struct IndexerSerializer<'a, 'b, B> {
pub tokenizer_builder: &'a B,
pub update: &'a mut DocumentUpdate<'b>,
pub document_id: DocumentId,
pub attribute: SchemaAttr,
pub stop_words: &'a HashSet<String>,
}
impl<'a, 'b, B> ser::Serializer for IndexerSerializer<'a, 'b, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
for token in self.tokenizer_builder.build(v) {
let Token { word, word_index, char_index } = token;
let document_id = self.document_id;
// FIXME must u32::try_from instead
let attribute = self.attribute.0;
let word_index = word_index as u32;
// insert the exact representation
let word_lower = word.to_lowercase();
let length = word.chars().count() as u16;
if self.stop_words.contains(&word_lower) { continue }
// and the unidecoded lowercased version
if !word_lower.chars().any(is_cjk) {
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
let word_unidecoded = word_unidecoded.trim();
if word_lower != word_unidecoded {
let char_index = char_index as u32;
let char_length = length;
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
self.update.insert_doc_index(word_unidecoded.as_bytes().to_vec(), doc_index)?;
}
}
let char_index = char_index as u32;
let char_length = length;
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
self.update.insert_doc_index(word_lower.into_bytes(), doc_index)?;
}
Ok(())
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "seq" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { name: "map" })
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct" })
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}

View File

@ -1,65 +0,0 @@
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
use std::error::Error;
use std::fmt;
use serde::ser;
macro_rules! forward_to_unserializable_type {
($($ty:ident => $se_method:ident,)*) => {
$(
fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "$ty" })
}
)*
}
}
pub mod find_id;
pub mod key_to_string;
pub mod value_to_number;
pub mod serializer;
pub mod indexer_serializer;
pub mod deserializer;
pub fn calculate_hash<T: Hash>(t: &T) -> u64 {
let mut s = DefaultHasher::new();
t.hash(&mut s);
s.finish()
}
#[derive(Debug)]
pub enum SerializerError {
DocumentIdNotFound,
UnserializableType { name: &'static str },
Custom(String),
}
impl ser::Error for SerializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
SerializerError::Custom(msg.to_string())
}
}
impl fmt::Display for SerializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
SerializerError::DocumentIdNotFound => {
write!(f, "serialized document does not have an id according to the schema")
}
SerializerError::UnserializableType { name } => {
write!(f, "Only struct and map types are considered valid documents and
can be serialized, not {} types directly.", name)
},
SerializerError::Custom(s) => f.write_str(&s),
}
}
}
impl Error for SerializerError {}
impl From<String> for SerializerError {
fn from(value: String) -> SerializerError {
SerializerError::Custom(value)
}
}

View File

@ -1,296 +0,0 @@
use std::collections::HashSet;
use serde::Serialize;
use serde::ser;
use crate::database::serde::indexer_serializer::IndexerSerializer;
use crate::database::serde::key_to_string::KeyToStringSerializer;
use crate::database::serde::value_to_number::ValueToNumberSerializer;
use crate::database::update::DocumentUpdate;
use crate::database::serde::SerializerError;
use crate::tokenizer::TokenizerBuilder;
use crate::database::schema::Schema;
use crate::DocumentId;
pub struct Serializer<'a, 'b, B> {
pub schema: &'a Schema,
pub update: &'a mut DocumentUpdate<'b>,
pub document_id: DocumentId,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
}
impl<'a, 'b, B> ser::Serializer for Serializer<'a, 'b, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapSerializer<'a, 'b, B>;
type SerializeStruct = StructSerializer<'a, 'b, B>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "str" })
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Ok(MapSerializer {
schema: self.schema,
document_id: self.document_id,
update: self.update,
tokenizer_builder: self.tokenizer_builder,
stop_words: self.stop_words,
current_key_name: None,
})
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Ok(StructSerializer {
schema: self.schema,
document_id: self.document_id,
update: self.update,
tokenizer_builder: self.tokenizer_builder,
stop_words: self.stop_words,
})
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}
pub struct MapSerializer<'a, 'b, B> {
pub schema: &'a Schema,
pub document_id: DocumentId,
pub update: &'a mut DocumentUpdate<'b>,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
pub current_key_name: Option<String>,
}
impl<'a, 'b, B> ser::SerializeMap for MapSerializer<'a, 'b, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: Serialize,
{
let key = key.serialize(KeyToStringSerializer)?;
self.current_key_name = Some(key);
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: Serialize,
{
let key = self.current_key_name.take().unwrap();
self.serialize_entry(&key, value)
}
fn serialize_entry<K: ?Sized, V: ?Sized>(
&mut self,
key: &K,
value: &V,
) -> Result<(), Self::Error>
where K: Serialize, V: Serialize,
{
let key = key.serialize(KeyToStringSerializer)?;
if let Some(attr) = self.schema.attribute(key) {
let props = self.schema.props(attr);
if props.is_stored() {
let value = bincode::serialize(value).unwrap();
self.update.insert_attribute_value(attr, &value)?;
}
if props.is_indexed() {
let serializer = IndexerSerializer {
update: self.update,
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
attribute: attr,
stop_words: self.stop_words,
};
value.serialize(serializer)?;
}
if props.is_ranked() {
let number = value.serialize(ValueToNumberSerializer)?;
self.update.register_ranked_attribute(attr, number)?;
}
}
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}
pub struct StructSerializer<'a, 'b, B> {
pub schema: &'a Schema,
pub document_id: DocumentId,
pub update: &'a mut DocumentUpdate<'b>,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
}
impl<'a, 'b, B> ser::SerializeStruct for StructSerializer<'a, 'b, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T
) -> Result<(), Self::Error>
where T: Serialize,
{
if let Some(attr) = self.schema.attribute(key) {
let props = self.schema.props(attr);
if props.is_stored() {
let value = bincode::serialize(value).unwrap();
self.update.insert_attribute_value(attr, &value)?;
}
if props.is_indexed() {
let serializer = IndexerSerializer {
update: self.update,
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
attribute: attr,
stop_words: self.stop_words,
};
value.serialize(serializer)?;
}
if props.is_ranked() {
let integer = value.serialize(ValueToNumberSerializer)?;
self.update.register_ranked_attribute(attr, integer)?;
}
}
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}

View File

@ -1,55 +0,0 @@
use std::error::Error;
use byteorder::{ReadBytesExt, WriteBytesExt};
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
use crate::write_to_bytes::WriteToBytes;
use crate::database::Index;
use crate::data::DocIds;
pub enum WriteIndexEvent<'a> {
RemovedDocuments(&'a DocIds),
UpdatedDocuments(&'a Index),
}
impl<'a> WriteToBytes for WriteIndexEvent<'a> {
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
match self {
WriteIndexEvent::RemovedDocuments(doc_ids) => {
let _ = bytes.write_u8(0);
doc_ids.write_to_bytes(bytes);
},
WriteIndexEvent::UpdatedDocuments(index) => {
let _ = bytes.write_u8(1);
index.write_to_bytes(bytes);
}
}
}
}
pub enum ReadIndexEvent {
RemovedDocuments(DocIds),
UpdatedDocuments(Index),
}
impl ReadIndexEvent {
pub fn updated_documents(self) -> Option<Index> {
use ReadIndexEvent::*;
match self {
RemovedDocuments(_) => None,
UpdatedDocuments(index) => Some(index),
}
}
}
impl FromSharedDataCursor for ReadIndexEvent {
type Error = Box<Error>;
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Self, Self::Error> {
match cursor.read_u8()? {
0 => DocIds::from_shared_data_cursor(cursor).map(ReadIndexEvent::RemovedDocuments),
1 => Index::from_shared_data_cursor(cursor).map(ReadIndexEvent::UpdatedDocuments),
_ => unreachable!(),
}
}
}

View File

@ -1,239 +0,0 @@
use std::collections::{HashSet, BTreeMap};
use std::error::Error;
use rocksdb::rocksdb::{Writable, WriteBatch};
use hashbrown::hash_map::HashMap;
use sdset::{Set, SetBuf};
use serde::Serialize;
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
use crate::database::serde::serializer::Serializer;
use crate::database::serde::SerializerError;
use crate::database::schema::SchemaAttr;
use crate::database::schema::Schema;
use crate::database::index::IndexBuilder;
use crate::database::{DATA_INDEX, DATA_RANKED_MAP};
use crate::database::{RankedMap, Number};
use crate::tokenizer::TokenizerBuilder;
use crate::write_to_bytes::WriteToBytes;
use crate::data::DocIds;
use crate::{DocumentId, DocIndex};
pub use self::index_event::{ReadIndexEvent, WriteIndexEvent};
pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent};
mod index_event;
mod ranked_map_event;
pub type Token = Vec<u8>; // TODO could be replaced by a SmallVec
pub struct Update {
schema: Schema,
raw_builder: RawUpdateBuilder,
}
impl Update {
pub(crate) fn new(schema: Schema) -> Update {
Update { schema, raw_builder: RawUpdateBuilder::new() }
}
pub fn update_document<T, B>(
&mut self,
document: T,
tokenizer_builder: &B,
stop_words: &HashSet<String>,
) -> Result<DocumentId, SerializerError>
where T: Serialize,
B: TokenizerBuilder,
{
let document_id = self.schema.document_id(&document)?;
let serializer = Serializer {
schema: &self.schema,
document_id: document_id,
tokenizer_builder: tokenizer_builder,
update: &mut self.raw_builder.document_update(document_id)?,
stop_words: stop_words,
};
document.serialize(serializer)?;
Ok(document_id)
}
pub fn remove_document<T>(&mut self, document: T) -> Result<DocumentId, SerializerError>
where T: Serialize,
{
let document_id = self.schema.document_id(&document)?;
self.raw_builder.document_update(document_id)?.remove()?;
Ok(document_id)
}
pub(crate) fn build(self) -> Result<WriteBatch, Box<Error>> {
self.raw_builder.build()
}
}
#[derive(Copy, Clone, PartialEq, Eq)]
enum UpdateType {
Updated,
Deleted,
}
use UpdateType::{Updated, Deleted};
pub struct RawUpdateBuilder {
documents_update: HashMap<DocumentId, UpdateType>,
documents_ranked_fields: RankedMap,
indexed_words: BTreeMap<Token, Vec<DocIndex>>,
batch: WriteBatch,
}
impl RawUpdateBuilder {
pub fn new() -> RawUpdateBuilder {
RawUpdateBuilder {
documents_update: HashMap::new(),
documents_ranked_fields: HashMap::new(),
indexed_words: BTreeMap::new(),
batch: WriteBatch::new(),
}
}
pub fn document_update(&mut self, document_id: DocumentId) -> Result<DocumentUpdate, SerializerError> {
use serde::ser::Error;
match self.documents_update.get(&document_id) {
Some(Deleted) | None => Ok(DocumentUpdate { document_id, inner: self }),
Some(Updated) => Err(SerializerError::custom(
"This document has already been removed and cannot be updated in the same update"
)),
}
}
pub fn build(self) -> Result<WriteBatch, Box<Error>> {
// create the list of all the removed documents
let removed_documents = {
let mut document_ids = Vec::new();
for (id, update_type) in self.documents_update {
if update_type == Deleted {
document_ids.push(id);
}
}
document_ids.sort_unstable();
let setbuf = SetBuf::new_unchecked(document_ids);
DocIds::new(&setbuf)
};
// create the Index of all the document updates
let index = {
let mut builder = IndexBuilder::new();
for (key, mut indexes) in self.indexed_words {
indexes.sort_unstable();
let indexes = Set::new_unchecked(&indexes);
builder.insert(key, indexes).unwrap();
}
builder.build()
};
// WARN: removed documents must absolutely
// be merged *before* document updates
// === index ===
if !removed_documents.is_empty() {
// remove the documents using the appropriate IndexEvent
let event_bytes = WriteIndexEvent::RemovedDocuments(&removed_documents).into_bytes();
self.batch.merge(DATA_INDEX, &event_bytes)?;
}
// update the documents using the appropriate IndexEvent
let event_bytes = WriteIndexEvent::UpdatedDocuments(&index).into_bytes();
self.batch.merge(DATA_INDEX, &event_bytes)?;
// === ranked map ===
if !removed_documents.is_empty() {
// update the ranked map using the appropriate RankedMapEvent
let event_bytes = WriteRankedMapEvent::RemovedDocuments(&removed_documents).into_bytes();
self.batch.merge(DATA_RANKED_MAP, &event_bytes)?;
}
// update the documents using the appropriate IndexEvent
let event_bytes = WriteRankedMapEvent::UpdatedDocuments(&self.documents_ranked_fields).into_bytes();
self.batch.merge(DATA_RANKED_MAP, &event_bytes)?;
Ok(self.batch)
}
}
pub struct DocumentUpdate<'a> {
document_id: DocumentId,
inner: &'a mut RawUpdateBuilder,
}
impl<'a> DocumentUpdate<'a> {
pub fn remove(&mut self) -> Result<(), SerializerError> {
use serde::ser::Error;
if let Updated = self.inner.documents_update.entry(self.document_id).or_insert(Deleted) {
return Err(SerializerError::custom(
"This document has already been updated and cannot be removed in the same update"
));
}
let start = DocumentKey::new(self.document_id).with_attribute_min();
let end = DocumentKey::new(self.document_id).with_attribute_max(); // FIXME max + 1
self.inner.batch.delete_range(start.as_ref(), end.as_ref())?;
Ok(())
}
pub fn insert_attribute_value(&mut self, attr: SchemaAttr, value: &[u8]) -> Result<(), SerializerError> {
use serde::ser::Error;
if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) {
return Err(SerializerError::custom(
"This document has already been deleted and cannot be updated in the same update"
));
}
let key = DocumentKeyAttr::new(self.document_id, attr);
self.inner.batch.put(key.as_ref(), &value)?;
Ok(())
}
pub fn insert_doc_index(&mut self, token: Token, doc_index: DocIndex) -> Result<(), SerializerError> {
use serde::ser::Error;
if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) {
return Err(SerializerError::custom(
"This document has already been deleted and cannot be updated in the same update"
));
}
self.inner.indexed_words.entry(token).or_insert_with(Vec::new).push(doc_index);
Ok(())
}
pub fn register_ranked_attribute(
&mut self,
attr: SchemaAttr,
number: Number,
) -> Result<(), SerializerError>
{
use serde::ser::Error;
if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) {
return Err(SerializerError::custom(
"This document has already been deleted, ranked attributes cannot be added in the same update"
));
}
self.inner.documents_ranked_fields.insert((self.document_id, attr), number);
Ok(())
}
}

View File

@ -1,58 +0,0 @@
use std::error::Error;
use byteorder::{ReadBytesExt, WriteBytesExt};
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
use crate::write_to_bytes::WriteToBytes;
use crate::database::RankedMap;
use crate::data::DocIds;
pub enum WriteRankedMapEvent<'a> {
RemovedDocuments(&'a DocIds),
UpdatedDocuments(&'a RankedMap),
}
impl<'a> WriteToBytes for WriteRankedMapEvent<'a> {
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
match self {
WriteRankedMapEvent::RemovedDocuments(doc_ids) => {
let _ = bytes.write_u8(0);
doc_ids.write_to_bytes(bytes);
},
WriteRankedMapEvent::UpdatedDocuments(ranked_map) => {
let _ = bytes.write_u8(1);
bincode::serialize_into(bytes, ranked_map).unwrap()
}
}
}
}
pub enum ReadRankedMapEvent {
RemovedDocuments(DocIds),
UpdatedDocuments(RankedMap),
}
impl ReadRankedMapEvent {
pub fn updated_documents(self) -> Option<RankedMap> {
use ReadRankedMapEvent::*;
match self {
RemovedDocuments(_) => None,
UpdatedDocuments(ranked_map) => Some(ranked_map),
}
}
}
impl FromSharedDataCursor for ReadRankedMapEvent {
type Error = Box<Error>;
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Self, Self::Error> {
match cursor.read_u8()? {
0 => DocIds::from_shared_data_cursor(cursor).map(ReadRankedMapEvent::RemovedDocuments),
1 => {
let ranked_map = bincode::deserialize_from(cursor)?;
Ok(ReadRankedMapEvent::UpdatedDocuments(ranked_map))
},
_ => unreachable!(),
}
}
}

View File

@ -1,201 +0,0 @@
use std::error::Error;
use std::path::Path;
use std::ops::Deref;
use std::{fmt, marker};
use rocksdb::rocksdb_options::{ReadOptions, EnvOptions, ColumnFamilyOptions};
use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey, SstFileWriter};
use serde::de::DeserializeOwned;
use crate::database::{retrieve_data_schema, retrieve_data_index, retrieve_data_ranked_map, retrieve_config};
use crate::database::serde::deserializer::Deserializer;
use crate::database::{DocumentKey, DocumentKeyAttr};
use crate::rank::{QueryBuilder, FilterFunc};
use crate::database::schema::Schema;
use crate::database::index::Index;
use crate::database::RankedMap;
use crate::database::Config;
use crate::DocumentId;
pub struct DatabaseView<D>
where D: Deref<Target=DB>
{
snapshot: Snapshot<D>,
index: Index,
ranked_map: RankedMap,
schema: Schema,
config: Config,
}
impl<D> DatabaseView<D>
where D: Deref<Target=DB>
{
pub fn new(snapshot: Snapshot<D>) -> Result<DatabaseView<D>, Box<Error>> {
let schema = retrieve_data_schema(&snapshot)?;
let index = retrieve_data_index(&snapshot)?;
let ranked_map = retrieve_data_ranked_map(&snapshot)?;
let config = retrieve_config(&snapshot)?;
Ok(DatabaseView { snapshot, index, ranked_map, schema, config })
}
pub fn schema(&self) -> &Schema {
&self.schema
}
pub fn index(&self) -> &Index {
&self.index
}
pub fn ranked_map(&self) -> &RankedMap {
&self.ranked_map
}
pub fn into_snapshot(self) -> Snapshot<D> {
self.snapshot
}
pub fn snapshot(&self) -> &Snapshot<D> {
&self.snapshot
}
pub fn config(&self) -> &Config {
&self.config
}
pub fn get(&self, key: &[u8]) -> Result<Option<DBVector>, Box<Error>> {
Ok(self.snapshot.get(key)?)
}
pub fn dump_all<P: AsRef<Path>>(&self, path: P) -> Result<(), Box<Error>> {
let path = path.as_ref().to_string_lossy();
let env_options = EnvOptions::new();
let column_family_options = ColumnFamilyOptions::new();
let mut file_writer = SstFileWriter::new(env_options, column_family_options);
file_writer.open(&path)?;
let mut iter = self.snapshot.iter();
iter.seek(SeekKey::Start);
for (key, value) in &mut iter {
file_writer.put(&key, &value)?;
}
file_writer.finish()?;
Ok(())
}
pub fn query_builder(&self) -> QueryBuilder<FilterFunc> {
QueryBuilder::new(self.index())
}
pub fn raw_field_by_document_id(
&self,
name: &str,
id: DocumentId
) -> Result<Option<Vec<u8>>, Box<Error>>
{
let attr = self.schema.attribute(name).ok_or("field not found")?;
let key = DocumentKeyAttr::new(id, attr);
let vector = self.snapshot.get(key.as_ref())?;
Ok(vector.map(|v| v.to_vec()))
}
pub fn document_by_id<T>(&self, id: DocumentId) -> Result<T, Box<Error>>
where T: DeserializeOwned,
{
let mut deserializer = Deserializer::new(&self.snapshot, &self.schema, id);
Ok(T::deserialize(&mut deserializer)?)
}
pub fn documents_by_id<T, I>(&self, ids: I) -> DocumentIter<D, T, I::IntoIter>
where T: DeserializeOwned,
I: IntoIterator<Item=DocumentId>,
{
DocumentIter {
database_view: self,
document_ids: ids.into_iter(),
_phantom: marker::PhantomData,
}
}
}
impl<D> fmt::Debug for DatabaseView<D>
where D: Deref<Target=DB>
{
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut options = ReadOptions::new();
let lower = DocumentKey::new(DocumentId(0));
options.set_iterate_lower_bound(lower.as_ref());
let mut iter = self.snapshot.iter_opt(options);
iter.seek(SeekKey::Start);
let iter = iter.map(|(key, _)| DocumentKeyAttr::from_bytes(&key));
if f.alternate() {
writeln!(f, "DatabaseView(")?;
} else {
write!(f, "DatabaseView(")?;
}
self.schema.fmt(f)?;
if f.alternate() {
writeln!(f, ",")?;
} else {
write!(f, ", ")?;
}
f.debug_list().entries(iter).finish()?;
write!(f, ")")
}
}
// TODO this is just an iter::Map !!!
pub struct DocumentIter<'a, D, T, I>
where D: Deref<Target=DB>
{
database_view: &'a DatabaseView<D>,
document_ids: I,
_phantom: marker::PhantomData<T>,
}
impl<'a, D, T, I> Iterator for DocumentIter<'a, D, T, I>
where D: Deref<Target=DB>,
T: DeserializeOwned,
I: Iterator<Item=DocumentId>,
{
type Item = Result<T, Box<Error>>;
fn size_hint(&self) -> (usize, Option<usize>) {
self.document_ids.size_hint()
}
fn next(&mut self) -> Option<Self::Item> {
match self.document_ids.next() {
Some(id) => Some(self.database_view.document_by_id(id)),
None => None
}
}
}
impl<'a, D, T, I> ExactSizeIterator for DocumentIter<'a, D, T, I>
where D: Deref<Target=DB>,
T: DeserializeOwned,
I: ExactSizeIterator + Iterator<Item=DocumentId>,
{ }
impl<'a, D, T, I> DoubleEndedIterator for DocumentIter<'a, D, T, I>
where D: Deref<Target=DB>,
T: DeserializeOwned,
I: DoubleEndedIterator + Iterator<Item=DocumentId>,
{
fn next_back(&mut self) -> Option<Self::Item> {
match self.document_ids.next_back() {
Some(id) => Some(self.database_view.document_by_id(id)),
None => None
}
}
}

View File

@ -1,136 +0,0 @@
#![cfg_attr(feature = "nightly", feature(test))]
pub mod automaton;
pub mod database;
pub mod data;
pub mod rank;
pub mod tokenizer;
mod common_words;
mod shared_data_cursor;
mod write_to_bytes;
use serde_derive::{Serialize, Deserialize};
pub use rocksdb;
pub use self::tokenizer::Tokenizer;
pub use self::common_words::CommonWords;
pub fn is_cjk(c: char) -> bool {
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
(c >= '\u{3040}' && c <= '\u{309f}') ||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
(c >= '\u{3100}' && c <= '\u{312f}') ||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
(c >= '\u{f900}' && c <= '\u{faff}')
}
/// Represent an internally generated document unique identifier.
///
/// It is used to inform the database the document you want to deserialize.
/// Helpful for custom ranking.
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
pub struct DocumentId(u64);
/// This structure represent the position of a word
/// in a document and its attributes.
///
/// This is stored in the map, generated at index time,
/// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(C)]
pub struct DocIndex {
/// The document identifier where the word was found.
pub document_id: DocumentId,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
pub word_index: u32,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u32,
pub char_length: u16,
}
/// This structure represent a matching word with informations
/// on the location of the word in the document.
///
/// The order of the field is important because it defines
/// the way these structures are ordered between themselves.
///
/// The word in itself is not important.
// TODO do data oriented programming ? very arrays ?
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Match {
/// The word index in the query sentence.
/// Same as the `attribute_index` but for the query words.
///
/// Used to retrieve the automaton that match this word.
pub query_index: u32,
/// The distance the word has with the query word
/// (i.e. the Levenshtein distance).
pub distance: u8,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
pub word_index: u32,
/// Whether the word that match is an exact match or a prefix.
pub is_exact: bool,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u32,
pub char_length: u16,
}
impl Match {
pub fn zero() -> Self {
Match {
query_index: 0,
distance: 0,
attribute: 0,
word_index: 0,
is_exact: false,
char_index: 0,
char_length: 0,
}
}
pub fn max() -> Self {
Match {
query_index: u32::max_value(),
distance: u8::max_value(),
attribute: u16::max_value(),
word_index: u32::max_value(),
is_exact: true,
char_index: u32::max_value(),
char_length: u16::max_value(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::mem;
#[test]
fn docindex_mem_size() {
assert_eq!(mem::size_of::<DocIndex>(), 24);
}
}

View File

@ -1,183 +0,0 @@
pub mod criterion;
mod query_builder;
mod distinct_map;
use std::sync::Arc;
use slice_group_by::GroupBy;
use rayon::slice::ParallelSliceMut;
use crate::{Match, DocumentId};
pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Document {
pub id: DocumentId,
pub matches: Vec<Match>,
}
impl Document {
fn from_raw(raw: &RawDocument) -> Document {
let len = raw.matches.range.len();
let mut matches = Vec::with_capacity(len);
let query_index = raw.query_index();
let distance = raw.distance();
let attribute = raw.attribute();
let word_index = raw.word_index();
let is_exact = raw.is_exact();
let char_index = raw.char_index();
let char_length = raw.char_length();
for i in 0..len {
let match_ = Match {
query_index: query_index[i],
distance: distance[i],
attribute: attribute[i],
word_index: word_index[i],
is_exact: is_exact[i],
char_index: char_index[i],
char_length: char_length[i],
};
matches.push(match_);
}
Document { id: raw.id, matches }
}
}
#[derive(Clone)]
pub struct RawDocument {
pub id: DocumentId,
pub matches: SharedMatches,
}
impl RawDocument {
fn new(id: DocumentId, range: Range, matches: Arc<Matches>) -> RawDocument {
RawDocument { id, matches: SharedMatches { range, matches } }
}
pub fn query_index(&self) -> &[u32] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
}
pub fn distance(&self) -> &[u8] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
}
pub fn attribute(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
}
pub fn word_index(&self) -> &[u32] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
}
pub fn is_exact(&self) -> &[bool] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
}
pub fn char_index(&self) -> &[u32] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.char_index.get_unchecked(r.start..r.end) }
}
pub fn char_length(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.char_length.get_unchecked(r.start..r.end) }
}
}
pub fn raw_documents_from_matches(mut matches: Vec<(DocumentId, Match)>) -> Vec<RawDocument> {
let mut docs_ranges = Vec::<(DocumentId, Range)>::new();
let mut matches2 = Matches::with_capacity(matches.len());
matches.par_sort_unstable();
for group in matches.linear_group_by(|(a, _), (b, _)| a == b) {
let id = group[0].0;
let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0);
let end = start + group.len();
docs_ranges.push((id, Range { start, end }));
matches2.extend_from_slice(group);
}
let matches = Arc::new(matches2);
docs_ranges.into_iter().map(|(i, r)| RawDocument::new(i, r, matches.clone())).collect()
}
#[derive(Debug, Copy, Clone)]
struct Range {
start: usize,
end: usize,
}
impl Range {
fn len(self) -> usize {
self.end - self.start
}
}
#[derive(Clone)]
pub struct SharedMatches {
range: Range,
matches: Arc<Matches>,
}
#[derive(Clone)]
struct Matches {
query_index: Vec<u32>,
distance: Vec<u8>,
attribute: Vec<u16>,
word_index: Vec<u32>,
is_exact: Vec<bool>,
char_index: Vec<u32>,
char_length: Vec<u16>,
}
impl Matches {
fn with_capacity(cap: usize) -> Matches {
Matches {
query_index: Vec::with_capacity(cap),
distance: Vec::with_capacity(cap),
attribute: Vec::with_capacity(cap),
word_index: Vec::with_capacity(cap),
is_exact: Vec::with_capacity(cap),
char_index: Vec::with_capacity(cap),
char_length: Vec::with_capacity(cap),
}
}
fn extend_from_slice(&mut self, matches: &[(DocumentId, Match)]) {
for (_, match_) in matches {
self.query_index.push(match_.query_index);
self.distance.push(match_.distance);
self.attribute.push(match_.attribute);
self.word_index.push(match_.word_index);
self.is_exact.push(match_.is_exact);
self.char_index.push(match_.char_index);
self.char_length.push(match_.char_length);
}
}
}

View File

@ -1,332 +0,0 @@
use std::{cmp, mem};
use std::ops::Range;
use std::time::Instant;
use std::hash::Hash;
use std::rc::Rc;
use rayon::slice::ParallelSliceMut;
use slice_group_by::{GroupByMut, LinearStrGroupBy};
use hashbrown::HashMap;
use fst::Streamer;
use log::info;
use crate::automaton::{self, DfaExt, AutomatonExt};
use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap};
use crate::rank::criterion::Criteria;
use crate::database::Index;
use crate::rank::{raw_documents_from_matches, RawDocument, Document};
use crate::{is_cjk, Match, DocumentId};
#[derive(Debug, PartialEq, Eq)]
enum CharCategory {
Space,
Cjk,
Other,
}
fn classify_char(c: char) -> CharCategory {
if c.is_whitespace() { CharCategory::Space }
else if is_cjk(c) { CharCategory::Cjk }
else { CharCategory::Other }
}
fn is_word(s: &&str) -> bool {
!s.chars().any(char::is_whitespace)
}
fn same_group_category(a: char, b: char) -> bool {
let ca = classify_char(a);
let cb = classify_char(b);
if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb }
}
fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let mut groups = LinearStrGroupBy::new(query, same_group_category)
.filter(is_word)
.map(str::to_lowercase)
.peekable();
let mut automatons = Vec::new();
while let Some(word) = groups.next() {
let has_following_word = groups.peek().is_some();
let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) {
automaton::build_dfa(&word)
} else {
automaton::build_prefix_dfa(&word)
};
automatons.push(lev);
}
automatons
}
pub type FilterFunc = fn(DocumentId) -> bool;
pub struct QueryBuilder<'i, 'c, FI> {
index: &'i Index,
criteria: Criteria<'c>,
filter: Option<FI>,
}
impl<'i, 'c> QueryBuilder<'i, 'c, FilterFunc> {
pub fn new(index: &'i Index) -> Self {
QueryBuilder::with_criteria(index, Criteria::default())
}
pub fn with_criteria(index: &'i Index, criteria: Criteria<'c>) -> Self {
QueryBuilder { index, criteria, filter: None }
}
}
impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI>
{
pub fn with_filter<F>(self, function: F) -> QueryBuilder<'i, 'c, F>
where F: Fn(DocumentId) -> bool,
{
QueryBuilder {
index: self.index,
criteria: self.criteria,
filter: Some(function)
}
}
pub fn with_distinct<F, K>(self, function: F, size: usize) -> DistinctQueryBuilder<'i, 'c, FI, F>
where F: Fn(DocumentId) -> Option<K>,
K: Hash + Eq,
{
DistinctQueryBuilder {
inner: self,
function: function,
size: size
}
}
fn query_all(&self, query: &str) -> Vec<RawDocument> {
let automatons = split_whitespace_automatons(query);
let mut stream = {
let mut op_builder = fst::map::OpBuilder::new();
for automaton in &automatons {
let stream = self.index.map.search(automaton);
op_builder.push(stream);
}
op_builder.union()
};
let mut matches = Vec::new();
while let Some((input, indexed_values)) = stream.next() {
for iv in indexed_values {
let automaton = &automatons[iv.index];
let distance = automaton.eval(input).to_u8();
let is_exact = distance == 0 && input.len() == automaton.query_len();
let doc_indexes = &self.index.indexes;
let doc_indexes = &doc_indexes[iv.value as usize];
for doc_index in doc_indexes {
let match_ = Match {
query_index: iv.index as u32,
distance: distance,
attribute: doc_index.attribute,
word_index: doc_index.word_index,
is_exact: is_exact,
char_index: doc_index.char_index,
char_length: doc_index.char_length,
};
matches.push((doc_index.document_id, match_));
}
}
}
let total_matches = matches.len();
let raw_documents = raw_documents_from_matches(matches);
info!("{} total documents to classify", raw_documents.len());
info!("{} total matches to classify", total_matches);
raw_documents
}
}
impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI>
where FI: Fn(DocumentId) -> bool,
{
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
// We delegate the filter work to the distinct query builder,
// specifying a distinct rule that has no effect.
if self.filter.is_some() {
let builder = self.with_distinct(|_| None as Option<()>, 1);
return builder.query(query, range);
}
let start = Instant::now();
let mut documents = self.query_all(query);
info!("query_all took {:.2?}", start.elapsed());
let mut groups = vec![documents.as_mut_slice()];
'criteria: for (ci, criterion) in self.criteria.as_ref().iter().enumerate() {
let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut documents_seen = 0;
for group in tmp_groups {
info!("criterion {}, documents group of size {}", ci, group.len());
// if this group does not overlap with the requested range,
// push it without sorting and splitting it
if documents_seen + group.len() < range.start {
documents_seen += group.len();
groups.push(group);
continue;
}
let start = Instant::now();
group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b));
info!("criterion {} sort took {:.2?}", ci, start.elapsed());
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
documents_seen += group.len();
groups.push(group);
// we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion
if documents_seen >= range.end { continue 'criteria }
}
}
}
let offset = cmp::min(documents.len(), range.start);
let iter = documents.into_iter().skip(offset).take(range.len());
iter.map(|d| Document::from_raw(&d)).collect()
}
}
pub struct DistinctQueryBuilder<'i, 'c, FI, FD> {
inner: QueryBuilder<'i, 'c, FI>,
function: FD,
size: usize,
}
impl<'i, 'c, FI, FD> DistinctQueryBuilder<'i, 'c, FI, FD>
{
pub fn with_filter<F>(self, function: F) -> DistinctQueryBuilder<'i, 'c, F, FD>
where F: Fn(DocumentId) -> bool,
{
DistinctQueryBuilder {
inner: self.inner.with_filter(function),
function: self.function,
size: self.size
}
}
}
impl<'i, 'c, FI, FD, K> DistinctQueryBuilder<'i, 'c, FI, FD>
where FI: Fn(DocumentId) -> bool,
FD: Fn(DocumentId) -> Option<K>,
K: Hash + Eq,
{
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
let start = Instant::now();
let mut documents = self.inner.query_all(query);
info!("query_all took {:.2?}", start.elapsed());
let mut groups = vec![documents.as_mut_slice()];
let mut key_cache = HashMap::new();
let mut filter_map = HashMap::new();
// these two variables informs on the current distinct map and
// on the raw offset of the start of the group where the
// range.start bound is located according to the distinct function
let mut distinct_map = DistinctMap::new(self.size);
let mut distinct_raw_offset = 0;
'criteria: for (ci, criterion) in self.inner.criteria.as_ref().iter().enumerate() {
let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
let mut documents_seen = 0;
for group in tmp_groups {
info!("criterion {}, documents group of size {}", ci, group.len());
// if this group does not overlap with the requested range,
// push it without sorting and splitting it
if documents_seen + group.len() < distinct_raw_offset {
documents_seen += group.len();
groups.push(group);
continue;
}
let start = Instant::now();
group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b));
info!("criterion {} sort took {:.2?}", ci, start.elapsed());
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
// we must compute the real distinguished len of this sub-group
for document in group.iter() {
let filter_accepted = match &self.inner.filter {
Some(filter) => {
let entry = filter_map.entry(document.id);
*entry.or_insert_with(|| (filter)(document.id))
},
None => true,
};
if filter_accepted {
let entry = key_cache.entry(document.id);
let key = entry.or_insert_with(|| (self.function)(document.id).map(Rc::new));
match key.clone() {
Some(key) => buf_distinct.register(key),
None => buf_distinct.register_without_key(),
};
}
// the requested range end is reached: stop computing distinct
if buf_distinct.len() >= range.end { break }
}
documents_seen += group.len();
groups.push(group);
// if this sub-group does not overlap with the requested range
// we must update the distinct map and its start index
if buf_distinct.len() < range.start {
buf_distinct.transfert_to_internal();
distinct_raw_offset = documents_seen;
}
// we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion
if buf_distinct.len() >= range.end { continue 'criteria }
}
}
}
let mut out_documents = Vec::with_capacity(range.len());
let mut seen = BufferedDistinctMap::new(&mut distinct_map);
for document in documents.into_iter().skip(distinct_raw_offset) {
let filter_accepted = match &self.inner.filter {
Some(_) => filter_map.remove(&document.id).expect("BUG: filtered not found"),
None => true,
};
if filter_accepted {
let key = key_cache.remove(&document.id).expect("BUG: cached key not found");
let distinct_accepted = match key {
Some(key) => seen.register(key),
None => seen.register_without_key(),
};
if distinct_accepted && seen.len() > range.start {
out_documents.push(Document::from_raw(&document));
if out_documents.len() == range.len() { break }
}
}
}
out_documents
}
}

View File

@ -1,56 +0,0 @@
use std::io::{self, Read, Cursor, BufRead};
use std::sync::Arc;
use crate::data::SharedData;
pub struct SharedDataCursor(Cursor<SharedData>);
impl SharedDataCursor {
pub fn from_bytes(bytes: Vec<u8>) -> SharedDataCursor {
let len = bytes.len();
let bytes = Arc::new(bytes);
SharedDataCursor::from_shared_bytes(bytes, 0, len)
}
pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedDataCursor {
let data = SharedData::new(bytes, offset, len);
let cursor = Cursor::new(data);
SharedDataCursor(cursor)
}
pub fn extract(&mut self, amt: usize) -> SharedData {
let offset = self.0.position() as usize;
let extracted = self.0.get_ref().range(offset, amt);
self.0.consume(amt);
extracted
}
}
impl Read for SharedDataCursor {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
self.0.read(buf)
}
}
impl BufRead for SharedDataCursor {
fn fill_buf(&mut self) -> io::Result<&[u8]> {
self.0.fill_buf()
}
fn consume(&mut self, amt: usize) {
self.0.consume(amt)
}
}
pub trait FromSharedDataCursor: Sized {
type Error;
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Self, Self::Error>;
fn from_bytes(bytes: Vec<u8>) -> Result<Self, Self::Error> {
let mut cursor = SharedDataCursor::from_bytes(bytes);
Self::from_shared_data_cursor(&mut cursor)
}
}

View File

@ -1,259 +0,0 @@
use std::mem;
use crate::is_cjk;
use self::Separator::*;
pub trait TokenizerBuilder {
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a>;
}
pub struct DefaultBuilder;
impl DefaultBuilder {
pub fn new() -> DefaultBuilder {
DefaultBuilder
}
}
#[derive(Debug, PartialEq, Eq)]
pub struct Token<'a> {
pub word: &'a str,
pub word_index: usize,
pub char_index: usize,
}
impl TokenizerBuilder for DefaultBuilder {
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a> {
Box::new(Tokenizer::new(text))
}
}
pub struct Tokenizer<'a> {
word_index: usize,
char_index: usize,
inner: &'a str,
}
impl<'a> Tokenizer<'a> {
pub fn new(string: &str) -> Tokenizer {
let mut char_advance = 0;
let mut index_advance = 0;
for (n, (i, c)) in string.char_indices().enumerate() {
char_advance = n;
index_advance = i;
if detect_separator(c).is_none() { break }
}
Tokenizer {
word_index: 0,
char_index: char_advance,
inner: &string[index_advance..],
}
}
}
#[derive(Debug, Clone, Copy)]
enum Separator {
Short,
Long,
}
impl Separator {
fn add(self, add: Separator) -> Separator {
match (self, add) {
(_, Long) => Long,
(Short, Short) => Short,
(Long, Short) => Long,
}
}
fn to_usize(self) -> usize {
match self {
Short => 1,
Long => 8,
}
}
}
fn detect_separator(c: char) -> Option<Separator> {
match c {
'.' | ';' | ',' | '!' | '?' | '-' | '(' | ')' => Some(Long),
' ' | '\'' | '"' => Some(Short),
_ => None,
}
}
impl<'a> Iterator for Tokenizer<'a> {
type Item = Token<'a>;
fn next(&mut self) -> Option<Self::Item> {
let mut start_word = None;
let mut distance = None;
for (i, c) in self.inner.char_indices() {
match detect_separator(c) {
Some(sep) => {
if let Some(start_word) = start_word {
let (prefix, tail) = self.inner.split_at(i);
let (spaces, word) = prefix.split_at(start_word);
self.inner = tail;
self.char_index += spaces.chars().count();
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
let token = Token {
word: word,
word_index: self.word_index,
char_index: self.char_index,
};
self.char_index += word.chars().count();
return Some(token)
}
distance = Some(distance.map_or(sep, |s| s.add(sep)));
},
None => {
// if this is a Chinese, a Japanese or a Korean character
// See <http://unicode-table.com>
if is_cjk(c) {
match start_word {
Some(start_word) => {
let (prefix, tail) = self.inner.split_at(i);
let (spaces, word) = prefix.split_at(start_word);
self.inner = tail;
self.char_index += spaces.chars().count();
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
let token = Token {
word: word,
word_index: self.word_index,
char_index: self.char_index,
};
self.word_index += 1;
self.char_index += word.chars().count();
return Some(token)
},
None => {
let (prefix, tail) = self.inner.split_at(i + c.len_utf8());
let (spaces, word) = prefix.split_at(i);
self.inner = tail;
self.char_index += spaces.chars().count();
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
let token = Token {
word: word,
word_index: self.word_index,
char_index: self.char_index,
};
if tail.chars().next().and_then(detect_separator).is_none() {
self.word_index += 1;
}
self.char_index += 1;
return Some(token)
}
}
}
if start_word.is_none() { start_word = Some(i) }
},
}
}
if let Some(start_word) = start_word {
let prefix = mem::replace(&mut self.inner, "");
let (spaces, word) = prefix.split_at(start_word);
let token = Token {
word: word,
word_index: self.word_index + distance.map(Separator::to_usize).unwrap_or(0),
char_index: self.char_index + spaces.chars().count(),
};
return Some(token)
}
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn easy() {
let mut tokenizer = Tokenizer::new("salut");
assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard() {
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 18 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 32, char_index: 24 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard_long_chars() {
let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard_kanjis() {
let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}");
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 11, char_index: 14 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 12, char_index: 23 }));
assert_eq!(tokenizer.next(), None);
}
}

View File

@ -1,9 +0,0 @@
pub trait WriteToBytes {
fn write_to_bytes(&self, bytes: &mut Vec<u8>);
fn into_bytes(&self) -> Vec<u8> {
let mut bytes = Vec::new();
self.write_to_bytes(&mut bytes);
bytes
}
}