Compare commits

..

330 Commits
v0.2 ... v0.4.0

Author SHA1 Message Date
aaeb25828f Merge pull request #183 from meilisearch/number-of-documents
Compute the number of documents on updates
2019-09-14 16:32:18 +02:00
af26c39482 test: Improve the tests of the number of documents counting 2019-09-14 15:29:46 +02:00
2006259a23 feat: Improve the number of documents counting 2019-09-14 15:26:41 +02:00
707e2f4d77 feat: Update the number of documents in the KV 2019-09-14 15:26:39 +02:00
8d8aed36a8 feat: Count the number of deleted/inserted documents 2019-09-14 15:24:39 +02:00
2658ef0176 Merge pull request #182 from meilisearch/replace-sled-by-rocksdb
Replace sled by RocksDB
2019-09-14 11:32:26 +02:00
400d542fef feat: Update the README to reflect the kv store update 2019-09-12 16:28:23 +02:00
f46868407c feat: Make RocksDB works seemlessly like sled 2019-09-05 18:43:10 +02:00
e3fa07077c feat: Introduce the CfTree and CfIter types 2019-09-05 14:53:09 +02:00
e5763e73eb chore: Prefer using const names to avoid typos 2019-09-05 13:22:53 +02:00
fd880e0a0e Merge pull request #175 from meilisearch/moving-back-to-sled
Moving back to sled
2019-09-05 13:14:48 +02:00
e33cc89846 feat: Introduce update callbacks 2019-09-05 11:48:26 +02:00
f40b373f9f feat: Introduce the UpdateStatus type 2019-09-05 11:48:26 +02:00
cd8535d410 feat: Introduce the update_status/_blocking functions 2019-09-05 11:48:25 +02:00
f07b99fe97 fix: Make the tests work with the new update system 2019-09-05 11:48:25 +02:00
f45a00df3b fix: Cloned ArcSwaps are unsynchronized versions 2019-09-05 11:46:02 +02:00
cd864c40bc feat: Make the update update serialization be based on message pack 2019-09-05 11:46:02 +02:00
91b44a2759 chore: Change the Box<Error> to be marked dyn 2019-09-05 11:46:01 +02:00
d8cd8c5def chore: Move the updates in their own module 2019-09-05 11:46:01 +02:00
b0be06540a chore: Simplify the update application 2019-09-05 11:46:01 +02:00
4deee93a55 feat: Introduce synonyms deletion using the update system 2019-09-05 11:33:11 +02:00
451c0a6d03 feat: Introduce synonyms addition using the update system 2019-09-05 11:33:10 +02:00
0db3e6c58c feat: Introduce documents deletion using the update system 2019-09-05 11:33:10 +02:00
f83d6df4ef feat: Introduce documents addition using the update system 2019-09-05 11:33:10 +02:00
5a9e25c315 feat: Introduce the UpdatesIndex type 2019-09-05 11:14:11 +02:00
50e3c2c3de chore: Upgrade the meilidb-data dependencies 2019-09-05 10:49:46 +02:00
093ee9732f Merge pull request #180 from meilisearch/store-every-document
Change the STORED attribute property by DISPLAYED
2019-09-04 14:45:00 +02:00
333189ee51 fix: Change every stored schema property by displayed 2019-09-04 11:16:36 +02:00
50b8a66794 feat: Change the STORED attribute property by DISPLAYED 2019-09-03 11:14:20 +02:00
8be3fc1a66 Merge pull request #179 from meilisearch/deunicode-before-tokenize
Improve the tokenizer by split after deunicode
2019-09-02 17:20:30 +02:00
b5503989f9 feat: Improve the tokenizer by split after deunicode 2019-09-02 16:54:54 +02:00
5b8bc09826 Merge pull request #176 from meilisearch/no-more-hanging-threads
Replace the rayon::scope by always checking time
2019-09-01 20:02:03 +02:00
c8ee21f227 feat: Replace the rayon::scope by always checking time 2019-09-01 18:52:38 +02:00
a420fbf1e8 Merge pull request #174 from meilisearch/arc-fst-sets
Do not clone probably large fst::Sets, Arc them
2019-08-30 14:52:28 +02:00
ca34c28335 feat: Do not clone probably large fst::Sets, Arc them 2019-08-30 14:37:28 +02:00
3e1b81c4ce Merge pull request #173 from meilisearch/fix-ranked-map-set
Use the right ranked-map key name
2019-08-30 14:21:14 +02:00
9b353dfda6 chore: Use const names to avoid typos 2019-08-30 12:36:10 +02:00
d8dcc6f34b fix: Use the right ranked-map key name 2019-08-30 12:21:00 +02:00
fba1272a3e Merge pull request #172 from meilisearch/expose-internal-functions
Expose some internal functions
2019-08-29 15:26:42 +02:00
e20a038970 fix: Expose some internal functions 2019-08-29 15:11:51 +02:00
6f34dccc89 Merge pull request #171 from meilisearch/stringify-document-id
Transform identifiers fields into a string before hashing it
2019-08-29 13:42:46 +02:00
f5b0eb044a fix: Transform the identifier value into a string before hashing it 2019-08-29 11:41:20 +02:00
bae86e978e Merge pull request #170 from meilisearch/async-word-index-fetching-with-rayon-scope
Async word index fetching with rayon scope
2019-08-28 14:37:38 +02:00
8030a822ab test: Add a way to setup the fetch timeout of the query-database example 2019-08-28 13:42:20 +02:00
9c5ec110e5 feat: Introduce a way to enable or disable query timeouts 2019-08-28 13:24:34 +02:00
67302d09f3 feat: Multiword rewrite while there is time 2019-08-19 11:12:23 +02:00
7dc9ea78fa feat: Make the automaton DFA construction lazy 2019-08-19 11:12:23 +02:00
0ee56314fb feat: Try to simplify Store trait bound with a rayon scope 2019-08-19 11:10:54 +02:00
b7b60b5fe5 feat: Introduce a new thread to avoid waiting on doc indexes fetchs 2019-08-16 16:35:19 +02:00
d9c9fafd78 feat: Fetch doc indexes while there is time 2019-08-16 15:01:25 +02:00
bb0a79c577 feat: Process automatons in the order they were sort 2019-08-16 12:25:35 +02:00
81d44a0854 feat: Order automatons by importance 2019-08-16 12:19:34 +02:00
ebc95cb8f2 feat: Display the documents fields in the order they were declared 2019-08-16 11:25:42 +02:00
a488c00a2e feat: Use RustyLine in the query-database example 2019-08-16 11:25:42 +02:00
bf3c2c3725 feat: Move the multi-word rewriting algorithm into its own function 2019-08-16 11:25:42 +02:00
89df496f0c feat: Separate highlights from matches to make the code easier to follow 2019-08-16 11:25:42 +02:00
9959f2e952 feat: Move the RawDocument type to its own module 2019-08-16 11:25:42 +02:00
795557c046 feat: Remove query splitting from the automaton generation 2019-08-16 11:25:42 +02:00
225a3bf184 test: Produce tests that work with the new cumulative word index system 2019-08-16 11:25:42 +02:00
e65d7418b7 feat: Remove the query index from the Automaton type 2019-08-16 11:25:42 +02:00
f478bbf826 feat: Introduce the QueryEnhancer in the query synonym system 2019-08-16 11:25:42 +02:00
5e691c2140 feat: Introduce the QueryEnhancer type 2019-08-16 11:25:42 +02:00
e0cadaa68d Merge pull request #165 from meilisearch/reorder-schema-attributes
Reorder schema attributes
2019-07-01 16:12:33 +02:00
9175e4686b feat: Collect TmpMatches only on tests, producing data useful for tests 2019-07-01 14:55:47 +02:00
e8afca614c chore: Little clean ups of meilidb-core 2019-07-01 14:34:06 +02:00
4f4b630ae9 fix: Make the examples compile with the new Highlight type 2019-07-01 12:06:17 +02:00
6b6db2f8e6 feat: Introduce the Highlight type to simplify the data oriented design 2019-07-01 12:06:16 +02:00
b7ed22bc59 feat: Introduce on the fly attributes reordering with meilidb-core 2019-07-01 12:03:31 +02:00
97cc3c7cce Merge pull request #166 from meilisearch/split-query-words
Split query words
2019-06-28 18:30:13 +02:00
f5d52396f5 feat: Support query words splits 2019-06-28 18:04:35 +02:00
9cc154da05 chore: Rewrite tests to use iterators and be easily testable 2019-06-28 18:04:35 +02:00
5aa49d232c feat: Rewrite Automaton generation related code 2019-06-28 18:04:35 +02:00
1cb42cbb30 Merge pull request #164 from meilisearch/concat-query-words
Support query words concatenation
2019-06-28 18:03:49 +02:00
9f320590d3 feat: Support query words concatenation 2019-06-27 10:14:17 +02:00
1b0fd2e0ba Merge pull request #160 from meilisearch/synonyms
Support all types of synonyms
2019-06-26 14:59:45 +02:00
b249b2a81b feat: Support removing specific synonym alternatives 2019-06-26 10:45:51 +02:00
0a5d4eb7ed feat: Normalize synonym strings and query strings to search for synonyms 2019-06-26 10:45:51 +02:00
3dcbc737f3 feat: Make synonyms be not considered like exact matches 2019-06-26 10:45:51 +02:00
43f11e929d fix: Do not trigger a synonym when its not the last word and is a prefix 2019-06-26 10:45:51 +02:00
8f2a551cca feat: Trigger synonym replacement only when the last word is tipped 2019-06-26 10:45:50 +02:00
8f044c6853 fix: Only create non-prefix DFA when generating synonyms alternatives 2019-06-26 10:45:50 +02:00
a76c00a787 feat: Create types to edit synonyms and keep them in the database 2019-06-26 10:45:50 +02:00
0633f16b4d feat: Make multi-word support multi-word synonyms 2019-06-26 10:45:50 +02:00
59fafb8b30 feat: Support one word has multi-word alternatives 2019-06-26 10:45:50 +02:00
d2bd99cc2a fix: Append DocIndexes when building InMemorySetStore from an Iterator 2019-06-26 10:45:50 +02:00
62930ecc4e feat: Deduplicate automatons when synonyms produce duplicated ones 2019-06-26 10:45:49 +02:00
6cb57aa8a4 feat: Unique word has multi-word synonyms basically work 2019-06-26 10:45:49 +02:00
9861c3878e tests: Add more tests about synonyms 2019-06-26 10:45:49 +02:00
707d7b062b feat: Made query handle synonyms via the Store 2019-06-26 10:45:49 +02:00
18736bdcd0 feat: Introduce the synonyms concept to the Store trait 2019-06-26 10:45:49 +02:00
e8b2e86007 feat: Introduce a basic way to handle synonyms 2019-06-26 10:45:48 +02:00
ae8b4f56f2 Merge pull request #163 from meilisearch/export-compute-docid
Expose a function to compute the DocumentId from an Hashable value
2019-06-25 12:25:38 +02:00
28a0074497 feat: Expose a function to compute the DocumentId from an Hashable value 2019-06-25 11:21:12 +02:00
71c039db09 Merge pull request #162 from meilisearch/trustful-hash
Prefer using a reliable SipHash to compute document ids
2019-06-22 11:51:52 +02:00
15646c258b fix: Prefer using a reliable SipHash to compute document ids 2019-06-22 11:22:21 +02:00
25a5605b35 Merge pull request #161 from meilisearch/remove-tide
Remove tide as it break compilation on the latest nightly
2019-06-18 14:04:47 +02:00
b630e32c6a fix: Remove tide as it break compilation on the latest nightly 2019-06-18 13:40:46 +02:00
c39254bf98 Merge pull request #159 from meilisearch/create-specific-schema-crate
Move the Schema to its own workspace crate
2019-06-03 09:17:14 +02:00
994a0e78f1 feat: Move the Schema to its own workspace crate 2019-05-29 15:37:28 +02:00
ab2ca15c5c Merge pull request #158 from meilisearch/moving-back-to-rocksdb
Moving back to RocksDB
2019-05-29 14:56:55 +02:00
07f447c457 feat: Force RocksDB compaction 2019-05-28 17:38:59 +02:00
62c8f1ba04 feat: Fix the index opening when index already exists 2019-05-26 11:36:47 +02:00
e08edc2d6b feat: Introduce some stats to ease debugging 2019-05-25 12:12:24 +02:00
a147c09b06 feat: Make more functions accessible on the custom settings 2019-05-24 14:37:04 +02:00
9fca74443e feat: Wrap the database index access to improve usability 2019-05-24 14:26:05 +02:00
6f258f71d5 feat: Implement some convenient accessors for custom settings 2019-05-23 15:43:41 +02:00
ce61c16dbe feat: Disable all the default RocksDB compression features 2019-05-23 15:35:53 +02:00
4c973238a1 feat: Introduce a basic RocksDB based version 2019-05-23 14:57:29 +02:00
3a8da82792 Merge pull request #157 from meilisearch/update-readme
Fix some badly spelled sentences
2019-05-22 14:01:33 +02:00
f10da122ff doc: Fix some badly spelled sentences 2019-05-22 11:41:03 +02:00
ec20a8cacb Merge pull request #156 from meilisearch/clippy-pass
Do a little clippy pass
2019-05-22 11:33:55 +02:00
102fb506db chore: Do a little clippy pass 2019-05-22 11:00:58 +02:00
34ba520f44 Merge pull request #155 from meilisearch/update-sdset
Use safest SetBuf constructor instead of new_unchecked
2019-05-21 18:23:39 +02:00
fa099555c0 feat: Use safest SetBuf constructor instead of new_unchecked 2019-05-21 18:15:48 +02:00
8387c5b14e Merge pull request #153 from meilisearch/example-expose-system-stats
Output more informations from the examples on document injection
2019-05-21 16:50:25 +02:00
5040095228 feat: Output more informations from the examples on document injection 2019-05-21 16:37:17 +02:00
788fae59a1 Merge pull request #154 from meilisearch/reintroduce-sort-by-attr
Reintroduce the `SortByAttr` custom criterion
2019-05-21 16:32:12 +02:00
e042f44e0d feat: Reintroduce the SortByAttr custom criterion 2019-05-21 16:22:23 +02:00
b1fc3e5cec Merge pull request #152 from meilisearch/documents-deletion-updates-ranked-map
Remove the documents from the ranked map on documents deletion
2019-05-21 13:59:21 +02:00
d7b1b7a2a9 feat: Remove the documents from the ranked map on documents deletion 2019-05-21 13:33:42 +02:00
97744ad24f Merge pull request #151 from meilisearch/expose-sled-compression-factor
Expose the sled compression setting
2019-05-20 15:03:43 +02:00
2e79b2a871 feat: Expose the sled compression setting 2019-05-20 14:41:15 +02:00
349f0f7068 Merge pull request #148 from meilisearch/split-fst-docindexes
Split fst doc-indexes
2019-05-20 14:24:48 +02:00
94f9587db1 feat: Implement Debug on RawDocument for more convenience 2019-05-20 11:21:41 +02:00
6df8f62022 test: Add more test to some criteria 2019-05-20 11:21:40 +02:00
8c71473498 feat: Introduce the Criterion::name to allow better debugging 2019-05-20 11:21:40 +02:00
08d89053da feat: Introduce a little simple http server for demo 2019-05-16 17:09:41 +02:00
4b36fa0739 test: Add tests about additions and deletions of documents 2019-05-16 13:44:21 +02:00
921b063a71 feat: Make the DocumentsDeletion public interface to take serde types 2019-05-16 12:04:08 +02:00
3de633c869 feat: Reexport sled to reduce user level library incompatibilities 2019-05-16 12:04:08 +02:00
021f0545eb doc: Update the deep-dive explanation text 2019-05-16 12:04:08 +02:00
b701eb85b8 doc: Update the README features links 2019-05-16 12:04:08 +02:00
4e80378a77 chore: Rename the ebay example into kaggle 2019-05-16 12:04:07 +02:00
830d2f28b9 feat: Introduce a custom tree for user custom settings 2019-05-16 12:04:07 +02:00
c5ba34d0b0 chore: Replace crate only public interface to be completely public 2019-05-16 12:04:07 +02:00
2e31bb519a chore: Split the database structure internal types 2019-05-16 12:04:07 +02:00
169bd4cb39 feat: Store all documents words by document rather than by attribute 2019-05-15 15:42:13 +02:00
aa90f22865 feat: Remove the Index dependency of the Serializer 2019-05-15 15:42:12 +02:00
9bba90c47e fix: Fix a bug in the Database open-index method 2019-05-15 15:42:12 +02:00
2844cb5bca fix: Make the examples compile 2019-05-15 15:42:12 +02:00
dff81bb161 feat: Prefer set/del methods instead of set with an Option type 2019-05-15 15:42:12 +02:00
1f2abce7c3 feat: Introduce the DocumentsDeletion type 2019-05-15 15:42:11 +02:00
e67ada8823 feat: Introduce the DocumentsAddition type 2019-05-15 15:42:11 +02:00
42e39f6eb5 feat: Introduce a simplified version of the Store trait 2019-05-15 15:42:11 +02:00
f317a7a322 feat: implement open/create_index on the Database type 2019-05-15 15:42:11 +02:00
8434ecbb43 feat: Introduce the RankedMap real type 2019-05-15 15:42:10 +02:00
0c18026240 feat: Introduce Tree wrappers for each index component 2019-05-15 15:42:10 +02:00
6eb25687f8 feat: Handle word doc-indexes sled tree errors 2019-05-15 15:42:10 +02:00
737db5668b chore: Remove the WriteToBytes trait 2019-05-15 15:42:10 +02:00
f16e0333e4 chore: Remove the SharedData/Cursor types 2019-05-15 15:42:09 +02:00
27ffcaabe9 chore: Remove the DocIndexes type 2019-05-15 15:42:09 +02:00
db031a5b95 chore: Remove the DocIds type 2019-05-15 15:42:09 +02:00
2e9fbd07cd chore: Remove most of the warnings 2019-05-15 15:42:09 +02:00
74acf83464 chore: Remove the NewIndexEvent type 2019-05-15 15:42:08 +02:00
3dc057ca9c feat: Introduce the new Index system 2019-05-15 15:42:08 +02:00
e142339106 Merge pull request #150 from felixonmars/patch-1
chore: Fix some typos
2019-05-06 15:00:53 +02:00
39038750a8 chore: Fix some typos 2019-05-06 20:12:33 +08:00
f68733bf11 Merge pull request #149 from meilisearch/ci-only-nightly
Update ci with rust nightly only
2019-05-02 15:43:53 +02:00
85edb3e90c Update ci with rust nightly only 2019-05-02 11:43:45 +02:00
d7ce6d016b Merge pull request #147 from meilisearch/moving-to-sled
Make the repository a workspace and move to sled
2019-04-29 15:21:02 +02:00
9023a12ad4 feat: Introduce the unrankable error variant 2019-04-29 14:32:04 +02:00
0547671246 feat: Take ranked attributes into account 2019-04-29 14:32:04 +02:00
068f1bc202 feat: Index unidecoded words 2019-04-29 14:32:04 +02:00
7035f76077 squash-me: Make better measurements of the retrieving spent time 2019-04-29 14:32:04 +02:00
f0268d49fe fix: Always lowercase indexed tokens 2019-04-29 14:32:04 +02:00
7dbf5d6319 fix: Make the examples build 2019-04-29 14:32:03 +02:00
ed6b6038ee feat: Finalize index merging on document insertion 2019-04-29 14:32:03 +02:00
ad24ef8a25 feat: Index words of structs, maps and tuples 2019-04-29 14:32:03 +02:00
645bab7748 feat: Index documents using the Serializer struct 2019-04-29 14:32:03 +02:00
abd7d1de48 feat: Introduce the extract_document_id function 2019-04-29 14:32:03 +02:00
ea0ee070ef feat: Introduce the Serializer
Which will serialize documents fields as message pack in the kv-store
2019-04-29 14:32:03 +02:00
2a69170f14 feat: Introduce the DocumentsDeletion type 2019-04-29 14:32:02 +02:00
725e7b4229 chore: Move the Deserializer into the the serde module 2019-04-29 14:32:02 +02:00
187e6740bd feat: Allow users to construct query builders from database indexes 2019-04-29 14:32:02 +02:00
4b40d5b0d4 feat: Introduce the Index struct 2019-04-29 14:32:02 +02:00
ee2bad20c7 feat: Store the RankedMap into the inner sled tree 2019-04-29 14:32:02 +02:00
b7805fee93 feat: Store already opened indexes and word indexes 2019-04-29 14:32:02 +02:00
0104e93ba9 feat: Introduce index events to update the WordIndex 2019-04-29 14:32:02 +02:00
25a4961453 feat: Introduce the Indexer struct 2019-04-29 14:32:01 +02:00
7338e522bd squash-me: Add set/get/del_document_attribute to Index methods 2019-04-29 14:32:01 +02:00
58c020a2e1 feat: Store the word index into the database index 2019-04-29 14:32:01 +02:00
f7eced03fd chore: Using a fork of the fst library that support Arc<[u8]> 2019-04-29 14:32:01 +02:00
9be7c02461 chore: Update sled to 0.22.1 2019-04-29 14:32:01 +02:00
9483f2df60 feat: Introduce a custom Error type 2019-04-29 14:32:01 +02:00
f17a05c342 feat: Introduce the RankedMap type 2019-04-29 14:32:00 +02:00
e41c551757 feat: Introduce the Number type 2019-04-29 14:32:00 +02:00
95dfbd1fe0 feat: Introduce the meilidb-data schema module 2019-04-29 14:32:00 +02:00
287d5dee4d feat: Introduce the meilidb-data workspace member 2019-04-29 14:32:00 +02:00
77405cc103 chore: Remove the database module from meilidb 2019-04-29 14:32:00 +02:00
abf7191eec feat: Make the Tokenizer able to support tokenizing sequences 2019-04-29 14:32:00 +02:00
c6bb2b6f9c chore: Make the debug symbols available for release binaries 2019-04-29 14:31:59 +02:00
acede0f3e8 fix: Correctly assert the DocIndex memory size 2019-04-29 14:31:59 +02:00
e56106cbdc chore: Update the toml dependency 2019-04-29 14:31:59 +02:00
87f9528791 feat: Use the new Tokenizer 2019-04-29 14:31:59 +02:00
397522f277 fet: Move meilidb example into the meilidb workspace 2019-04-29 14:31:59 +02:00
a745819ddf feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-04-29 14:31:37 +02:00
5d5bcf7011 feat: Remove the FilterFunc alias type 2019-04-29 14:31:37 +02:00
19e67dcf0b feat: Move query splitting into the tokenizer workspace 2019-04-29 14:31:37 +02:00
1897da5348 feat: Move tokenizer things into the meilidb-tokenizer workspace 2019-04-29 14:31:37 +02:00
d8cbb03c42 chore: Update the .gitignore file 2019-04-29 14:31:36 +02:00
bc227bef21 chore: Add a nightly feature to meilidb-core 2019-04-29 14:31:36 +02:00
3bcb1dc802 chore: Allow the activation of the meilidb-core i128 feature 2019-04-29 14:31:36 +02:00
d0786b4156 chore: Move the SortByAttr into meilidb 2019-04-29 14:31:36 +02:00
14790eeae3 chore: Move index related things to the meilidb-core workspace member 2019-04-29 14:31:35 +02:00
3056b351fa Merge pull request #143 from ndudnicz/examples-movies
doc: add a new +19k movies example dataset
2019-04-15 10:11:38 +02:00
52fca57114 doc: add a new +19k movies example dataset 2019-04-13 21:11:28 +02:00
ee7a570b2f doc: Fix a little typo 2019-03-24 16:45:33 +01:00
61dcf72e04 Merge pull request #131 from meilisearch/update-readme
Add a Features section to the readme
2019-03-24 16:44:00 +01:00
bace8ad510 doc: Add a features section to the readme 2019-03-24 16:28:19 +01:00
e0b759839d Merge pull request #129 from meilisearch/ci-badge
Add CI badge
2019-03-10 22:46:57 +01:00
05b0a3e7d2 Add CI badge 2019-03-10 21:38:04 +01:00
2518037b91 Merge pull request #128 from meilisearch/azure-pipeline
Azure pipeline
2019-03-10 17:38:47 +01:00
3e452f362c Replace TravisCI by Azure CI 2019-03-10 15:46:59 +01:00
4900544574 Merge pull request #126 from Kerollmops/searchable-attributes
Searchable attributes
2019-03-05 17:11:15 +01:00
858589dc6b feat: Limit the QueryBuilder to search only into some attributes 2019-03-05 16:34:29 +01:00
915f2e70a3 Merge pull request #125 from Kerollmops/limit-memory-usage
Limit memory usage
2019-03-05 16:17:56 +01:00
aae301878c fix: Flush the database after each WriteBatch injected 2019-03-05 14:55:57 +01:00
383a49b44f fix: Compact the whole database for each WriteBatch injected 2019-03-05 14:55:57 +01:00
a45cc4b618 fix: Reduce the size of the DocIndex type 2019-03-05 14:55:57 +01:00
aef7d7825f Merge pull request #124 from Kerollmops/version-bump
Bump version to 0.3.2
2019-02-25 14:22:02 +01:00
f28ce661af chore: Bump version to 0.3.2 2019-02-25 13:56:23 +01:00
74eb9c8d0f Merge pull request #122 from Kerollmops/query-builder-no-view-dep
Remove the DatabaseView dependencies from the QueryBuilder
2019-02-24 16:56:12 +01:00
d664221c64 feat: Remove the DatabaseView dependencies from the QueryBuilder 2019-02-24 16:25:28 +01:00
58bff3d4ac Merge pull request #123 from Kerollmops/update-deps
Update all the dependencies
2019-02-24 16:24:47 +01:00
2c206eb98c chore: Update all the dependencies 2019-02-24 16:00:03 +01:00
19724e5af9 Merge pull request #121 from Kerollmops/no-cjk-unidecode
Do not save unidecoded cjk kanjis
2019-02-23 22:34:47 +01:00
c9e0ad132c feat: Do not save unidecoded cjk kanjis 2019-02-23 19:11:54 +01:00
24f265a963 Merge pull request #120 from Kerollmops/custom-log10-function
Optimize the SumOfTypos criterion
2019-02-23 19:01:12 +01:00
f8a743ee00 feat: Optimize the SumOfTypos criterion 2019-02-23 18:36:45 +01:00
64971de7ed Merge pull request #119 from Kerollmops/dont-be-hurry
Fix the tokenizer (next time don't be so hurry to merge)
2019-02-23 17:07:42 +01:00
a960c325f3 feat: Make query strings support cjk kanjis 2019-02-23 14:57:13 +01:00
a799470997 fix: Change the tokenizer to mesure cjk chars positions 2019-02-22 23:06:42 +01:00
10414791a2 fix: Remove debug println from the tokenizer 2019-02-22 22:34:37 +01:00
743974e60d Merge pull request #118 from Kerollmops/tokenizer-support-kanjis
Make the Tokenizer support Kanjis
2019-02-22 20:16:55 +01:00
0e267cae4b feat: Make the Tokenizer support Kanjis 2019-02-22 19:37:19 +01:00
12a352ae2f Merge pull request #117 from Kerollmops/tokenizer-support-parentheses
Make the tokenizer support parentheses
2019-02-22 19:36:15 +01:00
5070b27728 feat: Make the tokenizer support parentheses
Interpreting them as hard ponctuation (like a dot).
2019-02-22 18:18:17 +01:00
7a6b734078 Merge pull request #116 from Kerollmops/raw-field-value-getter
Allow users to retrieve the raw field value of a document
2019-02-22 18:02:46 +01:00
24823da6f7 feat: Allow users to retrieve the raw field value of a document 2019-02-22 15:30:20 +01:00
8701cb3a8f Merge pull request #115 from qdequele/database-path
Add accessor for database path and index path
2019-02-22 15:11:40 +01:00
315fc1fbe3 feat: Add accessor for database and index path 2019-02-22 13:49:04 +01:00
23833bac10 Merge pull request #114 from Kerollmops/hot-fix-ranked-attribute
Do not error when an attribute is registered for ranking
2019-02-21 23:17:10 +01:00
8235b6efc9 fix: Do not error when an attribute is registered for ranking 2019-02-21 20:14:08 +01:00
7f937eea5a Merge pull request #113 from Kerollmops/hot-fix-query-builder
Remove the QueryBuilder boxed criteria default static restriction
2019-02-21 20:11:10 +01:00
a1cf634ac1 feat: Remove the QueryBuilder boxed criteria default static restriction 2019-02-21 19:26:22 +01:00
c86472e997 Merge pull request #112 from Kerollmops/bump-version
Bump version to 0.3.1
2019-02-21 15:18:37 +01:00
26cb398a6f chore: Bump version to 0.3.1 2019-02-21 14:52:40 +01:00
f6e664d298 Merge pull request #111 from qdequele/config
Add a config per index
2019-02-21 14:39:37 +01:00
9437cecf87 chore: Use Default derive on Config struct 2019-02-21 14:01:55 +01:00
13309511b3 chore: Use serde derive lowercase on RankingOrdering 2019-02-21 14:01:55 +01:00
1941cb16c0 feat: Add Config.update_with(_) method to merge 2 config 2019-02-21 14:01:55 +01:00
55823c5d5d feat: add admin key on config 2019-02-21 14:01:55 +01:00
4721da1679 feat: Add access key on config 2019-02-21 14:01:55 +01:00
482f750231 chore: Set config field pub 2019-02-21 14:01:55 +01:00
d5119db165 feat: Allow to retrieve config from Database and DatabaseView 2019-02-21 14:01:55 +01:00
37578ed74f feat: store config into database 2019-02-20 14:07:19 +01:00
f5992ce822 Merge pull request #109 from Kerollmops/implement-text-cropping
Introduce text cropping that shows the first matches
2019-02-18 19:40:30 +01:00
badb0035c5 feat: Introduce text cropping that shows the first match 2019-02-18 18:59:50 +01:00
4bc14aa261 Merge pull request #108 from Kerollmops/refactor-index
Refactor the Index and Updates
2019-02-18 18:59:20 +01:00
a0c4ec0be0 feat: Introduce the updated_documents methods 2019-02-18 18:01:40 +01:00
264fffa826 feat: Replace the elapsed dependency by std::time::Instant 2019-02-17 16:37:45 +01:00
bddb37e44f feat: Move SharedData to its own module 2019-02-17 16:37:45 +01:00
6393b0cbc0 feat: Prefer binary to exponential search 2019-02-17 16:37:45 +01:00
a8df438814 feat: Implement WriteToBytes/FromSharedDataCursor 2019-02-17 16:37:44 +01:00
8014857ebf feat: Introduce the WriteToBytes trait 2019-02-17 16:37:44 +01:00
9e7261a48f feat: Introduce the FromSharedDataCursor trait 2019-02-17 16:37:44 +01:00
c4e70d0475 feat: Introduce the SharedDataCursor type 2019-02-17 16:37:44 +01:00
cbb0aaa217 feat: Introduce the Index structure along with the Events types 2019-02-17 16:36:47 +01:00
ce50e74491 Merge pull request #107 from Kerollmops/update-dependencies
Update dependencies
2019-02-13 16:05:51 +01:00
e103e1c277 chore: Replace the crossbeam::ArcCell by arc-swap::ArcSwap 2019-02-13 15:19:02 +01:00
64929fe5dc chore: Update slice-group-by to 0.2 2019-02-13 15:06:34 +01:00
b108f1e6c9 Merge pull request #106 from Kerollmops/fix-criterion
Fix the SumOfTypos and WordsProximity criteria
2019-02-12 22:06:32 +01:00
58b417e045 feat: Replace the linear_group_by by the new linear_group method 2019-02-12 21:23:36 +01:00
2e5a616d8e fix: Compute the proximity on the words with the min distance 2019-02-12 21:22:45 +01:00
092d446a7e chore: Update the slice-group-by dependency 2019-02-12 21:22:45 +01:00
85a1f126bf fix: Make the SumOfTypos criterion use a more clever algorithm 2019-02-12 21:22:42 +01:00
cf58cf86da Merge pull request #105 from Kerollmops/custom-ranking-field-into-hashmap
Save the custom ranking field into an HashMap
2019-02-11 17:36:26 +01:00
db6210c7ee feat: Introduce the Number type 2019-02-11 16:58:44 +01:00
83cd071827 feat: Introduce the SortByAttr custom ranking helper 2019-02-11 16:55:31 +01:00
084c3a95b6 feat: Add a new ranked attribute to the schema 2019-02-11 16:55:30 +01:00
78908aa34e Merge pull request #103 from Kerollmops/ranking-typo-rules
Add a reading on the default typos and ranking rules
2019-02-11 15:05:04 +01:00
cf27706f91 doc: Add a reading on the default typos and ranking rules 2019-02-11 11:58:17 +01:00
d3f53a7fd6 Merge pull request #104 from Kerollmops/update-readme
Update the Redame wrk stats
2019-02-10 14:53:15 +01:00
508af5613f doc: Update the Redame wrk stats 2019-02-10 14:05:21 +01:00
c615c31016 Merge pull request #101 from Kerollmops/version-bump
Bump version to 0.3.0
2019-02-07 15:26:38 +01:00
908b28790b chore: Bump version to 0.3.0 2019-02-07 14:51:39 +01:00
4c0279729b Merge pull request #100 from qdequele/master
Allow users to manage multiple database indexes
2019-02-07 14:49:52 +01:00
96dfac5b33 feat: Allow users to manage multiple database indexes 2019-02-07 13:05:55 +01:00
8576218b51 Merge pull request #99 from Kerollmops/simplify-transactional-update
Remove the lifetime restriction for Database Updates
2019-02-06 18:19:45 +01:00
1c1f9201b8 feat: Remove the lifetime restriction for Database Updates 2019-02-06 18:03:41 +01:00
4398b88a3a Merge pull request #98 from Kerollmops/updates-with-transactions
Change updates to be handled using the RocksDB WriteBatch feature
2019-02-06 16:13:47 +01:00
73e79f5ca4 chore: Make travis build with Rust 1.32 2019-02-06 15:58:48 +01:00
1bfd51d6e9 feat: Change updates to be handled using the RocksDB WriteBatch feature 2019-02-06 15:58:47 +01:00
0d2daf27f2 Merge pull request #97 from Kerollmops/remove-hashbrown-stop-words
Remove the hashbrown dependency for library users
2019-02-03 17:31:08 +01:00
87f0d8cf3c feat: Remove the hashbrown dependency for library users 2019-02-03 12:22:50 +01:00
06d5a10902 Merge pull request #96 from Kerollmops/chore
Make some little changes
2019-02-03 11:55:06 +01:00
94b89c5439 chore: Make the Document from_raw method private 2019-02-03 11:24:44 +01:00
c5e951be09 chore: Move the deseserializer into the serde module 2019-02-03 11:24:44 +01:00
66ae5c8161 chore: Clarify some QueryBuilder comments 2019-02-03 11:24:44 +01:00
8438e2202f Merge pull request #95 from Kerollmops/fix-querybuilder-with-criteria
Make the QueryBuilder with_criteria use FilterFunc
2019-02-03 11:24:17 +01:00
7a6166d229 feat: Make the QueryBuilder with_criteria use FilterFunc 2019-02-03 10:55:16 +01:00
d46fa4b215 Merge pull request #94 from Kerollmops/data-oriented
Introduce Data Oriented design into the search algorithm
2019-02-02 15:40:10 +01:00
2bd5b4ab86 feat: Remove useless WordsProximity criterion benchmark 2019-02-02 15:12:54 +01:00
5efbc5ceb3 feat: Introduce the revisited SortBy criterion 2019-02-02 14:42:12 +01:00
2e905bac08 chore: Remove Attribute and WordArea structures 2019-02-02 14:40:15 +01:00
4c0ad5f964 feat: Simplify the Criterion Trait by removing the DatabaseView param 2019-02-02 14:40:15 +01:00
455cbf3bf4 feat: Make the search algorithm become fully data oriented 2019-02-02 14:40:14 +01:00
a3a28c56fa feat: Replace compressed Match fields by uncompressed ones 2019-02-02 14:40:14 +01:00
b0b3175641 Merge pull request #93 from Kerollmops/slice-group-by
Use the GroupBy/Mut Traits of the slice-group-by library
2019-01-30 17:52:27 +01:00
c2f0df3f73 feat: Use the GroupBy/Mut Traits of the slice-group-by library 2019-01-30 16:54:52 +01:00
820f1f9ac6 Merge pull request #91 from Kerollmops/warn-reused-document-id
Emit warnings when a document id is reused
2019-01-28 21:05:42 +01:00
337aee5b65 chore: Emit warnings when a document id is reused 2019-01-28 16:11:55 +01:00
810dfdf656 Merge pull request #90 from Kerollmops/version-bump
Bump version to 0.2.1
2019-01-25 17:08:53 +01:00
f016652fca chore: Bump version to 0.2.1 2019-01-25 16:41:08 +01:00
6c99ebe3fa Merge pull request #89 from Kerollmops/no-more-compaction
Remove the manual compaction triggering
2019-01-25 16:40:08 +01:00
94d357985f feat: Remove the manual compaction triggering 2019-01-25 16:05:56 +01:00
fbc698567a Merge pull request #87 from Kerollmops/measure-index-loading
Display index loading times
2019-01-24 14:07:11 +01:00
aa9db14c09 chore: Display index loading times 2019-01-23 11:19:44 +01:00
61e83a1c21 Merge pull request #86 from Kerollmops/measure-indexation
Display timings of indexation operations
2019-01-16 13:32:44 +01:00
1316be5b09 chore: Display timings of indexation operations 2019-01-16 11:45:33 +01:00
4e8b0383dd Merge pull request #85 from Kerollmops/debug-more-stats
Display more stats infos
2019-01-15 14:20:28 +01:00
4fa10753c1 chore: Display more stats infos 2019-01-14 21:18:46 +01:00
2473e289e8 Merge pull request #84 from qdequele/create-server-example
Example HTTP server example can use stopwords
2019-01-14 18:55:58 +01:00
e0e5e87ed3 feat: HTTP server example can use stopwords 2019-01-14 18:21:58 +01:00
b13e61f40a Merge pull request #83 from qdequele/create-server-example
Create an example of HTTP server managing multiple databases
2019-01-14 14:35:33 +01:00
c023cb3065 feat: Create an example for HTTP server managing multiple databases 2019-01-14 13:39:54 +01:00
0a3d069fbc Merge pull request #79 from qdequele/master
Schema can be de/serialized from a json format
2019-01-12 21:50:02 +01:00
fa062ce2cf feat: Schema can be de/serialized from a json format 2019-01-12 21:05:48 +01:00
cdc6e47bf5 Merge pull request #81 from Kerollmops/update-readme
Simplify the examples command lines
2019-01-12 13:43:42 +01:00
d5f44838be doc: Simplify the examples command lines 2019-01-12 12:56:11 +01:00
102 changed files with 27336 additions and 4904 deletions

3
.gitignore vendored
View File

@ -1,6 +1,7 @@
/rocksdb
/target
/Cargo.lock
meilidb/Cargo.lock
meilidb-core/Cargo.lock
**/*.rs.bk
**/*.csv
**/*.json_lines

View File

@ -1,22 +0,0 @@
language: rust
cache: cargo
branches:
only:
- master
matrix:
fast_finish: true
include:
# Test crates on their minimum Rust versions.
- rust: 1.31.0
name: "meilidb on 1.31.0"
script: ./ci/meilidb.sh
# Test crates on nightly Rust.
- rust: nightly
name: "meilidb on nightly"
script: ./ci/meilidb.sh

View File

@ -1,55 +1,11 @@
[package]
edition = "2018"
name = "meilidb"
version = "0.2.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
[dependencies]
bincode = "1.0"
byteorder = "1.2"
crossbeam = "0.6"
fst = "0.3"
hashbrown = { version = "0.1", features = ["serde"] }
lazy_static = "1.1"
levenshtein_automata = { version = "0.1", features = ["fst_automaton"] }
linked-hash-map = { version = "0.5", features = ["serde_impl"] }
log = "0.4"
sdset = "0.3"
serde = "1.0"
serde_derive = "1.0"
unidecode = "0.3"
[dependencies.toml]
git = "https://github.com/Kerollmops/toml-rs.git"
features = ["preserve_order"]
rev = "0372ba6"
[dependencies.rocksdb]
git = "https://github.com/pingcap/rust-rocksdb.git"
rev = "306e201"
[dependencies.group-by]
git = "https://github.com/Kerollmops/group-by.git"
rev = "5a113fe"
[features]
default = ["simd"]
i128 = ["bincode/i128", "byteorder/i128"]
portable = ["rocksdb/portable"]
simd = ["rocksdb/sse"]
nightly = ["hashbrown/nightly", "group-by/nightly"]
[dev-dependencies]
csv = "1.0"
elapsed = "0.1"
env_logger = "0.6"
jemallocator = "0.1"
quickcheck = "0.8"
rand = "0.6"
rand_xorshift = "0.1"
structopt = "0.2"
tempfile = "3.0"
termcolor = "1.0"
[workspace]
members = [
"meilidb",
"meilidb-core",
"meilidb-data",
"meilidb-schema",
"meilidb-tokenizer",
]
[profile.release]
debug = true

View File

@ -1,6 +1,6 @@
# MeiliDB
[![Build Status](https://travis-ci.org/Kerollmops/MeiliDB.svg?branch=master)](https://travis-ci.org/Kerollmops/MeiliDB)
[![Build Status](https://dev.azure.com/thomas0884/thomas/_apis/build/status/meilisearch.MeiliDB?branchName=master)](https://dev.azure.com/thomas0884/thomas/_build/latest?definitionId=1&branchName=master)
[![dependency status](https://deps.rs/repo/github/Kerollmops/MeiliDB/status.svg)](https://deps.rs/repo/github/Kerollmops/MeiliDB)
[![License](https://img.shields.io/github/license/Kerollmops/MeiliDB.svg)](https://github.com/Kerollmops/MeiliDB)
[![Rust 1.31+](https://img.shields.io/badge/rust-1.31+-lightgray.svg)](
@ -8,34 +8,48 @@ https://www.rust-lang.org)
A _full-text search database_ using a key-value store internally.
It uses [RocksDB](https://github.com/facebook/rocksdb) as the internal key-value store. The key-value store allows us to handle updates and queries with small memory and CPU overheads.
## Features
You can [read the deep dive](deep-dive.md) if you want more information on the engine, it describes the whole process of generating updates and handling queries.
- Provides [6 default ranking criteria](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/criterion/mod.rs#L95-L101) used to [bucket sort](https://en.wikipedia.org/wiki/Bucket_sort) documents
- Accepts [custom criteria](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/criterion/mod.rs#L22-L29) and can apply them in any custom order
- Support [ranged queries](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/query_builder.rs#L146), useful for paginating results
- Can [distinct](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/query_builder.rs#L68) and [filter](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/query_builder.rs#L57) returned documents based on context defined rules
- Can store complete documents or only [user schema specified fields](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/examples/movies/schema-movies.toml)
- The [default tokenizer](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-tokenizer/src/lib.rs#L99) can index latin and kanji based languages
- Returns [the matching text areas](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/lib.rs#L117-L120), useful to highlight matched words in results
- Accepts query time search config like the [searchable fields](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/query_builder.rs#L79)
- Supports run time indexing (incremental indexing)
We will be proud if you submit issues and pull requests. You can help to grow this project and start contributing by checking [issues tagged "good-first-issue"](https://github.com/Kerollmops/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). It is a good start!
It uses [RocksDB](https://github.com/facebook/rocksdb) as the internal key-value store. The key-value store allows us to handle updates and queries with small memory and CPU overheads. The whole ranking system is [data oriented](https://github.com/meilisearch/MeiliDB/issues/82) and provides great performances.
You can [read the deep dive](deep-dive.md) if you want more information on the engine, it describes the whole process of generating updates and handling queries or you can take a look at the [typos and ranking rules](typos-ranking-rules.md) if you want to know the default rules used to sort the documents.
We will be proud if you submit issues and pull requests. You can help to grow this project and start contributing by checking [issues tagged "good-first-issue"](https://github.com/meilisearch/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). It is a good start!
The project is only a library yet. It means that there is no binary provided yet. To get started, you can check the examples wich are made to work with the data located in the `misc/` folder.
MeiliDB will be a binary in a near future so you will be able to use it as a database out-of-the-box. We should be able to query it using a [to-be-defined](https://github.com/Kerollmops/MeiliDB/issues/38) protocol. This is our current goal, [see the milestones](https://github.com/Kerollmops/MeiliDB/milestones). In the end, the binary will be a bunch of network protocols and wrappers around the library - which will also be published on [crates.io](https://crates.io). Both the binary and the library will follow the same update cycle.
MeiliDB will be a binary in a near future so you will be able to use it as a database out-of-the-box. We should be able to query it using a [to-be-defined](https://github.com/meilisearch/MeiliDB/issues/38) protocol. This is our current goal, [see the milestones](https://github.com/meilisearch/MeiliDB/milestones). In the end, the binary will be a bunch of network protocols and wrappers around the library - which will also be published on [crates.io](https://crates.io). Both the binary and the library will follow the same update cycle.
## Performances
With a database composed of _100 353_ documents with _352_ attributes each and _90_ of them indexed.
So nearly _9 million_ fields indexed for _35 million_ stored we can handle more than _1.2k req/sec_ on an Intel i7-7700 (8) @ 4.2GHz.
With a database composed of _100 353_ documents with _352_ attributes each and _3_ of them indexed.
So more than _300 000_ fields indexed for _35 million_ stored we can handle more than _2.8k req/sec_ with an average response time of _9 ms_ on an Intel i7-7700 (8) @ 4.2GHz.
Requests are made using [wrk](https://github.com/wg/wrk) and scripted to generate real users queries.
Requests are made using [wrk](https://github.com/wg/wrk) and scripted to simulate real users queries.
```
Running 10s test @ http://localhost:2230
2 threads and 12 connections
2 threads and 25 connections
Thread Stats Avg Stdev Max +/- Stdev
Latency 18.86ms 49.39ms 614.89ms 95.23%
Req/Sec 620.41 59.53 790.00 65.00%
12359 requests in 10.00s, 3.26MB read
Requests/sec: 1235.54
Transfer/sec: 334.22KB
Latency 9.52ms 7.61ms 99.25ms 84.58%
Req/Sec 1.41k 119.11 1.78k 64.50%
28080 requests in 10.01s, 7.42MB read
Requests/sec: 2806.46
Transfer/sec: 759.17KB
```
### Notes
@ -45,16 +59,35 @@ We have seen much better performances when [using jemalloc as the global allocat
## Usage and examples
MeiliDB runs with an index like most search engines.
So to test the library you can create one by indexing a simple csv file.
You can try a little part of MeiliDB with the following commands.
It creates an index named _movies_ and insert two great Tarantino movies in it.
```bash
cargo run --release --example create-database -- test.mdb misc/kaggle.csv --schema schema-example.toml --stop-words misc/fr.stopwords.txt
cargo run --release
curl -XPOST 'http://127.0.0.1:8000/movies' \
-d '
identifier = "id"
[attributes.id]
stored = true
[attributes.title]
stored = true
indexed = true
'
curl -H 'Content-Type: application/json' \
-XPUT 'http://127.0.0.1:8000/movies' \
-d '{ "id": 123, "title": "Inglorious Bastards" }'
curl -H 'Content-Type: application/json' \
-XPUT 'http://127.0.0.1:8000/movies' \
-d '{ "id": 456, "title": "Django Unchained" }'
```
Once the command is executed, the index should be in the `test.mdb` folder. You are now able to run the `query-database` example and play with MeiliDB.
Once the database is initialized you can query it by using the following command:
```bash
cargo run --release --example query-database -- test.mdb -n 10 id title
curl -XGET 'http://127.0.0.1:8000/movies/search?q=inglo'
```

47
azure-pipelines.yml Normal file
View File

@ -0,0 +1,47 @@
---
trigger:
branches:
include: [ master ]
pr: [ master ]
jobs:
- job: test
pool:
vmImage: 'Ubuntu 16.04'
container: tpayet/chiquitita:latest
steps:
- script: |
curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain nightly
displayName: 'Install rustc'
- script: |
$HOME/.cargo/bin/cargo check
displayName: 'Check MeiliDB'
- script: |
$HOME/.cargo/bin/cargo test
displayName: 'Test MeiliDB'
- job: build
dependsOn:
- test
condition: succeeded()
pool:
vmImage: 'Ubuntu 16.04'
container: tpayet/chiquitita:latest
steps:
- script: |
curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain nightly
displayName: 'Install rustc'
- script: |
$HOME/.cargo/bin/cargo build --release
displayName: 'Build MeiliDB'
- task: CopyFiles@2
inputs:
contents: '$(System.DefaultWorkingDirectory)/target/release/libmeilidb.rlib'
targetFolder: $(Build.ArtifactStagingDirectory)
displayName: 'Copy build'
- task: PublishBuildArtifacts@1
inputs:
artifactName: libmeilidb.rlib
displayName: 'Upload artifacts'

View File

@ -1,28 +1,22 @@
# A deep dive in MeiliDB
On the 9 of december 2018.
MeiliDB is a full text search engine based on a final state transducer named [fst](https://github.com/BurntSushi/fst) and a key-value store named [RocksDB](https://github.com/facebook/rocksdb). The goal of a search engine is to store data and to respond to queries as accurate and fast as possible. To achieve this it must save the data as an [inverted index](https://en.wikipedia.org/wiki/Inverted_index).
On the 15 of May 2019.
MeiliDB is a full text search engine based on a final state transducer named [fst](https://github.com/BurntSushi/fst) and a key-value store named [sled](https://github.com/spacejam/sled). The goal of a search engine is to store data and to respond to queries as accurate and fast as possible. To achieve this it must save the matching words in an [inverted index](https://en.wikipedia.org/wiki/Inverted_index).
<!-- MarkdownTOC autolink="true" -->
- [Where is the data stored?](#where-is-the-data-stored)
- [What does the key-value store contains?](#what-does-the-key-value-store-contains)
- [The blob type](#the-blob-type)
- [The inverted word index](#the-inverted-word-index)
- [A final state transducer](#a-final-state-transducer)
- [Document indexes](#document-indexes)
- [Document ids](#document-ids)
- [The schema](#the-schema)
- [Document attributes](#document-attributes)
- [How is an update handled?](#how-is-an-update-handled)
- [The merge operation is CPU consuming](#the-merge-operation-is-cpu-consuming)
- [How is a request processed?](#how-is-a-request-processed)
- [Query lexemes](#query-lexemes)
- [Automatons and query index](#automatons-and-query-index)
- [Sort by criteria](#sort-by-criteria)
- [Retrieve original documents](#retrieve-original-documents)
<!-- /MarkdownTOC -->
@ -30,21 +24,17 @@ MeiliDB is a full text search engine based on a final state transducer named [fs
MeiliDB is entirely backed by a key-value store like any good database (i.e. Postgres, MySQL). This brings a great flexibility in the way documents can be stored and updates handled along time.
[RocksDB brings some](https://rocksdb.org/blog/2015/02/27/write-batch-with-index.html) of the [A.C.I.D. properties](https://en.wikipedia.org/wiki/ACID_(computer_science)) to help us be sure the saved data is consistent, for example we use SST files and the key-value store ability to load them in one time to manage updates.
Note that the SST file have the same restriction as the fst, it needs its keys to be added in order at creation.
[sled will brings some](https://github.com/spacejam/sled/tree/434533332a3f485e6d2e467023be0a0b55d3a1af#plans) of the [A.C.I.D. properties](https://en.wikipedia.org/wiki/ACID_(computer_science)) to help us be sure the saved data is consistent.
## What does the key-value store contains?
It contain the blob, the schema and the documents stored attributes.
It contain the inverted word index, the schema and the documents fields.
### The blob type
### The inverted word index
[The Blob type](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/blob/mod.rs#L16-L19) is a data structure that indicate if an update is a positive or a negative one. In the case where the update is considered positive, the blob will contain [an fst map and the document indexes](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/blob/positive/blob.rs#L15-L18) associated. In the other case it will only contain [all the document ids](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/blob/negative/blob.rs#L12-L14) that must be considered removed.
The Blob type [is stored under the "*data-index*" entry](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/update/positive/update.rs#L497-L499) and marked as [a merge operation](https://github.com/facebook/rocksdb/wiki/Merge-Operator-Implementation) in the key-value store.
[The inverted word index](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/words_index.rs) is a sled Tree dedicated to store and give access to all documents that contains a specific word. The information stored under the word is simply a big ordered array of where in the document the word has been found. In other word, a big list of [`DocIndex`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/lib.rs#L35-L51).
#### A final state transducer
@ -52,89 +42,54 @@ _...also abbreviated fst_
This is the first entry point of the engine, you can read more about how it work with the beautiful blog post of @BurntSushi, [Index 1,600,000,000 Keys with Automata and Rust](https://blog.burntsushi.net/transducers/).
To make it short it is a powerful way to store all the words that are present in the indexed documents. You construct it by giving it all the words you want to index associated with a value that, for the moment, can only be an `u64`. When you want to search in it you can provide any automaton you want, in MeiliDB [a custom levenshtein automaton](https://github.com/tantivy-search/levenshtein-automata/) is used.
Note that the number under each word is auto-incremental, each new word have a new number that is greater than the prevous one.
Another powerful feature of `fst` is that it can nearly avoid using RAM and be streamed to disk for example, the problem is that the keys must be always added in lexicographic order, so you must sort them before, for the moment MeiliDB uses a [BTreeMap](https://github.com/Kerollmops/raptor-rs/blob/8abdb0a228e2808fe1814a6a0641a4b72d158579/src/metadata/doc_indexes.rs#L107-L112).
To make it short it is a powerful way to store all the words that are present in the indexed documents. You construct it by giving it all the words you want to index. When you want to search in it you can provide any automaton you want, in MeiliDB [a custom levenshtein automaton](https://github.com/tantivy-search/levenshtein-automata/) is used.
#### Document indexes
As it has been specified, the `fst` can only store a number corresponding to a word, an `u64`, but the goal of the search engine is to retrieve a match in a document when a query is made. You want it to return some sort of position in an attribute in a document, an information about where the given word match.
The `fst` will only return the words that match with the search automaton but the goal of the search engine is to retrieve all matches in all the documents when a query is made. You want it to return some sort of position in an attribute in a document, an information about where the given word matched.
To make it possible, a custom data structure has been developed, the document indexes is composed of two arrays, the ranges array and all the docindexes corresponding to a given range, each range identify the word number. The [DocIndexes](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/data/doc_indexes.rs#L23) type is designed to be streamed when constructed, consumming a minimum amount of ram like the fst. Another advantage is that the slices are accessible in `O(1)` when you know the word associated number.
#### Document ids
This is a simple ordered list of all documents ids which must be considered deleted. It is used with [the sdset library](https://docs.rs/sdset/0.3.0/sdset/duo/struct.DifferenceByKey.html), the docindexes and the `DifferenceByKey` operation builder when merging blobs.
When a blob represent a negative update it only contains this simple slice of deleted documents ids.
To make it possible we retrieve all of the `DocIndex` corresponding to all the matching words in the fst, we use the [`WordsIndex`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/words_index.rs#L11-L21) Tree to get the `DocIndexes` corresponding the words.
### The schema
The schema is a data struture that represents which documents attributes should be stored and which should be indexed. It is stored under the "_data-schema_" entry and given to MeiliDB only at the creation.
The schema is a data structure that represents which documents attributes should be stored and which should be indexed. It is stored under a the [`MainIndex`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/main_index.rs#L12) Tree and given to MeiliDB only at the creation of an index.
Each document attribute is associated to a unique 32 bit number named `SchemaAttr`.
Each document attribute is associated to a unique 16 bit number named [`SchemaAttr`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/schema.rs#L186).
In the future this schema type could be given along with updates and probably be different from the original, the database could be able to handled this document structure and reindex it.
In the future, this schema type could be given along with updates, the database could be able to handled a new schema and reindex the database according to the new one.
### Document attributes
When the engine handle a query the result that the requester want is a document, not only the [match](https://github.com/Kerollmops/MeiliDB/blob/fc2cdf92596fc002ce278e3aa8718640ac44724d/src/lib.rs#L51-L79) associated to it, fields of the original document must be returned too.
When the engine handle a query the result that the requester want is a document, not only the [`Matches`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/lib.rs#L62-L88) associated to it, fields of the original document must be returned too.
So MeiliDB again uses the power of the underlying key-value store and save the documents attributes marked as _STORE_. The key is prefixed by "_doc_" followed by the 64 bit document id in bytes and the schema attribute number in bytes corresponding to the document attribute stored.
So MeiliDB again uses the power of the underlying key-value store and save the documents attributes marked as _STORE_ in the schema. The dedicated Tree for this information is the [`DocumentsIndex`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/documents_index.rs#L11).
When a document field is saved in the key-value store its value is binary encoded using the [bincode](https://docs.rs/bincode/) library, so a document must be serializable using serde.
## How is an update handled?
First of all an update in MeiliDB is nothing more than [a RocksDB SST file](https://github.com/facebook/rocksdb/wiki/Creating-and-Ingesting-SST-files). It contains the blob and all the documents attributes binary encoded like described above. Note that the blob is stored under the "_data-index_" key marked as [a merge operation](https://github.com/facebook/rocksdb/wiki/Merge-Operator-Implementation).
### The merge operation is CPU consuming
When [the database ingest an update](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/mod.rs#L108-L145) it gives the SST file to the underlying RocksDB, once it has ingested it there is a "_data-index_" entry available, we can request it but the key-value store will call a function before, a merge operation is performed.
This merge operation is done on multiple blobs as you have understood and will compute a [PositiveBlob](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/blob/positive/blob.rs#L15), this type contains the fst and document indexes structures allowing us to search for documents. This two data structures can be considered as the inverted index.
The computation time of this merge is important, RocksDB doesn't keep the previous merged result, it will call our merge operation each time until it decided to do a compaction. So [we must force this compaction earlier](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/mod.rs#L129-L131) when we receive an update to reduce this cost.
This way when we request the "_data-index_" value it will gives us the previously merged positive blob without any other merge overhead.
When a document field is saved in the key-value store its value is binary encoded using [message pack](https://github.com/3Hren/msgpack-rust), so a document must be serializable using serde.
## How is a request processed?
Now that we have our "_data-index_" we are able to return results based on a query. In the MeiliDB universe a query is a string.
Now that we have our inverted index we are able to return results based on a query. In the MeiliDB universe a query is a simple string containing words.
### Query lexemes
The first step to be able to call the underlying structures is to split the query in words, for that we use a [custom tokenizer](https://github.com/Kerollmops/MeiliDB/blob/fc2cdf92596fc002ce278e3aa8718640ac44724d/src/tokenizer/mod.rs) that is not finished for the moment, [there is an open issue](https://github.com/Kerollmops/MeiliDB/issues/3). Note that a tokenizer is specialized for a human language, this is the hard part.
The first step to be able to call the underlying structures is to split the query in words, for that we use a [custom tokenizer](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-tokenizer/src/lib.rs#L82-L84). Note that a tokenizer is specialized for a human language, this is the hard part.
### Automatons and query index
So to query the fst we need an automaton, in MeiliDB we use a [levenshtein automaton](https://en.wikipedia.org/wiki/Levenshtein_automaton), this automaton is constructed using a string and a maximum distance. According to the [Algolia's blog post](https://blog.algolia.com/inside-the-algolia-engine-part-3-query-processing/#algolia%e2%80%99s-way-of-searching-for-alternatives) we [created the DFAs](https://github.com/Kerollmops/MeiliDB/blob/fc2cdf92596fc002ce278e3aa8718640ac44724d/src/automaton.rs#L62-L75) with different settings.
So to query the fst we need an automaton, in MeiliDB we use a [levenshtein automaton](https://en.wikipedia.org/wiki/Levenshtein_automaton), this automaton is constructed using a string and a maximum distance. According to the [Algolia's blog post](https://blog.algolia.com/inside-the-algolia-engine-part-3-query-processing/#algolia%e2%80%99s-way-of-searching-for-alternatives) we [created the DFAs](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/automaton.rs#L59-L78) with different settings.
Thanks to the power of the fst library [it is possible to union multiple automatons](https://docs.rs/fst/0.3.2/fst/map/struct.OpBuilder.html#method.union) on the same fst map, it will allow us to know which [automaton returns a word according to its index](https://github.com/Kerollmops/MeiliDB/blob/fc2cdf92596fc002ce278e3aa8718640ac44724d/src/metadata/ops.rs#L111). The `Stream` is able to return all the numbers associated to the words. We use these numbers to find the whole list of `DocIndexes` associated and do the union set operation.
Thanks to the power of the fst library [it is possible to union multiple automatons](https://docs.rs/fst/0.3.2/fst/map/struct.OpBuilder.html#method.union) on the same fst set. The `Stream` is able to return all the matching words. We use these words to find the whole list of `DocIndexes` associated.
With all these informations it is possible [to reconstruct a list of all the DocIndexes associated](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/rank/query_builder.rs#L62-L99) with the words queried.
With all these informations it is possible [to reconstruct a list of all the `DocIndexes` associated](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/query_builder.rs#L103-L130) with the words queried.
### Sort by criteria
Now that we are able to get a big list of [DocIndexes](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/lib.rs#L21-L36) it is not enough to sort them by criteria, we need more informations like the levenshtein distance or the fact that a query word match exactly the word stored in the fst. So [we stuff it a little bit](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/rank/query_builder.rs#L86-L93), and aggregate all these [Matches](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/lib.rs#L47-L74) for each document. This way it will be easy to sort a simple vector of document using a bunch of functions.
With this big list of documents and associated matches [we are able to sort only the part of the slice that we want](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/rank/query_builder.rs#L108-L119) using bucket sorting. [Each criterion](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/rank/criterion/mod.rs#L75-L87) is evaluated on each subslice without copy, thanks to [GroupByMut](https://github.com/Kerollmops/group-by/blob/cab857bae01463dbd0edb99b0e0d7f3624e6c6f5/src/lib.rs#L180-L185) which, I hope [will soon be merged](https://github.com/rust-lang/rfcs/pull/2477).
Note that it is possible to customize the criteria used by using the `QueryBuilder::with_criteria` constructor, this way you can implement some custom ranking based on the document attributes using the appropriate structure and the `retrieve_document` method.
### Retrieve original documents
The [DatabaseView](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/database_view.rs#L18-L24) structure that you must have created to be able to query the database have [two functions](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/database_view.rs#L60-L76) that allows you to retrieve a full (or not) document according to the schema you specified at creation time (i.e. the _STORED_ attributes).
As you can see, these functions force the created type `T` to implement [the serde Deserialize trait](https://docs.rs/serde/1.0.81/serde/trait.Deserialize.html), MeiliDB will use the `bincode::deserialise` function for each attribute to construct your type and return it to you.
With this big list of documents and associated matches [we are able to sort only the part of the slice that we want](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/query_builder.rs#L160-L188) using bucket sorting. [Each criterion](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/criterion/mod.rs#L95-L101) is evaluated on each subslice without copy, thanks to [GroupByMut](https://docs.rs/slice-group-by/0.2.4/slice_group_by/) which, I hope [will soon be merged](https://github.com/rust-lang/rfcs/pull/2477).
Note that it is possible to customize the criteria used by using the `QueryBuilder::with_criteria` constructor, this way you can implement some custom ranking based on the document attributes using the appropriate structure and the [`document` method](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/index.rs#L86).
At this point, MeiliDB work is over 🎉

View File

@ -1,138 +0,0 @@
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
use std::io::{self, BufRead, BufReader};
use std::path::{Path, PathBuf};
use std::error::Error;
use std::borrow::Cow;
use std::fs::File;
use hashbrown::{HashMap, HashSet};
use serde_derive::{Serialize, Deserialize};
use structopt::StructOpt;
use meilidb::database::{Database, Schema, UpdateBuilder};
use meilidb::tokenizer::DefaultBuilder;
#[derive(Debug, StructOpt)]
pub struct Opt {
/// The destination where the database must be created.
#[structopt(parse(from_os_str))]
pub database_path: PathBuf,
/// The csv file to index.
#[structopt(parse(from_os_str))]
pub csv_data_path: PathBuf,
/// The path to the schema.
#[structopt(long = "schema", parse(from_os_str))]
pub schema_path: PathBuf,
/// The path to the list of stop words (one by line).
#[structopt(long = "stop-words", parse(from_os_str))]
pub stop_words_path: Option<PathBuf>,
#[structopt(long = "update-group-size")]
pub update_group_size: Option<usize>,
}
#[derive(Serialize, Deserialize)]
struct Document<'a> (
#[serde(borrow)]
HashMap<Cow<'a, str>, Cow<'a, str>>
);
fn index(
schema: Schema,
database_path: &Path,
csv_data_path: &Path,
update_group_size: Option<usize>,
stop_words: &HashSet<String>,
) -> Result<Database, Box<Error>>
{
let database = Database::create(database_path, &schema)?;
let mut rdr = csv::Reader::from_path(csv_data_path)?;
let mut raw_record = csv::StringRecord::new();
let headers = rdr.headers()?.clone();
let mut i = 0;
let mut end_of_file = false;
while !end_of_file {
let tokenizer_builder = DefaultBuilder::new();
let update_path = tempfile::NamedTempFile::new()?;
let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema.clone());
loop {
end_of_file = !rdr.read_record(&mut raw_record)?;
if end_of_file { break }
let document: Document = match raw_record.deserialize(Some(&headers)) {
Ok(document) => document,
Err(e) => {
eprintln!("{:?}", e);
continue;
}
};
update.update_document(&document, &tokenizer_builder, &stop_words)?;
print!("\rindexing document {}", i);
i += 1;
if let Some(group_size) = update_group_size {
if i % group_size == 0 { break }
}
}
println!();
println!("building update...");
let update = update.build()?;
println!("ingesting update...");
database.ingest_update_file(update)?;
}
Ok(database)
}
fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
let f = File::open(path)?;
let reader = BufReader::new(f);
let mut words = HashSet::new();
for line in reader.lines() {
let line = line?;
let word = line.trim().to_string();
words.insert(word);
}
Ok(words)
}
fn main() -> Result<(), Box<Error>> {
let _ = env_logger::init();
let opt = Opt::from_args();
let schema = {
let file = File::open(&opt.schema_path)?;
Schema::from_toml(file)?
};
let stop_words = match opt.stop_words_path {
Some(ref path) => retrieve_stop_words(path)?,
None => HashSet::new(),
};
let (elapsed, result) = elapsed::measure_time(|| {
index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words)
});
if let Err(e) = result {
return Err(e.into())
}
println!("database created in {} at: {:?}", elapsed, opt.database_path);
Ok(())
}

View File

@ -5,15 +5,15 @@
identifier = "id"
[attributes.id]
stored = true
displayed = true
[attributes.title]
stored = true
displayed = true
indexed = true
[attributes.description]
stored = true
displayed = true
indexed = true
[attributes.image]
stored = true
displayed = true

View File

@ -0,0 +1 @@
_datas in movies.csv are from https://www.themoviedb.org/_

19700
examples/movies/movies.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
# This schema has been generated ...
# The order in which the attributes are declared is important,
# it specify the attribute xxx...
identifier = "id"
[attributes.id]
displayed = true
[attributes.title]
displayed = true
indexed = true
[attributes.overview]
displayed = true
indexed = true
[attributes.release_date]
displayed = true
[attributes.poster]
displayed = true

View File

@ -1,170 +0,0 @@
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
use std::collections::btree_map::{BTreeMap, Entry};
use std::iter::FromIterator;
use std::io::{self, Write};
use std::path::PathBuf;
use std::error::Error;
use hashbrown::{HashMap, HashSet};
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
use structopt::StructOpt;
use meilidb::database::schema::SchemaAttr;
use meilidb::database::Database;
use meilidb::Match;
#[derive(Debug, StructOpt)]
pub struct Opt {
/// The destination where the database must be created
#[structopt(parse(from_os_str))]
pub database_path: PathBuf,
/// Fields that must be displayed.
pub displayed_fields: Vec<String>,
/// The number of returned results
#[structopt(short = "n", long = "number-results", default_value = "10")]
pub number_results: usize,
}
type Document = HashMap<String, String>;
fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
let mut stdout = StandardStream::stdout(ColorChoice::Always);
let mut highlighted = false;
for range in ranges.windows(2) {
let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() };
if highlighted {
stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
}
write!(&mut stdout, "{}", &text[start..end])?;
stdout.reset()?;
highlighted = !highlighted;
}
Ok(())
}
fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize) {
let mut byte_index = 0;
let mut byte_length = 0;
for (n, (i, c)) in text.char_indices().enumerate() {
if n == index {
byte_index = i;
}
if n + 1 == index + length {
byte_length = i - byte_index + c.len_utf8();
break;
}
}
(byte_index, byte_length)
}
fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr) -> Vec<usize> {
let mut byte_indexes = BTreeMap::new();
for match_ in matches {
let match_attribute = match_.attribute.attribute();
if SchemaAttr::new(match_attribute) == attribute {
let word_area = match_.word_area;
let char_index = word_area.char_index() as usize;
let char_length = word_area.length() as usize;
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
match byte_indexes.entry(byte_index) {
Entry::Vacant(entry) => { entry.insert(byte_length); },
Entry::Occupied(mut entry) => {
if *entry.get() < byte_length {
entry.insert(byte_length);
}
},
}
}
}
let mut title_areas = Vec::new();
title_areas.push(0);
for (byte_index, length) in byte_indexes {
title_areas.push(byte_index);
title_areas.push(byte_index + length);
}
title_areas.push(text.len());
title_areas.sort_unstable();
title_areas
}
fn main() -> Result<(), Box<Error>> {
let _ = env_logger::init();
let opt = Opt::from_args();
let (elapsed, result) = elapsed::measure_time(|| Database::open(&opt.database_path));
let database = result?;
println!("database prepared for you in {}", elapsed);
let mut buffer = String::new();
let input = io::stdin();
loop {
print!("Searching for: ");
io::stdout().flush()?;
if input.read_line(&mut buffer)? == 0 { break }
let query = buffer.trim_end_matches('\n');
let view = database.view();
let schema = view.schema();
let (elapsed, documents) = elapsed::measure_time(|| {
let builder = view.query_builder().unwrap();
builder.query(query, 0..opt.number_results)
});
let number_of_documents = documents.len();
for doc in documents {
match view.document_by_id::<Document>(doc.id) {
Ok(document) => {
for name in &opt.displayed_fields {
let attr = match schema.attribute(name) {
Some(attr) => attr,
None => continue,
};
let text = match document.get(name) {
Some(text) => text,
None => continue,
};
print!("{}: ", name);
let areas = create_highlight_areas(&text, &doc.matches, attr);
display_highlights(&text, &areas)?;
println!();
}
},
Err(e) => eprintln!("{}", e),
}
let mut matching_attributes = HashSet::new();
for _match in doc.matches {
let attr = SchemaAttr::new(_match.attribute.attribute());
let name = schema.attribute_name(attr);
matching_attributes.insert(name);
}
let matching_attributes = Vec::from_iter(matching_attributes);
println!("matching in: {:?}", matching_attributes);
println!();
}
eprintln!("===== Found {} results in {} =====", number_of_documents, elapsed);
buffer.clear();
}
Ok(())
}

34
meilidb-core/Cargo.toml Normal file
View File

@ -0,0 +1,34 @@
[package]
name = "meilidb-core"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"
[dependencies]
byteorder = "1.3.1"
deunicode = "1.0.0"
hashbrown = "0.2.2"
lazy_static = "1.2.0"
log = "0.4.6"
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
rayon = "1.2.0"
sdset = "0.3.2"
serde = { version = "1.0.88", features = ["derive"] }
slice-group-by = "0.2.6"
zerocopy = "0.2.2"
[dependencies.fst]
git = "https://github.com/Kerollmops/fst.git"
branch = "arc-byte-slice"
[dependencies.levenshtein_automata]
git = "https://github.com/Kerollmops/levenshtein-automata.git"
branch = "arc-byte-slice"
features = ["fst_automaton"]
[dev-dependencies]
assert_matches = "1.3"
[features]
i128 = ["byteorder/i128"]
nightly = ["hashbrown/nightly", "slice-group-by/nightly"]

View File

@ -0,0 +1,44 @@
use lazy_static::lazy_static;
use levenshtein_automata::{
LevenshteinAutomatonBuilder as LevBuilder,
DFA,
};
lazy_static! {
static ref LEVDIST0: LevBuilder = LevBuilder::new(0, false);
static ref LEVDIST1: LevBuilder = LevBuilder::new(1, false);
static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false);
}
#[derive(Copy, Clone)]
enum PrefixSetting {
Prefix,
NoPrefix,
}
fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA {
use self::PrefixSetting::{Prefix, NoPrefix};
match query.len() {
0 ..= 4 => match setting {
Prefix => LEVDIST0.build_prefix_dfa(query),
NoPrefix => LEVDIST0.build_dfa(query),
},
5 ..= 8 => match setting {
Prefix => LEVDIST1.build_prefix_dfa(query),
NoPrefix => LEVDIST1.build_dfa(query),
},
_ => match setting {
Prefix => LEVDIST2.build_prefix_dfa(query),
NoPrefix => LEVDIST2.build_dfa(query),
},
}
}
pub fn build_prefix_dfa(query: &str) -> DFA {
build_dfa_with_setting(query, PrefixSetting::Prefix)
}
pub fn build_dfa(query: &str) -> DFA {
build_dfa_with_setting(query, PrefixSetting::NoPrefix)
}

View File

@ -0,0 +1,16 @@
use std::cmp::Ordering;
use crate::criterion::Criterion;
use crate::RawDocument;
#[derive(Debug, Clone, Copy)]
pub struct DocumentId;
impl Criterion for DocumentId {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
lhs.id.cmp(&rhs.id)
}
fn name(&self) -> &'static str {
"DocumentId"
}
}

View File

@ -0,0 +1,65 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize {
let mut count = 0;
let mut index = 0;
for group in query_index.linear_group() {
let len = group.len();
count += is_exact[index..index + len].contains(&true) as usize;
index += len;
}
count
}
#[derive(Debug, Clone, Copy)]
pub struct Exact;
impl Criterion for Exact {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let is_exact = lhs.is_exact();
number_exact_matches(query_index, is_exact)
};
let rhs = {
let query_index = rhs.query_index();
let is_exact = rhs.is_exact();
number_exact_matches(query_index, is_exact)
};
lhs.cmp(&rhs).reverse()
}
fn name(&self) -> &'static str {
"Exact"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "soulier"
//
// doc0: "Soulier bleu"
// doc1: "souliereres rouge"
#[test]
fn easy_case() {
let query_index0 = &[0];
let is_exact0 = &[true];
let query_index1 = &[0];
let is_exact1 = &[false];
let doc0 = number_exact_matches(query_index0, is_exact0);
let doc1 = number_exact_matches(query_index1, is_exact1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
}

View File

@ -0,0 +1,120 @@
mod sum_of_typos;
mod number_of_words;
mod words_proximity;
mod sum_of_words_attribute;
mod sum_of_words_position;
mod exact;
mod document_id;
use std::cmp::Ordering;
use crate::RawDocument;
pub use self::{
sum_of_typos::SumOfTypos,
number_of_words::NumberOfWords,
words_proximity::WordsProximity,
sum_of_words_attribute::SumOfWordsAttribute,
sum_of_words_position::SumOfWordsPosition,
exact::Exact,
document_id::DocumentId,
};
pub trait Criterion: Send + Sync {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering;
fn name(&self) -> &'static str;
#[inline]
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
self.evaluate(lhs, rhs) == Ordering::Equal
}
}
impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
(**self).evaluate(lhs, rhs)
}
fn name(&self) -> &'static str {
(**self).name()
}
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
(**self).eq(lhs, rhs)
}
}
impl<T: Criterion + ?Sized> Criterion for Box<T> {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
(**self).evaluate(lhs, rhs)
}
fn name(&self) -> &'static str {
(**self).name()
}
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
(**self).eq(lhs, rhs)
}
}
#[derive(Default)]
pub struct CriteriaBuilder<'a> {
inner: Vec<Box<dyn Criterion + 'a>>
}
impl<'a> CriteriaBuilder<'a>
{
pub fn new() -> CriteriaBuilder<'a> {
CriteriaBuilder { inner: Vec::new() }
}
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> {
CriteriaBuilder { inner: Vec::with_capacity(capacity) }
}
pub fn reserve(&mut self, additional: usize) {
self.inner.reserve(additional)
}
pub fn add<C: 'a>(mut self, criterion: C) -> CriteriaBuilder<'a>
where C: Criterion,
{
self.push(criterion);
self
}
pub fn push<C: 'a>(&mut self, criterion: C)
where C: Criterion,
{
self.inner.push(Box::new(criterion));
}
pub fn build(self) -> Criteria<'a> {
Criteria { inner: self.inner }
}
}
pub struct Criteria<'a> {
inner: Vec<Box<dyn Criterion + 'a>>,
}
impl<'a> Default for Criteria<'a> {
fn default() -> Self {
CriteriaBuilder::with_capacity(7)
.add(SumOfTypos)
.add(NumberOfWords)
.add(WordsProximity)
.add(SumOfWordsAttribute)
.add(SumOfWordsPosition)
.add(Exact)
.add(DocumentId)
.build()
}
}
impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> {
fn as_ref(&self) -> &[Box<dyn Criterion + 'a>] {
&self.inner
}
}

View File

@ -0,0 +1,31 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn number_of_query_words(query_index: &[u32]) -> usize {
query_index.linear_group().count()
}
#[derive(Debug, Clone, Copy)]
pub struct NumberOfWords;
impl Criterion for NumberOfWords {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
number_of_query_words(query_index)
};
let rhs = {
let query_index = rhs.query_index();
number_of_query_words(query_index)
};
lhs.cmp(&rhs).reverse()
}
fn name(&self) -> &'static str {
"NumberOfWords"
}
}

View File

@ -0,0 +1,116 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
// This function is a wrong logarithmic 10 function.
// It is safe to panic on input number higher than 3,
// the number of typos is never bigger than that.
#[inline]
fn custom_log10(n: u8) -> f32 {
match n {
0 => 0.0, // log(1)
1 => 0.30102, // log(2)
2 => 0.47712, // log(3)
3 => 0.60205, // log(4)
_ => panic!("invalid number"),
}
}
#[inline]
fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize {
let mut number_words: usize = 0;
let mut sum_typos = 0.0;
let mut index = 0;
for group in query_index.linear_group() {
sum_typos += custom_log10(distance[index]);
number_words += 1;
index += group.len();
}
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfTypos;
impl Criterion for SumOfTypos {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let distance = lhs.distance();
sum_matches_typos(query_index, distance)
};
let rhs = {
let query_index = rhs.query_index();
let distance = rhs.distance();
sum_matches_typos(query_index, distance)
};
lhs.cmp(&rhs).reverse()
}
fn name(&self) -> &'static str {
"SumOfTypos"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "Geox CEO"
//
// doc0: "Geox SpA: CEO and Executive"
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
#[test]
fn one_typo_reference() {
let query_index0 = &[0, 1];
let distance0 = &[0, 0];
let query_index1 = &[0, 1];
let distance1 = &[1, 0];
let doc0 = sum_matches_typos(query_index0, distance0);
let doc1 = sum_matches_typos(query_index1, distance1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
// typing: "bouton manchette"
//
// doc0: "bouton manchette"
// doc1: "bouton"
#[test]
fn no_typo() {
let query_index0 = &[0, 1];
let distance0 = &[0, 0];
let query_index1 = &[0];
let distance1 = &[0];
let doc0 = sum_matches_typos(query_index0, distance0);
let doc1 = sum_matches_typos(query_index1, distance1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
// typing: "bouton manchztte"
//
// doc0: "bouton manchette"
// doc1: "bouton"
#[test]
fn one_typo() {
let query_index0 = &[0, 1];
let distance0 = &[0, 1];
let query_index1 = &[0];
let distance1 = &[0];
let doc0 = sum_matches_typos(query_index0, distance0);
let doc1 = sum_matches_typos(query_index1, distance1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
}

View File

@ -0,0 +1,64 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
let mut sum_attributes = 0;
let mut index = 0;
for group in query_index.linear_group() {
sum_attributes += attribute[index] as usize;
index += group.len();
}
sum_attributes
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfWordsAttribute;
impl Criterion for SumOfWordsAttribute {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let attribute = lhs.attribute();
sum_matches_attributes(query_index, attribute)
};
let rhs = {
let query_index = rhs.query_index();
let attribute = rhs.attribute();
sum_matches_attributes(query_index, attribute)
};
lhs.cmp(&rhs)
}
fn name(&self) -> &'static str {
"SumOfWordsAttribute"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "soulier"
//
// doc0: { 0. "Soulier bleu", 1. "bla bla bla" }
// doc1: { 0. "Botte rouge", 1. "Soulier en cuir" }
#[test]
fn title_vs_description() {
let query_index0 = &[0];
let attribute0 = &[0];
let query_index1 = &[0];
let attribute1 = &[1];
let doc0 = sum_matches_attributes(query_index0, attribute0);
let doc1 = sum_matches_attributes(query_index1, attribute1);
assert_eq!(doc0.cmp(&doc1), Ordering::Less);
}
}

View File

@ -0,0 +1,64 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
let mut sum_word_index = 0;
let mut index = 0;
for group in query_index.linear_group() {
sum_word_index += word_index[index] as usize;
index += group.len();
}
sum_word_index
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfWordsPosition;
impl Criterion for SumOfWordsPosition {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let word_index = lhs.word_index();
sum_matches_attribute_index(query_index, word_index)
};
let rhs = {
let query_index = rhs.query_index();
let word_index = rhs.word_index();
sum_matches_attribute_index(query_index, word_index)
};
lhs.cmp(&rhs)
}
fn name(&self) -> &'static str {
"SumOfWordsPosition"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "soulier"
//
// doc0: "Soulier bleu"
// doc1: "Botte rouge et soulier noir"
#[test]
fn easy_case() {
let query_index0 = &[0];
let word_index0 = &[0];
let query_index1 = &[0];
let word_index1 = &[3];
let doc0 = sum_matches_attribute_index(query_index0, word_index0);
let doc1 = sum_matches_attribute_index(query_index1, word_index1);
assert_eq!(doc0.cmp(&doc1), Ordering::Less);
}
}

View File

@ -0,0 +1,155 @@
use std::cmp::{self, Ordering};
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
const MAX_DISTANCE: u16 = 8;
#[inline]
fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
(a.clone(), b.clone())
}
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
if lhs < rhs {
cmp::min(rhs - lhs, MAX_DISTANCE)
} else {
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
}
}
fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
if lattr != rattr { return MAX_DISTANCE }
index_proximity(lwi, rwi)
}
fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 {
let mut min_prox = u16::max_value();
for a in lattr.iter().zip(lwi) {
for b in rattr.iter().zip(rwi) {
let a = clone_tuple(a);
let b = clone_tuple(b);
min_prox = cmp::min(min_prox, attribute_proximity(a, b));
}
}
min_prox
}
fn matches_proximity(
query_index: &[u32],
distance: &[u8],
attribute: &[u16],
word_index: &[u16],
) -> u16
{
let mut query_index_groups = query_index.linear_group();
let mut proximity = 0;
let mut index = 0;
let get_attr_wi = |index: usize, group_len: usize| {
// retrieve the first distance group (with the lowest values)
let len = distance[index..index + group_len].linear_group().next().unwrap().len();
let rattr = &attribute[index..index + len];
let rwi = &word_index[index..index + len];
(rattr, rwi)
};
let mut last = query_index_groups.next().map(|group| {
let attr_wi = get_attr_wi(index, group.len());
index += group.len();
attr_wi
});
// iter by windows of size 2
while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) {
let attr_wi = get_attr_wi(index, rhs.len());
proximity += min_proximity(lhs, attr_wi);
last = Some(attr_wi);
index += rhs.len();
}
proximity
}
#[derive(Debug, Clone, Copy)]
pub struct WordsProximity;
impl Criterion for WordsProximity {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let distance = lhs.distance();
let attribute = lhs.attribute();
let word_index = lhs.word_index();
matches_proximity(query_index, distance, attribute, word_index)
};
let rhs = {
let query_index = rhs.query_index();
let distance = rhs.distance();
let attribute = rhs.attribute();
let word_index = rhs.word_index();
matches_proximity(query_index, distance, attribute, word_index)
};
lhs.cmp(&rhs)
}
fn name(&self) -> &'static str {
"WordsProximity"
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn three_different_attributes() {
// "soup" "of the" "the day"
//
// { id: 0, attr: 0, attr_index: 0 }
// { id: 1, attr: 1, attr_index: 0 }
// { id: 2, attr: 1, attr_index: 1 }
// { id: 2, attr: 2, attr_index: 0 }
// { id: 3, attr: 3, attr_index: 1 }
let query_index = &[0, 1, 2, 2, 3];
let distance = &[0, 0, 0, 0, 0];
let attribute = &[0, 1, 1, 2, 3];
let word_index = &[0, 0, 1, 0, 1];
// soup -> of = 8
// + of -> the = 1
// + the -> day = 8 (not 1)
assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 17);
}
#[test]
fn two_different_attributes() {
// "soup day" "soup of the day"
//
// { id: 0, attr: 0, attr_index: 0 }
// { id: 0, attr: 1, attr_index: 0 }
// { id: 1, attr: 1, attr_index: 1 }
// { id: 2, attr: 1, attr_index: 2 }
// { id: 3, attr: 0, attr_index: 1 }
// { id: 3, attr: 1, attr_index: 3 }
let query_index = &[0, 0, 1, 2, 3, 3];
let distance = &[0, 0, 0, 0, 0, 0];
let attribute = &[0, 1, 1, 1, 0, 1];
let word_index = &[0, 0, 1, 2, 1, 3];
// soup -> of = 1
// + of -> the = 1
// + the -> day = 1
assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 3);
}
}

View File

@ -1,5 +1,4 @@
use std::hash::Hash;
use hashbrown::HashMap;
pub struct DistinctMap<K> {
@ -12,7 +11,7 @@ impl<K: Hash + Eq> DistinctMap<K> {
pub fn new(limit: usize) -> Self {
DistinctMap {
inner: HashMap::new(),
limit: limit,
limit,
len: 0,
}
}
@ -31,7 +30,7 @@ pub struct BufferedDistinctMap<'a, K> {
impl<'a, K: Hash + Eq> BufferedDistinctMap<'a, K> {
pub fn new(internal: &'a mut DistinctMap<K>) -> BufferedDistinctMap<'a, K> {
BufferedDistinctMap {
internal: internal,
internal,
inner: HashMap::new(),
len: 0,
}

144
meilidb-core/src/lib.rs Normal file
View File

@ -0,0 +1,144 @@
#![feature(checked_duration_since)]
#[cfg(test)]
#[macro_use] extern crate assert_matches;
mod automaton;
mod distinct_map;
mod query_builder;
mod query_enhancer;
mod raw_document;
mod reordered_attrs;
mod store;
pub mod criterion;
use serde::{Serialize, Deserialize};
use zerocopy::{AsBytes, FromBytes};
use self::raw_document::raw_documents_from;
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder, normalize_str};
pub use self::raw_document::RawDocument;
pub use self::store::Store;
/// Represent an internally generated document unique identifier.
///
/// It is used to inform the database the document you want to deserialize.
/// Helpful for custom ranking.
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
#[derive(Serialize, Deserialize)]
#[derive(AsBytes, FromBytes)]
#[repr(C)]
pub struct DocumentId(pub u64);
/// This structure represent the position of a word
/// in a document and its attributes.
///
/// This is stored in the map, generated at index time,
/// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[derive(AsBytes, FromBytes)]
#[repr(C)]
pub struct DocIndex {
/// The document identifier where the word was found.
pub document_id: DocumentId,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
pub word_index: u16,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
pub char_length: u16,
}
/// This structure represent a matching word with informations
/// on the location of the word in the document.
///
/// The order of the field is important because it defines
/// the way these structures are ordered between themselves.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Highlight {
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
/// The position in bytes where the word was found.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
/// The length in bytes of the found word.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_length: u16,
}
#[doc(hidden)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct TmpMatch {
pub query_index: u32,
pub distance: u8,
pub attribute: u16,
pub word_index: u16,
pub is_exact: bool,
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Document {
pub id: DocumentId,
pub highlights: Vec<Highlight>,
#[cfg(test)]
pub matches: Vec<TmpMatch>,
}
impl Document {
#[cfg(not(test))]
fn from_raw(raw: RawDocument) -> Document {
Document { id: raw.id, highlights: raw.highlights }
}
#[cfg(test)]
fn from_raw(raw: RawDocument) -> Document {
let len = raw.query_index().len();
let mut matches = Vec::with_capacity(len);
let query_index = raw.query_index();
let distance = raw.distance();
let attribute = raw.attribute();
let word_index = raw.word_index();
let is_exact = raw.is_exact();
for i in 0..len {
let match_ = TmpMatch {
query_index: query_index[i],
distance: distance[i],
attribute: attribute[i],
word_index: word_index[i],
is_exact: is_exact[i],
};
matches.push(match_);
}
Document { id: raw.id, matches, highlights: raw.highlights }
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::mem;
#[test]
fn docindex_mem_size() {
assert_eq!(mem::size_of::<DocIndex>(), 16);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,398 @@
use std::ops::Range;
use std::cmp::Ordering::{Less, Greater, Equal};
/// Return `true` if the specified range can accept the given replacements words.
/// Returns `false` if the replacements words are already present in the original query
/// or if there is fewer replacement words than the range to replace.
//
//
// ## Ignored because already present in original
//
// new york city subway
// -------- ^^^^
// / \
// [new york city]
//
//
// ## Ignored because smaller than the original
//
// new york city subway
// -------------
// \ /
// [new york]
//
//
// ## Accepted because bigger than the original
//
// NYC subway
// ---
// / \
// / \
// / \
// / \
// / \
// [new york city]
//
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
where S: AsRef<str>,
T: AsRef<str>,
{
if words.len() <= range.len() {
// there is fewer or equal replacement words
// than there is already in the replaced range
return false
}
// retrieve the part to rewrite but with the length
// of the replacement part
let original = query.iter().skip(range.start).take(words.len());
// check if the original query doesn't already contain
// the replacement words
!original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref))
}
type Origin = usize;
type RealLength = usize;
struct FakeIntervalTree {
intervals: Vec<(Range<usize>, (Origin, RealLength))>,
}
impl FakeIntervalTree {
fn new(mut intervals: Vec<(Range<usize>, (Origin, RealLength))>) -> FakeIntervalTree {
intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end));
FakeIntervalTree { intervals }
}
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
let element = self.intervals.binary_search_by(|(r, _)| {
if point >= r.start {
if point < r.end { Equal } else { Less }
} else { Greater }
});
let n = match element { Ok(n) => n, Err(n) => n };
match self.intervals.get(n) {
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
_otherwise => None,
}
}
}
pub struct QueryEnhancerBuilder<'a, S> {
query: &'a [S],
origins: Vec<usize>,
real_to_origin: Vec<(Range<usize>, (Origin, RealLength))>,
}
impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
// we initialize origins query indices based on their positions
let origins: Vec<_> = (0..query.len() + 1).collect();
let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect();
QueryEnhancerBuilder { query, origins, real_to_origin }
}
/// Update the final real to origin query indices mapping.
///
/// `range` is the original words range that this `replacement` words replace
/// and `real` is the first real query index of these replacement words.
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
where T: AsRef<str>,
{
// check if the range of original words
// can be rewritten with the replacement words
if rewrite_range_with(self.query, range.clone(), replacement) {
// this range can be replaced so we need to
// modify the origins accordingly
let offset = replacement.len() - range.len();
let previous_padding = self.origins[range.end - 1];
let current_offset = (self.origins[range.end] - 1) - previous_padding;
let diff = offset.saturating_sub(current_offset);
self.origins[range.end] += diff;
for r in &mut self.origins[range.end + 1..] {
*r += diff;
}
}
// we need to store the real number and origins relations
// this way it will be possible to know by how many
// we need to pad real query indices
let real_range = real..real + replacement.len().max(range.len());
let real_length = replacement.len();
self.real_to_origin.push((real_range, (range.start, real_length)));
}
pub fn build(self) -> QueryEnhancer {
QueryEnhancer {
origins: self.origins,
real_to_origin: FakeIntervalTree::new(self.real_to_origin),
}
}
}
pub struct QueryEnhancer {
origins: Vec<usize>,
real_to_origin: FakeIntervalTree,
}
impl QueryEnhancer {
/// Returns the query indices to use to replace this real query index.
pub fn replacement(&self, real: u32) -> Range<u32> {
let real = real as usize;
// query the fake interval tree with the real query index
let (range, (origin, real_length)) =
self.real_to_origin
.query(real)
.expect("real has never been declared");
// if `real` is the end bound of the range
if (range.start + real_length - 1) == real {
let mut count = range.len();
let mut new_origin = origin;
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
let len = slice[1] - slice[0];
count = count.saturating_sub(len);
if count == 0 { new_origin = origin + i; break }
}
let n = real - range.start;
let start = self.origins[origin];
let end = self.origins[new_origin + 1];
let remaining = (end - start) - n;
Range { start: (start + n) as u32, end: (start + n + remaining) as u32 }
} else {
// just return the origin along with
// the real position of the word
let n = real as usize - range.start;
let origin = self.origins[origin];
Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 }
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn original_unmodified() {
let query = ["new", "york", "city", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// new york = new york city
builder.declare(0..2, 4, &["new", "york", "city"]);
// ^ 4 5 6
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // new
assert_eq!(enhancer.replacement(1), 1..2); // york
assert_eq!(enhancer.replacement(2), 2..3); // city
assert_eq!(enhancer.replacement(3), 3..4); // subway
assert_eq!(enhancer.replacement(4), 0..1); // new
assert_eq!(enhancer.replacement(5), 1..2); // york
assert_eq!(enhancer.replacement(6), 2..3); // city
}
#[test]
fn simple_growing() {
let query = ["new", "york", "subway"];
// 0 1 2
let mut builder = QueryEnhancerBuilder::new(&query);
// new york = new york city
builder.declare(0..2, 3, &["new", "york", "city"]);
// ^ 3 4 5
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // new
assert_eq!(enhancer.replacement(1), 1..3); // york
assert_eq!(enhancer.replacement(2), 3..4); // subway
assert_eq!(enhancer.replacement(3), 0..1); // new
assert_eq!(enhancer.replacement(4), 1..2); // york
assert_eq!(enhancer.replacement(5), 2..3); // city
}
#[test]
fn same_place_growings() {
let query = ["NY", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NY = new york
builder.declare(0..1, 2, &["new", "york"]);
// ^ 2 3
// NY = new york city
builder.declare(0..1, 4, &["new", "york", "city"]);
// ^ 4 5 6
// NY = NYC
builder.declare(0..1, 7, &["NYC"]);
// ^ 7
// NY = new york city
builder.declare(0..1, 8, &["new", "york", "city"]);
// ^ 8 9 10
// subway = underground train
builder.declare(1..2, 11, &["underground", "train"]);
// ^ 11 12
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..3); // NY
assert_eq!(enhancer.replacement(1), 3..5); // subway
assert_eq!(enhancer.replacement(2), 0..1); // new
assert_eq!(enhancer.replacement(3), 1..3); // york
assert_eq!(enhancer.replacement(4), 0..1); // new
assert_eq!(enhancer.replacement(5), 1..2); // york
assert_eq!(enhancer.replacement(6), 2..3); // city
assert_eq!(enhancer.replacement(7), 0..3); // NYC
assert_eq!(enhancer.replacement(8), 0..1); // new
assert_eq!(enhancer.replacement(9), 1..2); // york
assert_eq!(enhancer.replacement(10), 2..3); // city
assert_eq!(enhancer.replacement(11), 3..4); // underground
assert_eq!(enhancer.replacement(12), 4..5); // train
}
#[test]
fn bigger_growing() {
let query = ["NYC", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(0..1, 2, &["new", "york", "city"]);
// ^ 2 3 4
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..3); // NYC
assert_eq!(enhancer.replacement(1), 3..4); // subway
assert_eq!(enhancer.replacement(2), 0..1); // new
assert_eq!(enhancer.replacement(3), 1..2); // york
assert_eq!(enhancer.replacement(4), 2..3); // city
}
#[test]
fn middle_query_growing() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..6); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
}
#[test]
fn end_query_growing() {
let query = ["NYC", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(1..2, 2, &["underground", "train"]);
// ^ 2 3
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // NYC
assert_eq!(enhancer.replacement(1), 1..3); // subway
assert_eq!(enhancer.replacement(2), 1..2); // underground
assert_eq!(enhancer.replacement(3), 2..3); // train
}
#[test]
fn multiple_growings() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
// subway = underground train
builder.declare(3..4, 7, &["underground", "train"]);
// ^ 7 8
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..7); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
assert_eq!(enhancer.replacement(7), 5..6); // underground
assert_eq!(enhancer.replacement(8), 6..7); // train
}
#[test]
fn multiple_probable_growings() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
// subway = underground train
builder.declare(3..4, 7, &["underground", "train"]);
// ^ 7 8
// great awesome = good
builder.declare(0..2, 9, &["good"]);
// ^ 9
// awesome NYC = NY
builder.declare(1..3, 10, &["NY"]);
// ^^ 10
// NYC subway = metro
builder.declare(2..4, 11, &["metro"]);
// ^^ 11
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..7); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
assert_eq!(enhancer.replacement(7), 5..6); // underground
assert_eq!(enhancer.replacement(8), 6..7); // train
assert_eq!(enhancer.replacement(9), 0..2); // good
assert_eq!(enhancer.replacement(10), 1..5); // NY
assert_eq!(enhancer.replacement(11), 2..5); // metro
}
}

View File

@ -0,0 +1,141 @@
use std::sync::Arc;
use std::fmt;
use sdset::SetBuf;
use slice_group_by::GroupBy;
use crate::{TmpMatch, DocumentId, Highlight};
#[derive(Clone)]
pub struct RawDocument {
pub id: DocumentId,
pub matches: SharedMatches,
pub highlights: Vec<Highlight>,
}
impl RawDocument {
fn new(id: DocumentId, matches: SharedMatches, highlights: Vec<Highlight>) -> RawDocument {
RawDocument { id, matches, highlights }
}
pub fn query_index(&self) -> &[u32] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
}
pub fn distance(&self) -> &[u8] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
}
pub fn attribute(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
}
pub fn word_index(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
}
pub fn is_exact(&self) -> &[bool] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
}
}
impl fmt::Debug for RawDocument {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str("RawDocument {\r\n")?;
f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "query_index", self.query_index()))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "distance", self.distance()))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "attribute", self.attribute()))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "word_index", self.word_index()))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?;
f.write_str("}")?;
Ok(())
}
}
pub fn raw_documents_from(
matches: SetBuf<(DocumentId, TmpMatch)>,
highlights: SetBuf<(DocumentId, Highlight)>,
) -> Vec<RawDocument>
{
let mut docs_ranges: Vec<(_, Range, _)> = Vec::new();
let mut matches2 = Matches::with_capacity(matches.len());
let matches = matches.linear_group_by_key(|(id, _)| *id);
let highlights = highlights.linear_group_by_key(|(id, _)| *id);
for (mgroup, hgroup) in matches.zip(highlights) {
debug_assert_eq!(mgroup[0].0, hgroup[0].0);
let document_id = mgroup[0].0;
let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0);
let end = start + mgroup.len();
let highlights = hgroup.iter().map(|(_, h)| *h).collect();
docs_ranges.push((document_id, Range { start, end }, highlights));
matches2.extend_from_slice(mgroup);
}
let matches = Arc::new(matches2);
docs_ranges.into_iter().map(|(id, range, highlights)| {
let matches = SharedMatches { range, matches: matches.clone() };
RawDocument::new(id, matches, highlights)
}).collect()
}
#[derive(Debug, Copy, Clone)]
struct Range {
start: usize,
end: usize,
}
#[derive(Clone)]
pub struct SharedMatches {
range: Range,
matches: Arc<Matches>,
}
#[derive(Clone)]
struct Matches {
query_index: Vec<u32>,
distance: Vec<u8>,
attribute: Vec<u16>,
word_index: Vec<u16>,
is_exact: Vec<bool>,
}
impl Matches {
fn with_capacity(cap: usize) -> Matches {
Matches {
query_index: Vec::with_capacity(cap),
distance: Vec::with_capacity(cap),
attribute: Vec::with_capacity(cap),
word_index: Vec::with_capacity(cap),
is_exact: Vec::with_capacity(cap),
}
}
fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) {
for (_, match_) in matches {
self.query_index.push(match_.query_index);
self.distance.push(match_.distance);
self.attribute.push(match_.attribute);
self.word_index.push(match_.word_index);
self.is_exact.push(match_.is_exact);
}
}
}

View File

@ -0,0 +1,24 @@
#[derive(Default, Clone)]
pub struct ReorderedAttrs {
count: usize,
reorders: Vec<Option<u16>>,
}
impl ReorderedAttrs {
pub fn new() -> ReorderedAttrs {
ReorderedAttrs { count: 0, reorders: Vec::new() }
}
pub fn insert_attribute(&mut self, attribute: u16) {
self.reorders.resize(attribute as usize + 1, None);
self.reorders[attribute as usize] = Some(self.count as u16);
self.count += 1;
}
pub fn get(&self, attribute: u16) -> Option<u16> {
match self.reorders.get(attribute as usize) {
Some(Some(attribute)) => Some(*attribute),
_ => None,
}
}
}

34
meilidb-core/src/store.rs Normal file
View File

@ -0,0 +1,34 @@
use std::error::Error;
use fst::Set;
use sdset::SetBuf;
use crate::DocIndex;
pub trait Store {
type Error: Error;
fn words(&self) -> Result<&Set, Self::Error>;
fn word_indexes(&self, word: &[u8]) -> Result<Option<SetBuf<DocIndex>>, Self::Error>;
fn synonyms(&self) -> Result<&Set, Self::Error>;
fn alternatives_to(&self, word: &[u8]) -> Result<Option<Set>, Self::Error>;
}
impl<T> Store for &'_ T where T: Store {
type Error = T::Error;
fn words(&self) -> Result<&Set, Self::Error> {
(*self).words()
}
fn word_indexes(&self, word: &[u8]) -> Result<Option<SetBuf<DocIndex>>, Self::Error> {
(*self).word_indexes(word)
}
fn synonyms(&self) -> Result<&Set, Self::Error> {
(*self).synonyms()
}
fn alternatives_to(&self, word: &[u8]) -> Result<Option<Set>, Self::Error> {
(*self).alternatives_to(word)
}
}

39
meilidb-data/Cargo.toml Normal file
View File

@ -0,0 +1,39 @@
[package]
name = "meilidb-data"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"
[dependencies]
arc-swap = "0.4.2"
bincode = "1.1.4"
crossbeam-channel = "0.3.9"
deunicode = "1.0.0"
hashbrown = { version = "0.6.0", features = ["serde"] }
log = "0.4.6"
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
meilidb-schema = { path = "../meilidb-schema", version = "0.1.0" }
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
ordered-float = { version = "1.0.2", features = ["serde"] }
rocksdb = "0.12.3"
sdset = "0.3.2"
serde = { version = "1.0.99", features = ["derive"] }
serde_json = "1.0.40"
siphasher = "0.3.0"
zerocopy = "0.2.8"
[dependencies.rmp-serde]
git = "https://github.com/3Hren/msgpack-rust.git"
rev = "40b3d48"
[dependencies.rmpv]
git = "https://github.com/3Hren/msgpack-rust.git"
rev = "40b3d48"
features = ["with-serde"]
[dependencies.fst]
git = "https://github.com/Kerollmops/fst.git"
branch = "arc-byte-slice"
[dev-dependencies]
tempfile = "3.1.0"

113
meilidb-data/src/cf_tree.rs Normal file
View File

@ -0,0 +1,113 @@
use std::sync::Arc;
use crossbeam_channel::{unbounded, Sender, Receiver};
use rocksdb::{DBVector, IteratorMode, Direction};
use crate::RocksDbResult;
#[derive(Clone)]
pub struct CfTree {
index: Arc<CfTreeInner>,
sender: Option<Sender<()>>,
}
struct CfTreeInner {
db: Arc<rocksdb::DB>,
name: String,
}
impl CfTree {
pub fn create(db: Arc<rocksdb::DB>, name: String) -> RocksDbResult<CfTree> {
let mut options = rocksdb::Options::default();
options.create_missing_column_families(true);
let _cf = db.create_cf(&name, &options)?;
let index = Arc::new(CfTreeInner { db, name });
Ok(CfTree { index, sender: None })
}
pub fn create_with_subcription(
db: Arc<rocksdb::DB>,
name: String,
) -> RocksDbResult<(CfTree, Receiver<()>)>
{
let mut options = rocksdb::Options::default();
options.create_missing_column_families(true);
let _cf = db.create_cf(&name, &options)?;
let index = Arc::new(CfTreeInner { db, name });
let (sender, receiver) = unbounded();
Ok((CfTree { index, sender: Some(sender) }, receiver))
}
pub fn insert<K, V>(&self, key: K, value: V) -> RocksDbResult<()>
where K: AsRef<[u8]>,
V: AsRef<[u8]>,
{
let cf = self.index.db.cf_handle(&self.index.name).unwrap();
let result = self.index.db.put_cf(cf, key, value);
if let Some(sender) = &self.sender {
let _err = sender.send(());
}
result
}
pub fn get<K>(&self, key: K) -> RocksDbResult<Option<DBVector>>
where K: AsRef<[u8]>,
{
let cf = self.index.db.cf_handle(&self.index.name).unwrap();
self.index.db.get_cf(cf, key)
}
pub fn remove<K>(&self, key: K) -> RocksDbResult<()>
where K: AsRef<[u8]>
{
let cf = self.index.db.cf_handle(&self.index.name).unwrap();
self.index.db.delete_cf(cf, key)
}
/// Start and end key range is inclusive on both bounds.
pub fn range<KS, KE>(&self, start: KS, end: KE) -> RocksDbResult<CfIter>
where KS: AsRef<[u8]>,
KE: AsRef<[u8]>,
{
let cf = self.index.db.cf_handle(&self.index.name).unwrap();
let mut iter = self.index.db.iterator_cf(cf, IteratorMode::Start)?;
iter.set_mode(IteratorMode::From(start.as_ref(), Direction::Forward));
let end_bound = Box::from(end.as_ref());
Ok(CfIter { iter, end_bound: Some(end_bound) })
}
pub fn iter(&self) -> RocksDbResult<CfIter> {
let cf = self.index.db.cf_handle(&self.index.name).unwrap();
let iter = self.index.db.iterator_cf(cf, IteratorMode::Start)?;
Ok(CfIter { iter, end_bound: None })
}
pub fn last_key(&self) -> RocksDbResult<Option<Box<[u8]>>> {
let cf = self.index.db.cf_handle(&self.index.name).unwrap();
let mut iter = self.index.db.iterator_cf(cf, IteratorMode::End)?;
Ok(iter.next().map(|(key, _)| key))
}
}
pub struct CfIter<'a> {
iter: rocksdb::DBIterator<'a>,
end_bound: Option<Box<[u8]>>,
}
impl Iterator for CfIter<'_> {
type Item = (Box<[u8]>, Box<[u8]>);
fn next(&mut self) -> Option<Self::Item> {
match (self.iter.next(), &self.end_bound) {
(Some((ref key, _)), Some(end_bound)) if key > end_bound => None,
(Some(entry), _) => Some(entry),
(None, _) => None,
}
}
}

View File

@ -0,0 +1,73 @@
use std::{error, fmt};
use crate::serde::SerializerError;
#[derive(Debug)]
pub enum Error {
SchemaDiffer,
SchemaMissing,
WordIndexMissing,
MissingDocumentId,
RocksDbError(rocksdb::Error),
FstError(fst::Error),
RmpDecodeError(rmp_serde::decode::Error),
RmpEncodeError(rmp_serde::encode::Error),
BincodeError(bincode::Error),
SerializerError(SerializerError),
}
impl From<rocksdb::Error> for Error {
fn from(error: rocksdb::Error) -> Error {
Error::RocksDbError(error)
}
}
impl From<fst::Error> for Error {
fn from(error: fst::Error) -> Error {
Error::FstError(error)
}
}
impl From<rmp_serde::decode::Error> for Error {
fn from(error: rmp_serde::decode::Error) -> Error {
Error::RmpDecodeError(error)
}
}
impl From<rmp_serde::encode::Error> for Error {
fn from(error: rmp_serde::encode::Error) -> Error {
Error::RmpEncodeError(error)
}
}
impl From<bincode::Error> for Error {
fn from(error: bincode::Error) -> Error {
Error::BincodeError(error)
}
}
impl From<SerializerError> for Error {
fn from(error: SerializerError) -> Error {
Error::SerializerError(error)
}
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use self::Error::*;
match self {
SchemaDiffer => write!(f, "schemas differ"),
SchemaMissing => write!(f, "this index does not have a schema"),
WordIndexMissing => write!(f, "this index does not have a word index"),
MissingDocumentId => write!(f, "document id is missing"),
RocksDbError(e) => write!(f, "RocksDB error; {}", e),
FstError(e) => write!(f, "fst error; {}", e),
RmpDecodeError(e) => write!(f, "rmp decode error; {}", e),
RmpEncodeError(e) => write!(f, "rmp encode error; {}", e),
BincodeError(e) => write!(f, "bincode error; {}", e),
SerializerError(e) => write!(f, "serializer error; {}", e),
}
}
}
impl error::Error for Error { }

View File

@ -0,0 +1,12 @@
use std::ops::Deref;
#[derive(Clone)]
pub struct CustomSettingsIndex(pub(crate) crate::CfTree);
impl Deref for CustomSettingsIndex {
type Target = crate::CfTree;
fn deref(&self) -> &Self::Target {
&self.0
}
}

View File

@ -0,0 +1,33 @@
use std::sync::Arc;
use meilidb_core::DocumentId;
use crate::database::Error;
#[derive(Clone)]
pub struct DocsWordsIndex(pub crate::CfTree);
impl DocsWordsIndex {
pub fn doc_words(&self, id: DocumentId) -> Result<Option<fst::Set>, Error> {
let key = id.0.to_be_bytes();
match self.0.get(key)? {
Some(bytes) => {
let len = bytes.len();
let value = Arc::from(bytes.as_ref());
let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?;
Ok(Some(fst::Set::from(fst)))
},
None => Ok(None)
}
}
pub fn set_doc_words(&self, id: DocumentId, words: &fst::Set) -> Result<(), Error> {
let key = id.0.to_be_bytes();
self.0.insert(key, words.as_fst().as_bytes())?;
Ok(())
}
pub fn del_doc_words(&self, id: DocumentId) -> Result<(), Error> {
let key = id.0.to_be_bytes();
self.0.remove(key)?;
Ok(())
}
}

View File

@ -0,0 +1,90 @@
use std::convert::TryInto;
use meilidb_core::DocumentId;
use meilidb_schema::SchemaAttr;
use rocksdb::DBVector;
use crate::document_attr_key::DocumentAttrKey;
use crate::RocksDbResult;
fn document_fields_range(id: DocumentId) -> ([u8; 10], [u8; 10]) {
let start = DocumentAttrKey::new(id, SchemaAttr::min()).to_be_bytes();
let end = DocumentAttrKey::new(id, SchemaAttr::max()).to_be_bytes();
(start, end)
}
#[derive(Clone)]
pub struct DocumentsIndex(pub(crate) crate::CfTree);
impl DocumentsIndex {
pub fn document_field(&self, id: DocumentId, attr: SchemaAttr) -> RocksDbResult<Option<DBVector>> {
let key = DocumentAttrKey::new(id, attr).to_be_bytes();
self.0.get(key)
}
pub fn set_document_field(&self, id: DocumentId, attr: SchemaAttr, value: Vec<u8>) -> RocksDbResult<()> {
let key = DocumentAttrKey::new(id, attr).to_be_bytes();
self.0.insert(key, value)?;
Ok(())
}
pub fn del_document_field(&self, id: DocumentId, attr: SchemaAttr) -> RocksDbResult<()> {
let key = DocumentAttrKey::new(id, attr).to_be_bytes();
self.0.remove(key)?;
Ok(())
}
pub fn del_all_document_fields(&self, id: DocumentId) -> RocksDbResult<usize> {
let (start, end) = document_fields_range(id);
let mut count = 0;
for (key, _) in self.0.range(start, end)? {
self.0.remove(key)?;
count += 1;
}
Ok(count)
}
pub fn document_fields(&self, id: DocumentId) -> RocksDbResult<DocumentFieldsIter> {
let (start, end) = document_fields_range(id);
let iter = self.0.range(start, end)?;
Ok(DocumentFieldsIter(iter))
}
pub fn len(&self) -> RocksDbResult<u64> {
let mut last_document_id = None;
let mut count = 0;
for (key, _) in self.0.iter()? {
let array = key.as_ref().try_into().unwrap();
let document_id = DocumentAttrKey::from_be_bytes(array).document_id;
if Some(document_id) != last_document_id {
last_document_id = Some(document_id);
count += 1;
}
}
Ok(count)
}
}
pub struct DocumentFieldsIter<'a>(crate::CfIter<'a>);
impl Iterator for DocumentFieldsIter<'_> {
type Item = (SchemaAttr, Box<[u8]>);
fn next(&mut self) -> Option<Self::Item> {
match self.0.next() {
Some((key, value)) => {
let array = key.as_ref().try_into().unwrap();
let key = DocumentAttrKey::from_be_bytes(array);
Some((key.attribute, value))
},
None => None,
}
}
}

View File

@ -0,0 +1,102 @@
use std::sync::Arc;
use std::convert::TryInto;
use meilidb_schema::Schema;
use crate::ranked_map::RankedMap;
use crate::database::Error;
const SCHEMA_KEY: &str = "schema";
const WORDS_KEY: &str = "words";
const SYNONYMS_KEY: &str = "synonyms";
const RANKED_MAP_KEY: &str = "ranked-map";
const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
#[derive(Clone)]
pub struct MainIndex(pub(crate) crate::CfTree);
impl MainIndex {
pub fn schema(&self) -> Result<Option<Schema>, Error> {
match self.0.get(SCHEMA_KEY)? {
Some(bytes) => {
let schema = Schema::read_from_bin(bytes.as_ref())?;
Ok(Some(schema))
},
None => Ok(None),
}
}
pub fn set_schema(&self, schema: &Schema) -> Result<(), Error> {
let mut bytes = Vec::new();
schema.write_to_bin(&mut bytes)?;
self.0.insert(SCHEMA_KEY, bytes)?;
Ok(())
}
pub fn words_set(&self) -> Result<Option<fst::Set>, Error> {
match self.0.get(WORDS_KEY)? {
Some(bytes) => {
let len = bytes.len();
let value = Arc::from(bytes.as_ref());
let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?;
Ok(Some(fst::Set::from(fst)))
},
None => Ok(None),
}
}
pub fn set_words_set(&self, value: &fst::Set) -> Result<(), Error> {
self.0.insert(WORDS_KEY, value.as_fst().as_bytes()).map(drop).map_err(Into::into)
}
pub fn synonyms_set(&self) -> Result<Option<fst::Set>, Error> {
match self.0.get(SYNONYMS_KEY)? {
Some(bytes) => {
let len = bytes.len();
let value = Arc::from(bytes.as_ref());
let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?;
Ok(Some(fst::Set::from(fst)))
},
None => Ok(None),
}
}
pub fn set_synonyms_set(&self, value: &fst::Set) -> Result<(), Error> {
self.0.insert(SYNONYMS_KEY, value.as_fst().as_bytes()).map(drop).map_err(Into::into)
}
pub fn ranked_map(&self) -> Result<Option<RankedMap>, Error> {
match self.0.get(RANKED_MAP_KEY)? {
Some(bytes) => {
let ranked_map = RankedMap::read_from_bin(bytes.as_ref())?;
Ok(Some(ranked_map))
},
None => Ok(None),
}
}
pub fn set_ranked_map(&self, value: &RankedMap) -> Result<(), Error> {
let mut bytes = Vec::new();
value.write_to_bin(&mut bytes)?;
self.0.insert(RANKED_MAP_KEY, bytes)?;
Ok(())
}
pub fn number_of_documents(&self) -> Result<u64, Error> {
match self.0.get(NUMBER_OF_DOCUMENTS_KEY)? {
Some(bytes) => {
let array = (*bytes).try_into().unwrap();
Ok(u64::from_be_bytes(array))
},
None => Ok(0),
}
}
pub fn set_number_of_documents<F>(&self, f: F) -> Result<u64, Error>
where F: FnOnce(u64) -> u64,
{
let new = self.number_of_documents().map(f)?;
self.0.insert(NUMBER_OF_DOCUMENTS_KEY, new.to_be_bytes())?;
Ok(new)
}
}

View File

@ -0,0 +1,487 @@
use std::collections::{HashSet, BTreeMap};
use std::convert::TryInto;
use std::sync::Arc;
use std::sync::atomic::{AtomicU64, Ordering};
use std::thread;
use std::time::{Duration, Instant};
use arc_swap::{ArcSwap, ArcSwapOption, Guard};
use crossbeam_channel::Receiver;
use meilidb_core::criterion::Criteria;
use meilidb_core::{DocIndex, Store, DocumentId, QueryBuilder};
use meilidb_schema::Schema;
use sdset::SetBuf;
use serde::{de, Serialize, Deserialize};
use crate::CfTree;
use crate::ranked_map::RankedMap;
use crate::serde::{Deserializer, DeserializerError};
pub use self::custom_settings_index::CustomSettingsIndex;
use self::docs_words_index::DocsWordsIndex;
use self::documents_index::DocumentsIndex;
use self::main_index::MainIndex;
use self::synonyms_index::SynonymsIndex;
use self::words_index::WordsIndex;
use crate::RocksDbResult;
use crate::database::{
Error,
DocumentsAddition, DocumentsDeletion,
SynonymsAddition, SynonymsDeletion,
apply_documents_addition, apply_documents_deletion,
apply_synonyms_addition, apply_synonyms_deletion,
};
mod custom_settings_index;
mod docs_words_index;
mod documents_index;
mod main_index;
mod synonyms_index;
mod words_index;
#[derive(Deserialize)]
enum UpdateOwned {
DocumentsAddition(Vec<rmpv::Value>),
DocumentsDeletion(Vec<DocumentId>),
SynonymsAddition(BTreeMap<String, Vec<String>>),
SynonymsDeletion(BTreeMap<String, Option<Vec<String>>>),
}
#[derive(Serialize)]
enum Update {
DocumentsAddition(Vec<rmpv::Value>),
DocumentsDeletion(Vec<DocumentId>),
SynonymsAddition(BTreeMap<String, Vec<String>>),
SynonymsDeletion(BTreeMap<String, Option<Vec<String>>>),
}
#[derive(Clone, Serialize, Deserialize)]
pub enum UpdateType {
DocumentsAddition { number: usize },
DocumentsDeletion { number: usize },
SynonymsAddition { number: usize },
SynonymsDeletion { number: usize },
}
#[derive(Clone, Serialize, Deserialize)]
pub struct DetailedDuration {
main: Duration,
}
#[derive(Clone, Serialize, Deserialize)]
pub struct UpdateStatus {
pub update_id: u64,
pub update_type: UpdateType,
pub result: Result<(), String>,
pub detailed_duration: DetailedDuration,
}
fn spawn_update_system(index: Index, subscription: Receiver<()>) -> thread::JoinHandle<()> {
thread::spawn(move || {
let mut subscription = subscription.into_iter();
loop {
while let Some((key, _)) = index.updates_index.iter().unwrap().next() {
let update_id = key.as_ref().try_into().map(u64::from_be_bytes).unwrap();
let updates = &index.updates_index;
let results = &index.updates_results_index;
let update = updates.get(&key).unwrap().unwrap();
let (update_type, result, duration) = match rmp_serde::from_read_ref(&update).unwrap() {
UpdateOwned::DocumentsAddition(documents) => {
let update_type = UpdateType::DocumentsAddition { number: documents.len() };
let ranked_map = index.cache.load().ranked_map.clone();
let start = Instant::now();
let result = apply_documents_addition(&index, ranked_map, documents);
(update_type, result, start.elapsed())
},
UpdateOwned::DocumentsDeletion(documents) => {
let update_type = UpdateType::DocumentsDeletion { number: documents.len() };
let ranked_map = index.cache.load().ranked_map.clone();
let start = Instant::now();
let result = apply_documents_deletion(&index, ranked_map, documents);
(update_type, result, start.elapsed())
},
UpdateOwned::SynonymsAddition(synonyms) => {
let update_type = UpdateType::SynonymsAddition { number: synonyms.len() };
let start = Instant::now();
let result = apply_synonyms_addition(&index, synonyms);
(update_type, result, start.elapsed())
},
UpdateOwned::SynonymsDeletion(synonyms) => {
let update_type = UpdateType::SynonymsDeletion { number: synonyms.len() };
let start = Instant::now();
let result = apply_synonyms_deletion(&index, synonyms);
(update_type, result, start.elapsed())
},
};
let detailed_duration = DetailedDuration { main: duration };
let status = UpdateStatus {
update_id,
update_type,
result: result.map_err(|e| e.to_string()),
detailed_duration,
};
if let Some(callback) = &*index.update_callback.load() {
(callback)(status.clone());
}
let value = bincode::serialize(&status).unwrap();
results.insert(&key, value).unwrap();
updates.remove(&key).unwrap();
}
// this subscription is just used to block
// the loop until a new update is inserted
subscription.next();
}
})
}
fn last_update_id(
update_index: &crate::CfTree,
update_results_index: &crate::CfTree,
) -> RocksDbResult<u64>
{
let uikey = match update_index.last_key()? {
Some(key) => Some(key.as_ref().try_into().map(u64::from_be_bytes).unwrap()),
None => None,
};
let urikey = match update_results_index.last_key()? {
Some(key) => Some(key.as_ref().try_into().map(u64::from_be_bytes).unwrap()),
None => None,
};
Ok(uikey.max(urikey).unwrap_or(0))
}
#[derive(Copy, Clone)]
pub struct IndexStats {
pub number_of_words: usize,
pub number_of_documents: u64,
pub number_attrs_in_ranked_map: usize,
}
#[derive(Clone)]
pub struct Index {
pub(crate) cache: Arc<ArcSwap<Cache>>,
// TODO this will be a snapshot in the future
main_index: MainIndex,
synonyms_index: SynonymsIndex,
words_index: WordsIndex,
docs_words_index: DocsWordsIndex,
documents_index: DocumentsIndex,
custom_settings_index: CustomSettingsIndex,
// used by the update system
updates_id: Arc<AtomicU64>,
updates_index: crate::CfTree,
updates_results_index: crate::CfTree,
update_callback: Arc<ArcSwapOption<Box<dyn Fn(UpdateStatus) + Send + Sync + 'static>>>,
}
pub(crate) struct Cache {
pub words: Arc<fst::Set>,
pub synonyms: Arc<fst::Set>,
pub schema: Schema,
pub ranked_map: RankedMap,
pub number_of_documents: u64,
}
impl Index {
pub fn new(db: Arc<rocksdb::DB>, name: &str) -> Result<Index, Error> {
Index::new_raw(db, name, None)
}
pub fn with_schema(db: Arc<rocksdb::DB>, name: &str, schema: Schema) -> Result<Index, Error> {
Index::new_raw(db, name, Some(schema))
}
fn new_raw(db: Arc<rocksdb::DB>, name: &str, schema: Option<Schema>) -> Result<Index, Error> {
let main_index = CfTree::create(db.clone(), name.to_string()).map(MainIndex)?;
let synonyms_index = CfTree::create(db.clone(), format!("{}-synonyms", name)).map(SynonymsIndex)?;
let words_index = CfTree::create(db.clone(), format!("{}-words", name)).map(WordsIndex)?;
let docs_words_index = CfTree::create(db.clone(), format!("{}-docs-words", name)).map(DocsWordsIndex)?;
let documents_index = CfTree::create(db.clone(), format!("{}-documents", name)).map(DocumentsIndex)?;
let custom_settings_index = CfTree::create(db.clone(), format!("{}-custom", name)).map(CustomSettingsIndex)?;
let (updates_index, subscription) = CfTree::create_with_subcription(db.clone(), format!("{}-updates", name))?;
let updates_results_index = CfTree::create(db.clone(), format!("{}-updates-results", name))?;
let words = match main_index.words_set()? {
Some(words) => Arc::new(words),
None => Arc::new(fst::Set::default()),
};
let synonyms = match main_index.synonyms_set()? {
Some(synonyms) => Arc::new(synonyms),
None => Arc::new(fst::Set::default()),
};
let schema = match (schema, main_index.schema()?) {
(Some(ref expected), Some(ref current)) if current != expected => {
return Err(Error::SchemaDiffer)
},
(Some(expected), Some(_)) => expected,
(Some(expected), None) => {
main_index.set_schema(&expected)?;
expected
},
(None, Some(current)) => current,
(None, None) => return Err(Error::SchemaMissing),
};
let ranked_map = match main_index.ranked_map()? {
Some(map) => map,
None => RankedMap::default(),
};
let number_of_documents = documents_index.len()?;
let cache = Cache { words, synonyms, schema, ranked_map, number_of_documents };
let cache = Arc::new(ArcSwap::from_pointee(cache));
let last_update_id = last_update_id(&updates_index, &updates_results_index)?;
let updates_id = Arc::new(AtomicU64::new(last_update_id + 1));
let index = Index {
cache,
main_index,
synonyms_index,
words_index,
docs_words_index,
documents_index,
custom_settings_index,
updates_id,
updates_index,
updates_results_index,
update_callback: Arc::new(ArcSwapOption::empty()),
};
let _handle = spawn_update_system(index.clone(), subscription);
Ok(index)
}
pub fn set_update_callback<F>(&self, callback: F)
where F: Fn(UpdateStatus) + Send + Sync + 'static
{
self.update_callback.store(Some(Arc::new(Box::new(callback))));
}
pub fn unset_update_callback(&self) {
self.update_callback.store(None);
}
pub fn stats(&self) -> RocksDbResult<IndexStats> {
let cache = self.cache.load();
Ok(IndexStats {
number_of_words: cache.words.len(),
number_of_documents: cache.number_of_documents,
number_attrs_in_ranked_map: cache.ranked_map.len(),
})
}
pub fn query_builder(&self) -> QueryBuilder<RefIndex> {
let ref_index = self.as_ref();
QueryBuilder::new(ref_index)
}
pub fn query_builder_with_criteria<'c>(
&self,
criteria: Criteria<'c>,
) -> QueryBuilder<'c, RefIndex>
{
let ref_index = self.as_ref();
QueryBuilder::with_criteria(ref_index, criteria)
}
pub fn as_ref(&self) -> RefIndex {
RefIndex {
cache: self.cache.load(),
main_index: &self.main_index,
synonyms_index: &self.synonyms_index,
words_index: &self.words_index,
docs_words_index: &self.docs_words_index,
documents_index: &self.documents_index,
custom_settings_index: &self.custom_settings_index,
}
}
pub fn schema(&self) -> Schema {
self.cache.load().schema.clone()
}
pub fn custom_settings(&self) -> CustomSettingsIndex {
self.custom_settings_index.clone()
}
pub fn number_of_documents(&self) -> u64 {
self.cache.load().number_of_documents
}
pub fn documents_addition<D>(&self) -> DocumentsAddition<D> {
DocumentsAddition::new(self)
}
pub fn documents_deletion(&self) -> DocumentsDeletion {
DocumentsDeletion::new(self)
}
pub fn synonyms_addition(&self) -> SynonymsAddition {
SynonymsAddition::new(self)
}
pub fn synonyms_deletion(&self) -> SynonymsDeletion {
SynonymsDeletion::new(self)
}
pub fn update_status(
&self,
update_id: u64,
) -> Result<Option<UpdateStatus>, Error>
{
let update_id = update_id.to_be_bytes();
match self.updates_results_index.get(update_id)? {
Some(value) => {
let value = bincode::deserialize(&value)?;
Ok(Some(value))
},
None => Ok(None),
}
}
pub fn update_status_blocking(
&self,
update_id: u64,
) -> Result<UpdateStatus, Error>
{
// if we find the update result return it now
if let Some(result) = self.update_status(update_id)? {
return Ok(result)
}
loop {
if self.updates_results_index.get(&update_id.to_be_bytes())?.is_some() { break }
std::thread::sleep(Duration::from_millis(300));
}
// the thread has been unblocked, it means that the update result
// has been inserted in the tree, retrieve it
Ok(self.update_status(update_id)?.unwrap())
}
pub fn document<T>(
&self,
fields: Option<&HashSet<&str>>,
id: DocumentId,
) -> Result<Option<T>, DeserializerError>
where T: de::DeserializeOwned,
{
let schema = self.schema();
let fields = match fields {
Some(fields) => fields.into_iter().map(|name| schema.attribute(name)).collect(),
None => None,
};
let mut deserializer = Deserializer {
document_id: id,
index: &self,
fields: fields.as_ref(),
};
// TODO: currently we return an error if all document fields are missing,
// returning None would have been better
T::deserialize(&mut deserializer).map(Some)
}
}
impl Index {
pub(crate) fn push_documents_addition<D>(&self, addition: Vec<D>) -> Result<u64, Error>
where D: serde::Serialize
{
let mut values = Vec::with_capacity(addition.len());
for add in addition {
let vec = rmp_serde::to_vec_named(&add)?;
let add = rmp_serde::from_read(&vec[..])?;
values.push(add);
}
let addition = Update::DocumentsAddition(values);
let update = rmp_serde::to_vec_named(&addition)?;
self.raw_push_update(update)
}
pub(crate) fn push_documents_deletion(
&self,
deletion: Vec<DocumentId>,
) -> Result<u64, Error>
{
let deletion = Update::DocumentsDeletion(deletion);
let update = rmp_serde::to_vec_named(&deletion)?;
self.raw_push_update(update)
}
pub(crate) fn push_synonyms_addition(
&self,
addition: BTreeMap<String, Vec<String>>,
) -> Result<u64, Error>
{
let addition = Update::SynonymsAddition(addition);
let update = rmp_serde::to_vec_named(&addition)?;
self.raw_push_update(update)
}
pub(crate) fn push_synonyms_deletion(
&self,
deletion: BTreeMap<String, Option<Vec<String>>>,
) -> Result<u64, Error>
{
let deletion = Update::SynonymsDeletion(deletion);
let update = rmp_serde::to_vec_named(&deletion)?;
self.raw_push_update(update)
}
fn raw_push_update(&self, raw_update: Vec<u8>) -> Result<u64, Error> {
let update_id = self.updates_id.fetch_add(1, Ordering::SeqCst);
let update_id_array = update_id.to_be_bytes();
self.updates_index.insert(update_id_array, raw_update)?;
Ok(update_id)
}
}
pub struct RefIndex<'a> {
pub(crate) cache: Guard<'static, Arc<Cache>>,
pub main_index: &'a MainIndex,
pub synonyms_index: &'a SynonymsIndex,
pub words_index: &'a WordsIndex,
pub docs_words_index: &'a DocsWordsIndex,
pub documents_index: &'a DocumentsIndex,
pub custom_settings_index: &'a CustomSettingsIndex,
}
impl Store for RefIndex<'_> {
type Error = Error;
fn words(&self) -> Result<&fst::Set, Self::Error> {
Ok(&self.cache.words)
}
fn word_indexes(&self, word: &[u8]) -> Result<Option<SetBuf<DocIndex>>, Self::Error> {
Ok(self.words_index.doc_indexes(word)?)
}
fn synonyms(&self) -> Result<&fst::Set, Self::Error> {
Ok(&self.cache.synonyms)
}
fn alternatives_to(&self, word: &[u8]) -> Result<Option<fst::Set>, Self::Error> {
Ok(self.synonyms_index.alternatives_to(word)?)
}
}

View File

@ -0,0 +1,21 @@
use crate::RocksDbResult;
#[derive(Clone)]
pub struct SynonymsIndex(pub(crate) crate::CfTree);
impl SynonymsIndex {
pub fn alternatives_to(&self, word: &[u8]) -> RocksDbResult<Option<fst::Set>> {
match self.0.get(word)? {
Some(vector) => Ok(Some(fst::Set::from_bytes(vector.to_vec()).unwrap())),
None => Ok(None),
}
}
pub fn set_alternatives_to(&self, word: &[u8], value: Vec<u8>) -> RocksDbResult<()> {
self.0.insert(word, value).map(drop)
}
pub fn del_alternatives_of(&self, word: &[u8]) -> RocksDbResult<()> {
self.0.remove(word).map(drop)
}
}

View File

@ -0,0 +1,45 @@
use meilidb_core::DocIndex;
use sdset::{Set, SetBuf};
use zerocopy::{LayoutVerified, AsBytes};
use crate::RocksDbResult;
#[derive(Clone)]
pub struct WordsIndex(pub(crate) crate::CfTree);
impl WordsIndex {
pub fn doc_indexes(&self, word: &[u8]) -> RocksDbResult<Option<SetBuf<DocIndex>>> {
// we must force an allocation to make the memory aligned
match self.0.get(word)? {
Some(bytes) => {
let vec = match LayoutVerified::new_slice(bytes.as_ref()) {
Some(layout) => layout.into_slice().to_vec(),
None => {
let len = bytes.as_ref().len();
let count = len / std::mem::size_of::<DocIndex>();
let mut buf: Vec<DocIndex> = Vec::with_capacity(count);
unsafe {
let src = bytes.as_ref().as_ptr();
let dst = buf.as_mut_ptr() as *mut u8;
std::ptr::copy_nonoverlapping(src, dst, len);
buf.set_len(count);
}
buf
}
};
let setbuf = SetBuf::new_unchecked(vec);
Ok(Some(setbuf))
},
None => Ok(None),
}
}
pub fn set_doc_indexes(&self, word: &[u8], set: &Set<DocIndex>) -> RocksDbResult<()> {
self.0.insert(word, set.as_bytes()).map(drop)
}
pub fn del_doc_indexes(&self, word: &[u8]) -> RocksDbResult<()> {
self.0.remove(word).map(drop)
}
}

View File

@ -0,0 +1,115 @@
use std::collections::hash_map::Entry;
use std::collections::{HashSet, HashMap};
use std::path::Path;
use std::sync::Arc;
use std::sync::RwLock;
use meilidb_schema::Schema;
mod error;
mod index;
mod update;
pub use self::error::Error;
pub use self::index::{Index, CustomSettingsIndex};
pub use self::update::DocumentsAddition;
pub use self::update::DocumentsDeletion;
pub use self::update::SynonymsAddition;
pub use self::update::SynonymsDeletion;
use self::update::apply_documents_addition;
use self::update::apply_documents_deletion;
use self::update::apply_synonyms_addition;
use self::update::apply_synonyms_deletion;
const INDEXES_KEY: &str = "indexes";
fn load_indexes(tree: &rocksdb::DB) -> Result<HashSet<String>, Error> {
match tree.get(INDEXES_KEY)? {
Some(bytes) => Ok(bincode::deserialize(&bytes)?),
None => Ok(HashSet::new())
}
}
pub struct Database {
cache: RwLock<HashMap<String, Index>>,
inner: Arc<rocksdb::DB>,
}
impl Database {
pub fn open<P: AsRef<Path>>(path: P) -> Result<Database, Error> {
let cache = RwLock::new(HashMap::new());
let mut options = rocksdb::Options::default();
options.create_if_missing(true);
let cfs = rocksdb::DB::list_cf(&options, &path).unwrap_or_default();
let inner = Arc::new(rocksdb::DB::open_cf(&options, path, cfs)?);
let indexes = load_indexes(&inner)?;
let database = Database { cache, inner };
for index in indexes {
database.open_index(&index)?;
}
Ok(database)
}
pub fn indexes(&self) -> Result<HashSet<String>, Error> {
load_indexes(&self.inner)
}
fn set_indexes(&self, value: &HashSet<String>) -> Result<(), Error> {
let bytes = bincode::serialize(value)?;
self.inner.put(INDEXES_KEY, bytes)?;
Ok(())
}
pub fn open_index(&self, name: &str) -> Result<Option<Index>, Error> {
{
let cache = self.cache.read().unwrap();
if let Some(index) = cache.get(name).cloned() {
return Ok(Some(index))
}
}
let mut cache = self.cache.write().unwrap();
let index = match cache.entry(name.to_string()) {
Entry::Occupied(occupied) => {
occupied.get().clone()
},
Entry::Vacant(vacant) => {
if !self.indexes()?.contains(name) {
return Ok(None)
}
let index = Index::new(self.inner.clone(), name)?;
vacant.insert(index).clone()
},
};
Ok(Some(index))
}
pub fn create_index(&self, name: &str, schema: Schema) -> Result<Index, Error> {
let mut cache = self.cache.write().unwrap();
let index = match cache.entry(name.to_string()) {
Entry::Occupied(occupied) => {
occupied.get().clone()
},
Entry::Vacant(vacant) => {
let index = Index::with_schema(self.inner.clone(), name, schema)?;
let mut indexes = self.indexes()?;
indexes.insert(name.to_string());
self.set_indexes(&indexes)?;
vacant.insert(index).clone()
},
};
Ok(index)
}
}

View File

@ -0,0 +1,139 @@
use std::collections::HashSet;
use std::sync::Arc;
use fst::{SetBuilder, set::OpBuilder};
use sdset::{SetOperation, duo::Union};
use serde::Serialize;
use crate::RankedMap;
use crate::database::{Error, Index, index::Cache, apply_documents_deletion};
use crate::indexer::Indexer;
use crate::serde::{extract_document_id, Serializer, RamDocumentStore};
pub struct DocumentsAddition<'a, D> {
index: &'a Index,
documents: Vec<D>,
}
impl<'a, D> DocumentsAddition<'a, D> {
pub fn new(index: &'a Index) -> DocumentsAddition<'a, D> {
DocumentsAddition { index, documents: Vec::new() }
}
pub fn update_document(&mut self, document: D) {
self.documents.push(document);
}
pub fn finalize(self) -> Result<u64, Error>
where D: serde::Serialize
{
self.index.push_documents_addition(self.documents)
}
}
pub fn apply_documents_addition(
index: &Index,
mut ranked_map: RankedMap,
addition: Vec<rmpv::Value>,
) -> Result<(), Error>
{
let mut document_ids = HashSet::new();
let mut document_store = RamDocumentStore::new();
let mut indexer = Indexer::new();
let schema = &index.schema();
let identifier = schema.identifier_name();
for document in addition {
let document_id = match extract_document_id(identifier, &document)? {
Some(id) => id,
None => return Err(Error::MissingDocumentId),
};
// 1. store the document id for future deletion
document_ids.insert(document_id);
// 2. index the document fields in ram stores
let serializer = Serializer {
schema,
document_store: &mut document_store,
indexer: &mut indexer,
ranked_map: &mut ranked_map,
document_id,
};
document.serialize(serializer)?;
}
let ref_index = index.as_ref();
let docs_words = ref_index.docs_words_index;
let documents = ref_index.documents_index;
let main = ref_index.main_index;
let words = ref_index.words_index;
// 1. remove the previous documents match indexes
let documents_to_insert = document_ids.iter().cloned().collect();
apply_documents_deletion(index, ranked_map.clone(), documents_to_insert)?;
// 2. insert new document attributes in the database
for ((id, attr), value) in document_store.into_inner() {
documents.set_document_field(id, attr, value)?;
}
let indexed = indexer.build();
let mut delta_words_builder = SetBuilder::memory();
for (word, delta_set) in indexed.words_doc_indexes {
delta_words_builder.insert(&word).unwrap();
let set = match words.doc_indexes(&word)? {
Some(set) => Union::new(&set, &delta_set).into_set_buf(),
None => delta_set,
};
words.set_doc_indexes(&word, &set)?;
}
for (id, words) in indexed.docs_words {
docs_words.set_doc_words(id, &words)?;
}
let delta_words = delta_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
let words = match main.words_set()? {
Some(words) => {
let op = OpBuilder::new()
.add(words.stream())
.add(delta_words.stream())
.r#union();
let mut words_builder = SetBuilder::memory();
words_builder.extend_stream(op).unwrap();
words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
},
None => delta_words,
};
main.set_words_set(&words)?;
main.set_ranked_map(&ranked_map)?;
let inserted_documents_len = document_ids.len() as u64;
let number_of_documents = main.set_number_of_documents(|old| old + inserted_documents_len)?;
// update the "consistent" view of the Index
let cache = ref_index.cache;
let words = Arc::new(words);
let synonyms = cache.synonyms.clone();
let schema = cache.schema.clone();
let cache = Cache { words, synonyms, schema, ranked_map, number_of_documents };
index.cache.store(Arc::new(cache));
Ok(())
}

View File

@ -0,0 +1,150 @@
use std::collections::{HashMap, HashSet, BTreeSet};
use std::sync::Arc;
use fst::{SetBuilder, Streamer};
use meilidb_core::DocumentId;
use sdset::{SetBuf, SetOperation, duo::DifferenceByKey};
use crate::RankedMap;
use crate::serde::extract_document_id;
use crate::database::{Index, Error, index::Cache};
pub struct DocumentsDeletion<'a> {
index: &'a Index,
documents: Vec<DocumentId>,
}
impl<'a> DocumentsDeletion<'a> {
pub fn new(index: &'a Index) -> DocumentsDeletion<'a> {
DocumentsDeletion { index, documents: Vec::new() }
}
pub fn delete_document_by_id(&mut self, document_id: DocumentId) {
self.documents.push(document_id);
}
pub fn delete_document<D>(&mut self, document: D) -> Result<(), Error>
where D: serde::Serialize,
{
let schema = self.index.schema();
let identifier = schema.identifier_name();
let document_id = match extract_document_id(identifier, &document)? {
Some(id) => id,
None => return Err(Error::MissingDocumentId),
};
self.delete_document_by_id(document_id);
Ok(())
}
pub fn finalize(self) -> Result<u64, Error> {
self.index.push_documents_deletion(self.documents)
}
}
impl Extend<DocumentId> for DocumentsDeletion<'_> {
fn extend<T: IntoIterator<Item=DocumentId>>(&mut self, iter: T) {
self.documents.extend(iter)
}
}
pub fn apply_documents_deletion(
index: &Index,
mut ranked_map: RankedMap,
deletion: Vec<DocumentId>,
) -> Result<(), Error>
{
let ref_index = index.as_ref();
let schema = index.schema();
let docs_words = ref_index.docs_words_index;
let documents = ref_index.documents_index;
let main = ref_index.main_index;
let words = ref_index.words_index;
let idset = SetBuf::from_dirty(deletion);
// collect the ranked attributes according to the schema
let ranked_attrs: Vec<_> = schema.iter()
.filter_map(|(_, attr, prop)| {
if prop.is_ranked() { Some(attr) } else { None }
})
.collect();
let mut words_document_ids = HashMap::new();
for id in idset {
// remove all the ranked attributes from the ranked_map
for ranked_attr in &ranked_attrs {
ranked_map.remove(id, *ranked_attr);
}
if let Some(words) = docs_words.doc_words(id)? {
let mut stream = words.stream();
while let Some(word) = stream.next() {
let word = word.to_vec();
words_document_ids.entry(word).or_insert_with(Vec::new).push(id);
}
}
}
let mut deleted_documents = HashSet::new();
let mut removed_words = BTreeSet::new();
for (word, document_ids) in words_document_ids {
let document_ids = SetBuf::from_dirty(document_ids);
if let Some(doc_indexes) = words.doc_indexes(&word)? {
let op = DifferenceByKey::new(&doc_indexes, &document_ids, |d| d.document_id, |id| *id);
let doc_indexes = op.into_set_buf();
if !doc_indexes.is_empty() {
words.set_doc_indexes(&word, &doc_indexes)?;
} else {
words.del_doc_indexes(&word)?;
removed_words.insert(word);
}
}
for id in document_ids {
if documents.del_all_document_fields(id)? != 0 {
deleted_documents.insert(id);
}
docs_words.del_doc_words(id)?;
}
}
let removed_words = fst::Set::from_iter(removed_words).unwrap();
let words = match main.words_set()? {
Some(words_set) => {
let op = fst::set::OpBuilder::new()
.add(words_set.stream())
.add(removed_words.stream())
.difference();
let mut words_builder = SetBuilder::memory();
words_builder.extend_stream(op).unwrap();
words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
},
None => fst::Set::default(),
};
main.set_words_set(&words)?;
main.set_ranked_map(&ranked_map)?;
let deleted_documents_len = deleted_documents.len() as u64;
let number_of_documents = main.set_number_of_documents(|old| old - deleted_documents_len)?;
// update the "consistent" view of the Index
let cache = ref_index.cache;
let words = Arc::new(words);
let synonyms = cache.synonyms.clone();
let schema = cache.schema.clone();
let cache = Cache { words, synonyms, schema, ranked_map, number_of_documents };
index.cache.store(Arc::new(cache));
Ok(())
}

View File

@ -0,0 +1,9 @@
mod documents_addition;
mod documents_deletion;
mod synonyms_addition;
mod synonyms_deletion;
pub use self::documents_addition::{DocumentsAddition, apply_documents_addition};
pub use self::documents_deletion::{DocumentsDeletion, apply_documents_deletion};
pub use self::synonyms_addition::{SynonymsAddition, apply_synonyms_addition};
pub use self::synonyms_deletion::{SynonymsDeletion, apply_synonyms_deletion};

View File

@ -0,0 +1,94 @@
use std::collections::BTreeMap;
use std::sync::Arc;
use fst::{SetBuilder, set::OpBuilder};
use meilidb_core::normalize_str;
use sdset::SetBuf;
use crate::database::{Error, Index,index::Cache};
pub struct SynonymsAddition<'a> {
index: &'a Index,
synonyms: BTreeMap<String, Vec<String>>,
}
impl<'a> SynonymsAddition<'a> {
pub fn new(index: &'a Index) -> SynonymsAddition<'a> {
SynonymsAddition { index, synonyms: BTreeMap::new() }
}
pub fn add_synonym<S, T, I>(&mut self, synonym: S, alternatives: I)
where S: AsRef<str>,
T: AsRef<str>,
I: IntoIterator<Item=T>,
{
let synonym = normalize_str(synonym.as_ref());
let alternatives = alternatives.into_iter().map(|s| s.as_ref().to_lowercase());
self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives);
}
pub fn finalize(self) -> Result<u64, Error> {
self.index.push_synonyms_addition(self.synonyms)
}
}
pub fn apply_synonyms_addition(
index: &Index,
addition: BTreeMap<String, Vec<String>>,
) -> Result<(), Error>
{
let ref_index = index.as_ref();
let synonyms = ref_index.synonyms_index;
let main = ref_index.main_index;
let mut synonyms_builder = SetBuilder::memory();
for (synonym, alternatives) in addition {
synonyms_builder.insert(&synonym).unwrap();
let alternatives = {
let alternatives = SetBuf::from_dirty(alternatives);
let mut alternatives_builder = SetBuilder::memory();
alternatives_builder.extend_iter(alternatives).unwrap();
alternatives_builder.into_inner().unwrap()
};
synonyms.set_alternatives_to(synonym.as_bytes(), alternatives)?;
}
let delta_synonyms = synonyms_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
let synonyms = match main.synonyms_set()? {
Some(synonyms) => {
let op = OpBuilder::new()
.add(synonyms.stream())
.add(delta_synonyms.stream())
.r#union();
let mut synonyms_builder = SetBuilder::memory();
synonyms_builder.extend_stream(op).unwrap();
synonyms_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
},
None => delta_synonyms,
};
main.set_synonyms_set(&synonyms)?;
// update the "consistent" view of the Index
let cache = ref_index.cache;
let words = Arc::new(main.words_set()?.unwrap_or_default());
let ranked_map = cache.ranked_map.clone();
let synonyms = Arc::new(synonyms);
let schema = cache.schema.clone();
let number_of_documents = cache.number_of_documents;
let cache = Cache { words, synonyms, schema, ranked_map, number_of_documents };
index.cache.store(Arc::new(cache));
Ok(())
}

View File

@ -0,0 +1,137 @@
use std::collections::BTreeMap;
use std::iter::FromIterator;
use std::sync::Arc;
use fst::{SetBuilder, set::OpBuilder};
use meilidb_core::normalize_str;
use sdset::SetBuf;
use crate::database::{Error, Index, index::Cache};
pub struct SynonymsDeletion<'a> {
index: &'a Index,
synonyms: BTreeMap<String, Option<Vec<String>>>,
}
impl<'a> SynonymsDeletion<'a> {
pub fn new(index: &'a Index) -> SynonymsDeletion<'a> {
SynonymsDeletion { index, synonyms: BTreeMap::new() }
}
pub fn delete_all_alternatives_of<S: AsRef<str>>(&mut self, synonym: S) {
let synonym = normalize_str(synonym.as_ref());
self.synonyms.insert(synonym, None);
}
pub fn delete_specific_alternatives_of<S, T, I>(&mut self, synonym: S, alternatives: I)
where S: AsRef<str>,
T: AsRef<str>,
I: Iterator<Item=T>,
{
let synonym = normalize_str(synonym.as_ref());
let value = self.synonyms.entry(synonym).or_insert(None);
let alternatives = alternatives.map(|s| s.as_ref().to_lowercase());
match value {
Some(v) => v.extend(alternatives),
None => *value = Some(Vec::from_iter(alternatives)),
}
}
pub fn finalize(self) -> Result<u64, Error> {
self.index.push_synonyms_deletion(self.synonyms)
}
}
pub fn apply_synonyms_deletion(
index: &Index,
deletion: BTreeMap<String, Option<Vec<String>>>,
) -> Result<(), Error>
{
let ref_index = index.as_ref();
let synonyms = ref_index.synonyms_index;
let main = ref_index.main_index;
let mut delete_whole_synonym_builder = SetBuilder::memory();
for (synonym, alternatives) in deletion {
match alternatives {
Some(alternatives) => {
let prev_alternatives = synonyms.alternatives_to(synonym.as_bytes())?;
let prev_alternatives = match prev_alternatives {
Some(alternatives) => alternatives,
None => continue,
};
let delta_alternatives = {
let alternatives = SetBuf::from_dirty(alternatives);
let mut builder = SetBuilder::memory();
builder.extend_iter(alternatives).unwrap();
builder.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
};
let op = OpBuilder::new()
.add(prev_alternatives.stream())
.add(delta_alternatives.stream())
.difference();
let (alternatives, empty_alternatives) = {
let mut builder = SetBuilder::memory();
let len = builder.get_ref().len();
builder.extend_stream(op).unwrap();
let is_empty = len == builder.get_ref().len();
let alternatives = builder.into_inner().unwrap();
(alternatives, is_empty)
};
if empty_alternatives {
delete_whole_synonym_builder.insert(synonym.as_bytes())?;
} else {
synonyms.set_alternatives_to(synonym.as_bytes(), alternatives)?;
}
},
None => {
delete_whole_synonym_builder.insert(&synonym).unwrap();
synonyms.del_alternatives_of(synonym.as_bytes())?;
}
}
}
let delta_synonyms = delete_whole_synonym_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
let synonyms = match main.synonyms_set()? {
Some(synonyms) => {
let op = OpBuilder::new()
.add(synonyms.stream())
.add(delta_synonyms.stream())
.difference();
let mut synonyms_builder = SetBuilder::memory();
synonyms_builder.extend_stream(op).unwrap();
synonyms_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
},
None => fst::Set::default(),
};
main.set_synonyms_set(&synonyms)?;
// update the "consistent" view of the Index
let cache = ref_index.cache;
let words = Arc::new(main.words_set()?.unwrap_or_default());
let ranked_map = cache.ranked_map.clone();
let synonyms = Arc::new(synonyms);
let schema = cache.schema.clone();
let number_of_documents = cache.number_of_documents;
let cache = Cache { words, synonyms, schema, ranked_map, number_of_documents };
index.cache.store(Arc::new(cache));
Ok(())
}

View File

@ -0,0 +1,69 @@
use meilidb_core::DocumentId;
use meilidb_schema::SchemaAttr;
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct DocumentAttrKey {
pub document_id: DocumentId,
pub attribute: SchemaAttr,
}
impl DocumentAttrKey {
pub fn new(document_id: DocumentId, attribute: SchemaAttr) -> DocumentAttrKey {
DocumentAttrKey { document_id, attribute }
}
pub fn to_be_bytes(self) -> [u8; 10] {
let mut output = [0u8; 10];
let document_id = self.document_id.0.to_be_bytes();
let attribute = self.attribute.0.to_be_bytes();
unsafe {
use std::{mem::size_of, ptr::copy_nonoverlapping};
let output = output.as_mut_ptr();
copy_nonoverlapping(document_id.as_ptr(), output, size_of::<u64>());
let output = output.add(size_of::<u64>());
copy_nonoverlapping(attribute.as_ptr(), output, size_of::<u16>());
}
output
}
pub fn from_be_bytes(bytes: [u8; 10]) -> DocumentAttrKey {
let document_id;
let attribute;
unsafe {
use std::ptr::read_unaligned;
let pointer = bytes.as_ptr() as *const _;
let document_id_bytes = read_unaligned(pointer);
document_id = u64::from_be_bytes(document_id_bytes);
let pointer = pointer.add(1) as *const _;
let attribute_bytes = read_unaligned(pointer);
attribute = u16::from_be_bytes(attribute_bytes);
}
DocumentAttrKey {
document_id: DocumentId(document_id),
attribute: SchemaAttr(attribute),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn to_from_be_bytes() {
let document_id = DocumentId(67578308);
let schema_attr = SchemaAttr(3456);
let x = DocumentAttrKey::new(document_id, schema_attr);
assert_eq!(x, DocumentAttrKey::from_be_bytes(x.to_be_bytes()));
}
}

208
meilidb-data/src/indexer.rs Normal file
View File

@ -0,0 +1,208 @@
use std::collections::{BTreeMap, HashMap};
use std::convert::TryFrom;
use deunicode::deunicode_with_tofu;
use meilidb_core::{DocumentId, DocIndex};
use meilidb_schema::SchemaAttr;
use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token};
use sdset::SetBuf;
type Word = Vec<u8>; // TODO make it be a SmallVec
pub struct Indexer {
word_limit: usize, // the maximum number of indexed words
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
docs_words: HashMap<DocumentId, Vec<Word>>,
}
pub struct Indexed {
pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,
pub docs_words: HashMap<DocumentId, fst::Set>,
}
impl Indexer {
pub fn new() -> Indexer {
Indexer::with_word_limit(1000)
}
pub fn with_word_limit(limit: usize) -> Indexer {
Indexer {
word_limit: limit,
words_doc_indexes: BTreeMap::new(),
docs_words: HashMap::new(),
}
}
pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) {
let lowercase_text = text.to_lowercase();
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
// TODO compute the deunicoded version after the cjk check
let next = if !lowercase_text.contains(is_cjk) && lowercase_text != deunicoded {
Some(deunicoded)
} else {
None
};
let iter = Some(lowercase_text).into_iter().chain(next);
for text in iter {
for token in Tokenizer::new(&text) {
let must_continue = index_token(
token,
id,
attr,
self.word_limit,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
if !must_continue { break }
}
}
}
pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
where I: IntoIterator<Item=&'a str, IntoIter=IT>,
IT: Iterator<Item = &'a str> + Clone,
{
// TODO serialize this to one call to the SeqTokenizer loop
let lowercased: Vec<_> = iter.into_iter().map(str::to_lowercase).collect();
let iter = lowercased.iter().map(|t| t.as_str());
for token in SeqTokenizer::new(iter) {
let must_continue = index_token(
token,
id,
attr,
self.word_limit,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
if !must_continue { break }
}
let deunicoded: Vec<_> = lowercased.into_iter().map(|lowercase_text| {
if lowercase_text.contains(is_cjk) { return lowercase_text }
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
if lowercase_text != deunicoded { deunicoded } else { lowercase_text }
}).collect();
let iter = deunicoded.iter().map(|t| t.as_str());
for token in SeqTokenizer::new(iter) {
let must_continue = index_token(
token,
id,
attr,
self.word_limit,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
if !must_continue { break }
}
}
pub fn build(self) -> Indexed {
let words_doc_indexes = self.words_doc_indexes
.into_iter()
.map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
.collect();
let docs_words = self.docs_words
.into_iter()
.map(|(id, mut words)| {
words.sort_unstable();
words.dedup();
(id, fst::Set::from_iter(words).unwrap())
})
.collect();
Indexed { words_doc_indexes, docs_words }
}
}
fn index_token(
token: Token,
id: DocumentId,
attr: SchemaAttr,
word_limit: usize,
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool
{
if token.word_index >= word_limit { return false }
match token_to_docindex(id, attr, token) {
Some(docindex) => {
let word = Vec::from(token.word);
words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex);
docs_words.entry(id).or_insert_with(Vec::new).push(word);
},
None => return false,
}
true
}
fn token_to_docindex(id: DocumentId, attr: SchemaAttr, token: Token) -> Option<DocIndex> {
let word_index = u16::try_from(token.word_index).ok()?;
let char_index = u16::try_from(token.char_index).ok()?;
let char_length = u16::try_from(token.word.chars().count()).ok()?;
let docindex = DocIndex {
document_id: id,
attribute: attr.0,
word_index,
char_index,
char_length,
};
Some(docindex)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn strange_apostrophe() {
let mut indexer = Indexer::new();
let docid = DocumentId(0);
let attr = SchemaAttr(0);
let text = "Zut, laspirateur, jai oublié de léteindre !";
indexer.index_text(docid, attr, text);
let Indexed { words_doc_indexes, .. } = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe...
assert!(words_doc_indexes.get(&"léteindre".to_owned().into_bytes()).is_some());
}
#[test]
fn strange_apostrophe_in_sequence() {
let mut indexer = Indexer::new();
let docid = DocumentId(0);
let attr = SchemaAttr(0);
let text = vec!["Zut, laspirateur, jai oublié de léteindre !"];
indexer.index_text_seq(docid, attr, text);
let Indexed { words_doc_indexes, .. } = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe...
assert!(words_doc_indexes.get(&"léteindre".to_owned().into_bytes()).is_some());
}
}

15
meilidb-data/src/lib.rs Normal file
View File

@ -0,0 +1,15 @@
mod cf_tree;
mod database;
mod document_attr_key;
mod indexer;
mod number;
mod ranked_map;
mod serde;
pub use self::cf_tree::{CfTree, CfIter};
pub use self::database::{Database, Index, CustomSettingsIndex};
pub use self::number::Number;
pub use self::ranked_map::RankedMap;
pub use self::serde::{compute_document_id, extract_document_id, value_to_string};
pub type RocksDbResult<T> = Result<T, rocksdb::Error>;

View File

@ -0,0 +1,55 @@
use std::num::{ParseIntError, ParseFloatError};
use std::str::FromStr;
use std::fmt;
use ordered_float::OrderedFloat;
use serde::{Serialize, Deserialize};
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum Number {
Unsigned(u64),
Signed(i64),
Float(OrderedFloat<f64>),
}
impl FromStr for Number {
type Err = ParseNumberError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let uint_error = match u64::from_str(s) {
Ok(unsigned) => return Ok(Number::Unsigned(unsigned)),
Err(error) => error,
};
let int_error = match i64::from_str(s) {
Ok(signed) => return Ok(Number::Signed(signed)),
Err(error) => error,
};
let float_error = match f64::from_str(s) {
Ok(float) => return Ok(Number::Float(OrderedFloat(float))),
Err(error) => error,
};
Err(ParseNumberError { uint_error, int_error, float_error })
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ParseNumberError {
uint_error: ParseIntError,
int_error: ParseIntError,
float_error: ParseFloatError,
}
impl fmt::Display for ParseNumberError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
if self.uint_error == self.int_error {
write!(f, "can not parse number: {}, {}", self.uint_error, self.float_error)
} else {
write!(f, "can not parse number: {}, {}, {}",
self.uint_error, self.int_error, self.float_error)
}
}
}

View File

@ -0,0 +1,36 @@
use std::io::{Read, Write};
use hashbrown::HashMap;
use meilidb_core::DocumentId;
use meilidb_schema::SchemaAttr;
use crate::Number;
#[derive(Debug, Default, Clone, PartialEq, Eq)]
pub struct RankedMap(HashMap<(DocumentId, SchemaAttr), Number>);
impl RankedMap {
pub fn len(&self) -> usize {
self.0.len()
}
pub fn insert(&mut self, document: DocumentId, attribute: SchemaAttr, number: Number) {
self.0.insert((document, attribute), number);
}
pub fn remove(&mut self, document: DocumentId, attribute: SchemaAttr) {
self.0.remove(&(document, attribute));
}
pub fn get(&self, document: DocumentId, attribute: SchemaAttr) -> Option<Number> {
self.0.get(&(document, attribute)).cloned()
}
pub fn read_from_bin<R: Read>(reader: R) -> bincode::Result<RankedMap> {
bincode::deserialize_from(reader).map(RankedMap)
}
pub fn write_to_bin<W: Write>(&self, writer: W) -> bincode::Result<()> {
bincode::serialize_into(writer, &self.0)
}
}

View File

@ -0,0 +1,180 @@
use std::str::FromStr;
use ordered_float::OrderedFloat;
use serde::ser;
use serde::Serialize;
use super::SerializerError;
use crate::Number;
pub struct ConvertToNumber;
impl ser::Serializer for ConvertToNumber {
type Ok = Number;
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
fn serialize_bool(self, value: bool) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(u64::from(value)))
}
fn serialize_char(self, _value: char) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "char" })
}
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(i64::from(value)))
}
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(i64::from(value)))
}
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(i64::from(value)))
}
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(value))
}
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(u64::from(value)))
}
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(u64::from(value)))
}
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(u64::from(value)))
}
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(value))
}
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Float(OrderedFloat(f64::from(value))))
}
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Float(OrderedFloat(value)))
}
fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
Ok(Number::from_str(value)?)
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnrankableType { type_name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnrankableType { type_name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnrankableType { type_name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnrankableType { type_name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnrankableType { type_name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "map" })
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnrankableType { type_name: "struct" })
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnrankableType { type_name: "struct variant" })
}
}

View File

@ -1,11 +1,11 @@
use serde::Serialize;
use serde::ser;
use crate::database::serde::SerializerError;
use super::SerializerError;
pub struct KeyToStringSerializer;
pub struct ConvertToString;
impl ser::Serializer for KeyToStringSerializer {
impl ser::Serializer for ConvertToString {
type Ok = String;
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
@ -16,22 +16,52 @@ impl ser::Serializer for KeyToStringSerializer {
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "boolean" })
}
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
f32 => serialize_f32,
f64 => serialize_f64,
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
@ -39,25 +69,25 @@ impl ser::Serializer for KeyToStringSerializer {
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
Err(SerializerError::UnserializableType { type_name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
Err(SerializerError::UnserializableType { type_name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
Err(SerializerError::UnserializableType { type_name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
Err(SerializerError::UnserializableType { type_name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
Err(SerializerError::UnserializableType { type_name: "unit struct" })
}
fn serialize_unit_variant(
@ -67,7 +97,7 @@ impl ser::Serializer for KeyToStringSerializer {
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
Err(SerializerError::UnserializableType { type_name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
@ -89,15 +119,15 @@ impl ser::Serializer for KeyToStringSerializer {
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" })
Err(SerializerError::UnserializableType { type_name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
Err(SerializerError::UnserializableType { type_name: "tuple" })
}
fn serialize_tuple_struct(
@ -106,7 +136,7 @@ impl ser::Serializer for KeyToStringSerializer {
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
}
fn serialize_tuple_variant(
@ -117,11 +147,11 @@ impl ser::Serializer for KeyToStringSerializer {
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { name: "map" })
Err(SerializerError::UnserializableType { type_name: "map" })
}
fn serialize_struct(
@ -130,7 +160,7 @@ impl ser::Serializer for KeyToStringSerializer {
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct" })
Err(SerializerError::UnserializableType { type_name: "struct" })
}
fn serialize_struct_variant(
@ -141,6 +171,6 @@ impl ser::Serializer for KeyToStringSerializer {
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
Err(SerializerError::UnserializableType { type_name: "struct variant" })
}
}

View File

@ -0,0 +1,132 @@
use std::collections::HashSet;
use std::io::Cursor;
use std::{fmt, error::Error};
use meilidb_core::DocumentId;
use meilidb_schema::SchemaAttr;
use rmp_serde::decode::{Deserializer as RmpDeserializer, ReadReader};
use rmp_serde::decode::{Error as RmpError};
use serde::{de, forward_to_deserialize_any};
use crate::database::Index;
#[derive(Debug)]
pub enum DeserializerError {
RmpError(RmpError),
RocksDbError(rocksdb::Error),
Custom(String),
}
impl de::Error for DeserializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
DeserializerError::Custom(msg.to_string())
}
}
impl fmt::Display for DeserializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
DeserializerError::RmpError(e) => write!(f, "rmp serde related error: {}", e),
DeserializerError::RocksDbError(e) => write!(f, "RocksDB related error: {}", e),
DeserializerError::Custom(s) => f.write_str(s),
}
}
}
impl Error for DeserializerError {}
impl From<RmpError> for DeserializerError {
fn from(error: RmpError) -> DeserializerError {
DeserializerError::RmpError(error)
}
}
impl From<rocksdb::Error> for DeserializerError {
fn from(error: rocksdb::Error) -> DeserializerError {
DeserializerError::RocksDbError(error)
}
}
pub struct Deserializer<'a> {
pub document_id: DocumentId,
pub index: &'a Index,
pub fields: Option<&'a HashSet<SchemaAttr>>,
}
impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a>
{
type Error = DeserializerError;
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de>
{
self.deserialize_map(visitor)
}
forward_to_deserialize_any! {
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string
bytes byte_buf option unit unit_struct newtype_struct seq tuple
tuple_struct struct enum identifier ignored_any
}
fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de>
{
let schema = self.index.schema();
let documents = self.index.as_ref().documents_index;
let iter = documents
.document_fields(self.document_id)?
.filter_map(|(attr, value)| {
let is_displayed = schema.props(attr).is_displayed();
if is_displayed && self.fields.map_or(true, |f| f.contains(&attr)) {
let attribute_name = schema.attribute_name(attr);
Some((attribute_name, Value::new(value)))
} else {
None
}
});
let map_deserializer = de::value::MapDeserializer::new(iter);
let result = visitor.visit_map(map_deserializer).map_err(DeserializerError::from);
result
}
}
struct Value<A>(RmpDeserializer<ReadReader<Cursor<A>>>) where A: AsRef<[u8]>;
impl<A> Value<A> where A: AsRef<[u8]>
{
fn new(value: A) -> Value<A> {
Value(RmpDeserializer::new(Cursor::new(value)))
}
}
impl<'de, A> de::IntoDeserializer<'de, RmpError> for Value<A>
where A: AsRef<[u8]>,
{
type Deserializer = Self;
fn into_deserializer(self) -> Self::Deserializer {
self
}
}
impl<'de, 'a, A> de::Deserializer<'de> for Value<A>
where A: AsRef<[u8]>,
{
type Error = RmpError;
fn deserialize_any<V>(mut self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de>
{
self.0.deserialize_any(visitor)
}
forward_to_deserialize_any! {
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string
bytes byte_buf option unit unit_struct newtype_struct seq tuple
tuple_struct map struct enum identifier ignored_any
}
}

View File

@ -1,23 +1,53 @@
use serde::Serialize;
use serde::ser;
use std::hash::{Hash, Hasher};
use crate::database::serde::key_to_string::KeyToStringSerializer;
use crate::database::serde::{SerializerError, calculate_hash};
use crate::DocumentId;
use meilidb_core::DocumentId;
use serde::{ser, Serialize};
use serde_json::Value;
use siphasher::sip::SipHasher;
pub struct FindDocumentIdSerializer<'a> {
pub id_attribute_name: &'a str,
use super::{SerializerError, ConvertToString};
pub fn extract_document_id<D>(
identifier: &str,
document: &D,
) -> Result<Option<DocumentId>, SerializerError>
where D: serde::Serialize,
{
let serializer = ExtractDocumentId { identifier };
document.serialize(serializer)
}
impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
type Ok = DocumentId;
pub fn value_to_string(value: &Value) -> Option<String> {
match value {
Value::Null => None,
Value::Bool(_) => None,
Value::Number(value) => Some(value.to_string()),
Value::String(value) => Some(value.to_string()),
Value::Array(_) => None,
Value::Object(_) => None,
}
}
pub fn compute_document_id<H: Hash>(t: H) -> DocumentId {
let mut s = SipHasher::new();
t.hash(&mut s);
let hash = s.finish();
DocumentId(hash)
}
struct ExtractDocumentId<'a> {
identifier: &'a str,
}
impl<'a> ser::Serializer for ExtractDocumentId<'a> {
type Ok = Option<DocumentId>;
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = FindDocumentIdMapSerializer<'a>;
type SerializeStruct = FindDocumentIdStructSerializer<'a>;
type SerializeMap = ExtractDocumentIdMapSerializer<'a>;
type SerializeStruct = ExtractDocumentIdStructSerializer<'a>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
@ -38,30 +68,30 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
f64 => serialize_f64,
}
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "str" })
fn serialize_str(self, _value: &str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "str" })
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
fn serialize_bytes(self, _value: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
Err(SerializerError::UnserializableType { type_name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
Err(SerializerError::UnserializableType { type_name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
Err(SerializerError::UnserializableType { type_name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
Err(SerializerError::UnserializableType { type_name: "unit struct" })
}
fn serialize_unit_variant(
@ -71,7 +101,7 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
Err(SerializerError::UnserializableType { type_name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
@ -93,15 +123,15 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" })
Err(SerializerError::UnserializableType { type_name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
Err(SerializerError::UnserializableType { type_name: "tuple" })
}
fn serialize_tuple_struct(
@ -110,7 +140,7 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
}
fn serialize_tuple_variant(
@ -121,15 +151,17 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Ok(FindDocumentIdMapSerializer {
id_attribute_name: self.id_attribute_name,
let serializer = ExtractDocumentIdMapSerializer {
identifier: self.identifier,
document_id: None,
current_key_name: None,
})
};
Ok(serializer)
}
fn serialize_struct(
@ -138,10 +170,12 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Ok(FindDocumentIdStructSerializer {
id_attribute_name: self.id_attribute_name,
let serializer = ExtractDocumentIdStructSerializer {
identifier: self.identifier,
document_id: None,
})
};
Ok(serializer)
}
fn serialize_struct_variant(
@ -152,24 +186,24 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
Err(SerializerError::UnserializableType { type_name: "struct variant" })
}
}
pub struct FindDocumentIdMapSerializer<'a> {
id_attribute_name: &'a str,
pub struct ExtractDocumentIdMapSerializer<'a> {
identifier: &'a str,
document_id: Option<DocumentId>,
current_key_name: Option<String>,
}
impl<'a> ser::SerializeMap for FindDocumentIdMapSerializer<'a> {
type Ok = DocumentId;
impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> {
type Ok = Option<DocumentId>;
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: Serialize,
{
let key = key.serialize(KeyToStringSerializer)?;
let key = key.serialize(ConvertToString)?;
self.current_key_name = Some(key);
Ok(())
}
@ -188,33 +222,31 @@ impl<'a> ser::SerializeMap for FindDocumentIdMapSerializer<'a> {
) -> Result<(), Self::Error>
where K: Serialize, V: Serialize,
{
let key = key.serialize(KeyToStringSerializer)?;
let key = key.serialize(ConvertToString)?;
if self.id_attribute_name == key {
// TODO is it possible to have multiple ids?
let id = bincode::serialize(value).unwrap();
let hash = calculate_hash(&id);
self.document_id = Some(DocumentId(hash));
if self.identifier == key {
let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?;
match value_to_string(&value).map(|s| compute_document_id(&s)) {
Some(document_id) => self.document_id = Some(document_id),
None => return Err(SerializerError::InvalidDocumentIdType),
}
}
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
match self.document_id {
Some(document_id) => Ok(document_id),
None => Err(SerializerError::DocumentIdNotFound)
}
Ok(self.document_id)
}
}
pub struct FindDocumentIdStructSerializer<'a> {
id_attribute_name: &'a str,
pub struct ExtractDocumentIdStructSerializer<'a> {
identifier: &'a str,
document_id: Option<DocumentId>,
}
impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> {
type Ok = DocumentId;
impl<'a> ser::SerializeStruct for ExtractDocumentIdStructSerializer<'a> {
type Ok = Option<DocumentId>;
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
@ -224,20 +256,18 @@ impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> {
) -> Result<(), Self::Error>
where T: Serialize,
{
if self.id_attribute_name == key {
// TODO can it be possible to have multiple ids?
let id = bincode::serialize(value).unwrap();
let hash = calculate_hash(&id);
self.document_id = Some(DocumentId(hash));
if self.identifier == key {
let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?;
match value_to_string(&value).map(compute_document_id) {
Some(document_id) => self.document_id = Some(document_id),
None => return Err(SerializerError::InvalidDocumentIdType),
}
}
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
match self.document_id {
Some(document_id) => Ok(document_id),
None => Err(SerializerError::DocumentIdNotFound)
}
Ok(self.document_id)
}
}

View File

@ -0,0 +1,336 @@
use meilidb_core::DocumentId;
use meilidb_schema::SchemaAttr;
use serde::ser;
use serde::Serialize;
use crate::indexer::Indexer as RawIndexer;
use super::{SerializerError, ConvertToString};
pub struct Indexer<'a> {
pub attribute: SchemaAttr,
pub indexer: &'a mut RawIndexer,
pub document_id: DocumentId,
}
impl<'a> ser::Serializer for Indexer<'a> {
type Ok = ();
type Error = SerializerError;
type SerializeSeq = SeqIndexer<'a>;
type SerializeTuple = TupleIndexer<'a>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapIndexer<'a>;
type SerializeStruct = StructSerializer<'a>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "boolean" })
}
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_str(self, text: &str) -> Result<Self::Ok, Self::Error> {
self.indexer.index_text(self.document_id, self.attribute, text);
Ok(())
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "Option" })
}
fn serialize_some<T: ?Sized>(self, value: &T) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
let text = value.serialize(ConvertToString)?;
self.indexer.index_text(self.document_id, self.attribute, &text);
Ok(())
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
Err(SerializerError::UnindexableType { type_name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
let indexer = SeqIndexer {
attribute: self.attribute,
document_id: self.document_id,
indexer: self.indexer,
texts: Vec::new(),
};
Ok(indexer)
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
let indexer = TupleIndexer {
attribute: self.attribute,
document_id: self.document_id,
indexer: self.indexer,
texts: Vec::new(),
};
Ok(indexer)
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
let indexer = MapIndexer {
attribute: self.attribute,
document_id: self.document_id,
indexer: self.indexer,
texts: Vec::new(),
};
Ok(indexer)
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "struct" })
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "struct variant" })
}
}
pub struct SeqIndexer<'a> {
attribute: SchemaAttr,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: ser::Serialize
{
let text = value.serialize(ConvertToString)?;
self.texts.push(text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
Ok(())
}
}
pub struct MapIndexer<'a> {
attribute: SchemaAttr,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeMap for MapIndexer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: ser::Serialize,
{
let text = key.serialize(ConvertToString)?;
self.texts.push(text);
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: ser::Serialize,
{
let text = value.serialize(ConvertToString)?;
self.texts.push(text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
Ok(())
}
}
pub struct StructSerializer<'a> {
attribute: SchemaAttr,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T,
) -> Result<(), Self::Error>
where T: ser::Serialize,
{
let key_text = key.to_owned();
let value_text = value.serialize(ConvertToString)?;
self.texts.push(key_text);
self.texts.push(value_text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
Ok(())
}
}
pub struct TupleIndexer<'a> {
attribute: SchemaAttr,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: Serialize
{
let text = value.serialize(ConvertToString)?;
self.texts.push(text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
Ok(())
}
}

View File

@ -0,0 +1,131 @@
macro_rules! forward_to_unserializable_type {
($($ty:ident => $se_method:ident,)*) => {
$(
fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "$ty" })
}
)*
}
}
mod convert_to_number;
mod convert_to_string;
mod deserializer;
mod extract_document_id;
mod indexer;
mod serializer;
pub use self::deserializer::{Deserializer, DeserializerError};
pub use self::extract_document_id::{extract_document_id, compute_document_id, value_to_string};
pub use self::convert_to_string::ConvertToString;
pub use self::convert_to_number::ConvertToNumber;
pub use self::indexer::Indexer;
pub use self::serializer::Serializer;
use std::collections::BTreeMap;
use std::{fmt, error::Error};
use meilidb_core::DocumentId;
use meilidb_schema::SchemaAttr;
use rmp_serde::encode::Error as RmpError;
use serde_json::Error as SerdeJsonError;
use serde::ser;
use crate::number::ParseNumberError;
#[derive(Debug)]
pub enum SerializerError {
DocumentIdNotFound,
InvalidDocumentIdType,
RmpError(RmpError),
RocksDbError(rocksdb::Error),
SerdeJsonError(SerdeJsonError),
ParseNumberError(ParseNumberError),
UnserializableType { type_name: &'static str },
UnindexableType { type_name: &'static str },
UnrankableType { type_name: &'static str },
Custom(String),
}
impl ser::Error for SerializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
SerializerError::Custom(msg.to_string())
}
}
impl fmt::Display for SerializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
SerializerError::DocumentIdNotFound => {
write!(f, "serialized document does not have an id according to the schema")
},
SerializerError::InvalidDocumentIdType => {
write!(f, "document identifier can only be of type string or number")
},
SerializerError::RmpError(e) => write!(f, "rmp serde related error: {}", e),
SerializerError::RocksDbError(e) => write!(f, "RocksDB related error: {}", e),
SerializerError::SerdeJsonError(e) => write!(f, "serde json error: {}", e),
SerializerError::ParseNumberError(e) => {
write!(f, "error while trying to parse a number: {}", e)
},
SerializerError::UnserializableType { type_name } => {
write!(f, "{} are not a serializable type", type_name)
},
SerializerError::UnindexableType { type_name } => {
write!(f, "{} are not an indexable type", type_name)
},
SerializerError::UnrankableType { type_name } => {
write!(f, "{} types can not be used for ranking", type_name)
},
SerializerError::Custom(s) => f.write_str(s),
}
}
}
impl Error for SerializerError {}
impl From<String> for SerializerError {
fn from(value: String) -> SerializerError {
SerializerError::Custom(value)
}
}
impl From<RmpError> for SerializerError {
fn from(error: RmpError) -> SerializerError {
SerializerError::RmpError(error)
}
}
impl From<SerdeJsonError> for SerializerError {
fn from(error: SerdeJsonError) -> SerializerError {
SerializerError::SerdeJsonError(error)
}
}
impl From<rocksdb::Error> for SerializerError {
fn from(error: rocksdb::Error) -> SerializerError {
SerializerError::RocksDbError(error)
}
}
impl From<ParseNumberError> for SerializerError {
fn from(error: ParseNumberError) -> SerializerError {
SerializerError::ParseNumberError(error)
}
}
pub struct RamDocumentStore(BTreeMap<(DocumentId, SchemaAttr), Vec<u8>>);
impl RamDocumentStore {
pub fn new() -> RamDocumentStore {
RamDocumentStore(BTreeMap::new())
}
pub fn set_document_field(&mut self, id: DocumentId, attr: SchemaAttr, value: Vec<u8>) {
self.0.insert((id, attr), value);
}
pub fn into_inner(self) -> BTreeMap<(DocumentId, SchemaAttr), Vec<u8>> {
self.0
}
}

View File

@ -0,0 +1,287 @@
use meilidb_core::DocumentId;
use meilidb_schema::Schema;
use serde::ser;
use crate::indexer::Indexer as RawIndexer;
use crate::ranked_map::RankedMap;
use super::{RamDocumentStore, SerializerError, ConvertToString, ConvertToNumber, Indexer};
pub struct Serializer<'a> {
pub schema: &'a Schema,
pub document_store: &'a mut RamDocumentStore,
pub indexer: &'a mut RawIndexer,
pub ranked_map: &'a mut RankedMap,
pub document_id: DocumentId,
}
impl<'a> ser::Serializer for Serializer<'a> {
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapSerializer<'a>;
type SerializeStruct = StructSerializer<'a>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "str" })
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
Err(SerializerError::UnserializableType { type_name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Ok(MapSerializer {
schema: self.schema,
document_id: self.document_id,
document_store: self.document_store,
indexer: self.indexer,
ranked_map: self.ranked_map,
current_key_name: None,
})
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Ok(StructSerializer {
schema: self.schema,
document_id: self.document_id,
document_store: self.document_store,
indexer: self.indexer,
ranked_map: self.ranked_map,
})
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "struct variant" })
}
}
pub struct MapSerializer<'a> {
schema: &'a Schema,
document_id: DocumentId,
document_store: &'a mut RamDocumentStore,
indexer: &'a mut RawIndexer,
ranked_map: &'a mut RankedMap,
current_key_name: Option<String>,
}
impl<'a> ser::SerializeMap for MapSerializer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: ser::Serialize,
{
let key = key.serialize(ConvertToString)?;
self.current_key_name = Some(key);
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: ser::Serialize,
{
let key = self.current_key_name.take().unwrap();
self.serialize_entry(&key, value)
}
fn serialize_entry<K: ?Sized, V: ?Sized>(
&mut self,
key: &K,
value: &V,
) -> Result<(), Self::Error>
where K: ser::Serialize, V: ser::Serialize,
{
let key = key.serialize(ConvertToString)?;
serialize_value(
self.schema,
self.document_id,
self.document_store,
self.indexer,
self.ranked_map,
&key,
value,
)
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}
pub struct StructSerializer<'a> {
schema: &'a Schema,
document_id: DocumentId,
document_store: &'a mut RamDocumentStore,
indexer: &'a mut RawIndexer,
ranked_map: &'a mut RankedMap,
}
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T,
) -> Result<(), Self::Error>
where T: ser::Serialize,
{
serialize_value(
self.schema,
self.document_id,
self.document_store,
self.indexer,
self.ranked_map,
key,
value,
)
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}
fn serialize_value<T: ?Sized>(
schema: &Schema,
document_id: DocumentId,
document_store: &mut RamDocumentStore,
indexer: &mut RawIndexer,
ranked_map: &mut RankedMap,
key: &str,
value: &T,
) -> Result<(), SerializerError>
where T: ser::Serialize,
{
if let Some(attribute) = schema.attribute(key) {
let props = schema.props(attribute);
let serialized = rmp_serde::to_vec_named(value)?;
document_store.set_document_field(document_id, attribute, serialized);
if props.is_indexed() {
let indexer = Indexer { attribute, indexer, document_id };
value.serialize(indexer)?;
}
if props.is_ranked() {
let number = value.serialize(ConvertToNumber)?;
ranked_map.insert(document_id, attribute, number);
}
}
Ok(())
}

View File

@ -0,0 +1,96 @@
use std::sync::atomic::{AtomicBool, Ordering::Relaxed};
use std::sync::Arc;
use serde_json::json;
use meilidb_data::Database;
use meilidb_schema::{Schema, SchemaBuilder, DISPLAYED, INDEXED};
fn simple_schema() -> Schema {
let mut builder = SchemaBuilder::with_identifier("objectId");
builder.new_attribute("objectId", DISPLAYED | INDEXED);
builder.new_attribute("title", DISPLAYED | INDEXED);
builder.build()
}
#[test]
fn insert_delete_document() {
let tmp_dir = tempfile::tempdir().unwrap();
let database = Database::open(&tmp_dir).unwrap();
let as_been_updated = Arc::new(AtomicBool::new(false));
let schema = simple_schema();
let index = database.create_index("hello", schema).unwrap();
let as_been_updated_clone = as_been_updated.clone();
index.set_update_callback(move |_| as_been_updated_clone.store(true, Relaxed));
let doc1 = json!({ "objectId": 123, "title": "hello" });
let mut addition = index.documents_addition();
addition.update_document(&doc1);
let update_id = addition.finalize().unwrap();
let status = index.update_status_blocking(update_id).unwrap();
assert!(as_been_updated.swap(false, Relaxed));
assert!(status.result.is_ok());
assert_eq!(index.number_of_documents(), 1);
let docs = index.query_builder().query("hello", 0..10).unwrap();
assert_eq!(docs.len(), 1);
assert_eq!(index.document(None, docs[0].id).unwrap().as_ref(), Some(&doc1));
let mut deletion = index.documents_deletion();
deletion.delete_document(&doc1).unwrap();
let update_id = deletion.finalize().unwrap();
let status = index.update_status_blocking(update_id).unwrap();
assert!(as_been_updated.swap(false, Relaxed));
assert!(status.result.is_ok());
assert_eq!(index.number_of_documents(), 0);
let docs = index.query_builder().query("hello", 0..10).unwrap();
assert_eq!(docs.len(), 0);
}
#[test]
fn replace_document() {
let tmp_dir = tempfile::tempdir().unwrap();
let database = Database::open(&tmp_dir).unwrap();
let as_been_updated = Arc::new(AtomicBool::new(false));
let schema = simple_schema();
let index = database.create_index("hello", schema).unwrap();
let as_been_updated_clone = as_been_updated.clone();
index.set_update_callback(move |_| as_been_updated_clone.store(true, Relaxed));
let doc1 = json!({ "objectId": 123, "title": "hello" });
let doc2 = json!({ "objectId": 123, "title": "coucou" });
let mut addition = index.documents_addition();
addition.update_document(&doc1);
let update_id = addition.finalize().unwrap();
let status = index.update_status_blocking(update_id).unwrap();
assert!(as_been_updated.swap(false, Relaxed));
assert!(status.result.is_ok());
assert_eq!(index.number_of_documents(), 1);
let docs = index.query_builder().query("hello", 0..10).unwrap();
assert_eq!(docs.len(), 1);
assert_eq!(index.document(None, docs[0].id).unwrap().as_ref(), Some(&doc1));
let mut addition = index.documents_addition();
addition.update_document(&doc2);
let update_id = addition.finalize().unwrap();
let status = index.update_status_blocking(update_id).unwrap();
assert!(as_been_updated.swap(false, Relaxed));
assert!(status.result.is_ok());
assert_eq!(index.number_of_documents(), 1);
let docs = index.query_builder().query("hello", 0..10).unwrap();
assert_eq!(docs.len(), 0);
let docs = index.query_builder().query("coucou", 0..10).unwrap();
assert_eq!(docs.len(), 1);
assert_eq!(index.document(None, docs[0].id).unwrap().as_ref(), Some(&doc2));
}

12
meilidb-schema/Cargo.toml Normal file
View File

@ -0,0 +1,12 @@
[package]
name = "meilidb-schema"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"
[dependencies]
bincode = "1.1.2"
linked-hash-map = { version = "0.5.2", features = ["serde_impl"] }
serde = { version = "1.0.91", features = ["derive"] }
serde_json = { version = "1.0.39", features = ["preserve_order"] }
toml = { version = "0.5.0", features = ["preserve_order"] }

View File

@ -5,34 +5,37 @@ use std::{fmt, u16};
use std::ops::BitOr;
use std::sync::Arc;
use serde_derive::{Serialize, Deserialize};
use serde::{Serialize, Deserialize};
use linked_hash_map::LinkedHashMap;
use serde::Serialize;
use crate::database::serde::find_id::FindDocumentIdSerializer;
use crate::database::serde::SerializerError;
use crate::DocumentId;
pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false };
pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true };
pub const DISPLAYED: SchemaProps = SchemaProps { displayed: true, indexed: false, ranked: false };
pub const INDEXED: SchemaProps = SchemaProps { displayed: false, indexed: true, ranked: false };
pub const RANKED: SchemaProps = SchemaProps { displayed: false, indexed: false, ranked: true };
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct SchemaProps {
#[serde(default)]
stored: bool,
displayed: bool,
#[serde(default)]
indexed: bool,
#[serde(default)]
ranked: bool,
}
impl SchemaProps {
pub fn is_stored(self) -> bool {
self.stored
pub fn is_displayed(self) -> bool {
self.displayed
}
pub fn is_indexed(self) -> bool {
self.indexed
}
pub fn is_ranked(self) -> bool {
self.ranked
}
}
impl BitOr for SchemaProps {
@ -40,8 +43,9 @@ impl BitOr for SchemaProps {
fn bitor(self, other: Self) -> Self::Output {
SchemaProps {
stored: self.stored | other.stored,
displayed: self.displayed | other.displayed,
indexed: self.indexed | other.indexed,
ranked: self.ranked | other.ranked,
}
}
}
@ -95,14 +99,14 @@ struct InnerSchema {
}
impl Schema {
pub fn from_toml<R: Read>(mut reader: R) -> Result<Schema, Box<Error>> {
pub fn from_toml<R: Read>(mut reader: R) -> Result<Schema, Box<dyn Error>> {
let mut buffer = Vec::new();
reader.read_to_end(&mut buffer)?;
let builder: SchemaBuilder = toml::from_slice(&buffer)?;
Ok(builder.build())
}
pub fn to_toml<W: Write>(&self, mut writer: W) -> Result<(), Box<Error>> {
pub fn to_toml<W: Write>(&self, mut writer: W) -> Result<(), Box<dyn Error>> {
let identifier = self.inner.identifier.clone();
let attributes = self.attributes_ordered();
let builder = SchemaBuilder { identifier, attributes };
@ -113,12 +117,29 @@ impl Schema {
Ok(())
}
pub(crate) fn read_from_bin<R: Read>(reader: R) -> bincode::Result<Schema> {
pub fn from_json<R: Read>(mut reader: R) -> Result<Schema, Box<dyn Error>> {
let mut buffer = Vec::new();
reader.read_to_end(&mut buffer)?;
let builder: SchemaBuilder = serde_json::from_slice(&buffer)?;
Ok(builder.build())
}
pub fn to_json<W: Write>(&self, mut writer: W) -> Result<(), Box<dyn Error>> {
let identifier = self.inner.identifier.clone();
let attributes = self.attributes_ordered();
let builder = SchemaBuilder { identifier, attributes };
let string = serde_json::to_string_pretty(&builder)?;
writer.write_all(string.as_bytes())?;
Ok(())
}
pub fn read_from_bin<R: Read>(reader: R) -> bincode::Result<Schema> {
let builder: SchemaBuilder = bincode::deserialize_from(reader)?;
Ok(builder.build())
}
pub(crate) fn write_to_bin<W: Write>(&self, writer: W) -> bincode::Result<()> {
pub fn write_to_bin<W: Write>(&self, writer: W) -> bincode::Result<()> {
let identifier = self.inner.identifier.clone();
let attributes = self.attributes_ordered();
let builder = SchemaBuilder { identifier, attributes };
@ -141,14 +162,6 @@ impl Schema {
attributes
}
pub fn document_id<T>(&self, document: T) -> Result<DocumentId, SerializerError>
where T: Serialize,
{
let id_attribute_name = &self.inner.identifier;
let serializer = FindDocumentIdSerializer { id_attribute_name };
document.serialize(serializer)
}
pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
let (_, props) = self.inner.props[attr.0 as usize];
props
@ -166,18 +179,31 @@ impl Schema {
let (name, _) = &self.inner.props[attr.0 as usize];
name
}
pub fn iter<'a>(&'a self) -> impl Iterator<Item=(&str, SchemaAttr, SchemaProps)> + 'a {
self.inner.props.iter()
.map(move |(name, prop)| {
let attr = self.inner.attrs.get(name).unwrap();
(name.as_str(), *attr, *prop)
})
}
}
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)]
pub struct SchemaAttr(pub(crate) u16);
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct SchemaAttr(pub u16);
impl SchemaAttr {
pub fn new(value: u16) -> SchemaAttr {
pub const fn new(value: u16) -> SchemaAttr {
SchemaAttr(value)
}
pub fn min() -> SchemaAttr {
SchemaAttr(0)
pub const fn min() -> SchemaAttr {
SchemaAttr(u16::min_value())
}
pub const fn max() -> SchemaAttr {
SchemaAttr(u16::max_value())
}
pub fn next(self) -> Option<SchemaAttr> {
@ -187,10 +213,6 @@ impl SchemaAttr {
pub fn prev(self) -> Option<SchemaAttr> {
self.0.checked_sub(1).map(SchemaAttr)
}
pub fn max() -> SchemaAttr {
SchemaAttr(u16::MAX)
}
}
impl fmt::Display for SchemaAttr {
@ -207,8 +229,8 @@ mod tests {
#[test]
fn serialize_deserialize() -> bincode::Result<()> {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", STORED);
builder.new_attribute("beta", STORED | INDEXED);
builder.new_attribute("alpha", DISPLAYED);
builder.new_attribute("beta", DISPLAYED | INDEXED);
builder.new_attribute("gamma", INDEXED);
let schema = builder.build();
@ -223,10 +245,10 @@ mod tests {
}
#[test]
fn serialize_deserialize_toml() -> Result<(), Box<Error>> {
fn serialize_deserialize_toml() -> Result<(), Box<dyn Error>> {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", STORED);
builder.new_attribute("beta", STORED | INDEXED);
builder.new_attribute("alpha", DISPLAYED);
builder.new_attribute("beta", DISPLAYED | INDEXED);
builder.new_attribute("gamma", INDEXED);
let schema = builder.build();
@ -240,10 +262,10 @@ mod tests {
identifier = "id"
[attributes."alpha"]
stored = true
displayed = true
[attributes."beta"]
stored = true
displayed = true
indexed = true
[attributes."gamma"]
@ -254,4 +276,40 @@ mod tests {
Ok(())
}
#[test]
fn serialize_deserialize_json() -> Result<(), Box<dyn Error>> {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", DISPLAYED);
builder.new_attribute("beta", DISPLAYED | INDEXED);
builder.new_attribute("gamma", INDEXED);
let schema = builder.build();
let mut buffer = Vec::new();
schema.to_json(&mut buffer)?;
let schema2 = Schema::from_json(buffer.as_slice())?;
assert_eq!(schema, schema2);
let data = r#"
{
"identifier": "id",
"attributes": {
"alpha": {
"displayed": true
},
"beta": {
"displayed": true,
"indexed": true
},
"gamma": {
"indexed": true
}
}
}"#;
let schema2 = Schema::from_json(data.as_bytes())?;
assert_eq!(schema, schema2);
Ok(())
}
}

View File

@ -0,0 +1,8 @@
[package]
name = "meilidb-tokenizer"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"
[dependencies]
slice-group-by = "0.2.4"

View File

@ -0,0 +1,295 @@
use std::iter::Peekable;
use slice_group_by::StrGroupBy;
use self::SeparatorCategory::*;
pub fn is_cjk(c: char) -> bool {
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
(c >= '\u{3040}' && c <= '\u{309f}') ||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
(c >= '\u{3100}' && c <= '\u{312f}') ||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
(c >= '\u{f900}' && c <= '\u{faff}')
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
enum SeparatorCategory {
Soft,
Hard,
}
impl SeparatorCategory {
fn merge(self, other: SeparatorCategory) -> SeparatorCategory {
if let (Soft, Soft) = (self, other) { Soft } else { Hard }
}
fn to_usize(self) -> usize {
match self {
Soft => 1,
Hard => 8,
}
}
}
fn is_separator(c: char) -> bool {
classify_separator(c).is_some()
}
fn classify_separator(c: char) -> Option<SeparatorCategory> {
match c {
' ' | '\'' | '"' => Some(Soft),
'.' | ';' | ',' | '!' | '?' | '-' | '(' | ')' => Some(Hard),
_ => None,
}
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
enum CharCategory {
Separator(SeparatorCategory),
Cjk,
Other,
}
fn classify_char(c: char) -> CharCategory {
if let Some(category) = classify_separator(c) {
CharCategory::Separator(category)
} else if is_cjk(c) {
CharCategory::Cjk
} else {
CharCategory::Other
}
}
fn is_str_word(s: &str) -> bool {
!s.chars().any(is_separator)
}
fn same_group_category(a: char, b: char) -> bool {
match (classify_char(a), classify_char(b)) {
(CharCategory::Cjk, _) | (_, CharCategory::Cjk) => false,
(CharCategory::Separator(_), CharCategory::Separator(_)) => true,
(a, b) => a == b,
}
}
// fold the number of chars along with the index position
fn chars_count_index((n, _): (usize, usize), (i, c): (usize, char)) -> (usize, usize) {
(n + 1, i + c.len_utf8())
}
pub fn split_query_string(query: &str) -> impl Iterator<Item=&str> {
Tokenizer::new(query).map(|t| t.word)
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub struct Token<'a> {
pub word: &'a str,
pub word_index: usize,
pub char_index: usize,
}
pub struct Tokenizer<'a> {
inner: &'a str,
word_index: usize,
char_index: usize,
}
impl<'a> Tokenizer<'a> {
pub fn new(string: &str) -> Tokenizer {
// skip every separator and set `char_index`
// to the number of char trimmed
let (count, index) = string.char_indices()
.take_while(|(_, c)| is_separator(*c))
.fold((0, 0), chars_count_index);
Tokenizer {
inner: &string[index..],
word_index: 0,
char_index: count,
}
}
}
impl<'a> Iterator for Tokenizer<'a> {
type Item = Token<'a>;
fn next(&mut self) -> Option<Self::Item> {
let mut iter = self.inner.linear_group_by(same_group_category).peekable();
while let (Some(string), next_string) = (iter.next(), iter.peek()) {
let (count, index) = string.char_indices().fold((0, 0), chars_count_index);
if !is_str_word(string) {
self.word_index += string.chars()
.filter_map(classify_separator)
.fold(Soft, |a, x| a.merge(x))
.to_usize();
self.char_index += count;
self.inner = &self.inner[index..];
continue;
}
let token = Token {
word: string,
word_index: self.word_index,
char_index: self.char_index,
};
if next_string.filter(|s| is_str_word(s)).is_some() {
self.word_index += 1;
}
self.char_index += count;
self.inner = &self.inner[index..];
return Some(token);
}
self.inner = "";
None
}
}
pub struct SeqTokenizer<'a, I>
where I: Iterator<Item=&'a str>,
{
inner: I,
current: Option<Peekable<Tokenizer<'a>>>,
word_offset: usize,
char_offset: usize,
}
impl<'a, I> SeqTokenizer<'a, I>
where I: Iterator<Item=&'a str>,
{
pub fn new(mut iter: I) -> SeqTokenizer<'a, I> {
let current = iter.next().map(|s| Tokenizer::new(s).peekable());
SeqTokenizer {
inner: iter,
current: current,
word_offset: 0,
char_offset: 0,
}
}
}
impl<'a, I> Iterator for SeqTokenizer<'a, I>
where I: Iterator<Item=&'a str>,
{
type Item = Token<'a>;
fn next(&mut self) -> Option<Self::Item> {
match &mut self.current {
Some(current) => {
match current.next() {
Some(token) => {
// we must apply the word and char offsets
// to the token before returning it
let token = Token {
word: token.word,
word_index: token.word_index + self.word_offset,
char_index: token.char_index + self.char_offset,
};
// if this is the last iteration on this text
// we must save the offsets for next texts
if current.peek().is_none() {
let hard_space = SeparatorCategory::Hard.to_usize();
self.word_offset = token.word_index + hard_space;
self.char_offset = token.char_index + hard_space;
}
Some(token)
},
None => {
// no more words in this text we must
// start tokenizing the next text
self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable());
self.next()
},
}
},
// no more texts available
None => None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn easy() {
let mut tokenizer = Tokenizer::new("salut");
assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard() {
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 18 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 32, char_index: 24 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard_long_chars() {
let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard_kanjis() {
let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}");
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 11, char_index: 14 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 12, char_index: 23 }));
assert_eq!(tokenizer.next(), None);
}
}

28
meilidb/Cargo.toml Normal file
View File

@ -0,0 +1,28 @@
[package]
edition = "2018"
name = "meilidb"
version = "0.3.1"
authors = ["Kerollmops <renault.cle@gmail.com>"]
[dependencies]
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
meilidb-data = { path = "../meilidb-data", version = "0.1.0" }
meilidb-schema = { path = "../meilidb-schema", version = "0.1.0" }
[dev-dependencies]
csv = "1.0.7"
diskus = "0.5.0"
env_logger = "0.6.1"
jemallocator = "0.1.9"
linked-hash-map = "0.5.2"
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
quickcheck = "0.8.2"
rand = "0.6.5"
rand_xorshift = "0.1.1"
rustyline = { version = "5.0.0", default-features = false }
serde = { version = "1.0.91" , features = ["derive"] }
serde_json = "1.0.39"
structopt = "0.2.15"
sysinfo = "0.8.4"
tempfile = "3.0.7"
termcolor = "1.0.4"

View File

@ -0,0 +1,214 @@
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
use std::collections::{HashMap, HashSet};
use std::io::{self, BufRead, BufReader};
use std::path::{Path, PathBuf};
use std::time::Instant;
use std::error::Error;
use std::fs::File;
use diskus::Walk;
use sysinfo::{SystemExt, ProcessExt};
use serde::{Serialize, Deserialize};
use structopt::StructOpt;
use meilidb_data::Database;
use meilidb_schema::Schema;
#[derive(Debug, StructOpt)]
pub struct Opt {
/// The destination where the database must be created.
#[structopt(parse(from_os_str))]
pub database_path: PathBuf,
/// The csv file to index.
#[structopt(parse(from_os_str))]
pub csv_data_path: PathBuf,
/// The path to the schema.
#[structopt(long = "schema", parse(from_os_str))]
pub schema_path: PathBuf,
/// The file with the synonyms.
#[structopt(long = "synonyms", parse(from_os_str))]
pub synonyms: Option<PathBuf>,
/// The path to the list of stop words (one by line).
#[structopt(long = "stop-words", parse(from_os_str))]
pub stop_words: Option<PathBuf>,
#[structopt(long = "update-group-size")]
pub update_group_size: Option<usize>,
}
#[derive(Serialize, Deserialize)]
struct Document (
HashMap<String, String>
);
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum Synonym {
OneWay(SynonymOneWay),
MultiWay { synonyms: Vec<String> },
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SynonymOneWay {
pub search_terms: String,
pub synonyms: Synonyms,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum Synonyms {
Multiple(Vec<String>),
Single(String),
}
fn read_synomys(path: &Path) -> Result<Vec<Synonym>, Box<dyn Error>> {
let file = File::open(path)?;
let synonyms = serde_json::from_reader(file)?;
Ok(synonyms)
}
fn index(
schema: Schema,
database_path: &Path,
csv_data_path: &Path,
update_group_size: Option<usize>,
stop_words: &HashSet<String>,
synonyms: Vec<Synonym>,
) -> Result<Database, Box<dyn Error>>
{
let database = Database::open(database_path)?;
let mut wtr = csv::Writer::from_path("./stats.csv").unwrap();
wtr.write_record(&["NumberOfDocuments", "DiskUsed", "MemoryUsed"])?;
let mut system = sysinfo::System::new();
let index = database.create_index("test", schema.clone())?;
let mut synonyms_adder = index.synonyms_addition();
for synonym in synonyms {
match synonym {
Synonym::OneWay(SynonymOneWay { search_terms, synonyms }) => {
let alternatives = match synonyms {
Synonyms::Multiple(alternatives) => alternatives,
Synonyms::Single(alternative) => vec![alternative],
};
synonyms_adder.add_synonym(search_terms, alternatives);
},
Synonym::MultiWay { mut synonyms } => {
for _ in 0..synonyms.len() {
if let Some((synonym, alternatives)) = synonyms.split_first() {
synonyms_adder.add_synonym(synonym, alternatives);
}
synonyms.rotate_left(1);
}
},
}
}
synonyms_adder.finalize()?;
let mut rdr = csv::Reader::from_path(csv_data_path)?;
let mut raw_record = csv::StringRecord::new();
let headers = rdr.headers()?.clone();
let mut i = 0;
let mut end_of_file = false;
while !end_of_file {
let mut update = index.documents_addition();
loop {
end_of_file = !rdr.read_record(&mut raw_record)?;
if end_of_file { break }
let document: Document = match raw_record.deserialize(Some(&headers)) {
Ok(document) => document,
Err(e) => {
eprintln!("{:?}", e);
continue;
}
};
update.update_document(document);
print!("\rindexing document {}", i);
i += 1;
if let Some(group_size) = update_group_size {
if i % group_size == 0 { break }
}
}
println!();
println!("committing update...");
update.finalize()?;
// write stats
let directory_size = Walk::new(&[database_path.to_owned()], 4).run();
system.refresh_all();
let memory = system.get_process(sysinfo::get_current_pid()).unwrap().memory(); // in kb
wtr.write_record(&[i.to_string(), directory_size.to_string(), memory.to_string()])?;
wtr.flush()?;
}
Ok(database)
}
fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
let f = File::open(path)?;
let reader = BufReader::new(f);
let mut words = HashSet::new();
for line in reader.lines() {
let line = line?;
let word = line.trim().to_string();
words.insert(word);
}
Ok(words)
}
fn main() -> Result<(), Box<dyn Error>> {
let _ = env_logger::init();
let opt = Opt::from_args();
let schema = {
let file = File::open(&opt.schema_path)?;
Schema::from_toml(file)?
};
let stop_words = match opt.stop_words {
Some(ref path) => retrieve_stop_words(path)?,
None => HashSet::new(),
};
let synonyms = match opt.synonyms {
Some(ref path) => read_synomys(path)?,
None => Vec::new(),
};
let start = Instant::now();
let result = index(
schema,
&opt.database_path,
&opt.csv_data_path,
opt.update_group_size,
&stop_words,
synonyms,
);
if let Err(e) = result {
return Err(e.into())
}
println!("database created in {:.2?} at: {:?}", start.elapsed(), opt.database_path);
Ok(())
}

View File

@ -0,0 +1,229 @@
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
use std::collections::btree_map::{BTreeMap, Entry};
use std::collections::HashSet;
use std::error::Error;
use std::io::{self, Write};
use std::iter::FromIterator;
use std::path::PathBuf;
use std::time::{Instant, Duration};
use linked_hash_map::LinkedHashMap;
use rustyline::{Editor, Config};
use structopt::StructOpt;
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
use meilidb_core::Highlight;
use meilidb_data::Database;
use meilidb_schema::SchemaAttr;
#[derive(Debug, StructOpt)]
pub struct Opt {
/// The destination where the database must be created
#[structopt(parse(from_os_str))]
pub database_path: PathBuf,
#[structopt(long = "fetch-timeout-ms")]
pub fetch_timeout_ms: Option<u64>,
/// Fields that must be displayed.
pub displayed_fields: Vec<String>,
/// The number of returned results
#[structopt(short = "n", long = "number-results", default_value = "10")]
pub number_results: usize,
/// The number of characters before and after the first match
#[structopt(short = "C", long = "context", default_value = "35")]
pub char_context: usize,
}
type Document = LinkedHashMap<String, String>;
fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
let mut stdout = StandardStream::stdout(ColorChoice::Always);
let mut highlighted = false;
for range in ranges.windows(2) {
let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() };
if highlighted {
stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
}
write!(&mut stdout, "{}", &text[start..end])?;
stdout.reset()?;
highlighted = !highlighted;
}
Ok(())
}
fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize) {
let mut byte_index = 0;
let mut byte_length = 0;
for (n, (i, c)) in text.char_indices().enumerate() {
if n == index {
byte_index = i;
}
if n + 1 == index + length {
byte_length = i - byte_index + c.len_utf8();
break;
}
}
(byte_index, byte_length)
}
fn create_highlight_areas(text: &str, highlights: &[Highlight]) -> Vec<usize> {
let mut byte_indexes = BTreeMap::new();
for highlight in highlights {
let char_index = highlight.char_index as usize;
let char_length = highlight.char_length as usize;
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
match byte_indexes.entry(byte_index) {
Entry::Vacant(entry) => { entry.insert(byte_length); },
Entry::Occupied(mut entry) => {
if *entry.get() < byte_length {
entry.insert(byte_length);
}
},
}
}
let mut title_areas = Vec::new();
title_areas.push(0);
for (byte_index, length) in byte_indexes {
title_areas.push(byte_index);
title_areas.push(byte_index + length);
}
title_areas.push(text.len());
title_areas.sort_unstable();
title_areas
}
/// note: matches must have been sorted by `char_index` and `char_length` before being passed.
///
/// ```no_run
/// matches.sort_unstable_by_key(|m| (m.char_index, m.char_length));
///
/// let matches = matches.matches.iter().filter(|m| SchemaAttr::new(m.attribute) == attr).cloned();
///
/// let (text, matches) = crop_text(&text, matches, 35);
/// ```
fn crop_text(
text: &str,
highlights: impl IntoIterator<Item=Highlight>,
context: usize,
) -> (String, Vec<Highlight>)
{
let mut highlights = highlights.into_iter().peekable();
let char_index = highlights.peek().map(|m| m.char_index as usize).unwrap_or(0);
let start = char_index.saturating_sub(context);
let text = text.chars().skip(start).take(context * 2).collect();
let highlights = highlights
.take_while(|m| {
(m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)
})
.map(|highlight| {
Highlight { char_index: highlight.char_index - start as u16, ..highlight }
})
.collect();
(text, highlights)
}
fn main() -> Result<(), Box<dyn Error>> {
let _ = env_logger::init();
let opt = Opt::from_args();
let start = Instant::now();
let database = Database::open(&opt.database_path)?;
let index = database.open_index("test")?.unwrap();
let schema = index.schema();
println!("database prepared for you in {:.2?}", start.elapsed());
let fields = opt.displayed_fields.iter().map(String::as_str);
let fields = HashSet::from_iter(fields);
let config = Config::builder().auto_add_history(true).build();
let mut readline = Editor::<()>::with_config(config);
let _ = readline.load_history("query-history.txt");
for result in readline.iter("Searching for: ") {
match result {
Ok(query) => {
let start_total = Instant::now();
let builder = match opt.fetch_timeout_ms {
Some(timeout_ms) => {
let timeout = Duration::from_millis(timeout_ms);
index.query_builder().with_fetch_timeout(timeout)
},
None => index.query_builder(),
};
let documents = builder.query(&query, 0..opt.number_results)?;
let mut retrieve_duration = Duration::default();
let number_of_documents = documents.len();
for mut doc in documents {
doc.highlights.sort_unstable_by_key(|m| (m.char_index, m.char_length));
let start_retrieve = Instant::now();
let result = index.document::<Document>(Some(&fields), doc.id);
retrieve_duration += start_retrieve.elapsed();
match result {
Ok(Some(document)) => {
for (name, text) in document {
print!("{}: ", name);
let attr = schema.attribute(&name).unwrap();
let highlights = doc.highlights.iter()
.filter(|m| SchemaAttr::new(m.attribute) == attr)
.cloned();
let (text, highlights) = crop_text(&text, highlights, opt.char_context);
let areas = create_highlight_areas(&text, &highlights);
display_highlights(&text, &areas)?;
println!();
}
},
Ok(None) => eprintln!("missing document"),
Err(e) => eprintln!("{}", e),
}
let mut matching_attributes = HashSet::new();
for highlight in doc.highlights {
let attr = SchemaAttr::new(highlight.attribute);
let name = schema.attribute_name(attr);
matching_attributes.insert(name);
}
let matching_attributes = Vec::from_iter(matching_attributes);
println!("matching in: {:?}", matching_attributes);
println!();
}
eprintln!("document field retrieve took {:.2?}", retrieve_duration);
eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed());
},
Err(err) => {
println!("Error: {:?}", err);
break
}
}
}
readline.save_history("query-history.txt").unwrap();
Ok(())
}

3
meilidb/src/lib.rs Normal file
View File

@ -0,0 +1,3 @@
mod sort_by_attr;
pub use self::sort_by_attr::SortByAttr;

125
meilidb/src/sort_by_attr.rs Normal file
View File

@ -0,0 +1,125 @@
use std::cmp::Ordering;
use std::error::Error;
use std::fmt;
use meilidb_core::{criterion::Criterion, RawDocument};
use meilidb_data::RankedMap;
use meilidb_schema::{Schema, SchemaAttr};
/// An helper struct that permit to sort documents by
/// some of their stored attributes.
///
/// # Note
///
/// If a document cannot be deserialized it will be considered [`None`][].
///
/// Deserialized documents are compared like `Some(doc0).cmp(&Some(doc1))`,
/// so you must check the [`Ord`] of `Option` implementation.
///
/// [`None`]: https://doc.rust-lang.org/std/option/enum.Option.html#variant.None
/// [`Ord`]: https://doc.rust-lang.org/std/option/enum.Option.html#impl-Ord
///
/// # Example
///
/// ```ignore
/// use serde_derive::Deserialize;
/// use meilidb::rank::criterion::*;
///
/// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?;
///
/// let builder = CriteriaBuilder::with_capacity(8)
/// .add(SumOfTypos)
/// .add(NumberOfWords)
/// .add(WordsProximity)
/// .add(SumOfWordsAttribute)
/// .add(SumOfWordsPosition)
/// .add(Exact)
/// .add(custom_ranking)
/// .add(DocumentId);
///
/// let criterion = builder.build();
///
/// ```
pub struct SortByAttr<'a> {
ranked_map: &'a RankedMap,
attr: SchemaAttr,
reversed: bool,
}
impl<'a> SortByAttr<'a> {
pub fn lower_is_better(
ranked_map: &'a RankedMap,
schema: &Schema,
attr_name: &str,
) -> Result<SortByAttr<'a>, SortByAttrError>
{
SortByAttr::new(ranked_map, schema, attr_name, false)
}
pub fn higher_is_better(
ranked_map: &'a RankedMap,
schema: &Schema,
attr_name: &str,
) -> Result<SortByAttr<'a>, SortByAttrError>
{
SortByAttr::new(ranked_map, schema, attr_name, true)
}
fn new(
ranked_map: &'a RankedMap,
schema: &Schema,
attr_name: &str,
reversed: bool,
) -> Result<SortByAttr<'a>, SortByAttrError>
{
let attr = match schema.attribute(attr_name) {
Some(attr) => attr,
None => return Err(SortByAttrError::AttributeNotFound),
};
if !schema.props(attr).is_ranked() {
return Err(SortByAttrError::AttributeNotRegisteredForRanking);
}
Ok(SortByAttr { ranked_map, attr, reversed })
}
}
impl<'a> Criterion for SortByAttr<'a> {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = self.ranked_map.get(lhs.id, self.attr);
let rhs = self.ranked_map.get(rhs.id, self.attr);
match (lhs, rhs) {
(Some(lhs), Some(rhs)) => {
let order = lhs.cmp(&rhs);
if self.reversed { order.reverse() } else { order }
},
(None, Some(_)) => Ordering::Greater,
(Some(_), None) => Ordering::Less,
(None, None) => Ordering::Equal,
}
}
fn name(&self) -> &'static str {
"SortByAttr"
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum SortByAttrError {
AttributeNotFound,
AttributeNotRegisteredForRanking,
}
impl fmt::Display for SortByAttrError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use SortByAttrError::*;
match self {
AttributeNotFound => f.write_str("attribute not found in the schema"),
AttributeNotRegisteredForRanking => f.write_str("attribute not registered for ranking"),
}
}
}
impl Error for SortByAttrError { }

View File

@ -1,105 +0,0 @@
use std::fmt;
/// Represent an attribute number along with the word index
/// according to the tokenizer used.
///
/// It can accept up to 1024 attributes and word positions
/// can be maximum 2^22.
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Attribute(u32);
impl Attribute {
/// Construct an `Attribute` from an attribute number and
/// the word position of a match according to the tokenizer used.
pub(crate) fn new(attribute: u16, index: u32) -> Result<Attribute, AttributeError> {
if attribute & 0b1111_1100_0000_0000 != 0 {
return Err(AttributeError::AttributeTooBig)
}
if index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 {
return Err(AttributeError::IndexTooBig)
}
let attribute = u32::from(attribute) << 22;
Ok(Attribute(attribute | index))
}
/// Construct an `Attribute` from an attribute number and
/// the word position of a match according to the tokenizer used.
///
/// # Panics
///
/// The attribute must not be greater than 1024
/// and the word index not greater than 2^22.
pub(crate) fn new_faillible(attribute: u16, index: u32) -> Attribute {
match Attribute::new(attribute, index) {
Ok(attribute) => attribute,
Err(AttributeError::AttributeTooBig) => {
panic!("attribute must not be greater than 1024")
},
Err(AttributeError::IndexTooBig) => {
panic!("attribute word index must not be greater than 2^22")
},
}
}
pub(crate) fn max_value() -> Attribute {
Attribute(u32::max_value())
}
#[inline]
pub fn attribute(self) -> u16 {
(self.0 >> 22) as u16
}
#[inline]
pub fn word_index(self) -> u32 {
self.0 & 0b0000_0000_0011_1111_1111_1111_1111
}
}
impl fmt::Debug for Attribute {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("Attribute")
.field("attribute", &self.attribute())
.field("word_index", &self.word_index())
.finish()
}
}
pub enum AttributeError {
AttributeTooBig,
IndexTooBig,
}
#[cfg(test)]
mod tests {
use super::*;
use quickcheck::{quickcheck, TestResult};
quickcheck! {
fn qc_attribute(gen_attr: u16, gen_index: u32) -> TestResult {
if gen_attr > 2_u16.pow(10) || gen_index > 2_u32.pow(22) {
return TestResult::discard()
}
let attribute = Attribute::new_faillible(gen_attr, gen_index);
let valid_attribute = attribute.attribute() == gen_attr;
let valid_index = attribute.word_index() == gen_index;
TestResult::from_bool(valid_attribute && valid_index)
}
fn qc_attribute_ord(gen_attr: u16, gen_index: u32) -> TestResult {
if gen_attr >= 2_u16.pow(10) || gen_index >= 2_u32.pow(22) {
return TestResult::discard()
}
let a = Attribute::new_faillible(gen_attr, gen_index);
let b = Attribute::new_faillible(gen_attr + 1, gen_index + 1);
TestResult::from_bool(a < b)
}
}
}

View File

@ -1,91 +0,0 @@
use fst::Automaton;
use lazy_static::lazy_static;
use levenshtein_automata::{
LevenshteinAutomatonBuilder as LevBuilder,
DFA, Distance,
};
lazy_static! {
static ref LEVDIST0: LevBuilder = LevBuilder::new(0, false);
static ref LEVDIST1: LevBuilder = LevBuilder::new(1, false);
static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false);
}
pub struct DfaExt {
query_len: usize,
automaton: DFA,
}
impl Automaton for DfaExt {
type State = <DFA as Automaton>::State;
fn start(&self) -> Self::State {
self.automaton.start()
}
fn is_match(&self, state: &Self::State) -> bool {
self.automaton.is_match(state)
}
fn can_match(&self, state: &Self::State) -> bool {
self.automaton.can_match(state)
}
fn will_always_match(&self, state: &Self::State) -> bool {
self.automaton.will_always_match(state)
}
fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
self.automaton.accept(state, byte)
}
}
impl AutomatonExt for DfaExt {
fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance {
self.automaton.eval(s)
}
fn query_len(&self) -> usize {
self.query_len
}
}
#[derive(Copy, Clone)]
enum PrefixSetting {
Prefix,
NoPrefix,
}
fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DfaExt {
use self::PrefixSetting::{Prefix, NoPrefix};
let dfa = match query.len() {
0 ..= 4 => match setting {
Prefix => LEVDIST0.build_prefix_dfa(query),
NoPrefix => LEVDIST0.build_dfa(query),
},
5 ..= 8 => match setting {
Prefix => LEVDIST1.build_prefix_dfa(query),
NoPrefix => LEVDIST1.build_dfa(query),
},
_ => match setting {
Prefix => LEVDIST2.build_prefix_dfa(query),
NoPrefix => LEVDIST2.build_dfa(query),
},
};
DfaExt { query_len: query.len(), automaton: dfa }
}
pub fn build_prefix_dfa(query: &str) -> DfaExt {
build_dfa_with_setting(query, PrefixSetting::Prefix)
}
pub fn build_dfa(query: &str) -> DfaExt {
build_dfa_with_setting(query, PrefixSetting::NoPrefix)
}
pub trait AutomatonExt: Automaton {
fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance;
fn query_len(&self) -> usize;
}

View File

@ -1,26 +0,0 @@
use std::io::{self, BufReader, BufRead};
use std::collections::HashSet;
use std::path::Path;
use std::fs::File;
#[derive(Debug)]
pub struct CommonWords(HashSet<String>);
impl CommonWords {
pub fn from_file<P>(path: P) -> io::Result<Self>
where P: AsRef<Path>
{
let file = File::open(path)?;
let file = BufReader::new(file);
let mut set = HashSet::new();
for line in file.lines().filter_map(|l| l.ok()) {
let word = line.trim().to_owned();
set.insert(word);
}
Ok(CommonWords(set))
}
pub fn contains(&self, word: &str) -> bool {
self.0.contains(word)
}
}

View File

@ -1,54 +0,0 @@
use std::io::{self, Cursor, BufRead};
use std::slice::from_raw_parts;
use std::mem::size_of;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use sdset::Set;
use crate::DocumentId;
use crate::data::SharedData;
use super::into_u8_slice;
#[derive(Default, Clone)]
pub struct DocIds(SharedData);
impl DocIds {
pub fn new(ids: &Set<DocumentId>) -> DocIds {
let bytes = unsafe { into_u8_slice(ids.as_slice()) };
let data = SharedData::from_bytes(bytes.to_vec());
DocIds(data)
}
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIds> {
let len = cursor.read_u64::<LittleEndian>()? as usize;
let offset = cursor.position() as usize;
let doc_ids = cursor.get_ref().range(offset, len);
cursor.consume(len);
Ok(DocIds(doc_ids))
}
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
let len = self.0.len() as u64;
bytes.write_u64::<LittleEndian>(len).unwrap();
bytes.extend_from_slice(&self.0);
}
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
pub fn as_bytes(&self) -> &[u8] {
&self.0
}
}
impl AsRef<Set<DocumentId>> for DocIds {
fn as_ref(&self) -> &Set<DocumentId> {
let slice = &self.0;
let ptr = slice.as_ptr() as *const DocumentId;
let len = slice.len() / size_of::<DocumentId>();
let slice = unsafe { from_raw_parts(ptr, len) };
Set::new_unchecked(slice)
}
}

View File

@ -1,226 +0,0 @@
use std::io::{self, Write, Cursor, BufRead};
use std::slice::from_raw_parts;
use std::mem::size_of;
use std::ops::Index;
use std::sync::Arc;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use sdset::Set;
use crate::DocIndex;
use crate::data::SharedData;
use super::into_u8_slice;
#[derive(Debug)]
#[repr(C)]
struct Range {
start: u64,
end: u64,
}
#[derive(Clone, Default)]
pub struct DocIndexes {
ranges: SharedData,
indexes: SharedData,
}
impl DocIndexes {
pub fn from_bytes(bytes: Vec<u8>) -> io::Result<DocIndexes> {
let bytes = Arc::new(bytes);
let len = bytes.len();
let data = SharedData::new(bytes, 0, len);
let mut cursor = Cursor::new(data);
DocIndexes::from_cursor(&mut cursor)
}
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIndexes> {
let len = cursor.read_u64::<LittleEndian>()? as usize;
let offset = cursor.position() as usize;
let ranges = cursor.get_ref().range(offset, len);
cursor.consume(len);
let len = cursor.read_u64::<LittleEndian>()? as usize;
let offset = cursor.position() as usize;
let indexes = cursor.get_ref().range(offset, len);
cursor.consume(len);
Ok(DocIndexes { ranges, indexes })
}
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
let ranges_len = self.ranges.len() as u64;
let _ = bytes.write_u64::<LittleEndian>(ranges_len);
bytes.extend_from_slice(&self.ranges);
let indexes_len = self.indexes.len() as u64;
let _ = bytes.write_u64::<LittleEndian>(indexes_len);
bytes.extend_from_slice(&self.indexes);
}
pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> {
self.ranges().get(index).map(|Range { start, end }| {
let start = *start as usize;
let end = *end as usize;
let slice = &self.indexes()[start..end];
Set::new_unchecked(slice)
})
}
fn ranges(&self) -> &[Range] {
let slice = &self.ranges;
let ptr = slice.as_ptr() as *const Range;
let len = slice.len() / size_of::<Range>();
unsafe { from_raw_parts(ptr, len) }
}
fn indexes(&self) -> &[DocIndex] {
let slice = &self.indexes;
let ptr = slice.as_ptr() as *const DocIndex;
let len = slice.len() / size_of::<DocIndex>();
unsafe { from_raw_parts(ptr, len) }
}
}
impl Index<usize> for DocIndexes {
type Output = [DocIndex];
fn index(&self, index: usize) -> &Self::Output {
match self.get(index) {
Some(indexes) => indexes,
None => panic!("index {} out of range for a maximum of {} ranges", index, self.ranges().len()),
}
}
}
pub struct DocIndexesBuilder<W> {
ranges: Vec<Range>,
indexes: Vec<DocIndex>,
wtr: W,
}
impl DocIndexesBuilder<Vec<u8>> {
pub fn memory() -> Self {
DocIndexesBuilder {
ranges: Vec::new(),
indexes: Vec::new(),
wtr: Vec::new(),
}
}
}
impl<W: Write> DocIndexesBuilder<W> {
pub fn new(wtr: W) -> Self {
DocIndexesBuilder {
ranges: Vec::new(),
indexes: Vec::new(),
wtr: wtr,
}
}
pub fn insert(&mut self, indexes: &Set<DocIndex>) {
let len = indexes.len() as u64;
let start = self.ranges.last().map(|r| r.end).unwrap_or(0);
let range = Range { start, end: start + len };
self.ranges.push(range);
self.indexes.extend_from_slice(indexes);
}
pub fn finish(self) -> io::Result<()> {
self.into_inner().map(drop)
}
pub fn into_inner(mut self) -> io::Result<W> {
let ranges = unsafe { into_u8_slice(&self.ranges) };
let len = ranges.len() as u64;
self.wtr.write_u64::<LittleEndian>(len)?;
self.wtr.write_all(ranges)?;
let indexes = unsafe { into_u8_slice(&self.indexes) };
let len = indexes.len() as u64;
self.wtr.write_u64::<LittleEndian>(len)?;
self.wtr.write_all(indexes)?;
Ok(self.wtr)
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::error::Error;
use crate::{Attribute, WordArea};
use crate::DocumentId;
#[test]
fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex {
document_id: DocumentId(0),
attribute: Attribute::new_faillible(3, 11),
word_area: WordArea::new_faillible(30, 4)
};
let b = DocIndex {
document_id: DocumentId(1),
attribute: Attribute::new_faillible(4, 21),
word_area: WordArea::new_faillible(35, 6)
};
let c = DocIndex {
document_id: DocumentId(2),
attribute: Attribute::new_faillible(8, 2),
word_area: WordArea::new_faillible(89, 6)
};
let mut builder = DocIndexesBuilder::memory();
builder.insert(Set::new(&[a])?);
builder.insert(Set::new(&[a, b, c])?);
builder.insert(Set::new(&[a, c])?);
let bytes = builder.into_inner()?;
let docs = DocIndexes::from_bytes(bytes)?;
assert_eq!(docs.get(0), Some(Set::new(&[a])?));
assert_eq!(docs.get(1), Some(Set::new(&[a, b, c])?));
assert_eq!(docs.get(2), Some(Set::new(&[a, c])?));
assert_eq!(docs.get(3), None);
Ok(())
}
#[test]
fn serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex {
document_id: DocumentId(0),
attribute: Attribute::new_faillible(3, 11),
word_area: WordArea::new_faillible(30, 4)
};
let b = DocIndex {
document_id: DocumentId(1),
attribute: Attribute::new_faillible(4, 21),
word_area: WordArea::new_faillible(35, 6)
};
let c = DocIndex {
document_id: DocumentId(2),
attribute: Attribute::new_faillible(8, 2),
word_area: WordArea::new_faillible(89, 6)
};
let mut builder = DocIndexesBuilder::memory();
builder.insert(Set::new(&[a])?);
builder.insert(Set::new(&[a, b, c])?);
builder.insert(Set::new(&[a, c])?);
let builder_bytes = builder.into_inner()?;
let docs = DocIndexes::from_bytes(builder_bytes.clone())?;
let mut bytes = Vec::new();
docs.write_to_bytes(&mut bytes);
assert_eq!(builder_bytes, bytes);
Ok(())
}
}

View File

@ -1,58 +0,0 @@
mod doc_ids;
mod doc_indexes;
use std::slice::from_raw_parts;
use std::mem::size_of;
use std::ops::Deref;
use std::sync::Arc;
pub use self::doc_ids::DocIds;
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
#[derive(Default, Clone)]
pub struct SharedData {
pub bytes: Arc<Vec<u8>>,
pub offset: usize,
pub len: usize,
}
impl SharedData {
pub fn from_bytes(vec: Vec<u8>) -> SharedData {
let len = vec.len();
let bytes = Arc::new(vec);
SharedData::new(bytes, 0, len)
}
pub fn new(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedData {
SharedData { bytes, offset, len }
}
pub fn range(&self, offset: usize, len: usize) -> SharedData {
assert!(offset + len <= self.len);
SharedData {
bytes: self.bytes.clone(),
offset: self.offset + offset,
len: len,
}
}
}
impl Deref for SharedData {
type Target = [u8];
fn deref(&self) -> &Self::Target {
self.as_ref()
}
}
impl AsRef<[u8]> for SharedData {
fn as_ref(&self) -> &[u8] {
&self.bytes[self.offset..self.offset + self.len]
}
}
unsafe fn into_u8_slice<T: Sized>(slice: &[T]) -> &[u8] {
let ptr = slice.as_ptr() as *const u8;
let len = slice.len() * size_of::<T>();
from_raw_parts(ptr, len)
}

View File

@ -1,186 +0,0 @@
use std::error::Error;
use std::ops::Deref;
use std::fmt;
use rocksdb::rocksdb::{DB, Snapshot, SeekKey};
use rocksdb::rocksdb_options::ReadOptions;
use serde::forward_to_deserialize_any;
use serde::de::value::MapDeserializer;
use serde::de::{self, Visitor, IntoDeserializer};
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
use crate::database::schema::Schema;
use crate::DocumentId;
pub struct Deserializer<'a, D>
where D: Deref<Target=DB>
{
snapshot: &'a Snapshot<D>,
schema: &'a Schema,
document_id: DocumentId,
}
impl<'a, D> Deserializer<'a, D>
where D: Deref<Target=DB>
{
pub fn new(snapshot: &'a Snapshot<D>, schema: &'a Schema, doc: DocumentId) -> Self {
Deserializer { snapshot, schema, document_id: doc }
}
}
impl<'de, 'a, 'b, D> de::Deserializer<'de> for &'b mut Deserializer<'a, D>
where D: Deref<Target=DB>
{
type Error = DeserializerError;
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
self.deserialize_map(visitor)
}
forward_to_deserialize_any! {
bool u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 char str string unit seq
bytes byte_buf unit_struct tuple_struct
identifier tuple ignored_any option newtype_struct enum struct
}
fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
let mut options = ReadOptions::new();
let lower = DocumentKey::new(self.document_id);
let upper = lower.with_attribute_max();
options.set_iterate_lower_bound(lower.as_ref());
options.set_iterate_upper_bound(upper.as_ref());
let mut iter = self.snapshot.iter_opt(options);
iter.seek(SeekKey::Start);
if iter.kv().is_none() {
// FIXME return an error
}
let iter = iter.map(|(key, value)| {
// retrieve the schema attribute name
// from the schema attribute number
let document_key_attr = DocumentKeyAttr::from_bytes(&key);
let schema_attr = document_key_attr.attribute();
let attribute_name = self.schema.attribute_name(schema_attr);
(attribute_name, Value(value))
});
let map_deserializer = MapDeserializer::new(iter);
visitor.visit_map(map_deserializer)
}
}
struct Value(Vec<u8>);
impl<'de> IntoDeserializer<'de, DeserializerError> for Value {
type Deserializer = Self;
fn into_deserializer(self) -> Self::Deserializer {
self
}
}
macro_rules! forward_to_bincode_values {
($($ty:ident => $de_method:ident,)*) => {
$(
fn $de_method<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de>
{
match bincode::deserialize::<$ty>(&self.0) {
Ok(val) => val.into_deserializer().$de_method(visitor),
Err(e) => Err(de::Error::custom(e)),
}
}
)*
}
}
impl<'de, 'a> de::Deserializer<'de> for Value {
type Error = DeserializerError;
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
self.0.into_deserializer().deserialize_any(visitor)
}
fn deserialize_str<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
self.deserialize_string(visitor)
}
fn deserialize_string<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
match bincode::deserialize::<String>(&self.0) {
Ok(val) => val.into_deserializer().deserialize_string(visitor),
Err(e) => Err(de::Error::custom(e)),
}
}
fn deserialize_bytes<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
self.deserialize_byte_buf(visitor)
}
fn deserialize_byte_buf<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
match bincode::deserialize::<Vec<u8>>(&self.0) {
Ok(val) => val.into_deserializer().deserialize_byte_buf(visitor),
Err(e) => Err(de::Error::custom(e)),
}
}
forward_to_bincode_values! {
char => deserialize_char,
bool => deserialize_bool,
u8 => deserialize_u8,
u16 => deserialize_u16,
u32 => deserialize_u32,
u64 => deserialize_u64,
i8 => deserialize_i8,
i16 => deserialize_i16,
i32 => deserialize_i32,
i64 => deserialize_i64,
f32 => deserialize_f32,
f64 => deserialize_f64,
}
forward_to_deserialize_any! {
unit seq map
unit_struct tuple_struct
identifier tuple ignored_any option newtype_struct enum struct
}
}
#[derive(Debug)]
pub enum DeserializerError {
Custom(String),
}
impl de::Error for DeserializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
DeserializerError::Custom(msg.to_string())
}
}
impl fmt::Display for DeserializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
DeserializerError::Custom(s) => f.write_str(&s),
}
}
}
impl Error for DeserializerError {}

View File

@ -1,145 +0,0 @@
use std::io::{Cursor, Read, Write};
use std::mem::size_of;
use std::fmt;
use byteorder::{BigEndian, WriteBytesExt, ReadBytesExt};
use crate::database::schema::SchemaAttr;
use crate::DocumentId;
const DOC_KEY_LEN: usize = 4 + size_of::<u64>();
const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::<u16>();
#[derive(Copy, Clone)]
pub struct DocumentKey([u8; DOC_KEY_LEN]);
impl DocumentKey {
pub fn new(id: DocumentId) -> DocumentKey {
let mut buffer = [0; DOC_KEY_LEN];
let mut wtr = Cursor::new(&mut buffer[..]);
wtr.write_all(b"doc-").unwrap();
wtr.write_u64::<BigEndian>(id.0).unwrap();
DocumentKey(buffer)
}
pub fn from_bytes(mut bytes: &[u8]) -> DocumentKey {
assert!(bytes.len() >= DOC_KEY_LEN);
assert_eq!(&bytes[..4], b"doc-");
let mut buffer = [0; DOC_KEY_LEN];
bytes.read_exact(&mut buffer).unwrap();
DocumentKey(buffer)
}
pub fn with_attribute(&self, attr: SchemaAttr) -> DocumentKeyAttr {
DocumentKeyAttr::new(self.document_id(), attr)
}
pub fn with_attribute_max(&self) -> DocumentKeyAttr {
DocumentKeyAttr::new(self.document_id(), SchemaAttr::max())
}
pub fn document_id(&self) -> DocumentId {
let id = (&self.0[4..]).read_u64::<BigEndian>().unwrap();
DocumentId(id)
}
}
impl AsRef<[u8]> for DocumentKey {
fn as_ref(&self) -> &[u8] {
&self.0
}
}
impl fmt::Debug for DocumentKey {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("DocumentKey")
.field("document_id", &self.document_id())
.finish()
}
}
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct DocumentKeyAttr([u8; DOC_KEY_ATTR_LEN]);
impl DocumentKeyAttr {
pub fn new(id: DocumentId, attr: SchemaAttr) -> DocumentKeyAttr {
let mut buffer = [0; DOC_KEY_ATTR_LEN];
let DocumentKey(raw_key) = DocumentKey::new(id);
let mut wtr = Cursor::new(&mut buffer[..]);
wtr.write_all(&raw_key).unwrap();
wtr.write_all(b"-").unwrap();
wtr.write_u16::<BigEndian>(attr.0).unwrap();
DocumentKeyAttr(buffer)
}
pub fn with_attribute_min(id: DocumentId) -> DocumentKeyAttr {
DocumentKeyAttr::new(id, SchemaAttr::min())
}
pub fn with_attribute_max(id: DocumentId) -> DocumentKeyAttr {
DocumentKeyAttr::new(id, SchemaAttr::max())
}
pub fn from_bytes(mut bytes: &[u8]) -> DocumentKeyAttr {
assert!(bytes.len() >= DOC_KEY_ATTR_LEN);
assert_eq!(&bytes[..4], b"doc-");
let mut buffer = [0; DOC_KEY_ATTR_LEN];
bytes.read_exact(&mut buffer).unwrap();
DocumentKeyAttr(buffer)
}
pub fn document_id(&self) -> DocumentId {
let id = (&self.0[4..]).read_u64::<BigEndian>().unwrap();
DocumentId(id)
}
pub fn attribute(&self) -> SchemaAttr {
let offset = 4 + size_of::<u64>() + 1;
let value = (&self.0[offset..]).read_u16::<BigEndian>().unwrap();
SchemaAttr::new(value)
}
pub fn into_document_key(self) -> DocumentKey {
DocumentKey::new(self.document_id())
}
}
impl AsRef<[u8]> for DocumentKeyAttr {
fn as_ref(&self) -> &[u8] {
&self.0
}
}
impl fmt::Debug for DocumentKeyAttr {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("DocumentKeyAttr")
.field("document_id", &self.document_id())
.field("attribute", &self.attribute().0)
.finish()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn keep_as_ref_order() {
for (a, b) in (0..).zip(1..).take(u16::max_value() as usize - 1) {
let id = DocumentId(0);
let a = DocumentKeyAttr::new(id, SchemaAttr(a));
let b = DocumentKeyAttr::new(id, SchemaAttr(b));
assert!(a < b);
assert!(a.as_ref() < b.as_ref());
}
}
}

View File

@ -1,82 +0,0 @@
mod negative;
mod positive;
pub(crate) use self::negative::Negative;
pub(crate) use self::positive::{Positive, PositiveBuilder};
use std::error::Error;
use std::io::Cursor;
use std::sync::Arc;
use fst::{IntoStreamer, Streamer};
use sdset::duo::DifferenceByKey;
use sdset::{Set, SetOperation};
use fst::Map;
use crate::data::{SharedData, DocIndexes};
#[derive(Default)]
pub struct Index {
pub(crate) negative: Negative,
pub(crate) positive: Positive,
}
impl Index {
pub fn from_bytes(bytes: Vec<u8>) -> Result<Index, Box<Error>> {
let len = bytes.len();
Index::from_shared_bytes(Arc::new(bytes), 0, len)
}
pub fn from_shared_bytes(
bytes: Arc<Vec<u8>>,
offset: usize,
len: usize,
) -> Result<Index, Box<Error>>
{
let data = SharedData::new(bytes, offset, len);
let mut cursor = Cursor::new(data);
let negative = Negative::from_cursor(&mut cursor)?;
let positive = Positive::from_cursor(&mut cursor)?;
Ok(Index { negative, positive })
}
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
self.negative.write_to_bytes(bytes);
self.positive.write_to_bytes(bytes);
}
pub fn merge(&self, other: &Index) -> Result<Index, Box<Error>> {
if other.negative.is_empty() {
let negative = Negative::default();
let positive = self.positive.union(&other.positive)?;
return Ok(Index { negative, positive })
}
let mut buffer = Vec::new();
let mut builder = PositiveBuilder::memory();
let mut stream = self.positive.into_stream();
while let Some((key, indexes)) = stream.next() {
let op = DifferenceByKey::new(indexes, &other.negative, |x| x.document_id, |x| *x);
buffer.clear();
op.extend_vec(&mut buffer);
if !buffer.is_empty() {
let indexes = Set::new_unchecked(&buffer);
builder.insert(key, indexes)?;
}
}
let positive = {
let (map, indexes) = builder.into_inner()?;
let map = Map::from_bytes(map)?;
let indexes = DocIndexes::from_bytes(indexes)?;
Positive::new(map, indexes)
};
let negative = Negative::default();
let positive = positive.union(&other.positive)?;
Ok(Index { negative, positive })
}
}

View File

@ -1,43 +0,0 @@
use std::error::Error;
use std::io::Cursor;
use std::ops::Deref;
use sdset::Set;
use byteorder::{LittleEndian, WriteBytesExt};
use crate::data::SharedData;
use crate::data::DocIds;
use crate::DocumentId;
#[derive(Default)]
pub struct Negative(DocIds);
impl Negative {
pub fn new(doc_ids: DocIds) -> Negative {
Negative(doc_ids)
}
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Negative, Box<Error>> {
let doc_ids = DocIds::from_cursor(cursor)?;
Ok(Negative(doc_ids))
}
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
let slice = self.0.as_bytes();
let len = slice.len() as u64;
let _ = bytes.write_u64::<LittleEndian>(len);
bytes.extend_from_slice(slice);
}
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
}
impl Deref for Negative {
type Target = Set<DocumentId>;
fn deref(&self) -> &Self::Target {
self.0.as_ref()
}
}

View File

@ -1,166 +0,0 @@
use std::io::{Write, BufRead, Cursor};
use std::error::Error;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use fst::{map, Map, Streamer, IntoStreamer};
use sdset::{Set, SetOperation};
use sdset::duo::Union;
use fst::raw::Fst;
use crate::data::{DocIndexes, DocIndexesBuilder};
use crate::data::SharedData;
use crate::DocIndex;
#[derive(Default)]
pub struct Positive {
map: Map,
indexes: DocIndexes,
}
impl Positive {
pub fn new(map: Map, indexes: DocIndexes) -> Positive {
Positive { map, indexes }
}
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Positive, Box<Error>> {
let len = cursor.read_u64::<LittleEndian>()? as usize;
let offset = cursor.position() as usize;
let data = cursor.get_ref().range(offset, len);
let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?;
let map = Map::from(fst);
cursor.consume(len);
let indexes = DocIndexes::from_cursor(cursor)?;
Ok(Positive { map, indexes})
}
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
let slice = self.map.as_fst().as_bytes();
let len = slice.len() as u64;
let _ = bytes.write_u64::<LittleEndian>(len);
bytes.extend_from_slice(slice);
self.indexes.write_to_bytes(bytes);
}
pub fn map(&self) -> &Map {
&self.map
}
pub fn indexes(&self) -> &DocIndexes {
&self.indexes
}
pub fn union(&self, other: &Positive) -> Result<Positive, Box<Error>> {
let mut builder = PositiveBuilder::memory();
let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union();
let mut buffer = Vec::new();
while let Some((key, ivalues)) = stream.next() {
buffer.clear();
match ivalues {
[a, b] => {
let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
let indexes = indexes.get(a.value as usize).ok_or(format!("index not found"))?;
let a = Set::new_unchecked(indexes);
let indexes = if b.index == 0 { &self.indexes } else { &other.indexes };
let indexes = indexes.get(b.value as usize).ok_or(format!("index not found"))?;
let b = Set::new_unchecked(indexes);
let op = Union::new(a, b);
op.extend_vec(&mut buffer);
},
[a] => {
let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
let indexes = indexes.get(a.value as usize).ok_or(format!("index not found"))?;
buffer.extend_from_slice(indexes)
},
_ => continue,
}
if !buffer.is_empty() {
let indexes = Set::new_unchecked(&buffer);
builder.insert(key, indexes)?;
}
}
let (map, indexes) = builder.into_inner()?;
let map = Map::from_bytes(map)?;
let indexes = DocIndexes::from_bytes(indexes)?;
Ok(Positive { map, indexes })
}
}
impl<'m, 'a> IntoStreamer<'a> for &'m Positive {
type Item = (&'a [u8], &'a Set<DocIndex>);
/// The type of the stream to be constructed.
type Into = Stream<'m>;
/// Construct a stream from `Self`.
fn into_stream(self) -> Self::Into {
Stream {
map_stream: self.map.into_stream(),
indexes: &self.indexes,
}
}
}
pub struct Stream<'m> {
map_stream: map::Stream<'m>,
indexes: &'m DocIndexes,
}
impl<'m, 'a> Streamer<'a> for Stream<'m> {
type Item = (&'a [u8], &'a Set<DocIndex>);
fn next(&'a mut self) -> Option<Self::Item> {
match self.map_stream.next() {
Some((input, index)) => {
let indexes = &self.indexes[index as usize];
let indexes = Set::new_unchecked(indexes);
Some((input, indexes))
},
None => None,
}
}
}
pub struct PositiveBuilder<W, X> {
map: fst::MapBuilder<W>,
indexes: DocIndexesBuilder<X>,
value: u64,
}
impl PositiveBuilder<Vec<u8>, Vec<u8>> {
pub fn memory() -> Self {
PositiveBuilder {
map: fst::MapBuilder::memory(),
indexes: DocIndexesBuilder::memory(),
value: 0,
}
}
}
impl<W: Write, X: Write> PositiveBuilder<W, X> {
/// If a key is inserted that is less than or equal to any previous key added,
/// then an error is returned. Similarly, if there was a problem writing
/// to the underlying writer, an error is returned.
// FIXME what if one write doesn't work but the other do ?
pub fn insert<K>(&mut self, key: K, indexes: &Set<DocIndex>) -> Result<(), Box<Error>>
where K: AsRef<[u8]>,
{
self.map.insert(key, self.value)?;
self.indexes.insert(indexes);
self.value += 1;
Ok(())
}
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
let map = self.map.into_inner()?;
let indexes = self.indexes.into_inner()?;
Ok((map, indexes))
}
}

View File

@ -1,684 +0,0 @@
use std::sync::{Arc, Mutex};
use std::error::Error;
use std::ops::Deref;
use std::path::Path;
use rocksdb::rocksdb_options::{DBOptions, IngestExternalFileOptions, ColumnFamilyOptions};
use rocksdb::rocksdb::{Writable, Snapshot};
use rocksdb::{DB, DBVector, MergeOperands};
use crossbeam::atomic::ArcCell;
use log::debug;
pub use self::document_key::{DocumentKey, DocumentKeyAttr};
pub use self::view::{DatabaseView, DocumentIter};
pub use self::update::{Update, UpdateBuilder};
pub use self::serde::SerializerError;
pub use self::schema::Schema;
pub use self::index::Index;
const DATA_INDEX: &[u8] = b"data-index";
const DATA_SCHEMA: &[u8] = b"data-schema";
pub mod schema;
pub(crate) mod index;
mod deserializer;
mod document_key;
mod serde;
mod update;
mod view;
fn retrieve_data_schema<D>(snapshot: &Snapshot<D>) -> Result<Schema, Box<Error>>
where D: Deref<Target=DB>
{
match snapshot.get(DATA_SCHEMA)? {
Some(vector) => Ok(Schema::read_from_bin(&*vector)?),
None => Err(String::from("BUG: no schema found in the database").into()),
}
}
fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<Index, Box<Error>>
where D: Deref<Target=DB>
{
let index = match snapshot.get(DATA_INDEX)? {
Some(vector) => {
let bytes = vector.as_ref().to_vec();
Index::from_bytes(bytes)?
},
None => Index::default(),
};
Ok(index)
}
fn merge_indexes(key: &[u8], existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
assert_eq!(key, DATA_INDEX, "The merge operator only supports \"data-index\" merging");
let mut index: Option<Index> = None;
for bytes in existing.into_iter().chain(operands) {
let operand = Index::from_bytes(bytes.to_vec()).unwrap();
let merged = match index {
Some(ref index) => index.merge(&operand).unwrap(),
None => operand,
};
index.replace(merged);
}
let index = index.unwrap_or_default();
let mut bytes = Vec::new();
index.write_to_bytes(&mut bytes);
bytes
}
pub struct Database {
// DB is under a Mutex to sync update ingestions and separate DB update locking
// and DatabaseView acquiring locking in other words:
// "Block readers the minimum possible amount of time"
db: Mutex<Arc<DB>>,
// This view is updated each time the DB ingests an update
view: ArcCell<DatabaseView<Arc<DB>>>,
}
impl Database {
pub fn create<P: AsRef<Path>>(path: P, schema: &Schema) -> Result<Database, Box<Error>> {
let path = path.as_ref();
if path.exists() {
return Err(format!("File already exists at path: {}, cannot create database.",
path.display()).into())
}
let path = path.to_string_lossy();
let mut opts = DBOptions::new();
opts.create_if_missing(true);
// opts.error_if_exists(true); // FIXME pull request that
let mut cf_opts = ColumnFamilyOptions::new();
cf_opts.add_merge_operator("data-index merge operator", merge_indexes);
let db = DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
let mut schema_bytes = Vec::new();
schema.write_to_bin(&mut schema_bytes)?;
db.put(DATA_SCHEMA, &schema_bytes)?;
let db = Arc::new(db);
let snapshot = Snapshot::new(db.clone());
let view = ArcCell::new(Arc::new(DatabaseView::new(snapshot)?));
Ok(Database { db: Mutex::new(db), view })
}
pub fn open<P: AsRef<Path>>(path: P) -> Result<Database, Box<Error>> {
let path = path.as_ref().to_string_lossy();
let mut opts = DBOptions::new();
opts.create_if_missing(false);
let mut cf_opts = ColumnFamilyOptions::new();
cf_opts.add_merge_operator("data-index merge operator", merge_indexes);
let db = DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
// FIXME create a generic function to do that !
let _schema = match db.get(DATA_SCHEMA)? {
Some(value) => Schema::read_from_bin(&*value)?,
None => return Err(String::from("Database does not contain a schema").into()),
};
let db = Arc::new(db);
let snapshot = Snapshot::new(db.clone());
let view = ArcCell::new(Arc::new(DatabaseView::new(snapshot)?));
Ok(Database { db: Mutex::new(db), view })
}
pub fn ingest_update_file(&self, update: Update) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
let snapshot = {
// We must have a mutex here to ensure that update ingestions and compactions
// are done atomatically and in the right order.
// This way update ingestions will block other update ingestions without blocking view
// creations while doing the "data-index" compaction
let db = match self.db.lock() {
Ok(db) => db,
Err(e) => return Err(e.to_string().into()),
};
let path = update.path().to_string_lossy();
let options = IngestExternalFileOptions::new();
// options.move_files(move_update);
debug!("ingest update file");
let cf_handle = db.cf_handle("default").expect("\"default\" column family not found");
db.ingest_external_file_optimized(&cf_handle, &options, &[&path])?;
debug!("compacting index range");
// Compacting to trigger the merge operator only one time
// while ingesting the update and not each time searching
db.compact_range(Some(DATA_INDEX), Some(DATA_INDEX));
Snapshot::new(db.clone())
};
let view = Arc::new(DatabaseView::new(snapshot)?);
self.view.set(view.clone());
Ok(view)
}
pub fn get(&self, key: &[u8]) -> Result<Option<DBVector>, Box<Error>> {
self.view().get(key)
}
pub fn flush(&self) -> Result<(), Box<Error>> {
match self.db.lock() {
Ok(db) => Ok(db.flush(true)?),
Err(e) => Err(e.to_string().into()),
}
}
pub fn view(&self) -> Arc<DatabaseView<Arc<DB>>> {
self.view.get()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::error::Error;
use serde_derive::{Serialize, Deserialize};
use hashbrown::HashSet;
use tempfile::tempdir;
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
use crate::database::update::UpdateBuilder;
use crate::tokenizer::DefaultBuilder;
#[test]
fn ingest_one_update_file() -> Result<(), Box<Error>> {
let dir = tempdir()?;
let stop_words = HashSet::new();
let rocksdb_path = dir.path().join("rocksdb.rdb");
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
struct SimpleDoc {
id: u64,
title: String,
description: String,
timestamp: u64,
}
let schema = {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("id", STORED);
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
builder.new_attribute("timestamp", STORED);
builder.build()
};
let database = Database::create(&rocksdb_path, &schema)?;
let update_path = dir.path().join("update.sst");
let doc0 = SimpleDoc {
id: 0,
title: String::from("I am a title"),
description: String::from("I am a description"),
timestamp: 1234567,
};
let doc1 = SimpleDoc {
id: 1,
title: String::from("I am the second title"),
description: String::from("I am the second description"),
timestamp: 7654321,
};
let docid0;
let docid1;
let update = {
let tokenizer_builder = DefaultBuilder::new();
let mut builder = UpdateBuilder::new(update_path, schema);
docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
builder.build()?
};
database.ingest_update_file(update)?;
let view = database.view();
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
assert_eq!(doc0, de_doc0);
assert_eq!(doc1, de_doc1);
Ok(dir.close()?)
}
#[test]
fn ingest_two_update_files() -> Result<(), Box<Error>> {
let dir = tempdir()?;
let stop_words = HashSet::new();
let rocksdb_path = dir.path().join("rocksdb.rdb");
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
struct SimpleDoc {
id: u64,
title: String,
description: String,
timestamp: u64,
}
let schema = {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("id", STORED);
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
builder.new_attribute("timestamp", STORED);
builder.build()
};
let database = Database::create(&rocksdb_path, &schema)?;
let doc0 = SimpleDoc {
id: 0,
title: String::from("I am a title"),
description: String::from("I am a description"),
timestamp: 1234567,
};
let doc1 = SimpleDoc {
id: 1,
title: String::from("I am the second title"),
description: String::from("I am the second description"),
timestamp: 7654321,
};
let doc2 = SimpleDoc {
id: 2,
title: String::from("I am the third title"),
description: String::from("I am the third description"),
timestamp: 7654321,
};
let doc3 = SimpleDoc {
id: 3,
title: String::from("I am the fourth title"),
description: String::from("I am the fourth description"),
timestamp: 7654321,
};
let docid0;
let docid1;
let update1 = {
let tokenizer_builder = DefaultBuilder::new();
let update_path = dir.path().join("update-000.sst");
let mut builder = UpdateBuilder::new(update_path, schema.clone());
docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
builder.build()?
};
let docid2;
let docid3;
let update2 = {
let tokenizer_builder = DefaultBuilder::new();
let update_path = dir.path().join("update-001.sst");
let mut builder = UpdateBuilder::new(update_path, schema);
docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?;
docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?;
builder.build()?
};
database.ingest_update_file(update1)?;
database.ingest_update_file(update2)?;
let view = database.view();
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
assert_eq!(doc0, de_doc0);
assert_eq!(doc1, de_doc1);
let de_doc2: SimpleDoc = view.document_by_id(docid2)?;
let de_doc3: SimpleDoc = view.document_by_id(docid3)?;
assert_eq!(doc2, de_doc2);
assert_eq!(doc3, de_doc3);
Ok(dir.close()?)
}
}
#[cfg(all(feature = "nightly", test))]
mod bench {
extern crate test;
use super::*;
use std::error::Error;
use std::iter::repeat_with;
use self::test::Bencher;
use rand::distributions::Alphanumeric;
use rand_xorshift::XorShiftRng;
use rand::{Rng, SeedableRng};
use serde_derive::Serialize;
use rand::seq::SliceRandom;
use hashbrown::HashSet;
use crate::tokenizer::DefaultBuilder;
use crate::database::update::UpdateBuilder;
use crate::database::schema::*;
fn random_sentences<R: Rng>(number: usize, rng: &mut R) -> String {
let mut words = String::new();
for i in 0..number {
let word_len = rng.gen_range(1, 12);
let iter = repeat_with(|| rng.sample(Alphanumeric)).take(word_len);
words.extend(iter);
if i == number - 1 { // last word
let final_ = [".", "?", "!", "..."].choose(rng).cloned();
words.extend(final_);
} else {
let middle = [",", ", "].choose(rng).cloned();
words.extend(middle);
}
}
words
}
#[bench]
fn open_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let database = Database::create(db_path.clone(), &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let path = dir.path().join("update-000.sst");
let tokenizer_builder = DefaultBuilder;
let mut builder = UpdateBuilder::new(path, schema);
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..300 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
database.ingest_update_file(update)?;
drop(database);
bench.iter(|| {
let database = Database::open(db_path.clone()).unwrap();
test::black_box(|| database);
});
Ok(())
}
#[bench]
fn open_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let database = Database::create(db_path.clone(), &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let path = dir.path().join("update-000.sst");
let tokenizer_builder = DefaultBuilder;
let mut builder = UpdateBuilder::new(path, schema);
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..3000 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
database.ingest_update_file(update)?;
drop(database);
bench.iter(|| {
let database = Database::open(db_path.clone()).unwrap();
test::black_box(|| database);
});
Ok(())
}
#[bench]
#[ignore]
fn open_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let database = Database::create(db_path.clone(), &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let path = dir.path().join("update-000.sst");
let tokenizer_builder = DefaultBuilder;
let mut builder = UpdateBuilder::new(path, schema);
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..30_000 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
database.ingest_update_file(update)?;
drop(database);
bench.iter(|| {
let database = Database::open(db_path.clone()).unwrap();
test::black_box(|| database);
});
Ok(())
}
#[bench]
fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let database = Database::create(db_path.clone(), &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let path = dir.path().join("update-000.sst");
let tokenizer_builder = DefaultBuilder;
let mut builder = UpdateBuilder::new(path, schema);
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..300 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
let view = database.ingest_update_file(update)?;
bench.iter(|| {
for q in &["a", "b", "c", "d", "e"] {
let documents = view.query_builder().unwrap().query(q, 0..20);
test::black_box(|| documents);
}
});
Ok(())
}
#[bench]
fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let database = Database::create(db_path.clone(), &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let path = dir.path().join("update-000.sst");
let tokenizer_builder = DefaultBuilder;
let mut builder = UpdateBuilder::new(path, schema);
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..3000 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
let view = database.ingest_update_file(update)?;
bench.iter(|| {
for q in &["a", "b", "c", "d", "e"] {
let documents = view.query_builder().unwrap().query(q, 0..20);
test::black_box(|| documents);
}
});
Ok(())
}
#[bench]
#[ignore]
fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let database = Database::create(db_path.clone(), &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let path = dir.path().join("update-000.sst");
let tokenizer_builder = DefaultBuilder;
let mut builder = UpdateBuilder::new(path, schema);
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..30_000 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let update = builder.build()?;
let view = database.ingest_update_file(update)?;
bench.iter(|| {
for q in &["a", "b", "c", "d", "e"] {
let documents = view.query_builder().unwrap().query(q, 0..20);
test::black_box(|| documents);
}
});
Ok(())
}
}

View File

@ -1,196 +0,0 @@
use crate::database::update::DocumentUpdate;
use crate::database::serde::SerializerError;
use crate::database::schema::SchemaAttr;
use crate::tokenizer::TokenizerBuilder;
use crate::tokenizer::Token;
use crate::{DocumentId, DocIndex, Attribute, WordArea};
use hashbrown::HashSet;
use serde::Serialize;
use serde::ser;
pub struct IndexerSerializer<'a, B> {
pub tokenizer_builder: &'a B,
pub update: &'a mut DocumentUpdate,
pub document_id: DocumentId,
pub attribute: SchemaAttr,
pub stop_words: &'a HashSet<String>,
}
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
for token in self.tokenizer_builder.build(v) {
let Token { word, word_index, char_index } = token;
let document_id = self.document_id;
// FIXME must u32::try_from instead
let attribute = match Attribute::new(self.attribute.0, word_index as u32) {
Ok(attribute) => attribute,
Err(_) => return Ok(()),
};
// insert the exact representation
let word_lower = word.to_lowercase();
let length = word.chars().count() as u16;
if self.stop_words.contains(&word_lower) { continue }
// and the unidecoded lowercased version
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
if word_lower != word_unidecoded {
let word_area = match WordArea::new(char_index as u32, length) {
Ok(word_area) => word_area,
Err(_) => return Ok(()),
};
let doc_index = DocIndex { document_id, attribute, word_area };
self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index);
}
let word_area = match WordArea::new(char_index as u32, length) {
Ok(word_area) => word_area,
Err(_) => return Ok(()),
};
let doc_index = DocIndex { document_id, attribute, word_area };
self.update.insert_doc_index(word_lower.into_bytes(), doc_index);
}
Ok(())
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "seq" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { name: "map" })
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct" })
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}

View File

@ -1,57 +0,0 @@
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
use std::error::Error;
use std::fmt;
use serde::ser;
macro_rules! forward_to_unserializable_type {
($($ty:ident => $se_method:ident,)*) => {
$(
fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "$ty" })
}
)*
}
}
pub mod find_id;
pub mod key_to_string;
pub mod serializer;
pub mod indexer_serializer;
pub fn calculate_hash<T: Hash>(t: &T) -> u64 {
let mut s = DefaultHasher::new();
t.hash(&mut s);
s.finish()
}
#[derive(Debug)]
pub enum SerializerError {
DocumentIdNotFound,
UnserializableType { name: &'static str },
Custom(String),
}
impl ser::Error for SerializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
SerializerError::Custom(msg.to_string())
}
}
impl fmt::Display for SerializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
SerializerError::DocumentIdNotFound => {
write!(f, "serialized document does not have an id according to the schema")
}
SerializerError::UnserializableType { name } => {
write!(f, "Only struct and map types are considered valid documents and
can be serialized, not {} types directly.", name)
},
SerializerError::Custom(s) => f.write_str(&s),
}
}
}
impl Error for SerializerError {}

View File

@ -1,286 +0,0 @@
use hashbrown::HashSet;
use serde::Serialize;
use serde::ser;
use crate::database::serde::indexer_serializer::IndexerSerializer;
use crate::database::serde::key_to_string::KeyToStringSerializer;
use crate::database::update::DocumentUpdate;
use crate::database::serde::SerializerError;
use crate::tokenizer::TokenizerBuilder;
use crate::database::schema::Schema;
use crate::DocumentId;
pub struct Serializer<'a, B> {
pub schema: &'a Schema,
pub update: &'a mut DocumentUpdate,
pub document_id: DocumentId,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
}
impl<'a, B> ser::Serializer for Serializer<'a, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapSerializer<'a, B>;
type SerializeStruct = StructSerializer<'a, B>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "str" })
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Ok(MapSerializer {
schema: self.schema,
document_id: self.document_id,
update: self.update,
tokenizer_builder: self.tokenizer_builder,
stop_words: self.stop_words,
current_key_name: None,
})
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Ok(StructSerializer {
schema: self.schema,
update: self.update,
document_id: self.document_id,
tokenizer_builder: self.tokenizer_builder,
stop_words: self.stop_words,
})
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}
pub struct MapSerializer<'a, B> {
pub schema: &'a Schema,
pub document_id: DocumentId,
pub update: &'a mut DocumentUpdate,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
pub current_key_name: Option<String>,
}
impl<'a, B> ser::SerializeMap for MapSerializer<'a, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: Serialize,
{
let key = key.serialize(KeyToStringSerializer)?;
self.current_key_name = Some(key);
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: Serialize,
{
let key = self.current_key_name.take().unwrap();
self.serialize_entry(&key, value)
}
fn serialize_entry<K: ?Sized, V: ?Sized>(
&mut self,
key: &K,
value: &V
) -> Result<(), Self::Error>
where K: Serialize, V: Serialize,
{
let key = key.serialize(KeyToStringSerializer)?;
if let Some(attr) = self.schema.attribute(key) {
let props = self.schema.props(attr);
if props.is_stored() {
let value = bincode::serialize(value).unwrap();
self.update.insert_attribute_value(attr, value);
}
if props.is_indexed() {
let serializer = IndexerSerializer {
update: self.update,
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
attribute: attr,
stop_words: self.stop_words,
};
value.serialize(serializer)?;
}
}
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}
pub struct StructSerializer<'a, B> {
pub schema: &'a Schema,
pub document_id: DocumentId,
pub update: &'a mut DocumentUpdate,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
}
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T
) -> Result<(), Self::Error>
where T: Serialize,
{
if let Some(attr) = self.schema.attribute(key) {
let props = self.schema.props(attr);
if props.is_stored() {
let value = bincode::serialize(value).unwrap();
self.update.insert_attribute_value(attr, value);
}
if props.is_indexed() {
let serializer = IndexerSerializer {
update: self.update,
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
attribute: attr,
stop_words: self.stop_words,
};
value.serialize(serializer)?;
}
}
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}

View File

@ -1,64 +0,0 @@
use std::path::PathBuf;
use std::error::Error;
use hashbrown::HashSet;
use serde::Serialize;
use crate::database::serde::serializer::Serializer;
use crate::database::serde::SerializerError;
use crate::tokenizer::TokenizerBuilder;
use crate::database::Schema;
use crate::DocumentId;
use super::{Update, RawUpdateBuilder};
pub struct UpdateBuilder {
schema: Schema,
raw_builder: RawUpdateBuilder,
}
impl UpdateBuilder {
pub fn new(path: PathBuf, schema: Schema) -> UpdateBuilder {
UpdateBuilder {
schema: schema,
raw_builder: RawUpdateBuilder::new(path),
}
}
pub fn update_document<T, B>(
&mut self,
document: T,
tokenizer_builder: &B,
stop_words: &HashSet<String>,
) -> Result<DocumentId, SerializerError>
where T: Serialize,
B: TokenizerBuilder,
{
let document_id = self.schema.document_id(&document)?;
let update = self.raw_builder.document_update(document_id);
let serializer = Serializer {
schema: &self.schema,
document_id: document_id,
tokenizer_builder: tokenizer_builder,
update: update,
stop_words: stop_words,
};
document.serialize(serializer)?;
Ok(document_id)
}
pub fn remove_document<T>(&mut self, document: T) -> Result<DocumentId, SerializerError>
where T: Serialize,
{
let document_id = self.schema.document_id(&document)?;
self.raw_builder.document_update(document_id).remove();
Ok(document_id)
}
pub fn build(self) -> Result<Update, Box<Error>> {
self.raw_builder.build()
}
}

View File

@ -1,17 +0,0 @@
use std::path::{Path, PathBuf};
mod builder;
mod raw_builder;
pub use self::builder::UpdateBuilder;
pub use self::raw_builder::{RawUpdateBuilder, DocumentUpdate};
pub struct Update {
sst_file: PathBuf,
}
impl Update {
pub fn path(&self) -> &Path {
&self.sst_file
}
}

View File

@ -1,168 +0,0 @@
use std::collections::btree_map::{BTreeMap, Entry};
use std::path::PathBuf;
use std::error::Error;
use rocksdb::rocksdb_options;
use hashbrown::HashMap;
use fst::map::Map;
use sdset::Set;
use crate::database::index::{Index, Positive, PositiveBuilder, Negative};
use crate::database::{DATA_INDEX, DocumentKeyAttr};
use crate::database::schema::SchemaAttr;
use crate::data::{DocIds, DocIndexes};
use crate::{DocumentId, DocIndex};
use super::Update;
type Token = Vec<u8>; // TODO could be replaced by a SmallVec
type Value = Vec<u8>;
pub struct RawUpdateBuilder {
sst_file: PathBuf,
document_updates: BTreeMap<DocumentId, DocumentUpdate>,
}
pub struct DocumentUpdate {
cleared: bool,
words_indexes: HashMap<Token, Vec<DocIndex>>,
attributes: BTreeMap<SchemaAttr, Value>,
}
impl DocumentUpdate {
pub fn new() -> DocumentUpdate {
DocumentUpdate {
cleared: false,
words_indexes: HashMap::new(),
attributes: BTreeMap::new(),
}
}
pub fn remove(&mut self) {
self.cleared = true;
self.clear();
}
pub fn clear(&mut self) {
self.words_indexes.clear();
self.attributes.clear();
}
pub fn insert_attribute_value(&mut self, attr: SchemaAttr, value: Vec<u8>) {
self.attributes.insert(attr, value);
}
pub fn insert_doc_index(&mut self, token: Vec<u8>, doc_index: DocIndex) {
self.words_indexes.entry(token).or_insert_with(Vec::new).push(doc_index)
}
}
impl RawUpdateBuilder {
pub fn new(path: PathBuf) -> RawUpdateBuilder {
RawUpdateBuilder {
sst_file: path,
document_updates: BTreeMap::new(),
}
}
pub fn document_update(&mut self, document_id: DocumentId) -> &mut DocumentUpdate {
match self.document_updates.entry(document_id) {
Entry::Occupied(mut occupied) => {
occupied.get_mut().clear();
occupied.into_mut()
},
Entry::Vacant(vacant) => vacant.insert(DocumentUpdate::new()),
}
}
pub fn build(mut self) -> Result<Update, Box<Error>> {
let mut removed_document_ids = Vec::new();
let mut words_indexes = BTreeMap::new();
for (&id, update) in self.document_updates.iter_mut() {
if update.cleared { removed_document_ids.push(id) }
for (token, indexes) in &update.words_indexes {
words_indexes.entry(token).or_insert_with(Vec::new).extend_from_slice(indexes)
}
}
let negative = {
let removed_document_ids = Set::new_unchecked(&removed_document_ids);
let doc_ids = DocIds::new(removed_document_ids);
Negative::new(doc_ids)
};
let positive = {
let mut positive_builder = PositiveBuilder::memory();
for (key, mut indexes) in words_indexes {
indexes.sort_unstable();
let indexes = Set::new_unchecked(&indexes);
positive_builder.insert(key, indexes)?;
}
let (map, indexes) = positive_builder.into_inner()?;
let map = Map::from_bytes(map)?;
let indexes = DocIndexes::from_bytes(indexes)?;
Positive::new(map, indexes)
};
let index = Index { negative, positive };
let env_options = rocksdb_options::EnvOptions::new();
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
file_writer.open(&self.sst_file.to_string_lossy())?;
// write the data-index
let mut bytes = Vec::new();
index.write_to_bytes(&mut bytes);
file_writer.merge(DATA_INDEX, &bytes)?;
// write all the documents attributes updates
for (id, update) in self.document_updates {
let mut last_attr: Option<SchemaAttr> = None;
for (attr, value) in update.attributes {
if update.cleared {
// if there is no last attribute, remove from the first attribute
let start_attr = match last_attr {
Some(attr) => attr.next(),
None => Some(SchemaAttr::min())
};
let start = start_attr.map(|a| DocumentKeyAttr::new(id, a));
let end = attr.prev().map(|a| DocumentKeyAttr::new(id, a));
// delete_range between (last_attr + 1) and (attr - 1)
if let (Some(start), Some(end)) = (start, end) {
file_writer.delete_range(start.as_ref(), end.as_ref())?;
}
}
let key = DocumentKeyAttr::new(id, attr);
file_writer.put(key.as_ref(), &value)?;
last_attr = Some(attr);
}
if update.cleared {
// if there is no last attribute, remove from the first attribute
let start_attr = match last_attr {
Some(attr) => attr.next(),
None => Some(SchemaAttr::min())
};
let start = start_attr.map(|a| DocumentKeyAttr::new(id, a));
let end = DocumentKeyAttr::with_attribute_max(id);
// delete_range between (last_attr + 1) and attr_max
if let Some(start) = start {
file_writer.delete_range(start.as_ref(), end.as_ref())?;
}
}
}
file_writer.finish()?;
Ok(Update { sst_file: self.sst_file })
}
}

View File

@ -1,174 +0,0 @@
use std::error::Error;
use std::path::Path;
use std::ops::Deref;
use std::{fmt, marker};
use rocksdb::rocksdb_options::{ReadOptions, EnvOptions, ColumnFamilyOptions};
use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey, SstFileWriter};
use serde::de::DeserializeOwned;
use crate::database::{DocumentKey, DocumentKeyAttr};
use crate::database::{retrieve_data_schema, retrieve_data_index};
use crate::database::deserializer::Deserializer;
use crate::database::schema::Schema;
use crate::database::index::Index;
use crate::rank::{QueryBuilder, FilterFunc};
use crate::DocumentId;
pub struct DatabaseView<D>
where D: Deref<Target=DB>
{
snapshot: Snapshot<D>,
index: Index,
schema: Schema,
}
impl<D> DatabaseView<D>
where D: Deref<Target=DB>
{
pub fn new(snapshot: Snapshot<D>) -> Result<DatabaseView<D>, Box<Error>> {
let schema = retrieve_data_schema(&snapshot)?;
let index = retrieve_data_index(&snapshot)?;
Ok(DatabaseView { snapshot, index, schema })
}
pub fn schema(&self) -> &Schema {
&self.schema
}
pub fn index(&self) -> &Index {
&self.index
}
pub fn into_snapshot(self) -> Snapshot<D> {
self.snapshot
}
pub fn snapshot(&self) -> &Snapshot<D> {
&self.snapshot
}
pub fn get(&self, key: &[u8]) -> Result<Option<DBVector>, Box<Error>> {
Ok(self.snapshot.get(key)?)
}
pub fn dump_all<P: AsRef<Path>>(&self, path: P) -> Result<(), Box<Error>> {
let path = path.as_ref().to_string_lossy();
let env_options = EnvOptions::new();
let column_family_options = ColumnFamilyOptions::new();
let mut file_writer = SstFileWriter::new(env_options, column_family_options);
file_writer.open(&path)?;
let mut iter = self.snapshot.iter();
iter.seek(SeekKey::Start);
for (key, value) in &mut iter {
file_writer.put(&key, &value)?;
}
file_writer.finish()?;
Ok(())
}
pub fn query_builder(&self) -> Result<QueryBuilder<D, FilterFunc<D>>, Box<Error>> {
QueryBuilder::new(self)
}
pub fn document_by_id<T>(&self, id: DocumentId) -> Result<T, Box<Error>>
where T: DeserializeOwned
{
let mut deserializer = Deserializer::new(&self.snapshot, &self.schema, id);
Ok(T::deserialize(&mut deserializer)?)
}
pub fn documents_by_id<T, I>(&self, ids: I) -> DocumentIter<D, T, I::IntoIter>
where T: DeserializeOwned,
I: IntoIterator<Item=DocumentId>,
{
DocumentIter {
database_view: self,
document_ids: ids.into_iter(),
_phantom: marker::PhantomData,
}
}
}
impl<D> fmt::Debug for DatabaseView<D>
where D: Deref<Target=DB>
{
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut options = ReadOptions::new();
let lower = DocumentKey::new(DocumentId(0));
options.set_iterate_lower_bound(lower.as_ref());
let mut iter = self.snapshot.iter_opt(options);
iter.seek(SeekKey::Start);
let iter = iter.map(|(key, _)| DocumentKeyAttr::from_bytes(&key));
if f.alternate() {
writeln!(f, "DatabaseView(")?;
} else {
write!(f, "DatabaseView(")?;
}
self.schema.fmt(f)?;
if f.alternate() {
writeln!(f, ",")?;
} else {
write!(f, ", ")?;
}
f.debug_list().entries(iter).finish()?;
write!(f, ")")
}
}
// TODO this is just an iter::Map !!!
pub struct DocumentIter<'a, D, T, I>
where D: Deref<Target=DB>
{
database_view: &'a DatabaseView<D>,
document_ids: I,
_phantom: marker::PhantomData<T>,
}
impl<'a, D, T, I> Iterator for DocumentIter<'a, D, T, I>
where D: Deref<Target=DB>,
T: DeserializeOwned,
I: Iterator<Item=DocumentId>,
{
type Item = Result<T, Box<Error>>;
fn size_hint(&self) -> (usize, Option<usize>) {
self.document_ids.size_hint()
}
fn next(&mut self) -> Option<Self::Item> {
match self.document_ids.next() {
Some(id) => Some(self.database_view.document_by_id(id)),
None => None
}
}
}
impl<'a, D, T, I> ExactSizeIterator for DocumentIter<'a, D, T, I>
where D: Deref<Target=DB>,
T: DeserializeOwned,
I: ExactSizeIterator + Iterator<Item=DocumentId>,
{ }
impl<'a, D, T, I> DoubleEndedIterator for DocumentIter<'a, D, T, I>
where D: Deref<Target=DB>,
T: DeserializeOwned,
I: DoubleEndedIterator + Iterator<Item=DocumentId>,
{
fn next_back(&mut self) -> Option<Self::Item> {
match self.document_ids.next_back() {
Some(id) => Some(self.database_view.document_by_id(id)),
None => None
}
}
}

View File

@ -1,115 +0,0 @@
#![cfg_attr(feature = "nightly", feature(test))]
pub mod automaton;
pub mod database;
pub mod data;
pub mod rank;
pub mod tokenizer;
mod attribute;
mod word_area;
mod common_words;
pub use rocksdb;
pub use self::tokenizer::Tokenizer;
pub use self::common_words::CommonWords;
pub use self::attribute::{Attribute, AttributeError};
pub use self::word_area::{WordArea, WordAreaError};
/// Represent an internally generated document unique identifier.
///
/// It is used to inform the database the document you want to deserialize.
/// Helpful for custom ranking.
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
pub struct DocumentId(u64);
/// This structure represent the position of a word
/// in a document and its attributes.
///
/// This is stored in the map, generated at index time,
/// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(C)]
pub struct DocIndex {
/// The document identifier where the word was found.
pub document_id: DocumentId,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: Attribute,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub word_area: WordArea,
}
/// This structure represent a matching word with informations
/// on the location of the word in the document.
///
/// The order of the field is important because it defines
/// the way these structures are ordered between themselves.
///
/// The word in itself is not important.
// TODO do data oriented programming ? very arrays ?
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Match {
/// The word index in the query sentence.
/// Same as the `attribute_index` but for the query words.
///
/// Used to retrieve the automaton that match this word.
pub query_index: u32,
/// The distance the word has with the query word
/// (i.e. the Levenshtein distance).
pub distance: u8,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: Attribute,
/// Whether the word that match is an exact match or a prefix.
pub is_exact: bool,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub word_area: WordArea,
}
impl Match {
pub fn zero() -> Self {
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 0),
}
}
pub fn max() -> Self {
Match {
query_index: u32::max_value(),
distance: u8::max_value(),
attribute: Attribute::max_value(),
is_exact: true,
word_area: WordArea::max_value(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::mem;
#[test]
fn docindex_mem_size() {
assert_eq!(mem::size_of::<DocIndex>(), 16);
}
}

View File

@ -1,19 +0,0 @@
use std::cmp::Ordering;
use std::ops::Deref;
use rocksdb::DB;
use crate::rank::criterion::Criterion;
use crate::database::DatabaseView;
use crate::rank::Document;
#[derive(Debug, Clone, Copy)]
pub struct DocumentId;
impl<D> Criterion<D> for DocumentId
where D: Deref<Target=DB>
{
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
lhs.id.cmp(&rhs.id)
}
}

View File

@ -1,34 +0,0 @@
use std::cmp::Ordering;
use std::ops::Deref;
use rocksdb::DB;
use group_by::GroupBy;
use crate::rank::{match_query_index, Document};
use crate::rank::criterion::Criterion;
use crate::database::DatabaseView;
use crate::Match;
#[inline]
fn contains_exact(matches: &&[Match]) -> bool {
matches.iter().any(|m| m.is_exact)
}
#[inline]
fn number_exact_matches(matches: &[Match]) -> usize {
GroupBy::new(matches, match_query_index).filter(contains_exact).count()
}
#[derive(Debug, Clone, Copy)]
pub struct Exact;
impl<D> Criterion<D> for Exact
where D: Deref<Target=DB>
{
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
let lhs = number_exact_matches(&lhs.matches);
let rhs = number_exact_matches(&rhs.matches);
lhs.cmp(&rhs).reverse()
}
}

View File

@ -1,132 +0,0 @@
mod sum_of_typos;
mod number_of_words;
mod words_proximity;
mod sum_of_words_attribute;
mod sum_of_words_position;
mod exact;
mod sort_by;
mod document_id;
use std::cmp::Ordering;
use std::ops::Deref;
use rocksdb::DB;
use crate::database::DatabaseView;
use crate::rank::Document;
pub use self::{
sum_of_typos::SumOfTypos,
number_of_words::NumberOfWords,
words_proximity::WordsProximity,
sum_of_words_attribute::SumOfWordsAttribute,
sum_of_words_position::SumOfWordsPosition,
exact::Exact,
sort_by::SortBy,
document_id::DocumentId,
};
pub trait Criterion<D>
where D: Deref<Target=DB>
{
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering;
#[inline]
fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> bool {
self.evaluate(lhs, rhs, view) == Ordering::Equal
}
}
impl<'a, D, T: Criterion<D> + ?Sized> Criterion<D> for &'a T
where D: Deref<Target=DB>
{
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering {
(**self).evaluate(lhs, rhs, view)
}
fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> bool {
(**self).eq(lhs, rhs, view)
}
}
impl<D, T: Criterion<D> + ?Sized> Criterion<D> for Box<T>
where D: Deref<Target=DB>
{
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering {
(**self).evaluate(lhs, rhs, view)
}
fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> bool {
(**self).eq(lhs, rhs, view)
}
}
#[derive(Default)]
pub struct CriteriaBuilder<D>
where D: Deref<Target=DB>
{
inner: Vec<Box<dyn Criterion<D>>>
}
impl<D> CriteriaBuilder<D>
where D: Deref<Target=DB>
{
pub fn new() -> CriteriaBuilder<D> {
CriteriaBuilder { inner: Vec::new() }
}
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<D> {
CriteriaBuilder { inner: Vec::with_capacity(capacity) }
}
pub fn reserve(&mut self, additional: usize) {
self.inner.reserve(additional)
}
pub fn add<C>(mut self, criterion: C) -> CriteriaBuilder<D>
where C: 'static + Criterion<D>,
{
self.push(criterion);
self
}
pub fn push<C>(&mut self, criterion: C)
where C: 'static + Criterion<D>,
{
self.inner.push(Box::new(criterion));
}
pub fn build(self) -> Criteria<D> {
Criteria { inner: self.inner }
}
}
pub struct Criteria<D>
where D: Deref<Target=DB>
{
inner: Vec<Box<dyn Criterion<D>>>,
}
impl<D> Default for Criteria<D>
where D: Deref<Target=DB>
{
fn default() -> Self {
CriteriaBuilder::with_capacity(7)
.add(SumOfTypos)
.add(NumberOfWords)
.add(WordsProximity)
.add(SumOfWordsAttribute)
.add(SumOfWordsPosition)
.add(Exact)
.add(DocumentId)
.build()
}
}
impl<D> AsRef<[Box<dyn Criterion<D>>]> for Criteria<D>
where D: Deref<Target=DB>
{
fn as_ref(&self) -> &[Box<dyn Criterion<D>>] {
&self.inner
}
}

View File

@ -1,29 +0,0 @@
use std::cmp::Ordering;
use std::ops::Deref;
use rocksdb::DB;
use group_by::GroupBy;
use crate::rank::{match_query_index, Document};
use crate::rank::criterion::Criterion;
use crate::database::DatabaseView;
use crate::Match;
#[inline]
fn number_of_query_words(matches: &[Match]) -> usize {
GroupBy::new(matches, match_query_index).count()
}
#[derive(Debug, Clone, Copy)]
pub struct NumberOfWords;
impl<D> Criterion<D> for NumberOfWords
where D: Deref<Target=DB>
{
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
let lhs = number_of_query_words(&lhs.matches);
let rhs = number_of_query_words(&rhs.matches);
lhs.cmp(&rhs).reverse()
}
}

View File

@ -1,82 +0,0 @@
use std::cmp::Ordering;
use std::ops::Deref;
use std::marker;
use rocksdb::DB;
use serde::de::DeserializeOwned;
use crate::rank::criterion::Criterion;
use crate::database::DatabaseView;
use crate::rank::Document;
/// An helper struct that permit to sort documents by
/// some of their stored attributes.
///
/// # Note
///
/// If a document cannot be deserialized it will be considered [`None`][].
///
/// Deserialized documents are compared like `Some(doc0).cmp(&Some(doc1))`,
/// so you must check the [`Ord`] of `Option` implementation.
///
/// [`None`]: https://doc.rust-lang.org/std/option/enum.Option.html#variant.None
/// [`Ord`]: https://doc.rust-lang.org/std/option/enum.Option.html#impl-Ord
///
/// # Example
///
/// ```no-test
/// use serde_derive::Deserialize;
/// use meilidb::rank::criterion::*;
///
/// #[derive(Deserialize, PartialOrd, Ord, PartialEq, Eq)]
/// struct TimeOnly {
/// time: String,
/// }
///
/// let builder = CriteriaBuilder::with_capacity(8)
/// .add(SumOfTypos)
/// .add(NumberOfWords)
/// .add(WordsProximity)
/// .add(SumOfWordsAttribute)
/// .add(SumOfWordsPosition)
/// .add(Exact)
/// .add(SortBy::<TimeOnly>::new())
/// .add(DocumentId);
///
/// let criterion = builder.build();
///
/// ```
pub struct SortBy<T> {
_phantom: marker::PhantomData<T>,
}
impl<T> SortBy<T> {
pub fn new() -> Self {
SortBy::default()
}
}
impl<T> Default for SortBy<T> {
fn default() -> SortBy<T> {
SortBy { _phantom: marker::PhantomData }
}
}
impl<T, D> Criterion<D> for SortBy<T>
where D: Deref<Target=DB>,
T: DeserializeOwned + Ord,
{
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering {
let lhs = match view.document_by_id::<T>(lhs.id) {
Ok(doc) => Some(doc),
Err(e) => { eprintln!("{}", e); None },
};
let rhs = match view.document_by_id::<T>(rhs.id) {
Ok(doc) => Some(doc),
Err(e) => { eprintln!("{}", e); None },
};
lhs.cmp(&rhs)
}
}

View File

@ -1,205 +0,0 @@
use std::cmp::Ordering;
use std::ops::Deref;
use rocksdb::DB;
use group_by::GroupBy;
use crate::rank::{match_query_index, Document};
use crate::rank::criterion::Criterion;
use crate::database::DatabaseView;
use crate::Match;
#[inline]
fn sum_matches_typos(matches: &[Match]) -> isize {
let mut sum_typos = 0;
let mut number_words = 0;
// note that GroupBy will never return an empty group
// so we can do this assumption safely
for group in GroupBy::new(matches, match_query_index) {
sum_typos += unsafe { group.get_unchecked(0).distance as isize };
number_words += 1;
}
sum_typos - number_words
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfTypos;
impl<D> Criterion<D> for SumOfTypos
where D: Deref<Target=DB>
{
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
let lhs = sum_matches_typos(&lhs.matches);
let rhs = sum_matches_typos(&rhs.matches);
lhs.cmp(&rhs)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{DocumentId, Attribute, WordArea};
// typing: "Geox CEO"
//
// doc0: "Geox SpA: CEO and Executive"
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
#[test]
fn one_typo_reference() {
let doc0 = {
let matches = vec![
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
Match {
query_index: 1,
distance: 0,
attribute: Attribute::new_faillible(0, 2),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: DocumentId(0),
matches: matches,
}
};
let doc1 = {
let matches = vec![
Match {
query_index: 0,
distance: 1,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
Match {
query_index: 1,
distance: 0,
attribute: Attribute::new_faillible(0, 2),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: DocumentId(1),
matches: matches,
}
};
let lhs = sum_matches_typos(&doc0.matches);
let rhs = sum_matches_typos(&doc1.matches);
assert_eq!(lhs.cmp(&rhs), Ordering::Less);
}
// typing: "bouton manchette"
//
// doc0: "bouton manchette"
// doc1: "bouton"
#[test]
fn no_typo() {
let doc0 = {
let matches = vec![
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
Match {
query_index: 1,
distance: 0,
attribute: Attribute::new_faillible(0, 1),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: DocumentId(0),
matches: matches,
}
};
let doc1 = {
let matches = vec![
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: DocumentId(1),
matches: matches,
}
};
let lhs = sum_matches_typos(&doc0.matches);
let rhs = sum_matches_typos(&doc1.matches);
assert_eq!(lhs.cmp(&rhs), Ordering::Less);
}
// typing: "bouton manchztte"
//
// doc0: "bouton manchette"
// doc1: "bouton"
#[test]
fn one_typo() {
let doc0 = {
let matches = vec![
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
Match {
query_index: 1,
distance: 1,
attribute: Attribute::new_faillible(0, 1),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: DocumentId(0),
matches: matches,
}
};
let doc1 = {
let matches = vec![
Match {
query_index: 0,
distance: 0,
attribute: Attribute::new_faillible(0, 0),
is_exact: false,
word_area: WordArea::new_faillible(0, 6)
},
];
Document {
id: DocumentId(1),
matches: matches,
}
};
let lhs = sum_matches_typos(&doc0.matches);
let rhs = sum_matches_typos(&doc1.matches);
assert_eq!(lhs.cmp(&rhs), Ordering::Equal);
}
}

View File

@ -1,33 +0,0 @@
use std::cmp::Ordering;
use std::ops::Deref;
use rocksdb::DB;
use group_by::GroupBy;
use crate::database::DatabaseView;
use crate::rank::{match_query_index, Document};
use crate::rank::criterion::Criterion;
use crate::Match;
#[inline]
fn sum_matches_attributes(matches: &[Match]) -> usize {
// note that GroupBy will never return an empty group
// so we can do this assumption safely
GroupBy::new(matches, match_query_index).map(|group| {
unsafe { group.get_unchecked(0).attribute.attribute() as usize }
}).sum()
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfWordsAttribute;
impl<D> Criterion<D> for SumOfWordsAttribute
where D: Deref<Target=DB>
{
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
let lhs = sum_matches_attributes(&lhs.matches);
let rhs = sum_matches_attributes(&rhs.matches);
lhs.cmp(&rhs)
}
}

View File

@ -1,33 +0,0 @@
use std::cmp::Ordering;
use std::ops::Deref;
use rocksdb::DB;
use group_by::GroupBy;
use crate::database::DatabaseView;
use crate::rank::{match_query_index, Document};
use crate::rank::criterion::Criterion;
use crate::Match;
#[inline]
fn sum_matches_attribute_index(matches: &[Match]) -> usize {
// note that GroupBy will never return an empty group
// so we can do this assumption safely
GroupBy::new(matches, match_query_index).map(|group| {
unsafe { group.get_unchecked(0).attribute.word_index() as usize }
}).sum()
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfWordsPosition;
impl<D> Criterion<D> for SumOfWordsPosition
where D: Deref<Target=DB>
{
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
let lhs = sum_matches_attribute_index(&lhs.matches);
let rhs = sum_matches_attribute_index(&rhs.matches);
lhs.cmp(&rhs)
}
}

View File

@ -1,162 +0,0 @@
use std::cmp::{self, Ordering};
use std::ops::Deref;
use rocksdb::DB;
use group_by::GroupBy;
use crate::rank::{match_query_index, Document};
use crate::rank::criterion::Criterion;
use crate::database::DatabaseView;
use crate::Match;
const MAX_DISTANCE: u32 = 8;
fn index_proximity(lhs: u32, rhs: u32) -> u32 {
if lhs < rhs {
cmp::min(rhs - lhs, MAX_DISTANCE)
} else {
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
}
}
fn attribute_proximity(lhs: &Match, rhs: &Match) -> u32 {
if lhs.attribute.attribute() != rhs.attribute.attribute() { return MAX_DISTANCE }
index_proximity(lhs.attribute.word_index(), rhs.attribute.word_index())
}
fn min_proximity(lhs: &[Match], rhs: &[Match]) -> u32 {
let mut min_prox = u32::max_value();
for a in lhs {
for b in rhs {
min_prox = cmp::min(min_prox, attribute_proximity(a, b));
}
}
min_prox
}
fn matches_proximity(matches: &[Match]) -> u32 {
let mut proximity = 0;
let mut iter = GroupBy::new(matches, match_query_index);
// iterate over groups by windows of size 2
let mut last = iter.next();
while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
proximity += min_proximity(lhs, rhs);
last = Some(rhs);
}
proximity
}
#[derive(Debug, Clone, Copy)]
pub struct WordsProximity;
impl<D> Criterion<D> for WordsProximity
where D: Deref<Target=DB>
{
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
let lhs = matches_proximity(&lhs.matches);
let rhs = matches_proximity(&rhs.matches);
lhs.cmp(&rhs)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::Attribute;
#[test]
fn three_different_attributes() {
// "soup" "of the" "the day"
//
// { id: 0, attr: 0, attr_index: 0 }
// { id: 1, attr: 1, attr_index: 0 }
// { id: 2, attr: 1, attr_index: 1 }
// { id: 2, attr: 2, attr_index: 0 }
// { id: 3, attr: 3, attr_index: 1 }
let matches = &[
Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() },
Match { query_index: 1, attribute: Attribute::new_faillible(1, 0), ..Match::zero() },
Match { query_index: 2, attribute: Attribute::new_faillible(1, 1), ..Match::zero() },
Match { query_index: 2, attribute: Attribute::new_faillible(2, 0), ..Match::zero() },
Match { query_index: 3, attribute: Attribute::new_faillible(3, 1), ..Match::zero() },
];
// soup -> of = 8
// + of -> the = 1
// + the -> day = 8 (not 1)
assert_eq!(matches_proximity(matches), 17);
}
#[test]
fn two_different_attributes() {
// "soup day" "soup of the day"
//
// { id: 0, attr: 0, attr_index: 0 }
// { id: 0, attr: 1, attr_index: 0 }
// { id: 1, attr: 1, attr_index: 1 }
// { id: 2, attr: 1, attr_index: 2 }
// { id: 3, attr: 0, attr_index: 1 }
// { id: 3, attr: 1, attr_index: 3 }
let matches = &[
Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() },
Match { query_index: 0, attribute: Attribute::new_faillible(1, 0), ..Match::zero() },
Match { query_index: 1, attribute: Attribute::new_faillible(1, 1), ..Match::zero() },
Match { query_index: 2, attribute: Attribute::new_faillible(1, 2), ..Match::zero() },
Match { query_index: 3, attribute: Attribute::new_faillible(0, 1), ..Match::zero() },
Match { query_index: 3, attribute: Attribute::new_faillible(1, 3), ..Match::zero() },
];
// soup -> of = 1
// + of -> the = 1
// + the -> day = 1
assert_eq!(matches_proximity(matches), 3);
}
}
#[cfg(all(feature = "nightly", test))]
mod bench {
extern crate test;
use super::*;
use std::error::Error;
use self::test::Bencher;
use rand_xorshift::XorShiftRng;
use rand::{Rng, SeedableRng};
use crate::Attribute;
#[bench]
fn evaluate_proximity(bench: &mut Bencher) -> Result<(), Box<Error>> {
let number_matches = 30_000;
let mut matches = Vec::with_capacity(number_matches);
let mut rng = XorShiftRng::seed_from_u64(42);
for _ in 0..number_matches {
let query_index = rng.gen_range(0, 4);
let attribute = rng.gen_range(0, 5);
let word_index = rng.gen_range(0, 15);
let attribute = Attribute::new_faillible(attribute, word_index);
let match_ = Match { query_index, attribute, ..Match::zero() };
matches.push(match_);
}
bench.iter(|| {
let proximity = matches_proximity(&matches);
test::black_box(move || proximity)
});
Ok(())
}
}

View File

@ -1,33 +0,0 @@
pub mod criterion;
mod query_builder;
mod distinct_map;
use crate::{Match, DocumentId};
pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};
#[inline]
fn match_query_index(a: &Match, b: &Match) -> bool {
a.query_index == b.query_index
}
#[derive(Debug, Clone)]
pub struct Document {
pub id: DocumentId,
pub matches: Vec<Match>,
}
impl Document {
pub fn new(doc: DocumentId, match_: Match) -> Self {
unsafe { Self::from_sorted_matches(doc, vec![match_]) }
}
pub fn from_matches(doc: DocumentId, mut matches: Vec<Match>) -> Self {
matches.sort_unstable();
unsafe { Self::from_sorted_matches(doc, matches) }
}
pub unsafe fn from_sorted_matches(id: DocumentId, matches: Vec<Match>) -> Self {
Self { id, matches }
}
}

View File

@ -1,301 +0,0 @@
use std::{cmp, mem, vec, str, char};
use std::ops::{Deref, Range};
use std::error::Error;
use std::hash::Hash;
use std::rc::Rc;
use group_by::BinaryGroupByMut;
use hashbrown::HashMap;
use fst::Streamer;
use rocksdb::DB;
use log::info;
use crate::automaton::{self, DfaExt, AutomatonExt};
use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap};
use crate::rank::criterion::Criteria;
use crate::database::DatabaseView;
use crate::{Match, DocumentId};
use crate::rank::Document;
fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let mut automatons = Vec::new();
let mut words = query.split_whitespace().map(str::to_lowercase).peekable();
while let Some(word) = words.next() {
let has_following_word = words.peek().is_some();
let lev = if has_following_word || has_end_whitespace {
automaton::build_dfa(&word)
} else {
automaton::build_prefix_dfa(&word)
};
automatons.push(lev);
}
automatons
}
pub type FilterFunc<D> = fn(DocumentId, &DatabaseView<D>) -> bool;
pub struct QueryBuilder<'a, D, FI>
where D: Deref<Target=DB>
{
view: &'a DatabaseView<D>,
criteria: Criteria<D>,
filter: Option<FI>,
}
impl<'a, D> QueryBuilder<'a, D, FilterFunc<D>>
where D: Deref<Target=DB>
{
pub fn new(view: &'a DatabaseView<D>) -> Result<Self, Box<Error>> {
QueryBuilder::with_criteria(view, Criteria::default())
}
}
impl<'a, D, FI> QueryBuilder<'a, D, FI>
where D: Deref<Target=DB>,
{
pub fn with_criteria(view: &'a DatabaseView<D>, criteria: Criteria<D>) -> Result<Self, Box<Error>> {
Ok(QueryBuilder { view, criteria, filter: None })
}
pub fn with_filter<F>(self, function: F) -> QueryBuilder<'a, D, F>
where F: Fn(DocumentId, &DatabaseView<D>) -> bool,
{
QueryBuilder {
view: self.view,
criteria: self.criteria,
filter: Some(function)
}
}
pub fn with_distinct<F, K>(self, function: F, size: usize) -> DistinctQueryBuilder<'a, D, FI, F>
where F: Fn(DocumentId, &DatabaseView<D>) -> Option<K>,
K: Hash + Eq,
{
DistinctQueryBuilder {
inner: self,
function: function,
size: size
}
}
fn query_all(&self, query: &str) -> Vec<Document> {
let automatons = split_whitespace_automatons(query);
let mut stream = {
let mut op_builder = fst::map::OpBuilder::new();
for automaton in &automatons {
let stream = self.view.index().positive.map().search(automaton);
op_builder.push(stream);
}
op_builder.union()
};
let mut matches = HashMap::new();
while let Some((input, indexed_values)) = stream.next() {
for iv in indexed_values {
let automaton = &automatons[iv.index];
let distance = automaton.eval(input).to_u8();
let is_exact = distance == 0 && input.len() == automaton.query_len();
let doc_indexes = &self.view.index().positive.indexes();
let doc_indexes = &doc_indexes[iv.value as usize];
for doc_index in doc_indexes {
let match_ = Match {
query_index: iv.index as u32,
distance: distance,
attribute: doc_index.attribute,
is_exact: is_exact,
word_area: doc_index.word_area,
};
matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_);
}
}
}
info!("{} documents to classify", matches.len());
matches.into_iter().map(|(i, m)| Document::from_matches(i, m)).collect()
}
}
impl<'a, D, FI> QueryBuilder<'a, D, FI>
where D: Deref<Target=DB>,
FI: Fn(DocumentId, &DatabaseView<D>) -> bool,
{
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
// We give the filtering work to the query distinct builder,
// specifying a distinct rule that has no effect.
if self.filter.is_some() {
let builder = self.with_distinct(|_, _| None as Option<()>, 1);
return builder.query(query, range);
}
let mut documents = self.query_all(query);
let mut groups = vec![documents.as_mut_slice()];
let view = &self.view;
'criteria: for criterion in self.criteria.as_ref() {
let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut documents_seen = 0;
for group in tmp_groups {
// if this group does not overlap with the requested range,
// push it without sorting and splitting it
if documents_seen + group.len() < range.start {
documents_seen += group.len();
groups.push(group);
continue;
}
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view));
for group in BinaryGroupByMut::new(group, |a, b| criterion.eq(a, b, view)) {
documents_seen += group.len();
groups.push(group);
// we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion
if documents_seen >= range.end { continue 'criteria }
}
}
}
// `drain` removes the documents efficiently using `ptr::copy`
// TODO it could be more efficient to have a custom iterator
let offset = cmp::min(documents.len(), range.start);
documents.drain(0..offset);
documents.truncate(range.len());
documents
}
}
pub struct DistinctQueryBuilder<'a, D, FI, FD>
where D: Deref<Target=DB>
{
inner: QueryBuilder<'a, D, FI>,
function: FD,
size: usize,
}
impl<'a, D, FI, FD> DistinctQueryBuilder<'a, D, FI, FD>
where D: Deref<Target=DB>,
{
pub fn with_filter<F>(self, function: F) -> DistinctQueryBuilder<'a, D, F, FD>
where F: Fn(DocumentId, &DatabaseView<D>) -> bool,
{
DistinctQueryBuilder {
inner: self.inner.with_filter(function),
function: self.function,
size: self.size
}
}
}
impl<'a, D, FI, FD, K> DistinctQueryBuilder<'a, D, FI, FD>
where D: Deref<Target=DB>,
FI: Fn(DocumentId, &DatabaseView<D>) -> bool,
FD: Fn(DocumentId, &DatabaseView<D>) -> Option<K>,
K: Hash + Eq,
{
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
let mut documents = self.inner.query_all(query);
let mut groups = vec![documents.as_mut_slice()];
let mut key_cache = HashMap::new();
let view = &self.inner.view;
let mut filter_map = HashMap::new();
// these two variables informs on the current distinct map and
// on the raw offset of the start of the group where the
// range.start bound is located according to the distinct function
let mut distinct_map = DistinctMap::new(self.size);
let mut distinct_raw_offset = 0;
'criteria: for criterion in self.inner.criteria.as_ref() {
let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
let mut documents_seen = 0;
for group in tmp_groups {
// if this group does not overlap with the requested range,
// push it without sorting and splitting it
if documents_seen + group.len() < distinct_raw_offset {
documents_seen += group.len();
groups.push(group);
continue;
}
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view));
for group in BinaryGroupByMut::new(group, |a, b| criterion.eq(a, b, view)) {
// we must compute the real distinguished len of this sub-group
for document in group.iter() {
let filter_accepted = match &self.inner.filter {
Some(filter) => {
let entry = filter_map.entry(document.id);
*entry.or_insert_with(|| (filter)(document.id, view))
},
None => true,
};
if filter_accepted {
let entry = key_cache.entry(document.id);
let key = entry.or_insert_with(|| (self.function)(document.id, view).map(Rc::new));
match key.clone() {
Some(key) => buf_distinct.register(key),
None => buf_distinct.register_without_key(),
};
}
// the requested range end is reached: stop computing distinct
if buf_distinct.len() >= range.end { break }
}
documents_seen += group.len();
groups.push(group);
// if this sub-group does not overlap with the requested range
// we must update the distinct map and its start index
if buf_distinct.len() < range.start {
buf_distinct.transfert_to_internal();
distinct_raw_offset = documents_seen;
}
// we have sort enough documents if the last document sorted is after
// the end of the requested range, we can continue to the next criterion
if buf_distinct.len() >= range.end { continue 'criteria }
}
}
}
let mut out_documents = Vec::with_capacity(range.len());
let mut seen = BufferedDistinctMap::new(&mut distinct_map);
for document in documents.into_iter().skip(distinct_raw_offset) {
let filter_accepted = match &self.inner.filter {
Some(_) => filter_map.remove(&document.id).expect("BUG: filtered not found"),
None => true,
};
if filter_accepted {
let key = key_cache.remove(&document.id).expect("BUG: cached key not found");
let distinct_accepted = match key {
Some(key) => seen.register(key),
None => seen.register_without_key(),
};
if distinct_accepted && seen.len() > range.start {
out_documents.push(document);
if out_documents.len() == range.len() { break }
}
}
}
out_documents
}
}

View File

@ -1,188 +0,0 @@
use std::mem;
use self::Separator::*;
pub trait TokenizerBuilder {
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a>;
}
pub struct DefaultBuilder;
impl DefaultBuilder {
pub fn new() -> DefaultBuilder {
DefaultBuilder
}
}
#[derive(Debug, PartialEq, Eq)]
pub struct Token<'a> {
pub word: &'a str,
pub word_index: usize,
pub char_index: usize,
}
impl TokenizerBuilder for DefaultBuilder {
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a> {
Box::new(Tokenizer::new(text))
}
}
pub struct Tokenizer<'a> {
word_index: usize,
char_index: usize,
inner: &'a str,
}
impl<'a> Tokenizer<'a> {
pub fn new(string: &str) -> Tokenizer {
let mut char_advance = 0;
let mut index_advance = 0;
for (n, (i, c)) in string.char_indices().enumerate() {
char_advance = n;
index_advance = i;
if detect_separator(c).is_none() { break }
}
Tokenizer {
word_index: 0,
char_index: char_advance,
inner: &string[index_advance..],
}
}
}
#[derive(Debug, Clone, Copy)]
enum Separator {
Short,
Long,
}
impl Separator {
fn add(self, add: Separator) -> Separator {
match (self, add) {
(_, Long) => Long,
(Short, Short) => Short,
(Long, Short) => Long,
}
}
fn to_usize(self) -> usize {
match self {
Short => 1,
Long => 8,
}
}
}
fn detect_separator(c: char) -> Option<Separator> {
match c {
'.' | ';' | ',' | '!' | '?' | '-' => Some(Long),
' ' | '\'' | '"' => Some(Short),
_ => None,
}
}
impl<'a> Iterator for Tokenizer<'a> {
type Item = Token<'a>;
fn next(&mut self) -> Option<Self::Item> {
let mut start_word = None;
let mut distance = None;
for (i, c) in self.inner.char_indices() {
match detect_separator(c) {
Some(sep) => {
if let Some(start_word) = start_word {
let (prefix, tail) = self.inner.split_at(i);
let (spaces, word) = prefix.split_at(start_word);
self.inner = tail;
self.char_index += spaces.chars().count();
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
let token = Token {
word: word,
word_index: self.word_index,
char_index: self.char_index,
};
self.char_index += word.chars().count();
return Some(token)
}
distance.replace(distance.map_or(sep, |s| s.add(sep)));
},
None => { start_word.get_or_insert(i); },
}
}
if let Some(start_word) = start_word {
let prefix = mem::replace(&mut self.inner, "");
let (spaces, word) = prefix.split_at(start_word);
let token = Token {
word: word,
word_index: self.word_index + distance.map(Separator::to_usize).unwrap_or(0),
char_index: self.char_index + spaces.chars().count(),
};
return Some(token)
}
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn easy() {
let mut tokenizer = Tokenizer::new("salut");
assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard() {
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 18 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 32, char_index: 24 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard_long_chars() {
let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 }));
assert_eq!(tokenizer.next(), None);
}
}

Some files were not shown because too many files have changed in this diff Show More