Compare commits

...

457 Commits
v0.1 ... v0.4.1

Author SHA1 Message Date
87e5998489 Merge pull request #194 from meilisearch/set-code-public
Set code public
2019-09-19 18:25:13 +02:00
d7d1b6ff02 chore: reformat tests 2019-09-19 18:08:25 +02:00
7073b42afa feat: get update status Enqueued / Processed / Unknown 2019-09-19 18:08:14 +02:00
120d209e66 chore: set public SchemaProps values 2019-09-19 12:43:36 +02:00
62e981c6b8 chore: set public the main duration on update status 2019-09-19 12:43:36 +02:00
941302a4be chore: export ranked map 2019-09-19 12:43:36 +02:00
20f423268e chore: re-export database::Error type 2019-09-19 12:43:36 +02:00
522013425b chore: export a getter for synonyms 2019-09-19 12:43:35 +02:00
e3c413759f chore: implement deref on CommonIndex 2019-09-19 12:43:35 +02:00
6ed97d1c19 chore: re-export UpdateType/DetailedDuration/UpdateStatus 2019-09-19 12:43:35 +02:00
53ad1fc068 chore: split tests into multiples files 2019-09-19 12:43:35 +02:00
1e2ef06c5c Merge pull request #196 from meilisearch/fix-cf-handle-creation
Create the Column Family only when it doesn't already exist
2019-09-19 12:29:50 +02:00
9db86f13f3 fix: Only create the Column Family when it doesn't already exist 2019-09-19 12:02:34 +02:00
369461e635 Merge pull request #195 from meilisearch/update-readme
Update the README
2019-09-19 12:01:09 +02:00
d2d22ac76d doc: Update the README and refer to examples instead of the main binary 2019-09-19 12:00:34 +02:00
a5a19fc9dd Merge pull request #193 from meilisearch/get-documents-id
Add a method to get an iterator over all documents ids
2019-09-18 16:09:30 +02:00
a36c991897 feat: add a method to get an iterator over all documents ids 2019-09-18 15:41:06 +02:00
4f71219e17 Merge pull request #192 from meilisearch/bump-dependencies
Bump dependencies
2019-09-18 15:10:15 +02:00
69e0bae75e chore: Bump dependencies 2019-09-18 14:42:23 +02:00
1b18679950 Merge pull request #191 from meilisearch/typed-settings
Typed settings
2019-09-18 14:04:07 +02:00
e1c119b5a8 chore: add test for custom settings 2019-09-18 12:22:26 +02:00
03709910fd feat: add typed index custom settings for common uses 2019-09-18 12:22:21 +02:00
8fdb330195 Merge pull request #190 from meilisearch/bump-dependencies-versions
Bump dependency
2019-09-18 10:29:22 +02:00
59ae6458dc chore: bump dependencies 2019-09-17 18:50:44 +02:00
c10b701b9a Merge pull request #189 from meilisearch/documents-fields-repartition
Add the documents fields repartition into stats
2019-09-17 16:23:49 +02:00
80caa8b60d feat: add the documents fields repartition into stats 2019-09-17 15:56:13 +02:00
97cf5cca2a Merge pull request #188 from meilisearch/delete-index
Delete an index
2019-09-17 14:25:38 +02:00
3e76dc718b feat: delete an index and all it's associated data 2019-09-17 13:29:56 +02:00
5a17b5a63b Merge pull request #187 from meilisearch/export-snapshots
Re-export rocksdb snapshot function
2019-09-17 12:54:14 +02:00
5bc5185ac5 feat: re-export rocksdb snapshot function 2019-09-17 11:37:17 +02:00
3712fa7c24 Merge pull request #186 from meilisearch/common-db-tree
feat: expose a common DB tree for the database
2019-09-16 19:08:52 +02:00
918cc235a4 feat: expose a common DB tree for the database 2019-09-16 16:05:05 +02:00
8d24e54fa1 Merge pull request #185 from meilisearch/serde-schema
Implement De/Serialize on schema
2019-09-16 15:18:02 +02:00
35b7b58ff7 feat: Remove the Schema to/from_toml/json/bin methods 2019-09-16 14:50:38 +02:00
ffc29a319f feat: Implement De/Serialize on schema 2019-09-16 14:50:37 +02:00
ba3ac5ea7b chore: Create an internal Schema::to_builder method 2019-09-16 14:50:37 +02:00
ee6a54fe4c feat: Replace the linked-hash-map dependency by indexmap 2019-09-16 14:50:37 +02:00
f6ff79085e Merge pull request #184 from meilisearch/unify-update-types
Unify the Update and UpdateOwned types
2019-09-16 14:00:12 +02:00
bcd38c7d5a feat: Unify the Update and UpdateOwned types 2019-09-16 12:33:08 +02:00
aaeb25828f Merge pull request #183 from meilisearch/number-of-documents
Compute the number of documents on updates
2019-09-14 16:32:18 +02:00
af26c39482 test: Improve the tests of the number of documents counting 2019-09-14 15:29:46 +02:00
2006259a23 feat: Improve the number of documents counting 2019-09-14 15:26:41 +02:00
707e2f4d77 feat: Update the number of documents in the KV 2019-09-14 15:26:39 +02:00
8d8aed36a8 feat: Count the number of deleted/inserted documents 2019-09-14 15:24:39 +02:00
2658ef0176 Merge pull request #182 from meilisearch/replace-sled-by-rocksdb
Replace sled by RocksDB
2019-09-14 11:32:26 +02:00
400d542fef feat: Update the README to reflect the kv store update 2019-09-12 16:28:23 +02:00
f46868407c feat: Make RocksDB works seemlessly like sled 2019-09-05 18:43:10 +02:00
e3fa07077c feat: Introduce the CfTree and CfIter types 2019-09-05 14:53:09 +02:00
e5763e73eb chore: Prefer using const names to avoid typos 2019-09-05 13:22:53 +02:00
fd880e0a0e Merge pull request #175 from meilisearch/moving-back-to-sled
Moving back to sled
2019-09-05 13:14:48 +02:00
e33cc89846 feat: Introduce update callbacks 2019-09-05 11:48:26 +02:00
f40b373f9f feat: Introduce the UpdateStatus type 2019-09-05 11:48:26 +02:00
cd8535d410 feat: Introduce the update_status/_blocking functions 2019-09-05 11:48:25 +02:00
f07b99fe97 fix: Make the tests work with the new update system 2019-09-05 11:48:25 +02:00
f45a00df3b fix: Cloned ArcSwaps are unsynchronized versions 2019-09-05 11:46:02 +02:00
cd864c40bc feat: Make the update update serialization be based on message pack 2019-09-05 11:46:02 +02:00
91b44a2759 chore: Change the Box<Error> to be marked dyn 2019-09-05 11:46:01 +02:00
d8cd8c5def chore: Move the updates in their own module 2019-09-05 11:46:01 +02:00
b0be06540a chore: Simplify the update application 2019-09-05 11:46:01 +02:00
4deee93a55 feat: Introduce synonyms deletion using the update system 2019-09-05 11:33:11 +02:00
451c0a6d03 feat: Introduce synonyms addition using the update system 2019-09-05 11:33:10 +02:00
0db3e6c58c feat: Introduce documents deletion using the update system 2019-09-05 11:33:10 +02:00
f83d6df4ef feat: Introduce documents addition using the update system 2019-09-05 11:33:10 +02:00
5a9e25c315 feat: Introduce the UpdatesIndex type 2019-09-05 11:14:11 +02:00
50e3c2c3de chore: Upgrade the meilidb-data dependencies 2019-09-05 10:49:46 +02:00
093ee9732f Merge pull request #180 from meilisearch/store-every-document
Change the STORED attribute property by DISPLAYED
2019-09-04 14:45:00 +02:00
333189ee51 fix: Change every stored schema property by displayed 2019-09-04 11:16:36 +02:00
50b8a66794 feat: Change the STORED attribute property by DISPLAYED 2019-09-03 11:14:20 +02:00
8be3fc1a66 Merge pull request #179 from meilisearch/deunicode-before-tokenize
Improve the tokenizer by split after deunicode
2019-09-02 17:20:30 +02:00
b5503989f9 feat: Improve the tokenizer by split after deunicode 2019-09-02 16:54:54 +02:00
5b8bc09826 Merge pull request #176 from meilisearch/no-more-hanging-threads
Replace the rayon::scope by always checking time
2019-09-01 20:02:03 +02:00
c8ee21f227 feat: Replace the rayon::scope by always checking time 2019-09-01 18:52:38 +02:00
a420fbf1e8 Merge pull request #174 from meilisearch/arc-fst-sets
Do not clone probably large fst::Sets, Arc them
2019-08-30 14:52:28 +02:00
ca34c28335 feat: Do not clone probably large fst::Sets, Arc them 2019-08-30 14:37:28 +02:00
3e1b81c4ce Merge pull request #173 from meilisearch/fix-ranked-map-set
Use the right ranked-map key name
2019-08-30 14:21:14 +02:00
9b353dfda6 chore: Use const names to avoid typos 2019-08-30 12:36:10 +02:00
d8dcc6f34b fix: Use the right ranked-map key name 2019-08-30 12:21:00 +02:00
fba1272a3e Merge pull request #172 from meilisearch/expose-internal-functions
Expose some internal functions
2019-08-29 15:26:42 +02:00
e20a038970 fix: Expose some internal functions 2019-08-29 15:11:51 +02:00
6f34dccc89 Merge pull request #171 from meilisearch/stringify-document-id
Transform identifiers fields into a string before hashing it
2019-08-29 13:42:46 +02:00
f5b0eb044a fix: Transform the identifier value into a string before hashing it 2019-08-29 11:41:20 +02:00
bae86e978e Merge pull request #170 from meilisearch/async-word-index-fetching-with-rayon-scope
Async word index fetching with rayon scope
2019-08-28 14:37:38 +02:00
8030a822ab test: Add a way to setup the fetch timeout of the query-database example 2019-08-28 13:42:20 +02:00
9c5ec110e5 feat: Introduce a way to enable or disable query timeouts 2019-08-28 13:24:34 +02:00
67302d09f3 feat: Multiword rewrite while there is time 2019-08-19 11:12:23 +02:00
7dc9ea78fa feat: Make the automaton DFA construction lazy 2019-08-19 11:12:23 +02:00
0ee56314fb feat: Try to simplify Store trait bound with a rayon scope 2019-08-19 11:10:54 +02:00
b7b60b5fe5 feat: Introduce a new thread to avoid waiting on doc indexes fetchs 2019-08-16 16:35:19 +02:00
d9c9fafd78 feat: Fetch doc indexes while there is time 2019-08-16 15:01:25 +02:00
bb0a79c577 feat: Process automatons in the order they were sort 2019-08-16 12:25:35 +02:00
81d44a0854 feat: Order automatons by importance 2019-08-16 12:19:34 +02:00
ebc95cb8f2 feat: Display the documents fields in the order they were declared 2019-08-16 11:25:42 +02:00
a488c00a2e feat: Use RustyLine in the query-database example 2019-08-16 11:25:42 +02:00
bf3c2c3725 feat: Move the multi-word rewriting algorithm into its own function 2019-08-16 11:25:42 +02:00
89df496f0c feat: Separate highlights from matches to make the code easier to follow 2019-08-16 11:25:42 +02:00
9959f2e952 feat: Move the RawDocument type to its own module 2019-08-16 11:25:42 +02:00
795557c046 feat: Remove query splitting from the automaton generation 2019-08-16 11:25:42 +02:00
225a3bf184 test: Produce tests that work with the new cumulative word index system 2019-08-16 11:25:42 +02:00
e65d7418b7 feat: Remove the query index from the Automaton type 2019-08-16 11:25:42 +02:00
f478bbf826 feat: Introduce the QueryEnhancer in the query synonym system 2019-08-16 11:25:42 +02:00
5e691c2140 feat: Introduce the QueryEnhancer type 2019-08-16 11:25:42 +02:00
e0cadaa68d Merge pull request #165 from meilisearch/reorder-schema-attributes
Reorder schema attributes
2019-07-01 16:12:33 +02:00
9175e4686b feat: Collect TmpMatches only on tests, producing data useful for tests 2019-07-01 14:55:47 +02:00
e8afca614c chore: Little clean ups of meilidb-core 2019-07-01 14:34:06 +02:00
4f4b630ae9 fix: Make the examples compile with the new Highlight type 2019-07-01 12:06:17 +02:00
6b6db2f8e6 feat: Introduce the Highlight type to simplify the data oriented design 2019-07-01 12:06:16 +02:00
b7ed22bc59 feat: Introduce on the fly attributes reordering with meilidb-core 2019-07-01 12:03:31 +02:00
97cc3c7cce Merge pull request #166 from meilisearch/split-query-words
Split query words
2019-06-28 18:30:13 +02:00
f5d52396f5 feat: Support query words splits 2019-06-28 18:04:35 +02:00
9cc154da05 chore: Rewrite tests to use iterators and be easily testable 2019-06-28 18:04:35 +02:00
5aa49d232c feat: Rewrite Automaton generation related code 2019-06-28 18:04:35 +02:00
1cb42cbb30 Merge pull request #164 from meilisearch/concat-query-words
Support query words concatenation
2019-06-28 18:03:49 +02:00
9f320590d3 feat: Support query words concatenation 2019-06-27 10:14:17 +02:00
1b0fd2e0ba Merge pull request #160 from meilisearch/synonyms
Support all types of synonyms
2019-06-26 14:59:45 +02:00
b249b2a81b feat: Support removing specific synonym alternatives 2019-06-26 10:45:51 +02:00
0a5d4eb7ed feat: Normalize synonym strings and query strings to search for synonyms 2019-06-26 10:45:51 +02:00
3dcbc737f3 feat: Make synonyms be not considered like exact matches 2019-06-26 10:45:51 +02:00
43f11e929d fix: Do not trigger a synonym when its not the last word and is a prefix 2019-06-26 10:45:51 +02:00
8f2a551cca feat: Trigger synonym replacement only when the last word is tipped 2019-06-26 10:45:50 +02:00
8f044c6853 fix: Only create non-prefix DFA when generating synonyms alternatives 2019-06-26 10:45:50 +02:00
a76c00a787 feat: Create types to edit synonyms and keep them in the database 2019-06-26 10:45:50 +02:00
0633f16b4d feat: Make multi-word support multi-word synonyms 2019-06-26 10:45:50 +02:00
59fafb8b30 feat: Support one word has multi-word alternatives 2019-06-26 10:45:50 +02:00
d2bd99cc2a fix: Append DocIndexes when building InMemorySetStore from an Iterator 2019-06-26 10:45:50 +02:00
62930ecc4e feat: Deduplicate automatons when synonyms produce duplicated ones 2019-06-26 10:45:49 +02:00
6cb57aa8a4 feat: Unique word has multi-word synonyms basically work 2019-06-26 10:45:49 +02:00
9861c3878e tests: Add more tests about synonyms 2019-06-26 10:45:49 +02:00
707d7b062b feat: Made query handle synonyms via the Store 2019-06-26 10:45:49 +02:00
18736bdcd0 feat: Introduce the synonyms concept to the Store trait 2019-06-26 10:45:49 +02:00
e8b2e86007 feat: Introduce a basic way to handle synonyms 2019-06-26 10:45:48 +02:00
ae8b4f56f2 Merge pull request #163 from meilisearch/export-compute-docid
Expose a function to compute the DocumentId from an Hashable value
2019-06-25 12:25:38 +02:00
28a0074497 feat: Expose a function to compute the DocumentId from an Hashable value 2019-06-25 11:21:12 +02:00
71c039db09 Merge pull request #162 from meilisearch/trustful-hash
Prefer using a reliable SipHash to compute document ids
2019-06-22 11:51:52 +02:00
15646c258b fix: Prefer using a reliable SipHash to compute document ids 2019-06-22 11:22:21 +02:00
25a5605b35 Merge pull request #161 from meilisearch/remove-tide
Remove tide as it break compilation on the latest nightly
2019-06-18 14:04:47 +02:00
b630e32c6a fix: Remove tide as it break compilation on the latest nightly 2019-06-18 13:40:46 +02:00
c39254bf98 Merge pull request #159 from meilisearch/create-specific-schema-crate
Move the Schema to its own workspace crate
2019-06-03 09:17:14 +02:00
994a0e78f1 feat: Move the Schema to its own workspace crate 2019-05-29 15:37:28 +02:00
ab2ca15c5c Merge pull request #158 from meilisearch/moving-back-to-rocksdb
Moving back to RocksDB
2019-05-29 14:56:55 +02:00
07f447c457 feat: Force RocksDB compaction 2019-05-28 17:38:59 +02:00
62c8f1ba04 feat: Fix the index opening when index already exists 2019-05-26 11:36:47 +02:00
e08edc2d6b feat: Introduce some stats to ease debugging 2019-05-25 12:12:24 +02:00
a147c09b06 feat: Make more functions accessible on the custom settings 2019-05-24 14:37:04 +02:00
9fca74443e feat: Wrap the database index access to improve usability 2019-05-24 14:26:05 +02:00
6f258f71d5 feat: Implement some convenient accessors for custom settings 2019-05-23 15:43:41 +02:00
ce61c16dbe feat: Disable all the default RocksDB compression features 2019-05-23 15:35:53 +02:00
4c973238a1 feat: Introduce a basic RocksDB based version 2019-05-23 14:57:29 +02:00
3a8da82792 Merge pull request #157 from meilisearch/update-readme
Fix some badly spelled sentences
2019-05-22 14:01:33 +02:00
f10da122ff doc: Fix some badly spelled sentences 2019-05-22 11:41:03 +02:00
ec20a8cacb Merge pull request #156 from meilisearch/clippy-pass
Do a little clippy pass
2019-05-22 11:33:55 +02:00
102fb506db chore: Do a little clippy pass 2019-05-22 11:00:58 +02:00
34ba520f44 Merge pull request #155 from meilisearch/update-sdset
Use safest SetBuf constructor instead of new_unchecked
2019-05-21 18:23:39 +02:00
fa099555c0 feat: Use safest SetBuf constructor instead of new_unchecked 2019-05-21 18:15:48 +02:00
8387c5b14e Merge pull request #153 from meilisearch/example-expose-system-stats
Output more informations from the examples on document injection
2019-05-21 16:50:25 +02:00
5040095228 feat: Output more informations from the examples on document injection 2019-05-21 16:37:17 +02:00
788fae59a1 Merge pull request #154 from meilisearch/reintroduce-sort-by-attr
Reintroduce the `SortByAttr` custom criterion
2019-05-21 16:32:12 +02:00
e042f44e0d feat: Reintroduce the SortByAttr custom criterion 2019-05-21 16:22:23 +02:00
b1fc3e5cec Merge pull request #152 from meilisearch/documents-deletion-updates-ranked-map
Remove the documents from the ranked map on documents deletion
2019-05-21 13:59:21 +02:00
d7b1b7a2a9 feat: Remove the documents from the ranked map on documents deletion 2019-05-21 13:33:42 +02:00
97744ad24f Merge pull request #151 from meilisearch/expose-sled-compression-factor
Expose the sled compression setting
2019-05-20 15:03:43 +02:00
2e79b2a871 feat: Expose the sled compression setting 2019-05-20 14:41:15 +02:00
349f0f7068 Merge pull request #148 from meilisearch/split-fst-docindexes
Split fst doc-indexes
2019-05-20 14:24:48 +02:00
94f9587db1 feat: Implement Debug on RawDocument for more convenience 2019-05-20 11:21:41 +02:00
6df8f62022 test: Add more test to some criteria 2019-05-20 11:21:40 +02:00
8c71473498 feat: Introduce the Criterion::name to allow better debugging 2019-05-20 11:21:40 +02:00
08d89053da feat: Introduce a little simple http server for demo 2019-05-16 17:09:41 +02:00
4b36fa0739 test: Add tests about additions and deletions of documents 2019-05-16 13:44:21 +02:00
921b063a71 feat: Make the DocumentsDeletion public interface to take serde types 2019-05-16 12:04:08 +02:00
3de633c869 feat: Reexport sled to reduce user level library incompatibilities 2019-05-16 12:04:08 +02:00
021f0545eb doc: Update the deep-dive explanation text 2019-05-16 12:04:08 +02:00
b701eb85b8 doc: Update the README features links 2019-05-16 12:04:08 +02:00
4e80378a77 chore: Rename the ebay example into kaggle 2019-05-16 12:04:07 +02:00
830d2f28b9 feat: Introduce a custom tree for user custom settings 2019-05-16 12:04:07 +02:00
c5ba34d0b0 chore: Replace crate only public interface to be completely public 2019-05-16 12:04:07 +02:00
2e31bb519a chore: Split the database structure internal types 2019-05-16 12:04:07 +02:00
169bd4cb39 feat: Store all documents words by document rather than by attribute 2019-05-15 15:42:13 +02:00
aa90f22865 feat: Remove the Index dependency of the Serializer 2019-05-15 15:42:12 +02:00
9bba90c47e fix: Fix a bug in the Database open-index method 2019-05-15 15:42:12 +02:00
2844cb5bca fix: Make the examples compile 2019-05-15 15:42:12 +02:00
dff81bb161 feat: Prefer set/del methods instead of set with an Option type 2019-05-15 15:42:12 +02:00
1f2abce7c3 feat: Introduce the DocumentsDeletion type 2019-05-15 15:42:11 +02:00
e67ada8823 feat: Introduce the DocumentsAddition type 2019-05-15 15:42:11 +02:00
42e39f6eb5 feat: Introduce a simplified version of the Store trait 2019-05-15 15:42:11 +02:00
f317a7a322 feat: implement open/create_index on the Database type 2019-05-15 15:42:11 +02:00
8434ecbb43 feat: Introduce the RankedMap real type 2019-05-15 15:42:10 +02:00
0c18026240 feat: Introduce Tree wrappers for each index component 2019-05-15 15:42:10 +02:00
6eb25687f8 feat: Handle word doc-indexes sled tree errors 2019-05-15 15:42:10 +02:00
737db5668b chore: Remove the WriteToBytes trait 2019-05-15 15:42:10 +02:00
f16e0333e4 chore: Remove the SharedData/Cursor types 2019-05-15 15:42:09 +02:00
27ffcaabe9 chore: Remove the DocIndexes type 2019-05-15 15:42:09 +02:00
db031a5b95 chore: Remove the DocIds type 2019-05-15 15:42:09 +02:00
2e9fbd07cd chore: Remove most of the warnings 2019-05-15 15:42:09 +02:00
74acf83464 chore: Remove the NewIndexEvent type 2019-05-15 15:42:08 +02:00
3dc057ca9c feat: Introduce the new Index system 2019-05-15 15:42:08 +02:00
e142339106 Merge pull request #150 from felixonmars/patch-1
chore: Fix some typos
2019-05-06 15:00:53 +02:00
39038750a8 chore: Fix some typos 2019-05-06 20:12:33 +08:00
f68733bf11 Merge pull request #149 from meilisearch/ci-only-nightly
Update ci with rust nightly only
2019-05-02 15:43:53 +02:00
85edb3e90c Update ci with rust nightly only 2019-05-02 11:43:45 +02:00
d7ce6d016b Merge pull request #147 from meilisearch/moving-to-sled
Make the repository a workspace and move to sled
2019-04-29 15:21:02 +02:00
9023a12ad4 feat: Introduce the unrankable error variant 2019-04-29 14:32:04 +02:00
0547671246 feat: Take ranked attributes into account 2019-04-29 14:32:04 +02:00
068f1bc202 feat: Index unidecoded words 2019-04-29 14:32:04 +02:00
7035f76077 squash-me: Make better measurements of the retrieving spent time 2019-04-29 14:32:04 +02:00
f0268d49fe fix: Always lowercase indexed tokens 2019-04-29 14:32:04 +02:00
7dbf5d6319 fix: Make the examples build 2019-04-29 14:32:03 +02:00
ed6b6038ee feat: Finalize index merging on document insertion 2019-04-29 14:32:03 +02:00
ad24ef8a25 feat: Index words of structs, maps and tuples 2019-04-29 14:32:03 +02:00
645bab7748 feat: Index documents using the Serializer struct 2019-04-29 14:32:03 +02:00
abd7d1de48 feat: Introduce the extract_document_id function 2019-04-29 14:32:03 +02:00
ea0ee070ef feat: Introduce the Serializer
Which will serialize documents fields as message pack in the kv-store
2019-04-29 14:32:03 +02:00
2a69170f14 feat: Introduce the DocumentsDeletion type 2019-04-29 14:32:02 +02:00
725e7b4229 chore: Move the Deserializer into the the serde module 2019-04-29 14:32:02 +02:00
187e6740bd feat: Allow users to construct query builders from database indexes 2019-04-29 14:32:02 +02:00
4b40d5b0d4 feat: Introduce the Index struct 2019-04-29 14:32:02 +02:00
ee2bad20c7 feat: Store the RankedMap into the inner sled tree 2019-04-29 14:32:02 +02:00
b7805fee93 feat: Store already opened indexes and word indexes 2019-04-29 14:32:02 +02:00
0104e93ba9 feat: Introduce index events to update the WordIndex 2019-04-29 14:32:02 +02:00
25a4961453 feat: Introduce the Indexer struct 2019-04-29 14:32:01 +02:00
7338e522bd squash-me: Add set/get/del_document_attribute to Index methods 2019-04-29 14:32:01 +02:00
58c020a2e1 feat: Store the word index into the database index 2019-04-29 14:32:01 +02:00
f7eced03fd chore: Using a fork of the fst library that support Arc<[u8]> 2019-04-29 14:32:01 +02:00
9be7c02461 chore: Update sled to 0.22.1 2019-04-29 14:32:01 +02:00
9483f2df60 feat: Introduce a custom Error type 2019-04-29 14:32:01 +02:00
f17a05c342 feat: Introduce the RankedMap type 2019-04-29 14:32:00 +02:00
e41c551757 feat: Introduce the Number type 2019-04-29 14:32:00 +02:00
95dfbd1fe0 feat: Introduce the meilidb-data schema module 2019-04-29 14:32:00 +02:00
287d5dee4d feat: Introduce the meilidb-data workspace member 2019-04-29 14:32:00 +02:00
77405cc103 chore: Remove the database module from meilidb 2019-04-29 14:32:00 +02:00
abf7191eec feat: Make the Tokenizer able to support tokenizing sequences 2019-04-29 14:32:00 +02:00
c6bb2b6f9c chore: Make the debug symbols available for release binaries 2019-04-29 14:31:59 +02:00
acede0f3e8 fix: Correctly assert the DocIndex memory size 2019-04-29 14:31:59 +02:00
e56106cbdc chore: Update the toml dependency 2019-04-29 14:31:59 +02:00
87f9528791 feat: Use the new Tokenizer 2019-04-29 14:31:59 +02:00
397522f277 fet: Move meilidb example into the meilidb workspace 2019-04-29 14:31:59 +02:00
a745819ddf feat: Simplify the Tokenizer to use the LinearStrGroupBy type 2019-04-29 14:31:37 +02:00
5d5bcf7011 feat: Remove the FilterFunc alias type 2019-04-29 14:31:37 +02:00
19e67dcf0b feat: Move query splitting into the tokenizer workspace 2019-04-29 14:31:37 +02:00
1897da5348 feat: Move tokenizer things into the meilidb-tokenizer workspace 2019-04-29 14:31:37 +02:00
d8cbb03c42 chore: Update the .gitignore file 2019-04-29 14:31:36 +02:00
bc227bef21 chore: Add a nightly feature to meilidb-core 2019-04-29 14:31:36 +02:00
3bcb1dc802 chore: Allow the activation of the meilidb-core i128 feature 2019-04-29 14:31:36 +02:00
d0786b4156 chore: Move the SortByAttr into meilidb 2019-04-29 14:31:36 +02:00
14790eeae3 chore: Move index related things to the meilidb-core workspace member 2019-04-29 14:31:35 +02:00
3056b351fa Merge pull request #143 from ndudnicz/examples-movies
doc: add a new +19k movies example dataset
2019-04-15 10:11:38 +02:00
52fca57114 doc: add a new +19k movies example dataset 2019-04-13 21:11:28 +02:00
ee7a570b2f doc: Fix a little typo 2019-03-24 16:45:33 +01:00
61dcf72e04 Merge pull request #131 from meilisearch/update-readme
Add a Features section to the readme
2019-03-24 16:44:00 +01:00
bace8ad510 doc: Add a features section to the readme 2019-03-24 16:28:19 +01:00
e0b759839d Merge pull request #129 from meilisearch/ci-badge
Add CI badge
2019-03-10 22:46:57 +01:00
05b0a3e7d2 Add CI badge 2019-03-10 21:38:04 +01:00
2518037b91 Merge pull request #128 from meilisearch/azure-pipeline
Azure pipeline
2019-03-10 17:38:47 +01:00
3e452f362c Replace TravisCI by Azure CI 2019-03-10 15:46:59 +01:00
4900544574 Merge pull request #126 from Kerollmops/searchable-attributes
Searchable attributes
2019-03-05 17:11:15 +01:00
858589dc6b feat: Limit the QueryBuilder to search only into some attributes 2019-03-05 16:34:29 +01:00
915f2e70a3 Merge pull request #125 from Kerollmops/limit-memory-usage
Limit memory usage
2019-03-05 16:17:56 +01:00
aae301878c fix: Flush the database after each WriteBatch injected 2019-03-05 14:55:57 +01:00
383a49b44f fix: Compact the whole database for each WriteBatch injected 2019-03-05 14:55:57 +01:00
a45cc4b618 fix: Reduce the size of the DocIndex type 2019-03-05 14:55:57 +01:00
aef7d7825f Merge pull request #124 from Kerollmops/version-bump
Bump version to 0.3.2
2019-02-25 14:22:02 +01:00
f28ce661af chore: Bump version to 0.3.2 2019-02-25 13:56:23 +01:00
74eb9c8d0f Merge pull request #122 from Kerollmops/query-builder-no-view-dep
Remove the DatabaseView dependencies from the QueryBuilder
2019-02-24 16:56:12 +01:00
d664221c64 feat: Remove the DatabaseView dependencies from the QueryBuilder 2019-02-24 16:25:28 +01:00
58bff3d4ac Merge pull request #123 from Kerollmops/update-deps
Update all the dependencies
2019-02-24 16:24:47 +01:00
2c206eb98c chore: Update all the dependencies 2019-02-24 16:00:03 +01:00
19724e5af9 Merge pull request #121 from Kerollmops/no-cjk-unidecode
Do not save unidecoded cjk kanjis
2019-02-23 22:34:47 +01:00
c9e0ad132c feat: Do not save unidecoded cjk kanjis 2019-02-23 19:11:54 +01:00
24f265a963 Merge pull request #120 from Kerollmops/custom-log10-function
Optimize the SumOfTypos criterion
2019-02-23 19:01:12 +01:00
f8a743ee00 feat: Optimize the SumOfTypos criterion 2019-02-23 18:36:45 +01:00
64971de7ed Merge pull request #119 from Kerollmops/dont-be-hurry
Fix the tokenizer (next time don't be so hurry to merge)
2019-02-23 17:07:42 +01:00
a960c325f3 feat: Make query strings support cjk kanjis 2019-02-23 14:57:13 +01:00
a799470997 fix: Change the tokenizer to mesure cjk chars positions 2019-02-22 23:06:42 +01:00
10414791a2 fix: Remove debug println from the tokenizer 2019-02-22 22:34:37 +01:00
743974e60d Merge pull request #118 from Kerollmops/tokenizer-support-kanjis
Make the Tokenizer support Kanjis
2019-02-22 20:16:55 +01:00
0e267cae4b feat: Make the Tokenizer support Kanjis 2019-02-22 19:37:19 +01:00
12a352ae2f Merge pull request #117 from Kerollmops/tokenizer-support-parentheses
Make the tokenizer support parentheses
2019-02-22 19:36:15 +01:00
5070b27728 feat: Make the tokenizer support parentheses
Interpreting them as hard ponctuation (like a dot).
2019-02-22 18:18:17 +01:00
7a6b734078 Merge pull request #116 from Kerollmops/raw-field-value-getter
Allow users to retrieve the raw field value of a document
2019-02-22 18:02:46 +01:00
24823da6f7 feat: Allow users to retrieve the raw field value of a document 2019-02-22 15:30:20 +01:00
8701cb3a8f Merge pull request #115 from qdequele/database-path
Add accessor for database path and index path
2019-02-22 15:11:40 +01:00
315fc1fbe3 feat: Add accessor for database and index path 2019-02-22 13:49:04 +01:00
23833bac10 Merge pull request #114 from Kerollmops/hot-fix-ranked-attribute
Do not error when an attribute is registered for ranking
2019-02-21 23:17:10 +01:00
8235b6efc9 fix: Do not error when an attribute is registered for ranking 2019-02-21 20:14:08 +01:00
7f937eea5a Merge pull request #113 from Kerollmops/hot-fix-query-builder
Remove the QueryBuilder boxed criteria default static restriction
2019-02-21 20:11:10 +01:00
a1cf634ac1 feat: Remove the QueryBuilder boxed criteria default static restriction 2019-02-21 19:26:22 +01:00
c86472e997 Merge pull request #112 from Kerollmops/bump-version
Bump version to 0.3.1
2019-02-21 15:18:37 +01:00
26cb398a6f chore: Bump version to 0.3.1 2019-02-21 14:52:40 +01:00
f6e664d298 Merge pull request #111 from qdequele/config
Add a config per index
2019-02-21 14:39:37 +01:00
9437cecf87 chore: Use Default derive on Config struct 2019-02-21 14:01:55 +01:00
13309511b3 chore: Use serde derive lowercase on RankingOrdering 2019-02-21 14:01:55 +01:00
1941cb16c0 feat: Add Config.update_with(_) method to merge 2 config 2019-02-21 14:01:55 +01:00
55823c5d5d feat: add admin key on config 2019-02-21 14:01:55 +01:00
4721da1679 feat: Add access key on config 2019-02-21 14:01:55 +01:00
482f750231 chore: Set config field pub 2019-02-21 14:01:55 +01:00
d5119db165 feat: Allow to retrieve config from Database and DatabaseView 2019-02-21 14:01:55 +01:00
37578ed74f feat: store config into database 2019-02-20 14:07:19 +01:00
f5992ce822 Merge pull request #109 from Kerollmops/implement-text-cropping
Introduce text cropping that shows the first matches
2019-02-18 19:40:30 +01:00
badb0035c5 feat: Introduce text cropping that shows the first match 2019-02-18 18:59:50 +01:00
4bc14aa261 Merge pull request #108 from Kerollmops/refactor-index
Refactor the Index and Updates
2019-02-18 18:59:20 +01:00
a0c4ec0be0 feat: Introduce the updated_documents methods 2019-02-18 18:01:40 +01:00
264fffa826 feat: Replace the elapsed dependency by std::time::Instant 2019-02-17 16:37:45 +01:00
bddb37e44f feat: Move SharedData to its own module 2019-02-17 16:37:45 +01:00
6393b0cbc0 feat: Prefer binary to exponential search 2019-02-17 16:37:45 +01:00
a8df438814 feat: Implement WriteToBytes/FromSharedDataCursor 2019-02-17 16:37:44 +01:00
8014857ebf feat: Introduce the WriteToBytes trait 2019-02-17 16:37:44 +01:00
9e7261a48f feat: Introduce the FromSharedDataCursor trait 2019-02-17 16:37:44 +01:00
c4e70d0475 feat: Introduce the SharedDataCursor type 2019-02-17 16:37:44 +01:00
cbb0aaa217 feat: Introduce the Index structure along with the Events types 2019-02-17 16:36:47 +01:00
ce50e74491 Merge pull request #107 from Kerollmops/update-dependencies
Update dependencies
2019-02-13 16:05:51 +01:00
e103e1c277 chore: Replace the crossbeam::ArcCell by arc-swap::ArcSwap 2019-02-13 15:19:02 +01:00
64929fe5dc chore: Update slice-group-by to 0.2 2019-02-13 15:06:34 +01:00
b108f1e6c9 Merge pull request #106 from Kerollmops/fix-criterion
Fix the SumOfTypos and WordsProximity criteria
2019-02-12 22:06:32 +01:00
58b417e045 feat: Replace the linear_group_by by the new linear_group method 2019-02-12 21:23:36 +01:00
2e5a616d8e fix: Compute the proximity on the words with the min distance 2019-02-12 21:22:45 +01:00
092d446a7e chore: Update the slice-group-by dependency 2019-02-12 21:22:45 +01:00
85a1f126bf fix: Make the SumOfTypos criterion use a more clever algorithm 2019-02-12 21:22:42 +01:00
cf58cf86da Merge pull request #105 from Kerollmops/custom-ranking-field-into-hashmap
Save the custom ranking field into an HashMap
2019-02-11 17:36:26 +01:00
db6210c7ee feat: Introduce the Number type 2019-02-11 16:58:44 +01:00
83cd071827 feat: Introduce the SortByAttr custom ranking helper 2019-02-11 16:55:31 +01:00
084c3a95b6 feat: Add a new ranked attribute to the schema 2019-02-11 16:55:30 +01:00
78908aa34e Merge pull request #103 from Kerollmops/ranking-typo-rules
Add a reading on the default typos and ranking rules
2019-02-11 15:05:04 +01:00
cf27706f91 doc: Add a reading on the default typos and ranking rules 2019-02-11 11:58:17 +01:00
d3f53a7fd6 Merge pull request #104 from Kerollmops/update-readme
Update the Redame wrk stats
2019-02-10 14:53:15 +01:00
508af5613f doc: Update the Redame wrk stats 2019-02-10 14:05:21 +01:00
c615c31016 Merge pull request #101 from Kerollmops/version-bump
Bump version to 0.3.0
2019-02-07 15:26:38 +01:00
908b28790b chore: Bump version to 0.3.0 2019-02-07 14:51:39 +01:00
4c0279729b Merge pull request #100 from qdequele/master
Allow users to manage multiple database indexes
2019-02-07 14:49:52 +01:00
96dfac5b33 feat: Allow users to manage multiple database indexes 2019-02-07 13:05:55 +01:00
8576218b51 Merge pull request #99 from Kerollmops/simplify-transactional-update
Remove the lifetime restriction for Database Updates
2019-02-06 18:19:45 +01:00
1c1f9201b8 feat: Remove the lifetime restriction for Database Updates 2019-02-06 18:03:41 +01:00
4398b88a3a Merge pull request #98 from Kerollmops/updates-with-transactions
Change updates to be handled using the RocksDB WriteBatch feature
2019-02-06 16:13:47 +01:00
73e79f5ca4 chore: Make travis build with Rust 1.32 2019-02-06 15:58:48 +01:00
1bfd51d6e9 feat: Change updates to be handled using the RocksDB WriteBatch feature 2019-02-06 15:58:47 +01:00
0d2daf27f2 Merge pull request #97 from Kerollmops/remove-hashbrown-stop-words
Remove the hashbrown dependency for library users
2019-02-03 17:31:08 +01:00
87f0d8cf3c feat: Remove the hashbrown dependency for library users 2019-02-03 12:22:50 +01:00
06d5a10902 Merge pull request #96 from Kerollmops/chore
Make some little changes
2019-02-03 11:55:06 +01:00
94b89c5439 chore: Make the Document from_raw method private 2019-02-03 11:24:44 +01:00
c5e951be09 chore: Move the deseserializer into the serde module 2019-02-03 11:24:44 +01:00
66ae5c8161 chore: Clarify some QueryBuilder comments 2019-02-03 11:24:44 +01:00
8438e2202f Merge pull request #95 from Kerollmops/fix-querybuilder-with-criteria
Make the QueryBuilder with_criteria use FilterFunc
2019-02-03 11:24:17 +01:00
7a6166d229 feat: Make the QueryBuilder with_criteria use FilterFunc 2019-02-03 10:55:16 +01:00
d46fa4b215 Merge pull request #94 from Kerollmops/data-oriented
Introduce Data Oriented design into the search algorithm
2019-02-02 15:40:10 +01:00
2bd5b4ab86 feat: Remove useless WordsProximity criterion benchmark 2019-02-02 15:12:54 +01:00
5efbc5ceb3 feat: Introduce the revisited SortBy criterion 2019-02-02 14:42:12 +01:00
2e905bac08 chore: Remove Attribute and WordArea structures 2019-02-02 14:40:15 +01:00
4c0ad5f964 feat: Simplify the Criterion Trait by removing the DatabaseView param 2019-02-02 14:40:15 +01:00
455cbf3bf4 feat: Make the search algorithm become fully data oriented 2019-02-02 14:40:14 +01:00
a3a28c56fa feat: Replace compressed Match fields by uncompressed ones 2019-02-02 14:40:14 +01:00
b0b3175641 Merge pull request #93 from Kerollmops/slice-group-by
Use the GroupBy/Mut Traits of the slice-group-by library
2019-01-30 17:52:27 +01:00
c2f0df3f73 feat: Use the GroupBy/Mut Traits of the slice-group-by library 2019-01-30 16:54:52 +01:00
820f1f9ac6 Merge pull request #91 from Kerollmops/warn-reused-document-id
Emit warnings when a document id is reused
2019-01-28 21:05:42 +01:00
337aee5b65 chore: Emit warnings when a document id is reused 2019-01-28 16:11:55 +01:00
810dfdf656 Merge pull request #90 from Kerollmops/version-bump
Bump version to 0.2.1
2019-01-25 17:08:53 +01:00
f016652fca chore: Bump version to 0.2.1 2019-01-25 16:41:08 +01:00
6c99ebe3fa Merge pull request #89 from Kerollmops/no-more-compaction
Remove the manual compaction triggering
2019-01-25 16:40:08 +01:00
94d357985f feat: Remove the manual compaction triggering 2019-01-25 16:05:56 +01:00
fbc698567a Merge pull request #87 from Kerollmops/measure-index-loading
Display index loading times
2019-01-24 14:07:11 +01:00
aa9db14c09 chore: Display index loading times 2019-01-23 11:19:44 +01:00
61e83a1c21 Merge pull request #86 from Kerollmops/measure-indexation
Display timings of indexation operations
2019-01-16 13:32:44 +01:00
1316be5b09 chore: Display timings of indexation operations 2019-01-16 11:45:33 +01:00
4e8b0383dd Merge pull request #85 from Kerollmops/debug-more-stats
Display more stats infos
2019-01-15 14:20:28 +01:00
4fa10753c1 chore: Display more stats infos 2019-01-14 21:18:46 +01:00
2473e289e8 Merge pull request #84 from qdequele/create-server-example
Example HTTP server example can use stopwords
2019-01-14 18:55:58 +01:00
e0e5e87ed3 feat: HTTP server example can use stopwords 2019-01-14 18:21:58 +01:00
b13e61f40a Merge pull request #83 from qdequele/create-server-example
Create an example of HTTP server managing multiple databases
2019-01-14 14:35:33 +01:00
c023cb3065 feat: Create an example for HTTP server managing multiple databases 2019-01-14 13:39:54 +01:00
0a3d069fbc Merge pull request #79 from qdequele/master
Schema can be de/serialized from a json format
2019-01-12 21:50:02 +01:00
fa062ce2cf feat: Schema can be de/serialized from a json format 2019-01-12 21:05:48 +01:00
cdc6e47bf5 Merge pull request #81 from Kerollmops/update-readme
Simplify the examples command lines
2019-01-12 13:43:42 +01:00
d5f44838be doc: Simplify the examples command lines 2019-01-12 12:56:11 +01:00
5939f6e68a Merge pull request #80 from Kerollmops/version-bump
Bump version to 0.2.0
2019-01-12 12:52:08 +01:00
97edc987f8 chore: Bump version to 0.2.0 2019-01-12 12:18:29 +01:00
e4e50cecce Merge pull request #77 from Kerollmops/update-dependencies
Update the quickcheck dev-dependency
2019-01-10 22:09:44 +01:00
77e0c19749 chore: Update the quickcheck dev-dependency 2019-01-10 21:25:32 +01:00
251bccbbc3 Merge pull request #76 from Kerollmops/update-readme
Update readme
2019-01-10 21:20:39 +01:00
f7561f8552 doc: Update examples usages 2019-01-10 21:14:01 +01:00
05fd7e87ec doc: Add some wrk stats to the Readme 2019-01-10 21:13:54 +01:00
446d6a5455 Merge pull request #75 from Kerollmops/binary-group-by-mut-query-builder
Introduce binary group by in the query builder
2019-01-10 21:10:31 +01:00
78786a0007 feat: Introduce binary group by in the query builder 2019-01-10 20:13:40 +01:00
3d820a27ee Merge pull request #74 from Kerollmops/same-document-update-shadowed
Make multiple document updates shadow themselves
2019-01-10 15:57:49 +01:00
ac347d788c feat: Make multiple document updates shadow themselves 2019-01-10 15:25:24 +01:00
5627f15d41 Merge pull request #73 from Kerollmops/module-for-attribute-wordarea
Module for attribute wordarea
2019-01-10 15:23:03 +01:00
e31afc2da2 chore: Move the WordArea type to its own module 2019-01-10 13:37:22 +01:00
77c252e12a chore: Move the Attribute type to its own module 2019-01-10 11:59:42 +01:00
30c9c053c2 Merge pull request #72 from Kerollmops/wordarea-char-index
Make WordArea be based on char index and length
2019-01-09 20:53:59 +01:00
b53ef08d05 feat: Make WordArea be based on char index and length 2019-01-09 20:14:08 +01:00
86bfb173ef Merge pull request #70 from Kerollmops/fix-assert-new-attribute
Remove assert on Attribute::new()
2019-01-09 11:09:18 +01:00
8e5f834625 chore: remove assert on Attribute::new() 2019-01-08 18:46:55 +01:00
563b021679 Merge pull request #69 from tpayet/patch-1
Update README.md
2019-01-08 18:45:10 +01:00
681f721b1d Correct README typos 2019-01-08 17:09:48 +01:00
8a7c061539 Update README.md 2019-01-08 17:09:48 +01:00
8c781a4d05 Merge pull request #67 from Kerollmops/reintroduce-stop-words
Reintroduce stop words
2019-01-07 13:29:23 +01:00
de59ea495d feat: Log some update steps 2019-01-06 22:49:12 +01:00
966eda8ae5 feat: Do the sum of typos using usizes 2019-01-06 22:49:12 +01:00
32f8908d71 feat: Reintroduce stopwords for the serializer 2019-01-06 22:49:11 +01:00
a2f5e8aa25 Merge pull request #66 from Kerollmops/revert-precompute-query-index-groups
Revert precompute query index groups
2019-01-06 22:38:44 +01:00
f00b978801 Revert "feat: Pre-compute matches query index groups"
This reverts commit 039a9a4cc7.
2019-01-06 21:54:49 +01:00
a78b5d225f Revert "feat: Allow Matches to be constructed"
This reverts commit d21406a939.
2019-01-06 21:44:53 +01:00
f32a59720d Revert "feat: Introducing the Matches as_matches method"
This reverts commit ef7ba96d4a.
2019-01-06 21:44:53 +01:00
2cc5fbde1a Revert "feat: Introduce multiple Iterator impl for Matches"
This reverts commit c594597a01.
2019-01-06 21:44:53 +01:00
34d2850d28 Revert "feat: Prefer using ranges and not using unreachable!"
This reverts commit d899b86603.
2019-01-06 21:44:51 +01:00
023f62b0ce Merge pull request #65 from Kerollmops/logging
Add a little bit of logging
2019-01-06 15:55:48 +01:00
7f35b971f0 feat: Log the total number of documents to rank 2019-01-06 15:02:53 +01:00
3418adb06a feat: Add log libraries dependencies 2019-01-06 15:02:53 +01:00
510426c05c Merge pull request #64 from Kerollmops/precompute-query-index-groups
Precompute query index groups
2019-01-06 14:59:04 +01:00
c74caa0f82 feat: Sum usizes instead of little u16/u32 2019-01-06 13:54:14 +01:00
d899b86603 feat: Prefer using ranges and not using unreachable! 2019-01-06 13:54:14 +01:00
0d07af3caf fix: Filter and count the exact matching words 2019-01-06 13:54:13 +01:00
c594597a01 feat: Introduce multiple Iterator impl for Matches 2019-01-06 13:54:13 +01:00
ef7ba96d4a feat: Introducing the Matches as_matches method 2019-01-06 13:54:13 +01:00
d21406a939 feat: Allow Matches to be constructed 2019-01-06 13:54:13 +01:00
039a9a4cc7 feat: Pre-compute matches query index groups 2019-01-06 11:11:55 +01:00
40ab9e7a55 Merge pull request #63 from Kerollmops/update-rocksdb
Update RocksDB to Titan
2019-01-06 10:37:54 +01:00
d21abb50fa chore: Update RocksDB to Titan 2019-01-05 12:47:03 +01:00
3dd5e2445a Merge pull request #62 from Kerollmops/test-document-key-attr
Add tests to DocumentKeyAttr
2019-01-02 22:20:37 +01:00
7f5e6c5b6e test: Add test to the DocumentKeyAttr slice repr 2019-01-02 21:48:58 +01:00
e6d3840f12 Merge pull request #61 from Kerollmops/update-remove-kv-attributes
UpdateBuilder handles document attributes deletion
2019-01-02 18:20:14 +01:00
c05fab783a fix: Write and Read DocumentKeyAttr in big endian 2019-01-02 17:53:53 +01:00
95dc6fe904 feat: Rework the UpdateBuilder struct 2019-01-02 17:53:52 +01:00
b2e9ae4136 Merge pull request #60 from Kerollmops/improve-perfs
Improve performances
2019-01-01 17:03:41 +01:00
b070778d44 feat: Use the jemalloc global allocator in examples 2019-01-01 16:37:15 +01:00
6731025003 chore: Update group-by 2019-01-01 16:27:39 +01:00
04544c1531 feat: Expose nightly features of some dependencies 2019-01-01 16:27:08 +01:00
9dd68b4eaa Merge pull request #58 from Kerollmops/clean-up
Clean up some database functions
2019-01-01 11:43:27 +01:00
1d67012aa5 chore: Clean up some database functions 2019-01-01 01:40:20 +01:00
e723e01ec8 Merge pull request #57 from Kerollmops/clippy-pass
Clippy pass
2018-12-31 23:46:18 +01:00
7845292ea8 chore: Clippy pass 2018-12-31 23:20:30 +01:00
521df85c0d Merge pull request #55 from Kerollmops/add-benchmarks
Add benchmarks
2018-12-31 21:48:38 +01:00
dfa19582a2 test: Add benchmarks to mesure the words proximity criterion 2018-12-31 21:18:42 +01:00
87ec95f7a0 test: Add benchmarks to mesure the database 2018-12-31 21:18:37 +01:00
76ef2cceeb Merge pull request #49 from Kerollmops/serialize-any-map
Serialize any map
2018-12-31 21:11:17 +01:00
20b5a6a06e doc: Add examples for runtime defined data and Schema 2018-12-31 20:44:33 +01:00
a842e647f7 Merge pull request #56 from Kerollmops/new-index-struct
New Index structure
2018-12-31 19:55:18 +01:00
21bb38c3b0 test: Add more tests for updates ingestion 2018-12-31 19:27:21 +01:00
64d53ee1bd chore: Rework the data module structures
being able to be constructed from SharedData
2018-12-31 19:27:21 +01:00
c022fa3fca chore: Move serde related structs to their module 2018-12-31 19:26:28 +01:00
0080bf486f feat: Introduce the new Index structure
replacing the old ugly Blob system
2018-12-31 19:26:27 +01:00
6bd779f9ae feat: Improve the deserialization time of a Blob 2018-12-31 13:15:37 +01:00
a18401f47e Merge pull request #53 from Kerollmops/query-builder-filter
Distinct/QueryBuilder filtering
2018-12-29 23:11:43 +01:00
7132c3be89 feat: Allow filtering on QueryBuilder 2018-12-29 22:30:41 +01:00
aa3d059363 feat: Allow filtering on DistinctQueryBuilder 2018-12-29 22:30:41 +01:00
e2a9dbc404 feat: Introduce filtering methods for Distinct/QueryBuilder 2018-12-29 22:30:40 +01:00
a0a11faee5 Merge pull request #54 from Kerollmops/arccell-instead-of-rwlock
Prefer using ArcCell instead of RWLock for database updates
2018-12-29 22:29:35 +01:00
36ef9581aa feat: Return the database view for each update 2018-12-29 21:07:01 +01:00
f4b04dfb72 feat: Prefer doing DatabaseView updates atomically 2018-12-29 20:52:00 +01:00
cf5d56e63a Merge pull request #52 from Kerollmops/schema-toml
Schema can be de/serialized from a toml format
2018-12-28 19:59:40 +01:00
8412c14b5b feat: Schema can be toml de/serialized 2018-12-28 19:24:50 +01:00
70772eca5c Merge pull request #51 from Kerollmops/wordarea-attribute-fallible
Make the Attribute and WordArea errors recoverable
2018-12-28 18:26:19 +01:00
b27f632e14 feat: Make the Attribute and WordArea errors recoverable 2018-12-28 16:15:22 +01:00
e3bfb866e5 Merge pull request #46 from Kerollmops/schema-considers-id
Schema considers document ids
2018-12-27 12:26:57 +01:00
fa238f21ef feat: Move Database to its own module 2018-12-27 11:21:47 +01:00
444a4c1af7 feat: Make the schema consider document ids 2018-12-27 11:21:47 +01:00
2e5c5fad33 Merge pull request #45 from Kerollmops/index-length-in-docindex
Introduce the WordArea struct
2018-12-24 17:08:20 +01:00
b32c96cdc9 feat: Introduce a WordArea struct
Useful to highlight matching areas in the original text.
2018-12-24 15:58:46 +01:00
62521262e8 Merge pull request #44 from Kerollmops/real-document-id-type
Create a real DocumentId type
2018-12-24 15:41:47 +01:00
4ebae7784c feat: Create a strong DocumentId type
Forcing it to be something internal will permit to avoid possible miss comparisons to be done with other types.
2018-12-24 12:42:24 +01:00
a756ca5e3f Merge pull request #39 from Kerollmops/readme-badges
Add badges to the README
2018-12-19 14:42:54 +01:00
aa104fa253 doc: Add some funny badges to the README 2018-12-19 12:00:29 +01:00
114 changed files with 28425 additions and 4315 deletions

3
.gitignore vendored
View File

@ -1,6 +1,7 @@
/rocksdb
/target
/Cargo.lock
meilidb/Cargo.lock
meilidb-core/Cargo.lock
**/*.rs.bk
**/*.csv
**/*.json_lines

View File

@ -1,22 +0,0 @@
language: rust
cache: cargo
branches:
only:
- master
matrix:
fast_finish: true
include:
# Test crates on their minimum Rust versions.
- rust: 1.31.0
name: "meilidb on 1.31.0"
script: ./ci/meilidb.sh
# Test crates on nightly Rust.
- rust: nightly
name: "meilidb on nightly"
script: ./ci/meilidb.sh

View File

@ -1,39 +1,11 @@
[package]
edition = "2018"
name = "meilidb"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
[workspace]
members = [
"meilidb",
"meilidb-core",
"meilidb-data",
"meilidb-schema",
"meilidb-tokenizer",
]
[dependencies]
bincode = "1.0"
byteorder = "1.2"
fst = "0.3"
hashbrown = "0.1"
lazy_static = "1.1"
levenshtein_automata = { version = "0.1", features = ["fst_automaton"] }
linked-hash-map = { version = "0.5", features = ["serde_impl"] }
sdset = "0.3"
serde = "1.0"
serde_derive = "1.0"
unidecode = "0.3"
[dependencies.rocksdb]
git = "https://github.com/pingcap/rust-rocksdb.git"
rev = "c2eb140"
[dependencies.group-by]
git = "https://github.com/Kerollmops/group-by.git"
rev = "cab857b"
[features]
default = ["simd"]
i128 = ["bincode/i128", "byteorder/i128"]
simd = ["rocksdb/sse"]
portable = ["rocksdb/portable"]
nightly = []
[dev-dependencies]
csv = "1.0"
elapsed = "0.1"
structopt = "0.2"
tempfile = "3.0"
[profile.release]
debug = true

View File

@ -1,47 +1,82 @@
# MeiliDB
[![Build Status](https://dev.azure.com/thomas0884/thomas/_apis/build/status/meilisearch.MeiliDB?branchName=master)](https://dev.azure.com/thomas0884/thomas/_build/latest?definitionId=1&branchName=master)
[![dependency status](https://deps.rs/repo/github/Kerollmops/MeiliDB/status.svg)](https://deps.rs/repo/github/Kerollmops/MeiliDB)
[![License](https://img.shields.io/github/license/Kerollmops/MeiliDB.svg)](https://github.com/Kerollmops/MeiliDB)
[![Rust 1.31+](https://img.shields.io/badge/rust-1.31+-lightgray.svg)](
https://www.rust-lang.org)
A _full-text search database_ using a key-value store internally.
It uses [RocksDB](https://github.com/facebook/rocksdb) like a classic database, to store documents and internal data. The key-value store power allow us to handle updates and queries with small memory and CPU overheads.
## Features
You can [read the deep dive](deep-dive.md) if you want more informations on the engine, it describes the whole process of generating updates and handling queries.
- Provides [6 default ranking criteria](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/criterion/mod.rs#L95-L101) used to [bucket sort](https://en.wikipedia.org/wiki/Bucket_sort) documents
- Accepts [custom criteria](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/criterion/mod.rs#L22-L29) and can apply them in any custom order
- Support [ranged queries](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/query_builder.rs#L146), useful for paginating results
- Can [distinct](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/query_builder.rs#L68) and [filter](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/query_builder.rs#L57) returned documents based on context defined rules
- Can store complete documents or only [user schema specified fields](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/examples/movies/schema-movies.toml)
- The [default tokenizer](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-tokenizer/src/lib.rs#L99) can index latin and kanji based languages
- Returns [the matching text areas](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/lib.rs#L117-L120), useful to highlight matched words in results
- Accepts query time search config like the [searchable fields](https://github.com/meilisearch/MeiliDB/blob/3d85cbf0cfa3a3103cf1e151a75a443719cdd5d7/meilidb-core/src/query_builder.rs#L79)
- Supports run time indexing (incremental indexing)
We will be proud if you send pull requests to help us grow this project, you can start with [issues tagged "good-first-issue"](https://github.com/Kerollmops/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) to start !
At the moment this is a library only, this means that binaries are not part of this repository but since I'm still nice I have made some examples for you in the `examples/` folder that works with the data located in the `misc/` folder.
In a near future MeiliDB we be a binary like any database: updated and queried using some kind of protocol. It is the final goal, [see the milestones](https://github.com/Kerollmops/MeiliDB/milestones). MeiliDB will just be a bunch of network and protocols functions wrapping the library which itself will be published to https://crates.io, following the same update cycle.
It uses [RocksDB](https://github.com/facebook/rocksdb) as the internal key-value store. The key-value store allows us to handle updates and queries with small memory and CPU overheads. The whole ranking system is [data oriented](https://github.com/meilisearch/MeiliDB/issues/82) and provides great performances.
You can [read the deep dive](deep-dive.md) if you want more information on the engine, it describes the whole process of generating updates and handling queries or you can take a look at the [typos and ranking rules](typos-ranking-rules.md) if you want to know the default rules used to sort the documents.
We will be proud if you submit issues and pull requests. You can help to grow this project and start contributing by checking [issues tagged "good-first-issue"](https://github.com/meilisearch/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). It is a good start!
The project is only a library yet. It means that there is no binary provided yet. To get started, you can check the examples wich are made to work with the data located in the `misc/` folder.
MeiliDB will be a binary in a near future so you will be able to use it as a database out-of-the-box. We should be able to query it using a [to-be-defined](https://github.com/meilisearch/MeiliDB/issues/38) protocol. This is our current goal, [see the milestones](https://github.com/meilisearch/MeiliDB/milestones). In the end, the binary will be a bunch of network protocols and wrappers around the library - which will also be published on [crates.io](https://crates.io). Both the binary and the library will follow the same update cycle.
## Performances
_these informations have been made with a version dated of october 2018, we must update them_
With a database composed of _100 353_ documents with _352_ attributes each and _3_ of them indexed.
So more than _300 000_ fields indexed for _35 million_ stored we can handle more than _2.8k req/sec_ with an average response time of _9 ms_ on an Intel i7-7700 (8) @ 4.2GHz.
We made some tests on remote machines and found that we can handle with a dataset of near 280k products, on a server that cost 5$/month with 1vCPU and 1GB of ram and on the same index and with a simple query:
Requests are made using [wrk](https://github.com/wg/wrk) and scripted to simulate real users queries.
- near 190 users with an average response time of 90ms
- 150 users with an average response time of 70ms
- 100 users with an average response time of 45ms
Network is mesured, servers are located in amsterdam and tests are made between two different datacenters.
```
Running 10s test @ http://localhost:2230
2 threads and 25 connections
Thread Stats Avg Stdev Max +/- Stdev
Latency 9.52ms 7.61ms 99.25ms 84.58%
Req/Sec 1.41k 119.11 1.78k 64.50%
28080 requests in 10.01s, 7.42MB read
Requests/sec: 2806.46
Transfer/sec: 759.17KB
```
### Notes
The default Rust allocator has recently been [changed to use the system allocator](https://github.com/rust-lang/rust/pull/51241/).
We have seen much better performances when [using jemalloc as the global allocator](https://github.com/alexcrichton/jemallocator#documentation).
## Usage and examples
MeiliDB work with an index like most of the search engines.
So to test the library you can create one by indexing a simple csv file.
Currently MeiliDB do not provide an http server but you can run these two examples to try it out.
It creates an index named _movies_ and insert _19 700_ (in batches of _1000_) movies into it.
```bash
cargo run --release --example create-database -- test.mdb misc/kaggle.csv
cargo run --release --example create-database -- \
--schema examples/movies/schema-movies.toml \
--update-group-size 1000 \
movies.mdb \
examples/movies/movies.csv
```
Once the command finished indexing the database should have been saved under the `test.mdb` folder.
Now you can easily run the `query-database` example to check what is stored in it.
Once this is done, you can query this database using the second binary example.
```bash
cargo run --release --example query-database -- test.mdb
cargo run --release --example query-database -- \
movies.mdb \
--fetch-timeout-ms 50 \
-n 4 \
id title overview release_date poster
```

47
azure-pipelines.yml Normal file
View File

@ -0,0 +1,47 @@
---
trigger:
branches:
include: [ master ]
pr: [ master ]
jobs:
- job: test
pool:
vmImage: 'Ubuntu 16.04'
container: tpayet/chiquitita:latest
steps:
- script: |
curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain nightly
displayName: 'Install rustc'
- script: |
$HOME/.cargo/bin/cargo check
displayName: 'Check MeiliDB'
- script: |
$HOME/.cargo/bin/cargo test
displayName: 'Test MeiliDB'
- job: build
dependsOn:
- test
condition: succeeded()
pool:
vmImage: 'Ubuntu 16.04'
container: tpayet/chiquitita:latest
steps:
- script: |
curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain nightly
displayName: 'Install rustc'
- script: |
$HOME/.cargo/bin/cargo build --release
displayName: 'Build MeiliDB'
- task: CopyFiles@2
inputs:
contents: '$(System.DefaultWorkingDirectory)/target/release/libmeilidb.rlib'
targetFolder: $(Build.ArtifactStagingDirectory)
displayName: 'Copy build'
- task: PublishBuildArtifacts@1
inputs:
artifactName: libmeilidb.rlib
displayName: 'Upload artifacts'

View File

@ -1,28 +1,22 @@
# A deep dive in MeiliDB
On the 9 of december 2018.
MeiliDB is a full text search engine based on a final state transducer named [fst](https://github.com/BurntSushi/fst) and a key-value store named [RocksDB](https://github.com/facebook/rocksdb). The goal of a search engine is to store data and to respond to queries as accurate and fast as possible. To achieve this it must save the data as an [inverted index](https://en.wikipedia.org/wiki/Inverted_index).
On the 15 of May 2019.
MeiliDB is a full text search engine based on a final state transducer named [fst](https://github.com/BurntSushi/fst) and a key-value store named [sled](https://github.com/spacejam/sled). The goal of a search engine is to store data and to respond to queries as accurate and fast as possible. To achieve this it must save the matching words in an [inverted index](https://en.wikipedia.org/wiki/Inverted_index).
<!-- MarkdownTOC autolink="true" -->
- [Where is the data stored?](#where-is-the-data-stored)
- [What does the key-value store contains?](#what-does-the-key-value-store-contains)
- [The blob type](#the-blob-type)
- [The inverted word index](#the-inverted-word-index)
- [A final state transducer](#a-final-state-transducer)
- [Document indexes](#document-indexes)
- [Document ids](#document-ids)
- [The schema](#the-schema)
- [Document attributes](#document-attributes)
- [How is an update handled?](#how-is-an-update-handled)
- [The merge operation is CPU consuming](#the-merge-operation-is-cpu-consuming)
- [How is a request processed?](#how-is-a-request-processed)
- [Query lexemes](#query-lexemes)
- [Automatons and query index](#automatons-and-query-index)
- [Sort by criteria](#sort-by-criteria)
- [Retrieve original documents](#retrieve-original-documents)
<!-- /MarkdownTOC -->
@ -30,21 +24,17 @@ MeiliDB is a full text search engine based on a final state transducer named [fs
MeiliDB is entirely backed by a key-value store like any good database (i.e. Postgres, MySQL). This brings a great flexibility in the way documents can be stored and updates handled along time.
[RocksDB brings some](https://rocksdb.org/blog/2015/02/27/write-batch-with-index.html) of the [A.C.I.D. properties](https://en.wikipedia.org/wiki/ACID_(computer_science)) to help us be sure the saved data is consistent, for example we use SST files and the key-value store ability to load them in one time to manage updates.
Note that the SST file have the same restriction as the fst, it needs its keys to be added in order at creation.
[sled will brings some](https://github.com/spacejam/sled/tree/434533332a3f485e6d2e467023be0a0b55d3a1af#plans) of the [A.C.I.D. properties](https://en.wikipedia.org/wiki/ACID_(computer_science)) to help us be sure the saved data is consistent.
## What does the key-value store contains?
It contain the blob, the schema and the documents stored attributes.
It contain the inverted word index, the schema and the documents fields.
### The blob type
### The inverted word index
[The Blob type](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/blob/mod.rs#L16-L19) is a data structure that indicate if an update is a positive or a negative one. In the case where the update is considered positive, the blob will contain [an fst map and the document indexes](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/blob/positive/blob.rs#L15-L18) associated. In the other case it will only contain [all the document ids](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/blob/negative/blob.rs#L12-L14) that must be considered removed.
The Blob type [is stored under the "*data-index*" entry](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/update/positive/update.rs#L497-L499) and marked as [a merge operation](https://github.com/facebook/rocksdb/wiki/Merge-Operator-Implementation) in the key-value store.
[The inverted word index](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/words_index.rs) is a sled Tree dedicated to store and give access to all documents that contains a specific word. The information stored under the word is simply a big ordered array of where in the document the word has been found. In other word, a big list of [`DocIndex`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/lib.rs#L35-L51).
#### A final state transducer
@ -52,89 +42,54 @@ _...also abbreviated fst_
This is the first entry point of the engine, you can read more about how it work with the beautiful blog post of @BurntSushi, [Index 1,600,000,000 Keys with Automata and Rust](https://blog.burntsushi.net/transducers/).
To make it short it is a powerful way to store all the words that are present in the indexed documents. You construct it by giving it all the words you want to index associated with a value that, for the moment, can only be an `u64`. When you want to search in it you can provide any automaton you want, in MeiliDB [a custom levenshtein automaton](https://github.com/tantivy-search/levenshtein-automata/) is used.
Note that the number under each word is auto-incremental, each new word have a new number that is greater than the prevous one.
Another powerful feature of `fst` is that it can nearly avoid using RAM and be streamed to disk for example, the problem is that the keys must be always added in lexicographic order, so you must sort them before, for the moment MeiliDB uses a [BTreeMap](https://github.com/Kerollmops/raptor-rs/blob/8abdb0a228e2808fe1814a6a0641a4b72d158579/src/metadata/doc_indexes.rs#L107-L112).
To make it short it is a powerful way to store all the words that are present in the indexed documents. You construct it by giving it all the words you want to index. When you want to search in it you can provide any automaton you want, in MeiliDB [a custom levenshtein automaton](https://github.com/tantivy-search/levenshtein-automata/) is used.
#### Document indexes
As it has been specified, the `fst` can only store a number corresponding to a word, an `u64`, but the goal of the search engine is to retrieve a match in a document when a query is made. You want it to return some sort of position in an attribute in a document, an information about where the given word match.
The `fst` will only return the words that match with the search automaton but the goal of the search engine is to retrieve all matches in all the documents when a query is made. You want it to return some sort of position in an attribute in a document, an information about where the given word matched.
To make it possible, a custom data structure has been developed, the document indexes is composed of two arrays, the ranges array and all the docindexes corresponding to a given range, each range identify the word number. The [DocIndexes](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/data/doc_indexes.rs#L23) type is designed to be streamed when constructed, consumming a minimum amount of ram like the fst. Another advantage is that the slices are accessible in `O(1)` when you know the word associated number.
#### Document ids
This is a simple ordered list of all documents ids which must be considered deleted. It is used with [the sdset library](https://docs.rs/sdset/0.3.0/sdset/duo/struct.DifferenceByKey.html), the docindexes and the `DifferenceByKey` operation builder when merging blobs.
When a blob represent a negative update it only contains this simple slice of deleted documents ids.
To make it possible we retrieve all of the `DocIndex` corresponding to all the matching words in the fst, we use the [`WordsIndex`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/words_index.rs#L11-L21) Tree to get the `DocIndexes` corresponding the words.
### The schema
The schema is a data struture that represents which documents attributes should be stored and which should be indexed. It is stored under the "_data-schema_" entry and given to MeiliDB only at the creation.
The schema is a data structure that represents which documents attributes should be stored and which should be indexed. It is stored under a the [`MainIndex`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/main_index.rs#L12) Tree and given to MeiliDB only at the creation of an index.
Each document attribute is associated to a unique 32 bit number named `SchemaAttr`.
Each document attribute is associated to a unique 16 bit number named [`SchemaAttr`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/schema.rs#L186).
In the future this schema type could be given along with updates and probably be different from the original, the database could be able to handled this document structure and reindex it.
In the future, this schema type could be given along with updates, the database could be able to handled a new schema and reindex the database according to the new one.
### Document attributes
When the engine handle a query the result that the requester want is a document, not only the [match](https://github.com/Kerollmops/MeiliDB/blob/fc2cdf92596fc002ce278e3aa8718640ac44724d/src/lib.rs#L51-L79) associated to it, fields of the original document must be returned too.
When the engine handle a query the result that the requester want is a document, not only the [`Matches`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/lib.rs#L62-L88) associated to it, fields of the original document must be returned too.
So MeiliDB again uses the power of the underlying key-value store and save the documents attributes marked as _STORE_. The key is prefixed by "_doc_" followed by the 64 bit document id in bytes and the schema attribute number in bytes corresponding to the document attribute stored.
So MeiliDB again uses the power of the underlying key-value store and save the documents attributes marked as _STORE_ in the schema. The dedicated Tree for this information is the [`DocumentsIndex`](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/documents_index.rs#L11).
When a document field is saved in the key-value store its value is binary encoded using the [bincode](https://docs.rs/bincode/) library, so a document must be serializable using serde.
## How is an update handled?
First of all an update in MeiliDB is nothing more than [a RocksDB SST file](https://github.com/facebook/rocksdb/wiki/Creating-and-Ingesting-SST-files). It contains the blob and all the documents attributes binary encoded like described above. Note that the blob is stored under the "_data-index_" key marked as [a merge operation](https://github.com/facebook/rocksdb/wiki/Merge-Operator-Implementation).
### The merge operation is CPU consuming
When [the database ingest an update](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/mod.rs#L108-L145) it gives the SST file to the underlying RocksDB, once it has ingested it there is a "_data-index_" entry available, we can request it but the key-value store will call a function before, a merge operation is performed.
This merge operation is done on multiple blobs as you have understood and will compute a [PositiveBlob](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/blob/positive/blob.rs#L15), this type contains the fst and document indexes structures allowing us to search for documents. This two data structures can be considered as the inverted index.
The computation time of this merge is important, RocksDB doesn't keep the previous merged result, it will call our merge operation each time until it decided to do a compaction. So [we must force this compaction earlier](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/mod.rs#L129-L131) when we receive an update to reduce this cost.
This way when we request the "_data-index_" value it will gives us the previously merged positive blob without any other merge overhead.
When a document field is saved in the key-value store its value is binary encoded using [message pack](https://github.com/3Hren/msgpack-rust), so a document must be serializable using serde.
## How is a request processed?
Now that we have our "_data-index_" we are able to return results based on a query. In the MeiliDB universe a query is a string.
Now that we have our inverted index we are able to return results based on a query. In the MeiliDB universe a query is a simple string containing words.
### Query lexemes
The first step to be able to call the underlying structures is to split the query in words, for that we use a [custom tokenizer](https://github.com/Kerollmops/MeiliDB/blob/fc2cdf92596fc002ce278e3aa8718640ac44724d/src/tokenizer/mod.rs) that is not finished for the moment, [there is an open issue](https://github.com/Kerollmops/MeiliDB/issues/3). Note that a tokenizer is specialized for a human language, this is the hard part.
The first step to be able to call the underlying structures is to split the query in words, for that we use a [custom tokenizer](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-tokenizer/src/lib.rs#L82-L84). Note that a tokenizer is specialized for a human language, this is the hard part.
### Automatons and query index
So to query the fst we need an automaton, in MeiliDB we use a [levenshtein automaton](https://en.wikipedia.org/wiki/Levenshtein_automaton), this automaton is constructed using a string and a maximum distance. According to the [Algolia's blog post](https://blog.algolia.com/inside-the-algolia-engine-part-3-query-processing/#algolia%e2%80%99s-way-of-searching-for-alternatives) we [created the DFAs](https://github.com/Kerollmops/MeiliDB/blob/fc2cdf92596fc002ce278e3aa8718640ac44724d/src/automaton.rs#L62-L75) with different settings.
So to query the fst we need an automaton, in MeiliDB we use a [levenshtein automaton](https://en.wikipedia.org/wiki/Levenshtein_automaton), this automaton is constructed using a string and a maximum distance. According to the [Algolia's blog post](https://blog.algolia.com/inside-the-algolia-engine-part-3-query-processing/#algolia%e2%80%99s-way-of-searching-for-alternatives) we [created the DFAs](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/automaton.rs#L59-L78) with different settings.
Thanks to the power of the fst library [it is possible to union multiple automatons](https://docs.rs/fst/0.3.2/fst/map/struct.OpBuilder.html#method.union) on the same fst map, it will allow us to know which [automaton returns a word according to its index](https://github.com/Kerollmops/MeiliDB/blob/fc2cdf92596fc002ce278e3aa8718640ac44724d/src/metadata/ops.rs#L111). The `Stream` is able to return all the numbers associated to the words. We use these numbers to find the whole list of `DocIndexes` associated and do the union set operation.
Thanks to the power of the fst library [it is possible to union multiple automatons](https://docs.rs/fst/0.3.2/fst/map/struct.OpBuilder.html#method.union) on the same fst set. The `Stream` is able to return all the matching words. We use these words to find the whole list of `DocIndexes` associated.
With all these informations it is possible [to reconstruct a list of all the DocIndexes associated](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/rank/query_builder.rs#L62-L99) with the words queried.
With all these informations it is possible [to reconstruct a list of all the `DocIndexes` associated](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/query_builder.rs#L103-L130) with the words queried.
### Sort by criteria
Now that we are able to get a big list of [DocIndexes](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/lib.rs#L21-L36) it is not enough to sort them by criteria, we need more informations like the levenshtein distance or the fact that a query word match exactly the word stored in the fst. So [we stuff it a little bit](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/rank/query_builder.rs#L86-L93), and aggregate all these [Matches](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/lib.rs#L47-L74) for each document. This way it will be easy to sort a simple vector of document using a bunch of functions.
With this big list of documents and associated matches [we are able to sort only the part of the slice that we want](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/rank/query_builder.rs#L108-L119) using bucket sorting. [Each criterion](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/rank/criterion/mod.rs#L75-L87) is evaluated on each subslice without copy, thanks to [GroupByMut](https://github.com/Kerollmops/group-by/blob/cab857bae01463dbd0edb99b0e0d7f3624e6c6f5/src/lib.rs#L180-L185) which, I hope [will soon be merged](https://github.com/rust-lang/rfcs/pull/2477).
Note that it is possible to customize the criteria used by using the `QueryBuilder::with_criteria` constructor, this way you can implement some custom ranking based on the document attributes using the appropriate structure and the `retrieve_document` method.
### Retrieve original documents
The [DatabaseView](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/database_view.rs#L18-L24) structure that you must have created to be able to query the database have [two functions](https://github.com/Kerollmops/MeiliDB/blob/550dc1e99224e386516877450320f694947332d4/src/database/database_view.rs#L60-L76) that allows you to retrieve a full (or not) document according to the schema you specified at creation time (i.e. the _STORED_ attributes).
As you can see, these functions force the created type `T` to implement [the serde Deserialize trait](https://docs.rs/serde/1.0.81/serde/trait.Deserialize.html), MeiliDB will use the `bincode::deserialise` function for each attribute to construct your type and return it to you.
With this big list of documents and associated matches [we are able to sort only the part of the slice that we want](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/query_builder.rs#L160-L188) using bucket sorting. [Each criterion](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-core/src/criterion/mod.rs#L95-L101) is evaluated on each subslice without copy, thanks to [GroupByMut](https://docs.rs/slice-group-by/0.2.4/slice_group_by/) which, I hope [will soon be merged](https://github.com/rust-lang/rfcs/pull/2477).
Note that it is possible to customize the criteria used by using the `QueryBuilder::with_criteria` constructor, this way you can implement some custom ranking based on the document attributes using the appropriate structure and the [`document` method](https://github.com/meilisearch/MeiliDB/blob/3db823de002243004612e36a19b4578d800dab97/meilidb-data/src/database/index.rs#L86).
At this point, MeiliDB work is over 🎉

View File

@ -1,98 +0,0 @@
use std::collections::hash_map::DefaultHasher;
use std::path::{Path, PathBuf};
use std::hash::{Hash, Hasher};
use std::error::Error;
use serde_derive::{Serialize, Deserialize};
use structopt::StructOpt;
use meilidb::database::schema::{Schema, SchemaBuilder, STORED, INDEXED};
use meilidb::database::update::PositiveUpdateBuilder;
use meilidb::tokenizer::DefaultBuilder;
use meilidb::database::Database;
#[derive(Debug, StructOpt)]
pub struct Opt {
/// The destination where the database must be created
#[structopt(parse(from_os_str))]
pub database_path: PathBuf,
/// The csv file to index.
#[structopt(parse(from_os_str))]
pub csv_data_path: PathBuf,
}
#[derive(Debug, Serialize, Deserialize)]
struct Document<'a> {
id: &'a str,
title: &'a str,
description: &'a str,
image: &'a str,
}
fn calculate_hash<T: Hash>(t: &T) -> u64 {
let mut s = DefaultHasher::new();
t.hash(&mut s);
s.finish()
}
fn create_schema() -> Schema {
let mut schema = SchemaBuilder::new();
schema.new_attribute("id", STORED);
schema.new_attribute("title", STORED | INDEXED);
schema.new_attribute("description", STORED | INDEXED);
schema.new_attribute("image", STORED);
schema.build()
}
fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<Database, Box<Error>> {
let database = Database::create(database_path, schema.clone())?;
println!("start indexing...");
let tokenizer_builder = DefaultBuilder::new();
let update_path = tempfile::NamedTempFile::new()?;
let mut update = PositiveUpdateBuilder::new(update_path.path(), schema, tokenizer_builder);
let mut rdr = csv::Reader::from_path(csv_data_path)?;
let mut raw_record = csv::StringRecord::new();
let headers = rdr.headers()?.clone();
while rdr.read_record(&mut raw_record)? {
let document: Document = match raw_record.deserialize(Some(&headers)) {
Ok(document) => document,
Err(e) => {
eprintln!("{:?}", e);
continue;
}
};
let document_id = calculate_hash(&document.id);
update.update(document_id, &document).unwrap();
}
let mut update = update.build()?;
update.set_move(true);
database.ingest_update_file(update)?;
Ok(database)
}
fn main() -> Result<(), Box<Error>> {
let opt = Opt::from_args();
let schema = create_schema();
let (elapsed, result) = elapsed::measure_time(|| {
index(schema, &opt.database_path, &opt.csv_data_path)
});
if let Err(e) = result {
return Err(e.into())
}
println!("database created in {} at: {:?}", elapsed, opt.database_path);
Ok(())
}

View File

@ -0,0 +1,19 @@
# This schema has been generated ...
# The order in which the attributes are declared is important,
# it specify the attribute xxx...
identifier = "id"
[attributes.id]
displayed = true
[attributes.title]
displayed = true
indexed = true
[attributes.description]
displayed = true
indexed = true
[attributes.image]
displayed = true

View File

@ -0,0 +1 @@
_datas in movies.csv are from https://www.themoviedb.org/_

19700
examples/movies/movies.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
# This schema has been generated ...
# The order in which the attributes are declared is important,
# it specify the attribute xxx...
identifier = "id"
[attributes.id]
displayed = true
[attributes.title]
displayed = true
indexed = true
[attributes.overview]
displayed = true
indexed = true
[attributes.release_date]
displayed = true
[attributes.poster]
displayed = true

View File

@ -1,68 +0,0 @@
use std::io::{self, Write};
use std::path::PathBuf;
use std::error::Error;
use serde_derive::{Serialize, Deserialize};
use structopt::StructOpt;
use meilidb::database::Database;
#[derive(Debug, StructOpt)]
pub struct Opt {
/// The destination where the database must be created
#[structopt(parse(from_os_str))]
pub database_path: PathBuf,
/// The number of returned results
#[structopt(short = "n", long = "number-results", default_value = "10")]
pub number_results: usize,
}
#[derive(Debug, Serialize, Deserialize)]
struct Document {
id: String,
title: String,
description: String,
image: String,
}
fn main() -> Result<(), Box<Error>> {
let opt = Opt::from_args();
let (elapsed, result) = elapsed::measure_time(|| Database::open(&opt.database_path));
let database = result?;
println!("database prepared for you in {}", elapsed);
let mut buffer = String::new();
let input = io::stdin();
loop {
print!("Searching for: ");
io::stdout().flush()?;
if input.read_line(&mut buffer)? == 0 { break }
let view = database.view();
let (elapsed, documents) = elapsed::measure_time(|| {
let builder = view.query_builder().unwrap();
builder.query(&buffer, 0..opt.number_results)
});
let mut full_documents = Vec::with_capacity(documents.len());
for document in documents {
match view.retrieve_document::<Document>(document.id) {
Ok(document) => full_documents.push(document),
Err(e) => eprintln!("{}", e),
}
}
println!("{:#?}", full_documents);
println!("Found {} results in {}", full_documents.len(), elapsed);
buffer.clear();
}
Ok(())
}

34
meilidb-core/Cargo.toml Normal file
View File

@ -0,0 +1,34 @@
[package]
name = "meilidb-core"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"
[dependencies]
byteorder = "1.3.1"
deunicode = "1.0.0"
hashbrown = "0.6.0"
lazy_static = "1.2.0"
log = "0.4.6"
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
rayon = "1.2.0"
sdset = "0.3.2"
serde = { version = "1.0.88", features = ["derive"] }
slice-group-by = "0.2.6"
zerocopy = "0.2.8"
[dependencies.fst]
git = "https://github.com/Kerollmops/fst.git"
branch = "arc-byte-slice"
[dependencies.levenshtein_automata]
git = "https://github.com/Kerollmops/levenshtein-automata.git"
branch = "arc-byte-slice"
features = ["fst_automaton"]
[dev-dependencies]
assert_matches = "1.3"
[features]
i128 = ["byteorder/i128"]
nightly = ["hashbrown/nightly", "slice-group-by/nightly"]

View File

@ -0,0 +1,44 @@
use lazy_static::lazy_static;
use levenshtein_automata::{
LevenshteinAutomatonBuilder as LevBuilder,
DFA,
};
lazy_static! {
static ref LEVDIST0: LevBuilder = LevBuilder::new(0, false);
static ref LEVDIST1: LevBuilder = LevBuilder::new(1, false);
static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false);
}
#[derive(Copy, Clone)]
enum PrefixSetting {
Prefix,
NoPrefix,
}
fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DFA {
use self::PrefixSetting::{Prefix, NoPrefix};
match query.len() {
0 ..= 4 => match setting {
Prefix => LEVDIST0.build_prefix_dfa(query),
NoPrefix => LEVDIST0.build_dfa(query),
},
5 ..= 8 => match setting {
Prefix => LEVDIST1.build_prefix_dfa(query),
NoPrefix => LEVDIST1.build_dfa(query),
},
_ => match setting {
Prefix => LEVDIST2.build_prefix_dfa(query),
NoPrefix => LEVDIST2.build_dfa(query),
},
}
}
pub fn build_prefix_dfa(query: &str) -> DFA {
build_dfa_with_setting(query, PrefixSetting::Prefix)
}
pub fn build_dfa(query: &str) -> DFA {
build_dfa_with_setting(query, PrefixSetting::NoPrefix)
}

View File

@ -0,0 +1,16 @@
use std::cmp::Ordering;
use crate::criterion::Criterion;
use crate::RawDocument;
#[derive(Debug, Clone, Copy)]
pub struct DocumentId;
impl Criterion for DocumentId {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
lhs.id.cmp(&rhs.id)
}
fn name(&self) -> &'static str {
"DocumentId"
}
}

View File

@ -0,0 +1,65 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize {
let mut count = 0;
let mut index = 0;
for group in query_index.linear_group() {
let len = group.len();
count += is_exact[index..index + len].contains(&true) as usize;
index += len;
}
count
}
#[derive(Debug, Clone, Copy)]
pub struct Exact;
impl Criterion for Exact {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let is_exact = lhs.is_exact();
number_exact_matches(query_index, is_exact)
};
let rhs = {
let query_index = rhs.query_index();
let is_exact = rhs.is_exact();
number_exact_matches(query_index, is_exact)
};
lhs.cmp(&rhs).reverse()
}
fn name(&self) -> &'static str {
"Exact"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "soulier"
//
// doc0: "Soulier bleu"
// doc1: "souliereres rouge"
#[test]
fn easy_case() {
let query_index0 = &[0];
let is_exact0 = &[true];
let query_index1 = &[0];
let is_exact1 = &[false];
let doc0 = number_exact_matches(query_index0, is_exact0);
let doc1 = number_exact_matches(query_index1, is_exact1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
}

View File

@ -0,0 +1,120 @@
mod sum_of_typos;
mod number_of_words;
mod words_proximity;
mod sum_of_words_attribute;
mod sum_of_words_position;
mod exact;
mod document_id;
use std::cmp::Ordering;
use crate::RawDocument;
pub use self::{
sum_of_typos::SumOfTypos,
number_of_words::NumberOfWords,
words_proximity::WordsProximity,
sum_of_words_attribute::SumOfWordsAttribute,
sum_of_words_position::SumOfWordsPosition,
exact::Exact,
document_id::DocumentId,
};
pub trait Criterion: Send + Sync {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering;
fn name(&self) -> &'static str;
#[inline]
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
self.evaluate(lhs, rhs) == Ordering::Equal
}
}
impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
(**self).evaluate(lhs, rhs)
}
fn name(&self) -> &'static str {
(**self).name()
}
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
(**self).eq(lhs, rhs)
}
}
impl<T: Criterion + ?Sized> Criterion for Box<T> {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
(**self).evaluate(lhs, rhs)
}
fn name(&self) -> &'static str {
(**self).name()
}
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
(**self).eq(lhs, rhs)
}
}
#[derive(Default)]
pub struct CriteriaBuilder<'a> {
inner: Vec<Box<dyn Criterion + 'a>>
}
impl<'a> CriteriaBuilder<'a>
{
pub fn new() -> CriteriaBuilder<'a> {
CriteriaBuilder { inner: Vec::new() }
}
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> {
CriteriaBuilder { inner: Vec::with_capacity(capacity) }
}
pub fn reserve(&mut self, additional: usize) {
self.inner.reserve(additional)
}
pub fn add<C: 'a>(mut self, criterion: C) -> CriteriaBuilder<'a>
where C: Criterion,
{
self.push(criterion);
self
}
pub fn push<C: 'a>(&mut self, criterion: C)
where C: Criterion,
{
self.inner.push(Box::new(criterion));
}
pub fn build(self) -> Criteria<'a> {
Criteria { inner: self.inner }
}
}
pub struct Criteria<'a> {
inner: Vec<Box<dyn Criterion + 'a>>,
}
impl<'a> Default for Criteria<'a> {
fn default() -> Self {
CriteriaBuilder::with_capacity(7)
.add(SumOfTypos)
.add(NumberOfWords)
.add(WordsProximity)
.add(SumOfWordsAttribute)
.add(SumOfWordsPosition)
.add(Exact)
.add(DocumentId)
.build()
}
}
impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> {
fn as_ref(&self) -> &[Box<dyn Criterion + 'a>] {
&self.inner
}
}

View File

@ -0,0 +1,31 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn number_of_query_words(query_index: &[u32]) -> usize {
query_index.linear_group().count()
}
#[derive(Debug, Clone, Copy)]
pub struct NumberOfWords;
impl Criterion for NumberOfWords {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
number_of_query_words(query_index)
};
let rhs = {
let query_index = rhs.query_index();
number_of_query_words(query_index)
};
lhs.cmp(&rhs).reverse()
}
fn name(&self) -> &'static str {
"NumberOfWords"
}
}

View File

@ -0,0 +1,116 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
// This function is a wrong logarithmic 10 function.
// It is safe to panic on input number higher than 3,
// the number of typos is never bigger than that.
#[inline]
fn custom_log10(n: u8) -> f32 {
match n {
0 => 0.0, // log(1)
1 => 0.30102, // log(2)
2 => 0.47712, // log(3)
3 => 0.60205, // log(4)
_ => panic!("invalid number"),
}
}
#[inline]
fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize {
let mut number_words: usize = 0;
let mut sum_typos = 0.0;
let mut index = 0;
for group in query_index.linear_group() {
sum_typos += custom_log10(distance[index]);
number_words += 1;
index += group.len();
}
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfTypos;
impl Criterion for SumOfTypos {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let distance = lhs.distance();
sum_matches_typos(query_index, distance)
};
let rhs = {
let query_index = rhs.query_index();
let distance = rhs.distance();
sum_matches_typos(query_index, distance)
};
lhs.cmp(&rhs).reverse()
}
fn name(&self) -> &'static str {
"SumOfTypos"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "Geox CEO"
//
// doc0: "Geox SpA: CEO and Executive"
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
#[test]
fn one_typo_reference() {
let query_index0 = &[0, 1];
let distance0 = &[0, 0];
let query_index1 = &[0, 1];
let distance1 = &[1, 0];
let doc0 = sum_matches_typos(query_index0, distance0);
let doc1 = sum_matches_typos(query_index1, distance1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
// typing: "bouton manchette"
//
// doc0: "bouton manchette"
// doc1: "bouton"
#[test]
fn no_typo() {
let query_index0 = &[0, 1];
let distance0 = &[0, 0];
let query_index1 = &[0];
let distance1 = &[0];
let doc0 = sum_matches_typos(query_index0, distance0);
let doc1 = sum_matches_typos(query_index1, distance1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
// typing: "bouton manchztte"
//
// doc0: "bouton manchette"
// doc1: "bouton"
#[test]
fn one_typo() {
let query_index0 = &[0, 1];
let distance0 = &[0, 1];
let query_index1 = &[0];
let distance1 = &[0];
let doc0 = sum_matches_typos(query_index0, distance0);
let doc1 = sum_matches_typos(query_index1, distance1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
}

View File

@ -0,0 +1,64 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
let mut sum_attributes = 0;
let mut index = 0;
for group in query_index.linear_group() {
sum_attributes += attribute[index] as usize;
index += group.len();
}
sum_attributes
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfWordsAttribute;
impl Criterion for SumOfWordsAttribute {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let attribute = lhs.attribute();
sum_matches_attributes(query_index, attribute)
};
let rhs = {
let query_index = rhs.query_index();
let attribute = rhs.attribute();
sum_matches_attributes(query_index, attribute)
};
lhs.cmp(&rhs)
}
fn name(&self) -> &'static str {
"SumOfWordsAttribute"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "soulier"
//
// doc0: { 0. "Soulier bleu", 1. "bla bla bla" }
// doc1: { 0. "Botte rouge", 1. "Soulier en cuir" }
#[test]
fn title_vs_description() {
let query_index0 = &[0];
let attribute0 = &[0];
let query_index1 = &[0];
let attribute1 = &[1];
let doc0 = sum_matches_attributes(query_index0, attribute0);
let doc1 = sum_matches_attributes(query_index1, attribute1);
assert_eq!(doc0.cmp(&doc1), Ordering::Less);
}
}

View File

@ -0,0 +1,64 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
#[inline]
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
let mut sum_word_index = 0;
let mut index = 0;
for group in query_index.linear_group() {
sum_word_index += word_index[index] as usize;
index += group.len();
}
sum_word_index
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfWordsPosition;
impl Criterion for SumOfWordsPosition {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let word_index = lhs.word_index();
sum_matches_attribute_index(query_index, word_index)
};
let rhs = {
let query_index = rhs.query_index();
let word_index = rhs.word_index();
sum_matches_attribute_index(query_index, word_index)
};
lhs.cmp(&rhs)
}
fn name(&self) -> &'static str {
"SumOfWordsPosition"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "soulier"
//
// doc0: "Soulier bleu"
// doc1: "Botte rouge et soulier noir"
#[test]
fn easy_case() {
let query_index0 = &[0];
let word_index0 = &[0];
let query_index1 = &[0];
let word_index1 = &[3];
let doc0 = sum_matches_attribute_index(query_index0, word_index0);
let doc1 = sum_matches_attribute_index(query_index1, word_index1);
assert_eq!(doc0.cmp(&doc1), Ordering::Less);
}
}

View File

@ -0,0 +1,155 @@
use std::cmp::{self, Ordering};
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
const MAX_DISTANCE: u16 = 8;
#[inline]
fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
(a.clone(), b.clone())
}
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
if lhs < rhs {
cmp::min(rhs - lhs, MAX_DISTANCE)
} else {
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
}
}
fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
if lattr != rattr { return MAX_DISTANCE }
index_proximity(lwi, rwi)
}
fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 {
let mut min_prox = u16::max_value();
for a in lattr.iter().zip(lwi) {
for b in rattr.iter().zip(rwi) {
let a = clone_tuple(a);
let b = clone_tuple(b);
min_prox = cmp::min(min_prox, attribute_proximity(a, b));
}
}
min_prox
}
fn matches_proximity(
query_index: &[u32],
distance: &[u8],
attribute: &[u16],
word_index: &[u16],
) -> u16
{
let mut query_index_groups = query_index.linear_group();
let mut proximity = 0;
let mut index = 0;
let get_attr_wi = |index: usize, group_len: usize| {
// retrieve the first distance group (with the lowest values)
let len = distance[index..index + group_len].linear_group().next().unwrap().len();
let rattr = &attribute[index..index + len];
let rwi = &word_index[index..index + len];
(rattr, rwi)
};
let mut last = query_index_groups.next().map(|group| {
let attr_wi = get_attr_wi(index, group.len());
index += group.len();
attr_wi
});
// iter by windows of size 2
while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) {
let attr_wi = get_attr_wi(index, rhs.len());
proximity += min_proximity(lhs, attr_wi);
last = Some(attr_wi);
index += rhs.len();
}
proximity
}
#[derive(Debug, Clone, Copy)]
pub struct WordsProximity;
impl Criterion for WordsProximity {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let distance = lhs.distance();
let attribute = lhs.attribute();
let word_index = lhs.word_index();
matches_proximity(query_index, distance, attribute, word_index)
};
let rhs = {
let query_index = rhs.query_index();
let distance = rhs.distance();
let attribute = rhs.attribute();
let word_index = rhs.word_index();
matches_proximity(query_index, distance, attribute, word_index)
};
lhs.cmp(&rhs)
}
fn name(&self) -> &'static str {
"WordsProximity"
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn three_different_attributes() {
// "soup" "of the" "the day"
//
// { id: 0, attr: 0, attr_index: 0 }
// { id: 1, attr: 1, attr_index: 0 }
// { id: 2, attr: 1, attr_index: 1 }
// { id: 2, attr: 2, attr_index: 0 }
// { id: 3, attr: 3, attr_index: 1 }
let query_index = &[0, 1, 2, 2, 3];
let distance = &[0, 0, 0, 0, 0];
let attribute = &[0, 1, 1, 2, 3];
let word_index = &[0, 0, 1, 0, 1];
// soup -> of = 8
// + of -> the = 1
// + the -> day = 8 (not 1)
assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 17);
}
#[test]
fn two_different_attributes() {
// "soup day" "soup of the day"
//
// { id: 0, attr: 0, attr_index: 0 }
// { id: 0, attr: 1, attr_index: 0 }
// { id: 1, attr: 1, attr_index: 1 }
// { id: 2, attr: 1, attr_index: 2 }
// { id: 3, attr: 0, attr_index: 1 }
// { id: 3, attr: 1, attr_index: 3 }
let query_index = &[0, 0, 1, 2, 3, 3];
let distance = &[0, 0, 0, 0, 0, 0];
let attribute = &[0, 1, 1, 1, 0, 1];
let word_index = &[0, 0, 1, 2, 1, 3];
// soup -> of = 1
// + of -> the = 1
// + the -> day = 1
assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 3);
}
}

View File

@ -1,5 +1,4 @@
use std::hash::Hash;
use hashbrown::HashMap;
pub struct DistinctMap<K> {
@ -12,7 +11,7 @@ impl<K: Hash + Eq> DistinctMap<K> {
pub fn new(limit: usize) -> Self {
DistinctMap {
inner: HashMap::new(),
limit: limit,
limit,
len: 0,
}
}
@ -31,7 +30,7 @@ pub struct BufferedDistinctMap<'a, K> {
impl<'a, K: Hash + Eq> BufferedDistinctMap<'a, K> {
pub fn new(internal: &'a mut DistinctMap<K>) -> BufferedDistinctMap<'a, K> {
BufferedDistinctMap {
internal: internal,
internal,
inner: HashMap::new(),
len: 0,
}

144
meilidb-core/src/lib.rs Normal file
View File

@ -0,0 +1,144 @@
#![feature(checked_duration_since)]
#[cfg(test)]
#[macro_use] extern crate assert_matches;
mod automaton;
mod distinct_map;
mod query_builder;
mod query_enhancer;
mod raw_document;
mod reordered_attrs;
mod store;
pub mod criterion;
use serde::{Serialize, Deserialize};
use zerocopy::{AsBytes, FromBytes};
use self::raw_document::raw_documents_from;
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder, normalize_str};
pub use self::raw_document::RawDocument;
pub use self::store::Store;
/// Represent an internally generated document unique identifier.
///
/// It is used to inform the database the document you want to deserialize.
/// Helpful for custom ranking.
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
#[derive(Serialize, Deserialize)]
#[derive(AsBytes, FromBytes)]
#[repr(C)]
pub struct DocumentId(pub u64);
/// This structure represent the position of a word
/// in a document and its attributes.
///
/// This is stored in the map, generated at index time,
/// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[derive(AsBytes, FromBytes)]
#[repr(C)]
pub struct DocIndex {
/// The document identifier where the word was found.
pub document_id: DocumentId,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
pub word_index: u16,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
pub char_length: u16,
}
/// This structure represent a matching word with informations
/// on the location of the word in the document.
///
/// The order of the field is important because it defines
/// the way these structures are ordered between themselves.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Highlight {
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
/// The position in bytes where the word was found.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
/// The length in bytes of the found word.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_length: u16,
}
#[doc(hidden)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct TmpMatch {
pub query_index: u32,
pub distance: u8,
pub attribute: u16,
pub word_index: u16,
pub is_exact: bool,
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Document {
pub id: DocumentId,
pub highlights: Vec<Highlight>,
#[cfg(test)]
pub matches: Vec<TmpMatch>,
}
impl Document {
#[cfg(not(test))]
fn from_raw(raw: RawDocument) -> Document {
Document { id: raw.id, highlights: raw.highlights }
}
#[cfg(test)]
fn from_raw(raw: RawDocument) -> Document {
let len = raw.query_index().len();
let mut matches = Vec::with_capacity(len);
let query_index = raw.query_index();
let distance = raw.distance();
let attribute = raw.attribute();
let word_index = raw.word_index();
let is_exact = raw.is_exact();
for i in 0..len {
let match_ = TmpMatch {
query_index: query_index[i],
distance: distance[i],
attribute: attribute[i],
word_index: word_index[i],
is_exact: is_exact[i],
};
matches.push(match_);
}
Document { id: raw.id, matches, highlights: raw.highlights }
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::mem;
#[test]
fn docindex_mem_size() {
assert_eq!(mem::size_of::<DocIndex>(), 16);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,398 @@
use std::ops::Range;
use std::cmp::Ordering::{Less, Greater, Equal};
/// Return `true` if the specified range can accept the given replacements words.
/// Returns `false` if the replacements words are already present in the original query
/// or if there is fewer replacement words than the range to replace.
//
//
// ## Ignored because already present in original
//
// new york city subway
// -------- ^^^^
// / \
// [new york city]
//
//
// ## Ignored because smaller than the original
//
// new york city subway
// -------------
// \ /
// [new york]
//
//
// ## Accepted because bigger than the original
//
// NYC subway
// ---
// / \
// / \
// / \
// / \
// / \
// [new york city]
//
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
where S: AsRef<str>,
T: AsRef<str>,
{
if words.len() <= range.len() {
// there is fewer or equal replacement words
// than there is already in the replaced range
return false
}
// retrieve the part to rewrite but with the length
// of the replacement part
let original = query.iter().skip(range.start).take(words.len());
// check if the original query doesn't already contain
// the replacement words
!original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref))
}
type Origin = usize;
type RealLength = usize;
struct FakeIntervalTree {
intervals: Vec<(Range<usize>, (Origin, RealLength))>,
}
impl FakeIntervalTree {
fn new(mut intervals: Vec<(Range<usize>, (Origin, RealLength))>) -> FakeIntervalTree {
intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end));
FakeIntervalTree { intervals }
}
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
let element = self.intervals.binary_search_by(|(r, _)| {
if point >= r.start {
if point < r.end { Equal } else { Less }
} else { Greater }
});
let n = match element { Ok(n) => n, Err(n) => n };
match self.intervals.get(n) {
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
_otherwise => None,
}
}
}
pub struct QueryEnhancerBuilder<'a, S> {
query: &'a [S],
origins: Vec<usize>,
real_to_origin: Vec<(Range<usize>, (Origin, RealLength))>,
}
impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
// we initialize origins query indices based on their positions
let origins: Vec<_> = (0..query.len() + 1).collect();
let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect();
QueryEnhancerBuilder { query, origins, real_to_origin }
}
/// Update the final real to origin query indices mapping.
///
/// `range` is the original words range that this `replacement` words replace
/// and `real` is the first real query index of these replacement words.
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
where T: AsRef<str>,
{
// check if the range of original words
// can be rewritten with the replacement words
if rewrite_range_with(self.query, range.clone(), replacement) {
// this range can be replaced so we need to
// modify the origins accordingly
let offset = replacement.len() - range.len();
let previous_padding = self.origins[range.end - 1];
let current_offset = (self.origins[range.end] - 1) - previous_padding;
let diff = offset.saturating_sub(current_offset);
self.origins[range.end] += diff;
for r in &mut self.origins[range.end + 1..] {
*r += diff;
}
}
// we need to store the real number and origins relations
// this way it will be possible to know by how many
// we need to pad real query indices
let real_range = real..real + replacement.len().max(range.len());
let real_length = replacement.len();
self.real_to_origin.push((real_range, (range.start, real_length)));
}
pub fn build(self) -> QueryEnhancer {
QueryEnhancer {
origins: self.origins,
real_to_origin: FakeIntervalTree::new(self.real_to_origin),
}
}
}
pub struct QueryEnhancer {
origins: Vec<usize>,
real_to_origin: FakeIntervalTree,
}
impl QueryEnhancer {
/// Returns the query indices to use to replace this real query index.
pub fn replacement(&self, real: u32) -> Range<u32> {
let real = real as usize;
// query the fake interval tree with the real query index
let (range, (origin, real_length)) =
self.real_to_origin
.query(real)
.expect("real has never been declared");
// if `real` is the end bound of the range
if (range.start + real_length - 1) == real {
let mut count = range.len();
let mut new_origin = origin;
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
let len = slice[1] - slice[0];
count = count.saturating_sub(len);
if count == 0 { new_origin = origin + i; break }
}
let n = real - range.start;
let start = self.origins[origin];
let end = self.origins[new_origin + 1];
let remaining = (end - start) - n;
Range { start: (start + n) as u32, end: (start + n + remaining) as u32 }
} else {
// just return the origin along with
// the real position of the word
let n = real as usize - range.start;
let origin = self.origins[origin];
Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 }
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn original_unmodified() {
let query = ["new", "york", "city", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// new york = new york city
builder.declare(0..2, 4, &["new", "york", "city"]);
// ^ 4 5 6
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // new
assert_eq!(enhancer.replacement(1), 1..2); // york
assert_eq!(enhancer.replacement(2), 2..3); // city
assert_eq!(enhancer.replacement(3), 3..4); // subway
assert_eq!(enhancer.replacement(4), 0..1); // new
assert_eq!(enhancer.replacement(5), 1..2); // york
assert_eq!(enhancer.replacement(6), 2..3); // city
}
#[test]
fn simple_growing() {
let query = ["new", "york", "subway"];
// 0 1 2
let mut builder = QueryEnhancerBuilder::new(&query);
// new york = new york city
builder.declare(0..2, 3, &["new", "york", "city"]);
// ^ 3 4 5
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // new
assert_eq!(enhancer.replacement(1), 1..3); // york
assert_eq!(enhancer.replacement(2), 3..4); // subway
assert_eq!(enhancer.replacement(3), 0..1); // new
assert_eq!(enhancer.replacement(4), 1..2); // york
assert_eq!(enhancer.replacement(5), 2..3); // city
}
#[test]
fn same_place_growings() {
let query = ["NY", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NY = new york
builder.declare(0..1, 2, &["new", "york"]);
// ^ 2 3
// NY = new york city
builder.declare(0..1, 4, &["new", "york", "city"]);
// ^ 4 5 6
// NY = NYC
builder.declare(0..1, 7, &["NYC"]);
// ^ 7
// NY = new york city
builder.declare(0..1, 8, &["new", "york", "city"]);
// ^ 8 9 10
// subway = underground train
builder.declare(1..2, 11, &["underground", "train"]);
// ^ 11 12
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..3); // NY
assert_eq!(enhancer.replacement(1), 3..5); // subway
assert_eq!(enhancer.replacement(2), 0..1); // new
assert_eq!(enhancer.replacement(3), 1..3); // york
assert_eq!(enhancer.replacement(4), 0..1); // new
assert_eq!(enhancer.replacement(5), 1..2); // york
assert_eq!(enhancer.replacement(6), 2..3); // city
assert_eq!(enhancer.replacement(7), 0..3); // NYC
assert_eq!(enhancer.replacement(8), 0..1); // new
assert_eq!(enhancer.replacement(9), 1..2); // york
assert_eq!(enhancer.replacement(10), 2..3); // city
assert_eq!(enhancer.replacement(11), 3..4); // underground
assert_eq!(enhancer.replacement(12), 4..5); // train
}
#[test]
fn bigger_growing() {
let query = ["NYC", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(0..1, 2, &["new", "york", "city"]);
// ^ 2 3 4
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..3); // NYC
assert_eq!(enhancer.replacement(1), 3..4); // subway
assert_eq!(enhancer.replacement(2), 0..1); // new
assert_eq!(enhancer.replacement(3), 1..2); // york
assert_eq!(enhancer.replacement(4), 2..3); // city
}
#[test]
fn middle_query_growing() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..6); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
}
#[test]
fn end_query_growing() {
let query = ["NYC", "subway"];
// 0 1
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(1..2, 2, &["underground", "train"]);
// ^ 2 3
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // NYC
assert_eq!(enhancer.replacement(1), 1..3); // subway
assert_eq!(enhancer.replacement(2), 1..2); // underground
assert_eq!(enhancer.replacement(3), 2..3); // train
}
#[test]
fn multiple_growings() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
// subway = underground train
builder.declare(3..4, 7, &["underground", "train"]);
// ^ 7 8
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..7); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
assert_eq!(enhancer.replacement(7), 5..6); // underground
assert_eq!(enhancer.replacement(8), 6..7); // train
}
#[test]
fn multiple_probable_growings() {
let query = ["great", "awesome", "NYC", "subway"];
// 0 1 2 3
let mut builder = QueryEnhancerBuilder::new(&query);
// NYC = new york city
builder.declare(2..3, 4, &["new", "york", "city"]);
// ^ 4 5 6
// subway = underground train
builder.declare(3..4, 7, &["underground", "train"]);
// ^ 7 8
// great awesome = good
builder.declare(0..2, 9, &["good"]);
// ^ 9
// awesome NYC = NY
builder.declare(1..3, 10, &["NY"]);
// ^^ 10
// NYC subway = metro
builder.declare(2..4, 11, &["metro"]);
// ^^ 11
let enhancer = builder.build();
assert_eq!(enhancer.replacement(0), 0..1); // great
assert_eq!(enhancer.replacement(1), 1..2); // awesome
assert_eq!(enhancer.replacement(2), 2..5); // NYC
assert_eq!(enhancer.replacement(3), 5..7); // subway
assert_eq!(enhancer.replacement(4), 2..3); // new
assert_eq!(enhancer.replacement(5), 3..4); // york
assert_eq!(enhancer.replacement(6), 4..5); // city
assert_eq!(enhancer.replacement(7), 5..6); // underground
assert_eq!(enhancer.replacement(8), 6..7); // train
assert_eq!(enhancer.replacement(9), 0..2); // good
assert_eq!(enhancer.replacement(10), 1..5); // NY
assert_eq!(enhancer.replacement(11), 2..5); // metro
}
}

View File

@ -0,0 +1,141 @@
use std::sync::Arc;
use std::fmt;
use sdset::SetBuf;
use slice_group_by::GroupBy;
use crate::{TmpMatch, DocumentId, Highlight};
#[derive(Clone)]
pub struct RawDocument {
pub id: DocumentId,
pub matches: SharedMatches,
pub highlights: Vec<Highlight>,
}
impl RawDocument {
fn new(id: DocumentId, matches: SharedMatches, highlights: Vec<Highlight>) -> RawDocument {
RawDocument { id, matches, highlights }
}
pub fn query_index(&self) -> &[u32] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
}
pub fn distance(&self) -> &[u8] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
}
pub fn attribute(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
}
pub fn word_index(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
}
pub fn is_exact(&self) -> &[bool] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
}
}
impl fmt::Debug for RawDocument {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str("RawDocument {\r\n")?;
f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "query_index", self.query_index()))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "distance", self.distance()))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "attribute", self.attribute()))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "word_index", self.word_index()))?;
f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?;
f.write_str("}")?;
Ok(())
}
}
pub fn raw_documents_from(
matches: SetBuf<(DocumentId, TmpMatch)>,
highlights: SetBuf<(DocumentId, Highlight)>,
) -> Vec<RawDocument>
{
let mut docs_ranges: Vec<(_, Range, _)> = Vec::new();
let mut matches2 = Matches::with_capacity(matches.len());
let matches = matches.linear_group_by_key(|(id, _)| *id);
let highlights = highlights.linear_group_by_key(|(id, _)| *id);
for (mgroup, hgroup) in matches.zip(highlights) {
debug_assert_eq!(mgroup[0].0, hgroup[0].0);
let document_id = mgroup[0].0;
let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0);
let end = start + mgroup.len();
let highlights = hgroup.iter().map(|(_, h)| *h).collect();
docs_ranges.push((document_id, Range { start, end }, highlights));
matches2.extend_from_slice(mgroup);
}
let matches = Arc::new(matches2);
docs_ranges.into_iter().map(|(id, range, highlights)| {
let matches = SharedMatches { range, matches: matches.clone() };
RawDocument::new(id, matches, highlights)
}).collect()
}
#[derive(Debug, Copy, Clone)]
struct Range {
start: usize,
end: usize,
}
#[derive(Clone)]
pub struct SharedMatches {
range: Range,
matches: Arc<Matches>,
}
#[derive(Clone)]
struct Matches {
query_index: Vec<u32>,
distance: Vec<u8>,
attribute: Vec<u16>,
word_index: Vec<u16>,
is_exact: Vec<bool>,
}
impl Matches {
fn with_capacity(cap: usize) -> Matches {
Matches {
query_index: Vec::with_capacity(cap),
distance: Vec::with_capacity(cap),
attribute: Vec::with_capacity(cap),
word_index: Vec::with_capacity(cap),
is_exact: Vec::with_capacity(cap),
}
}
fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) {
for (_, match_) in matches {
self.query_index.push(match_.query_index);
self.distance.push(match_.distance);
self.attribute.push(match_.attribute);
self.word_index.push(match_.word_index);
self.is_exact.push(match_.is_exact);
}
}
}

View File

@ -0,0 +1,24 @@
#[derive(Default, Clone)]
pub struct ReorderedAttrs {
count: usize,
reorders: Vec<Option<u16>>,
}
impl ReorderedAttrs {
pub fn new() -> ReorderedAttrs {
ReorderedAttrs { count: 0, reorders: Vec::new() }
}
pub fn insert_attribute(&mut self, attribute: u16) {
self.reorders.resize(attribute as usize + 1, None);
self.reorders[attribute as usize] = Some(self.count as u16);
self.count += 1;
}
pub fn get(&self, attribute: u16) -> Option<u16> {
match self.reorders.get(attribute as usize) {
Some(Some(attribute)) => Some(*attribute),
_ => None,
}
}
}

34
meilidb-core/src/store.rs Normal file
View File

@ -0,0 +1,34 @@
use std::error::Error;
use fst::Set;
use sdset::SetBuf;
use crate::DocIndex;
pub trait Store {
type Error: Error;
fn words(&self) -> Result<&Set, Self::Error>;
fn word_indexes(&self, word: &[u8]) -> Result<Option<SetBuf<DocIndex>>, Self::Error>;
fn synonyms(&self) -> Result<&Set, Self::Error>;
fn alternatives_to(&self, word: &[u8]) -> Result<Option<Set>, Self::Error>;
}
impl<T> Store for &'_ T where T: Store {
type Error = T::Error;
fn words(&self) -> Result<&Set, Self::Error> {
(*self).words()
}
fn word_indexes(&self, word: &[u8]) -> Result<Option<SetBuf<DocIndex>>, Self::Error> {
(*self).word_indexes(word)
}
fn synonyms(&self) -> Result<&Set, Self::Error> {
(*self).synonyms()
}
fn alternatives_to(&self, word: &[u8]) -> Result<Option<Set>, Self::Error> {
(*self).alternatives_to(word)
}
}

41
meilidb-data/Cargo.toml Normal file
View File

@ -0,0 +1,41 @@
[package]
name = "meilidb-data"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"
[dependencies]
arc-swap = "0.4.2"
bincode = "1.1.4"
crossbeam-channel = "0.3.9"
deunicode = "1.0.0"
hashbrown = { version = "0.6.0", features = ["serde"] }
log = "0.4.6"
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
meilidb-schema = { path = "../meilidb-schema", version = "0.1.0" }
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
ordered-float = { version = "1.0.2", features = ["serde"] }
rocksdb = "0.12.3"
sdset = "0.3.2"
serde = { version = "1.0.99", features = ["derive"] }
serde_json = "1.0.40"
siphasher = "0.3.0"
zerocopy = "0.2.8"
[dependencies.rmp-serde]
git = "https://github.com/3Hren/msgpack-rust.git"
rev = "40b3d48"
[dependencies.rmpv]
git = "https://github.com/3Hren/msgpack-rust.git"
rev = "40b3d48"
features = ["with-serde"]
[dependencies.fst]
git = "https://github.com/Kerollmops/fst.git"
branch = "arc-byte-slice"
[dev-dependencies]
tempfile = "3.1.0"
maplit = "1.0.2"
big_s = "1.0.2"

126
meilidb-data/src/cf_tree.rs Normal file
View File

@ -0,0 +1,126 @@
use std::sync::Arc;
use crossbeam_channel::{unbounded, Sender, Receiver};
use rocksdb::{DBVector, IteratorMode, Direction};
use crate::RocksDbResult;
#[derive(Clone)]
pub struct CfTree {
index: Arc<CfTreeInner>,
sender: Option<Sender<()>>,
}
struct CfTreeInner {
db: Arc<rocksdb::DB>,
name: String,
}
impl CfTree {
pub fn create(db: Arc<rocksdb::DB>, name: String) -> RocksDbResult<CfTree> {
let mut options = rocksdb::Options::default();
options.create_missing_column_families(true); // this doesn't work
if db.cf_handle(&name).is_none() {
let _cf = db.create_cf(&name, &options)?;
}
let index = Arc::new(CfTreeInner { db, name });
Ok(CfTree { index, sender: None })
}
pub fn create_with_subcription(
db: Arc<rocksdb::DB>,
name: String,
) -> RocksDbResult<(CfTree, Receiver<()>)>
{
let mut options = rocksdb::Options::default();
options.create_missing_column_families(true); // this doesn't work
if db.cf_handle(&name).is_none() {
let _cf = db.create_cf(&name, &options)?;
}
let index = Arc::new(CfTreeInner { db, name });
let (sender, receiver) = unbounded();
Ok((CfTree { index, sender: Some(sender) }, receiver))
}
pub fn insert<K, V>(&self, key: K, value: V) -> RocksDbResult<()>
where K: AsRef<[u8]>,
V: AsRef<[u8]>,
{
let cf = self.index.db.cf_handle(&self.index.name).unwrap();
let result = self.index.db.put_cf(cf, key, value);
if let Some(sender) = &self.sender {
let _err = sender.send(());
}
result
}
pub fn get<K>(&self, key: K) -> RocksDbResult<Option<DBVector>>
where K: AsRef<[u8]>,
{
let cf = self.index.db.cf_handle(&self.index.name).unwrap();
self.index.db.get_cf(cf, key)
}
pub fn remove<K>(&self, key: K) -> RocksDbResult<()>
where K: AsRef<[u8]>
{
let cf = self.index.db.cf_handle(&self.index.name).unwrap();
self.index.db.delete_cf(cf, key)
}
/// Start and end key range is inclusive on both bounds.
pub fn range<KS, KE>(&self, start: KS, end: KE) -> RocksDbResult<CfIter>
where KS: AsRef<[u8]>,
KE: AsRef<[u8]>,
{
let cf = self.index.db.cf_handle(&self.index.name).unwrap();
let mut iter = self.index.db.iterator_cf(cf, IteratorMode::Start)?;
iter.set_mode(IteratorMode::From(start.as_ref(), Direction::Forward));
let end_bound = Box::from(end.as_ref());
Ok(CfIter { iter, end_bound: Some(end_bound) })
}
pub fn iter(&self) -> RocksDbResult<CfIter> {
let cf = self.index.db.cf_handle(&self.index.name).unwrap();
let iter = self.index.db.iterator_cf(cf, IteratorMode::Start)?;
Ok(CfIter { iter, end_bound: None })
}
pub fn last_key(&self) -> RocksDbResult<Option<Box<[u8]>>> {
let cf = self.index.db.cf_handle(&self.index.name).unwrap();
let mut iter = self.index.db.iterator_cf(cf, IteratorMode::End)?;
Ok(iter.next().map(|(key, _)| key))
}
pub fn prefix_iterator<P>(&self, prefix: P) -> RocksDbResult<rocksdb::DBIterator>
where P: AsRef<[u8]>,
{
let cf = self.index.db.cf_handle(&self.index.name).unwrap();
self.index.db.prefix_iterator_cf(cf, prefix)
}
}
pub struct CfIter<'a> {
iter: rocksdb::DBIterator<'a>,
end_bound: Option<Box<[u8]>>,
}
impl Iterator for CfIter<'_> {
type Item = (Box<[u8]>, Box<[u8]>);
fn next(&mut self) -> Option<Self::Item> {
match (self.iter.next(), &self.end_bound) {
(Some((ref key, _)), Some(end_bound)) if key > end_bound => None,
(Some(entry), _) => Some(entry),
(None, _) => None,
}
}
}

View File

@ -0,0 +1,73 @@
use std::{error, fmt};
use crate::serde::SerializerError;
#[derive(Debug)]
pub enum Error {
SchemaDiffer,
SchemaMissing,
WordIndexMissing,
MissingDocumentId,
RocksDbError(rocksdb::Error),
FstError(fst::Error),
RmpDecodeError(rmp_serde::decode::Error),
RmpEncodeError(rmp_serde::encode::Error),
BincodeError(bincode::Error),
SerializerError(SerializerError),
}
impl From<rocksdb::Error> for Error {
fn from(error: rocksdb::Error) -> Error {
Error::RocksDbError(error)
}
}
impl From<fst::Error> for Error {
fn from(error: fst::Error) -> Error {
Error::FstError(error)
}
}
impl From<rmp_serde::decode::Error> for Error {
fn from(error: rmp_serde::decode::Error) -> Error {
Error::RmpDecodeError(error)
}
}
impl From<rmp_serde::encode::Error> for Error {
fn from(error: rmp_serde::encode::Error) -> Error {
Error::RmpEncodeError(error)
}
}
impl From<bincode::Error> for Error {
fn from(error: bincode::Error) -> Error {
Error::BincodeError(error)
}
}
impl From<SerializerError> for Error {
fn from(error: SerializerError) -> Error {
Error::SerializerError(error)
}
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use self::Error::*;
match self {
SchemaDiffer => write!(f, "schemas differ"),
SchemaMissing => write!(f, "this index does not have a schema"),
WordIndexMissing => write!(f, "this index does not have a word index"),
MissingDocumentId => write!(f, "document id is missing"),
RocksDbError(e) => write!(f, "RocksDB error; {}", e),
FstError(e) => write!(f, "fst error; {}", e),
RmpDecodeError(e) => write!(f, "rmp decode error; {}", e),
RmpEncodeError(e) => write!(f, "rmp encode error; {}", e),
BincodeError(e) => write!(f, "bincode error; {}", e),
SerializerError(e) => write!(f, "serializer error; {}", e),
}
}
}
impl error::Error for Error { }

View File

@ -0,0 +1,77 @@
use std::ops::Deref;
use serde::de::DeserializeOwned;
use serde::Serialize;
use super::Error;
use std::marker::PhantomData;
#[derive(Clone)]
pub struct CommonIndex(pub crate::CfTree);
impl Deref for CommonIndex {
type Target = crate::CfTree;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl CommonIndex {
pub fn get<T, K>(&self, key: K) -> Result<Option<T>, Error>
where T: DeserializeOwned,
K: AsRef<[u8]>,
{
let raw = match self.0.get(key)? {
Some(raw) => raw,
None => return Ok(None),
};
let data = bincode::deserialize(&raw)?;
Ok(Some(data))
}
pub fn set<T, K>(&self, key: K, data: &T) -> Result<(), Error>
where T: Serialize,
K: AsRef<[u8]>,
{
let raw = bincode::serialize(data)?;
self.0.insert(key, &raw)?;
Ok(())
}
pub fn prefix_iterator<T, P>(&self, prefix: P) -> Result<SerializedIterator<T>, Error>
where T: DeserializeOwned,
P: AsRef<[u8]>,
{
let iter = self.0.prefix_iterator(prefix)?;
Ok(SerializedIterator { iter, _marker: PhantomData })
}
}
pub struct SerializedIterator<'a, T> {
iter: rocksdb::DBIterator<'a>,
_marker: PhantomData<T>,
}
impl<T> Iterator for SerializedIterator<'_, T>
where T: DeserializeOwned,
{
type Item = (String, T);
fn next(&mut self) -> Option<Self::Item> {
let (raw_key, raw_value) = match self.iter.next() {
Some((key, value)) => (key, value),
None => return None,
};
let value: T = match bincode::deserialize(&raw_value) {
Ok(data) => data,
Err(_) => return None,
};
let key = match std::str::from_utf8(&raw_key) {
Ok(key) => key.to_string(),
Err(_) => return None,
};
Some((key, value))
}
}

View File

@ -0,0 +1,89 @@
use serde::de::DeserializeOwned;
use serde::{Serialize, Deserialize};
use std::collections::{HashMap, HashSet};
use std::ops::Deref;
use super::Error;
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum RankingOrdering {
Asc,
Dsc
}
pub type StopWords = HashSet<String>;
pub type RankingOrder = Vec<String>;
pub type DistinctField = String;
pub type RankingRules = HashMap<String, RankingOrdering>;
const STOP_WORDS_KEY: &str = "stop-words";
const RANKING_ORDER_KEY: &str = "ranking-order";
const DISTINCT_FIELD_KEY: &str = "distinct-field";
const RANKING_RULES_KEY: &str = "ranking-rules";
#[derive(Clone)]
pub struct CustomSettingsIndex(pub(crate) crate::CfTree);
impl Deref for CustomSettingsIndex {
type Target = crate::CfTree;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl CustomSettingsIndex {
fn get<K, T>(&self, key: K) -> Result<Option<T>, Error>
where K: AsRef<[u8]>,
T: DeserializeOwned,
{
let setting = self.0.get(key)?;
let raw = match setting {
Some(raw) => raw,
None => return Ok(None)
};
Ok(Some(bincode::deserialize(&raw)?))
}
fn set<K, T>(&self, key: K, data: &T) -> Result<(), Error>
where K: AsRef<[u8]>,
T: Serialize,
{
let raw = bincode::serialize(data)?;
self.0.insert(key, &raw)?;
Ok(())
}
pub fn get_stop_words(&self) -> Result<Option<StopWords>, Error> {
self.get(STOP_WORDS_KEY)
}
pub fn get_ranking_order(&self) -> Result<Option<RankingOrder>, Error> {
self.get(RANKING_ORDER_KEY)
}
pub fn get_distinct_field(&self) -> Result<Option<DistinctField>, Error> {
self.get(DISTINCT_FIELD_KEY)
}
pub fn get_ranking_rules(&self) -> Result<Option<RankingRules>, Error> {
self.get(RANKING_RULES_KEY)
}
pub fn set_stop_words(&self, value: &StopWords) -> Result<(), Error> {
self.set(STOP_WORDS_KEY, value)
}
pub fn set_ranking_order(&self, value: &RankingOrder) -> Result<(), Error> {
self.set(RANKING_ORDER_KEY, value)
}
pub fn set_distinct_field(&self, value: &DistinctField) -> Result<(), Error> {
self.set(DISTINCT_FIELD_KEY, value)
}
pub fn set_ranking_rules(&self, value: &RankingRules) -> Result<(), Error> {
self.set(RANKING_RULES_KEY, value)
}
}

View File

@ -0,0 +1,33 @@
use std::sync::Arc;
use meilidb_core::DocumentId;
use crate::database::Error;
#[derive(Clone)]
pub struct DocsWordsIndex(pub crate::CfTree);
impl DocsWordsIndex {
pub fn doc_words(&self, id: DocumentId) -> Result<Option<fst::Set>, Error> {
let key = id.0.to_be_bytes();
match self.0.get(key)? {
Some(bytes) => {
let len = bytes.len();
let value = Arc::from(bytes.as_ref());
let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?;
Ok(Some(fst::Set::from(fst)))
},
None => Ok(None)
}
}
pub fn set_doc_words(&self, id: DocumentId, words: &fst::Set) -> Result<(), Error> {
let key = id.0.to_be_bytes();
self.0.insert(key, words.as_fst().as_bytes())?;
Ok(())
}
pub fn del_doc_words(&self, id: DocumentId) -> Result<(), Error> {
let key = id.0.to_be_bytes();
self.0.remove(key)?;
Ok(())
}
}

View File

@ -0,0 +1,146 @@
use std::convert::TryInto;
use std::collections::HashMap;
use meilidb_core::DocumentId;
use meilidb_schema::{Schema, SchemaAttr};
use rocksdb::DBVector;
use crate::document_attr_key::DocumentAttrKey;
use crate::RocksDbResult;
fn document_fields_range(id: DocumentId) -> ([u8; 10], [u8; 10]) {
let start = DocumentAttrKey::new(id, SchemaAttr::min()).to_be_bytes();
let end = DocumentAttrKey::new(id, SchemaAttr::max()).to_be_bytes();
(start, end)
}
#[derive(Clone)]
pub struct DocumentsIndex(pub(crate) crate::CfTree);
impl DocumentsIndex {
pub fn document_field(&self, id: DocumentId, attr: SchemaAttr) -> RocksDbResult<Option<DBVector>> {
let key = DocumentAttrKey::new(id, attr).to_be_bytes();
self.0.get(key)
}
pub fn set_document_field(&self, id: DocumentId, attr: SchemaAttr, value: Vec<u8>) -> RocksDbResult<()> {
let key = DocumentAttrKey::new(id, attr).to_be_bytes();
self.0.insert(key, value)?;
Ok(())
}
pub fn del_document_field(&self, id: DocumentId, attr: SchemaAttr) -> RocksDbResult<()> {
let key = DocumentAttrKey::new(id, attr).to_be_bytes();
self.0.remove(key)?;
Ok(())
}
pub fn del_all_document_fields(&self, id: DocumentId) -> RocksDbResult<usize> {
let (start, end) = document_fields_range(id);
let mut count = 0;
for (key, _) in self.0.range(start, end)? {
self.0.remove(key)?;
count += 1;
}
Ok(count)
}
pub fn document_fields(&self, id: DocumentId) -> RocksDbResult<DocumentFieldsIter> {
let (start, end) = document_fields_range(id);
let iter = self.0.range(start, end)?;
Ok(DocumentFieldsIter(iter))
}
pub fn documents_ids(&self) -> RocksDbResult<DocumentsIdsIter> {
let iter = DocumentsKeysIter(self.0.iter()?);
Ok(DocumentsIdsIter { inner: iter, last: None })
}
pub fn documents_fields_repartition(&self, schema: Schema) -> RocksDbResult<HashMap<String, u64>> {
let iter = self.0.iter()?;
let mut repartition_attributes_id = HashMap::new();
for key in DocumentsKeysIter(iter) {
let counter = repartition_attributes_id.entry(key.attribute).or_insert(0);
*counter += 1u64;
}
let mut repartition_with_attribute_name = HashMap::new();
for (key, val) in repartition_attributes_id {
repartition_with_attribute_name.insert(schema.attribute_name(key).to_owned(), val);
}
Ok(repartition_with_attribute_name)
}
pub fn len(&self) -> RocksDbResult<u64> {
let mut last_document_id = None;
let mut count = 0;
for (key, _) in self.0.iter()? {
let array = key.as_ref().try_into().unwrap();
let document_id = DocumentAttrKey::from_be_bytes(array).document_id;
if Some(document_id) != last_document_id {
last_document_id = Some(document_id);
count += 1;
}
}
Ok(count)
}
}
pub struct DocumentFieldsIter<'a>(crate::CfIter<'a>);
impl Iterator for DocumentFieldsIter<'_> {
type Item = (SchemaAttr, Box<[u8]>);
fn next(&mut self) -> Option<Self::Item> {
match self.0.next() {
Some((key, value)) => {
let array = key.as_ref().try_into().unwrap();
let key = DocumentAttrKey::from_be_bytes(array);
Some((key.attribute, value))
},
None => None,
}
}
}
pub struct DocumentsKeysIter<'a>(crate::CfIter<'a>);
impl Iterator for DocumentsKeysIter<'_> {
type Item = DocumentAttrKey;
fn next(&mut self) -> Option<Self::Item> {
match self.0.next() {
Some((key, _)) => {
let array = key.as_ref().try_into().unwrap();
let key = DocumentAttrKey::from_be_bytes(array);
Some(key)
},
None => None,
}
}
}
pub struct DocumentsIdsIter<'a> {
inner: DocumentsKeysIter<'a>,
last: Option<DocumentId>,
}
impl Iterator for DocumentsIdsIter<'_> {
type Item = DocumentId;
fn next(&mut self) -> Option<Self::Item> {
for DocumentAttrKey { document_id, .. } in &mut self.inner {
if self.last != Some(document_id) {
self.last = Some(document_id);
return Some(document_id)
}
}
None
}
}

View File

@ -0,0 +1,101 @@
use std::sync::Arc;
use std::convert::TryInto;
use meilidb_schema::Schema;
use crate::ranked_map::RankedMap;
use crate::database::Error;
const SCHEMA_KEY: &str = "schema";
const WORDS_KEY: &str = "words";
const SYNONYMS_KEY: &str = "synonyms";
const RANKED_MAP_KEY: &str = "ranked-map";
const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents";
#[derive(Clone)]
pub struct MainIndex(pub(crate) crate::CfTree);
impl MainIndex {
pub fn schema(&self) -> Result<Option<Schema>, Error> {
match self.0.get(SCHEMA_KEY)? {
Some(bytes) => {
let schema = bincode::deserialize_from(bytes.as_ref())?;
Ok(Some(schema))
},
None => Ok(None),
}
}
pub fn set_schema(&self, schema: &Schema) -> Result<(), Error> {
let bytes = bincode::serialize(schema)?;
self.0.insert(SCHEMA_KEY, bytes)?;
Ok(())
}
pub fn words_set(&self) -> Result<Option<fst::Set>, Error> {
match self.0.get(WORDS_KEY)? {
Some(bytes) => {
let len = bytes.len();
let value = Arc::from(bytes.as_ref());
let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?;
Ok(Some(fst::Set::from(fst)))
},
None => Ok(None),
}
}
pub fn set_words_set(&self, value: &fst::Set) -> Result<(), Error> {
self.0.insert(WORDS_KEY, value.as_fst().as_bytes()).map(drop).map_err(Into::into)
}
pub fn synonyms_set(&self) -> Result<Option<fst::Set>, Error> {
match self.0.get(SYNONYMS_KEY)? {
Some(bytes) => {
let len = bytes.len();
let value = Arc::from(bytes.as_ref());
let fst = fst::raw::Fst::from_shared_bytes(value, 0, len)?;
Ok(Some(fst::Set::from(fst)))
},
None => Ok(None),
}
}
pub fn set_synonyms_set(&self, value: &fst::Set) -> Result<(), Error> {
self.0.insert(SYNONYMS_KEY, value.as_fst().as_bytes()).map(drop).map_err(Into::into)
}
pub fn ranked_map(&self) -> Result<Option<RankedMap>, Error> {
match self.0.get(RANKED_MAP_KEY)? {
Some(bytes) => {
let ranked_map = RankedMap::read_from_bin(bytes.as_ref())?;
Ok(Some(ranked_map))
},
None => Ok(None),
}
}
pub fn set_ranked_map(&self, value: &RankedMap) -> Result<(), Error> {
let mut bytes = Vec::new();
value.write_to_bin(&mut bytes)?;
self.0.insert(RANKED_MAP_KEY, bytes)?;
Ok(())
}
pub fn number_of_documents(&self) -> Result<u64, Error> {
match self.0.get(NUMBER_OF_DOCUMENTS_KEY)? {
Some(bytes) => {
let array = (*bytes).try_into().unwrap();
Ok(u64::from_be_bytes(array))
},
None => Ok(0),
}
}
pub fn set_number_of_documents<F>(&self, f: F) -> Result<u64, Error>
where F: FnOnce(u64) -> u64,
{
let new = self.number_of_documents().map(f)?;
self.0.insert(NUMBER_OF_DOCUMENTS_KEY, new.to_be_bytes())?;
Ok(new)
}
}

View File

@ -0,0 +1,507 @@
use std::collections::{HashMap, HashSet, BTreeMap};
use std::convert::TryInto;
use std::sync::Arc;
use std::sync::atomic::{AtomicU64, Ordering};
use std::thread;
use std::time::{Duration, Instant};
use arc_swap::{ArcSwap, ArcSwapOption, Guard};
use crossbeam_channel::Receiver;
use meilidb_core::criterion::Criteria;
use meilidb_core::{DocIndex, Store, DocumentId, QueryBuilder};
use meilidb_schema::Schema;
use sdset::SetBuf;
use serde::{de, Serialize, Deserialize};
use crate::CfTree;
use crate::ranked_map::RankedMap;
use crate::serde::{Deserializer, DeserializerError};
pub use self::custom_settings_index::{CustomSettingsIndex, RankingOrdering, StopWords, RankingOrder, DistinctField, RankingRules};
pub use self::common_index::CommonIndex;
pub use self::documents_index::DocumentsIdsIter;
use self::docs_words_index::DocsWordsIndex;
use self::documents_index::DocumentsIndex;
use self::main_index::MainIndex;
use self::synonyms_index::SynonymsIndex;
use self::words_index::WordsIndex;
use crate::RocksDbResult;
use crate::database::{
Error,
DocumentsAddition, DocumentsDeletion,
SynonymsAddition, SynonymsDeletion,
apply_documents_addition, apply_documents_deletion,
apply_synonyms_addition, apply_synonyms_deletion,
};
mod common_index;
mod custom_settings_index;
mod docs_words_index;
mod documents_index;
mod main_index;
mod synonyms_index;
mod words_index;
#[derive(Serialize, Deserialize)]
enum Update {
DocumentsAddition(Vec<rmpv::Value>),
DocumentsDeletion(Vec<DocumentId>),
SynonymsAddition(BTreeMap<String, Vec<String>>),
SynonymsDeletion(BTreeMap<String, Option<Vec<String>>>),
}
#[derive(Clone, Serialize, Deserialize)]
pub enum UpdateType {
DocumentsAddition { number: usize },
DocumentsDeletion { number: usize },
SynonymsAddition { number: usize },
SynonymsDeletion { number: usize },
}
#[derive(Clone, Serialize, Deserialize)]
pub struct DetailedDuration {
pub main: Duration,
}
#[derive(Clone, Serialize, Deserialize)]
pub struct UpdateResult {
pub update_id: u64,
pub update_type: UpdateType,
pub result: Result<(), String>,
pub detailed_duration: DetailedDuration,
}
#[derive(Clone, Serialize, Deserialize)]
pub enum UpdateStatus {
Enqueued,
Processed(UpdateResult),
Unknown,
}
fn spawn_update_system(index: Index, subscription: Receiver<()>) -> thread::JoinHandle<()> {
thread::spawn(move || {
let mut subscription = subscription.into_iter();
loop {
while let Some((key, _)) = index.updates_index.iter().unwrap().next() {
let update_id = key.as_ref().try_into().map(u64::from_be_bytes).unwrap();
let updates = &index.updates_index;
let results = &index.updates_results_index;
let update = updates.get(&key).unwrap().unwrap();
let (update_type, result, duration) = match rmp_serde::from_read_ref(&update).unwrap() {
Update::DocumentsAddition(documents) => {
let update_type = UpdateType::DocumentsAddition { number: documents.len() };
let ranked_map = index.cache.load().ranked_map.clone();
let start = Instant::now();
let result = apply_documents_addition(&index, ranked_map, documents);
(update_type, result, start.elapsed())
},
Update::DocumentsDeletion(documents) => {
let update_type = UpdateType::DocumentsDeletion { number: documents.len() };
let ranked_map = index.cache.load().ranked_map.clone();
let start = Instant::now();
let result = apply_documents_deletion(&index, ranked_map, documents);
(update_type, result, start.elapsed())
},
Update::SynonymsAddition(synonyms) => {
let update_type = UpdateType::SynonymsAddition { number: synonyms.len() };
let start = Instant::now();
let result = apply_synonyms_addition(&index, synonyms);
(update_type, result, start.elapsed())
},
Update::SynonymsDeletion(synonyms) => {
let update_type = UpdateType::SynonymsDeletion { number: synonyms.len() };
let start = Instant::now();
let result = apply_synonyms_deletion(&index, synonyms);
(update_type, result, start.elapsed())
},
};
let detailed_duration = DetailedDuration { main: duration };
let status = UpdateResult {
update_id,
update_type,
result: result.map_err(|e| e.to_string()),
detailed_duration,
};
if let Some(callback) = &*index.update_callback.load() {
(callback)(status.clone());
}
let value = bincode::serialize(&status).unwrap();
results.insert(&key, value).unwrap();
updates.remove(&key).unwrap();
}
// this subscription is just used to block
// the loop until a new update is inserted
subscription.next();
}
})
}
fn last_update_id(
update_index: &crate::CfTree,
update_results_index: &crate::CfTree,
) -> RocksDbResult<u64>
{
let uikey = match update_index.last_key()? {
Some(key) => Some(key.as_ref().try_into().map(u64::from_be_bytes).unwrap()),
None => None,
};
let urikey = match update_results_index.last_key()? {
Some(key) => Some(key.as_ref().try_into().map(u64::from_be_bytes).unwrap()),
None => None,
};
Ok(uikey.max(urikey).unwrap_or(0))
}
#[derive(Clone)]
pub struct IndexStats {
pub number_of_words: usize,
pub number_of_documents: u64,
pub number_attrs_in_ranked_map: usize,
pub documents_fields_repartition: HashMap<String, u64>,
}
#[derive(Clone)]
pub struct Index {
pub(crate) cache: Arc<ArcSwap<Cache>>,
// TODO this will be a snapshot in the future
main_index: MainIndex,
synonyms_index: SynonymsIndex,
words_index: WordsIndex,
docs_words_index: DocsWordsIndex,
documents_index: DocumentsIndex,
custom_settings_index: CustomSettingsIndex,
// used by the update system
updates_id: Arc<AtomicU64>,
updates_index: crate::CfTree,
updates_results_index: crate::CfTree,
update_callback: Arc<ArcSwapOption<Box<dyn Fn(UpdateResult) + Send + Sync + 'static>>>,
}
pub(crate) struct Cache {
pub words: Arc<fst::Set>,
pub synonyms: Arc<fst::Set>,
pub schema: Schema,
pub ranked_map: RankedMap,
pub number_of_documents: u64,
}
impl Index {
pub fn new(db: Arc<rocksdb::DB>, name: &str) -> Result<Index, Error> {
Index::new_raw(db, name, None)
}
pub fn with_schema(db: Arc<rocksdb::DB>, name: &str, schema: Schema) -> Result<Index, Error> {
Index::new_raw(db, name, Some(schema))
}
fn new_raw(db: Arc<rocksdb::DB>, name: &str, schema: Option<Schema>) -> Result<Index, Error> {
let main_index = CfTree::create(db.clone(), name.to_string()).map(MainIndex)?;
let synonyms_index = CfTree::create(db.clone(), format!("{}-synonyms", name)).map(SynonymsIndex)?;
let words_index = CfTree::create(db.clone(), format!("{}-words", name)).map(WordsIndex)?;
let docs_words_index = CfTree::create(db.clone(), format!("{}-docs-words", name)).map(DocsWordsIndex)?;
let documents_index = CfTree::create(db.clone(), format!("{}-documents", name)).map(DocumentsIndex)?;
let custom_settings_index = CfTree::create(db.clone(), format!("{}-custom", name)).map(CustomSettingsIndex)?;
let (updates_index, subscription) = CfTree::create_with_subcription(db.clone(), format!("{}-updates", name))?;
let updates_results_index = CfTree::create(db.clone(), format!("{}-updates-results", name))?;
let words = match main_index.words_set()? {
Some(words) => Arc::new(words),
None => Arc::new(fst::Set::default()),
};
let synonyms = match main_index.synonyms_set()? {
Some(synonyms) => Arc::new(synonyms),
None => Arc::new(fst::Set::default()),
};
let schema = match (schema, main_index.schema()?) {
(Some(ref expected), Some(ref current)) if current != expected => {
return Err(Error::SchemaDiffer)
},
(Some(expected), Some(_)) => expected,
(Some(expected), None) => {
main_index.set_schema(&expected)?;
expected
},
(None, Some(current)) => current,
(None, None) => return Err(Error::SchemaMissing),
};
let ranked_map = match main_index.ranked_map()? {
Some(map) => map,
None => RankedMap::default(),
};
let number_of_documents = documents_index.len()?;
let cache = Cache { words, synonyms, schema, ranked_map, number_of_documents };
let cache = Arc::new(ArcSwap::from_pointee(cache));
let last_update_id = last_update_id(&updates_index, &updates_results_index)?;
let updates_id = Arc::new(AtomicU64::new(last_update_id + 1));
let index = Index {
cache,
main_index,
synonyms_index,
words_index,
docs_words_index,
documents_index,
custom_settings_index,
updates_id,
updates_index,
updates_results_index,
update_callback: Arc::new(ArcSwapOption::empty()),
};
let _handle = spawn_update_system(index.clone(), subscription);
Ok(index)
}
pub fn set_update_callback<F>(&self, callback: F)
where F: Fn(UpdateResult) + Send + Sync + 'static
{
self.update_callback.store(Some(Arc::new(Box::new(callback))));
}
pub fn unset_update_callback(&self) {
self.update_callback.store(None);
}
pub fn stats(&self) -> RocksDbResult<IndexStats> {
let cache = self.cache.load();
let documents_fields_repartition = self.documents_index.documents_fields_repartition(cache.schema.clone())?;
Ok(IndexStats {
number_of_words: cache.words.len(),
number_of_documents: cache.number_of_documents,
number_attrs_in_ranked_map: cache.ranked_map.len(),
documents_fields_repartition,
})
}
pub fn query_builder(&self) -> QueryBuilder<RefIndex> {
let ref_index = self.as_ref();
QueryBuilder::new(ref_index)
}
pub fn query_builder_with_criteria<'c>(
&self,
criteria: Criteria<'c>,
) -> QueryBuilder<'c, RefIndex>
{
let ref_index = self.as_ref();
QueryBuilder::with_criteria(ref_index, criteria)
}
pub fn as_ref(&self) -> RefIndex {
RefIndex {
cache: self.cache.load(),
main_index: &self.main_index,
synonyms_index: &self.synonyms_index,
words_index: &self.words_index,
docs_words_index: &self.docs_words_index,
documents_index: &self.documents_index,
custom_settings_index: &self.custom_settings_index,
}
}
pub fn schema(&self) -> Schema {
self.cache.load().schema.clone()
}
pub fn ranked_map(&self) -> RankedMap {
self.cache.load().ranked_map.clone()
}
pub fn synonyms_index(&self) -> SynonymsIndex {
self.synonyms_index.clone()
}
pub fn synonyms_set(&self) -> Arc<fst::Set> {
self.cache.load().synonyms.clone()
}
pub fn custom_settings(&self) -> CustomSettingsIndex {
self.custom_settings_index.clone()
}
pub fn number_of_documents(&self) -> u64 {
self.cache.load().number_of_documents
}
pub fn documents_addition<D>(&self) -> DocumentsAddition<D> {
DocumentsAddition::new(self)
}
pub fn documents_deletion(&self) -> DocumentsDeletion {
DocumentsDeletion::new(self)
}
pub fn synonyms_addition(&self) -> SynonymsAddition {
SynonymsAddition::new(self)
}
pub fn synonyms_deletion(&self) -> SynonymsDeletion {
SynonymsDeletion::new(self)
}
pub fn update_status(
&self,
update_id: u64,
) -> Result<UpdateStatus, Error>
{
let update_id = update_id.to_be_bytes();
match self.updates_results_index.get(update_id)? {
Some(value) => {
let value = bincode::deserialize(&value)?;
Ok(UpdateStatus::Processed(value))
},
None => {
match self.updates_index.get(update_id)? {
Some(_) => Ok(UpdateStatus::Enqueued),
None => Ok(UpdateStatus::Unknown),
}
}
}
}
pub fn update_status_blocking(
&self,
update_id: u64,
) -> Result<UpdateResult, Error>
{
loop {
if let Some(value) = self.updates_results_index.get(&update_id.to_be_bytes())? {
let value = bincode::deserialize(&value)?;
return Ok(value)
}
std::thread::sleep(Duration::from_millis(300));
}
}
pub fn documents_ids(&self) -> Result<DocumentsIdsIter, Error> {
Ok(self.documents_index.documents_ids()?)
}
pub fn document<T>(
&self,
fields: Option<&HashSet<&str>>,
id: DocumentId,
) -> Result<Option<T>, DeserializerError>
where T: de::DeserializeOwned,
{
let schema = self.schema();
let fields = match fields {
Some(fields) => fields.into_iter().map(|name| schema.attribute(name)).collect(),
None => None,
};
let mut deserializer = Deserializer {
document_id: id,
index: &self,
fields: fields.as_ref(),
};
// TODO: currently we return an error if all document fields are missing,
// returning None would have been better
T::deserialize(&mut deserializer).map(Some)
}
}
impl Index {
pub(crate) fn push_documents_addition<D>(&self, addition: Vec<D>) -> Result<u64, Error>
where D: serde::Serialize
{
let mut values = Vec::with_capacity(addition.len());
for add in addition {
let vec = rmp_serde::to_vec_named(&add)?;
let add = rmp_serde::from_read(&vec[..])?;
values.push(add);
}
let addition = Update::DocumentsAddition(values);
let update = rmp_serde::to_vec_named(&addition)?;
self.raw_push_update(update)
}
pub(crate) fn push_documents_deletion(
&self,
deletion: Vec<DocumentId>,
) -> Result<u64, Error>
{
let deletion = Update::DocumentsDeletion(deletion);
let update = rmp_serde::to_vec_named(&deletion)?;
self.raw_push_update(update)
}
pub(crate) fn push_synonyms_addition(
&self,
addition: BTreeMap<String, Vec<String>>,
) -> Result<u64, Error>
{
let addition = Update::SynonymsAddition(addition);
let update = rmp_serde::to_vec_named(&addition)?;
self.raw_push_update(update)
}
pub(crate) fn push_synonyms_deletion(
&self,
deletion: BTreeMap<String, Option<Vec<String>>>,
) -> Result<u64, Error>
{
let deletion = Update::SynonymsDeletion(deletion);
let update = rmp_serde::to_vec_named(&deletion)?;
self.raw_push_update(update)
}
fn raw_push_update(&self, raw_update: Vec<u8>) -> Result<u64, Error> {
let update_id = self.updates_id.fetch_add(1, Ordering::SeqCst);
let update_id_array = update_id.to_be_bytes();
self.updates_index.insert(update_id_array, raw_update)?;
Ok(update_id)
}
}
pub struct RefIndex<'a> {
pub(crate) cache: Guard<'static, Arc<Cache>>,
pub main_index: &'a MainIndex,
pub synonyms_index: &'a SynonymsIndex,
pub words_index: &'a WordsIndex,
pub docs_words_index: &'a DocsWordsIndex,
pub documents_index: &'a DocumentsIndex,
pub custom_settings_index: &'a CustomSettingsIndex,
}
impl Store for RefIndex<'_> {
type Error = Error;
fn words(&self) -> Result<&fst::Set, Self::Error> {
Ok(&self.cache.words)
}
fn word_indexes(&self, word: &[u8]) -> Result<Option<SetBuf<DocIndex>>, Self::Error> {
Ok(self.words_index.doc_indexes(word)?)
}
fn synonyms(&self) -> Result<&fst::Set, Self::Error> {
Ok(&self.cache.synonyms)
}
fn alternatives_to(&self, word: &[u8]) -> Result<Option<fst::Set>, Self::Error> {
Ok(self.synonyms_index.alternatives_to(word)?)
}
}

View File

@ -0,0 +1,21 @@
use crate::RocksDbResult;
#[derive(Clone)]
pub struct SynonymsIndex(pub(crate) crate::CfTree);
impl SynonymsIndex {
pub fn alternatives_to(&self, word: &[u8]) -> RocksDbResult<Option<fst::Set>> {
match self.0.get(word)? {
Some(vector) => Ok(Some(fst::Set::from_bytes(vector.to_vec()).unwrap())),
None => Ok(None),
}
}
pub fn set_alternatives_to(&self, word: &[u8], value: Vec<u8>) -> RocksDbResult<()> {
self.0.insert(word, value).map(drop)
}
pub fn del_alternatives_of(&self, word: &[u8]) -> RocksDbResult<()> {
self.0.remove(word).map(drop)
}
}

View File

@ -0,0 +1,45 @@
use meilidb_core::DocIndex;
use sdset::{Set, SetBuf};
use zerocopy::{LayoutVerified, AsBytes};
use crate::RocksDbResult;
#[derive(Clone)]
pub struct WordsIndex(pub(crate) crate::CfTree);
impl WordsIndex {
pub fn doc_indexes(&self, word: &[u8]) -> RocksDbResult<Option<SetBuf<DocIndex>>> {
// we must force an allocation to make the memory aligned
match self.0.get(word)? {
Some(bytes) => {
let vec = match LayoutVerified::new_slice(bytes.as_ref()) {
Some(layout) => layout.into_slice().to_vec(),
None => {
let len = bytes.as_ref().len();
let count = len / std::mem::size_of::<DocIndex>();
let mut buf: Vec<DocIndex> = Vec::with_capacity(count);
unsafe {
let src = bytes.as_ref().as_ptr();
let dst = buf.as_mut_ptr() as *mut u8;
std::ptr::copy_nonoverlapping(src, dst, len);
buf.set_len(count);
}
buf
}
};
let setbuf = SetBuf::new_unchecked(vec);
Ok(Some(setbuf))
},
None => Ok(None),
}
}
pub fn set_doc_indexes(&self, word: &[u8], set: &Set<DocIndex>) -> RocksDbResult<()> {
self.0.insert(word, set.as_bytes()).map(drop)
}
pub fn del_doc_indexes(&self, word: &[u8]) -> RocksDbResult<()> {
self.0.remove(word).map(drop)
}
}

View File

@ -0,0 +1,155 @@
use std::collections::hash_map::Entry;
use std::collections::{HashSet, HashMap};
use std::path::Path;
use std::sync::Arc;
use std::sync::RwLock;
use meilidb_schema::Schema;
mod error;
mod index;
mod update;
use crate::CfTree;
pub use self::error::Error;
pub use self::index::{
Index, CustomSettingsIndex, CommonIndex, RankingOrdering,
StopWords, RankingOrder, DistinctField, RankingRules,
UpdateType, DetailedDuration, UpdateResult, UpdateStatus
};
pub use self::update::DocumentsAddition;
pub use self::update::DocumentsDeletion;
pub use self::update::SynonymsAddition;
pub use self::update::SynonymsDeletion;
use self::update::apply_documents_addition;
use self::update::apply_documents_deletion;
use self::update::apply_synonyms_addition;
use self::update::apply_synonyms_deletion;
const INDEXES_KEY: &str = "indexes";
const COMMON_KEY: &str = "common-index";
fn load_indexes(tree: &rocksdb::DB) -> Result<HashSet<String>, Error> {
match tree.get(INDEXES_KEY)? {
Some(bytes) => Ok(bincode::deserialize(&bytes)?),
None => Ok(HashSet::new())
}
}
pub struct Database {
cache: RwLock<HashMap<String, Index>>,
inner: Arc<rocksdb::DB>,
common: Arc<CommonIndex>,
}
impl Database {
pub fn open<P: AsRef<Path>>(path: P) -> Result<Database, Error> {
let cache = RwLock::new(HashMap::new());
let mut options = rocksdb::Options::default();
options.create_if_missing(true);
let cfs = rocksdb::DB::list_cf(&options, &path).unwrap_or_default();
let inner = Arc::new(rocksdb::DB::open_cf(&options, path, cfs)?);
let common_tree = CfTree::create(inner.clone(), COMMON_KEY.to_owned())?;
let common = Arc::new(CommonIndex(common_tree));
let indexes = load_indexes(&inner)?;
let database = Database { cache, inner, common };
for index in indexes {
database.open_index(&index)?;
}
Ok(database)
}
pub fn indexes(&self) -> Result<HashSet<String>, Error> {
load_indexes(&self.inner)
}
fn set_indexes(&self, value: &HashSet<String>) -> Result<(), Error> {
let bytes = bincode::serialize(value)?;
self.inner.put(INDEXES_KEY, bytes)?;
Ok(())
}
pub fn open_index(&self, name: &str) -> Result<Option<Index>, Error> {
{
let cache = self.cache.read().unwrap();
if let Some(index) = cache.get(name).cloned() {
return Ok(Some(index))
}
}
let mut cache = self.cache.write().unwrap();
let index = match cache.entry(name.to_string()) {
Entry::Occupied(occupied) => {
occupied.get().clone()
},
Entry::Vacant(vacant) => {
if !self.indexes()?.contains(name) {
return Ok(None)
}
let index = Index::new(self.inner.clone(), name)?;
vacant.insert(index).clone()
},
};
Ok(Some(index))
}
pub fn create_index(&self, name: &str, schema: Schema) -> Result<Index, Error> {
let mut cache = self.cache.write().unwrap();
let index = match cache.entry(name.to_string()) {
Entry::Occupied(occupied) => {
occupied.get().clone()
},
Entry::Vacant(vacant) => {
let index = Index::with_schema(self.inner.clone(), name, schema)?;
let mut indexes = self.indexes()?;
indexes.insert(name.to_string());
self.set_indexes(&indexes)?;
vacant.insert(index).clone()
},
};
Ok(index)
}
pub fn delete_index(&self, name: &str) -> Result<(), Error> {
let mut cache = self.cache.write().unwrap();
self.inner.drop_cf(name)?;
let _ = self.inner.drop_cf(&format!("{}-synonyms", name));
let _ = self.inner.drop_cf(&format!("{}-words", name));
let _ = self.inner.drop_cf(&format!("{}-docs-words", name));
let _ = self.inner.drop_cf(&format!("{}-documents", name));
let _ = self.inner.drop_cf(&format!("{}-custom", name));
let _ = self.inner.drop_cf(&format!("{}-updates", name));
let _ = self.inner.drop_cf(&format!("{}-updates-results", name));
cache.remove(name);
if let Ok(mut index_list) = self.indexes() {
index_list.remove(name);
let _ = self.set_indexes(&index_list);
}
Ok(())
}
pub fn common_index(&self) -> Arc<CommonIndex> {
self.common.clone()
}
pub fn checkpoint_to<P>(&self, path: P) -> Result<(), Error>
where P: AsRef<Path>,
{
let checkpoint = rocksdb::checkpoint::Checkpoint::new(&self.inner)?;
Ok(checkpoint.create_checkpoint(path)?)
}
}

View File

@ -0,0 +1,139 @@
use std::collections::HashSet;
use std::sync::Arc;
use fst::{SetBuilder, set::OpBuilder};
use sdset::{SetOperation, duo::Union};
use serde::Serialize;
use crate::RankedMap;
use crate::database::{Error, Index, index::Cache, apply_documents_deletion};
use crate::indexer::Indexer;
use crate::serde::{extract_document_id, Serializer, RamDocumentStore};
pub struct DocumentsAddition<'a, D> {
index: &'a Index,
documents: Vec<D>,
}
impl<'a, D> DocumentsAddition<'a, D> {
pub fn new(index: &'a Index) -> DocumentsAddition<'a, D> {
DocumentsAddition { index, documents: Vec::new() }
}
pub fn update_document(&mut self, document: D) {
self.documents.push(document);
}
pub fn finalize(self) -> Result<u64, Error>
where D: serde::Serialize
{
self.index.push_documents_addition(self.documents)
}
}
pub fn apply_documents_addition(
index: &Index,
mut ranked_map: RankedMap,
addition: Vec<rmpv::Value>,
) -> Result<(), Error>
{
let mut document_ids = HashSet::new();
let mut document_store = RamDocumentStore::new();
let mut indexer = Indexer::new();
let schema = &index.schema();
let identifier = schema.identifier_name();
for document in addition {
let document_id = match extract_document_id(identifier, &document)? {
Some(id) => id,
None => return Err(Error::MissingDocumentId),
};
// 1. store the document id for future deletion
document_ids.insert(document_id);
// 2. index the document fields in ram stores
let serializer = Serializer {
schema,
document_store: &mut document_store,
indexer: &mut indexer,
ranked_map: &mut ranked_map,
document_id,
};
document.serialize(serializer)?;
}
let ref_index = index.as_ref();
let docs_words = ref_index.docs_words_index;
let documents = ref_index.documents_index;
let main = ref_index.main_index;
let words = ref_index.words_index;
// 1. remove the previous documents match indexes
let documents_to_insert = document_ids.iter().cloned().collect();
apply_documents_deletion(index, ranked_map.clone(), documents_to_insert)?;
// 2. insert new document attributes in the database
for ((id, attr), value) in document_store.into_inner() {
documents.set_document_field(id, attr, value)?;
}
let indexed = indexer.build();
let mut delta_words_builder = SetBuilder::memory();
for (word, delta_set) in indexed.words_doc_indexes {
delta_words_builder.insert(&word).unwrap();
let set = match words.doc_indexes(&word)? {
Some(set) => Union::new(&set, &delta_set).into_set_buf(),
None => delta_set,
};
words.set_doc_indexes(&word, &set)?;
}
for (id, words) in indexed.docs_words {
docs_words.set_doc_words(id, &words)?;
}
let delta_words = delta_words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
let words = match main.words_set()? {
Some(words) => {
let op = OpBuilder::new()
.add(words.stream())
.add(delta_words.stream())
.r#union();
let mut words_builder = SetBuilder::memory();
words_builder.extend_stream(op).unwrap();
words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
},
None => delta_words,
};
main.set_words_set(&words)?;
main.set_ranked_map(&ranked_map)?;
let inserted_documents_len = document_ids.len() as u64;
let number_of_documents = main.set_number_of_documents(|old| old + inserted_documents_len)?;
// update the "consistent" view of the Index
let cache = ref_index.cache;
let words = Arc::new(words);
let synonyms = cache.synonyms.clone();
let schema = cache.schema.clone();
let cache = Cache { words, synonyms, schema, ranked_map, number_of_documents };
index.cache.store(Arc::new(cache));
Ok(())
}

View File

@ -0,0 +1,150 @@
use std::collections::{HashMap, HashSet, BTreeSet};
use std::sync::Arc;
use fst::{SetBuilder, Streamer};
use meilidb_core::DocumentId;
use sdset::{SetBuf, SetOperation, duo::DifferenceByKey};
use crate::RankedMap;
use crate::serde::extract_document_id;
use crate::database::{Index, Error, index::Cache};
pub struct DocumentsDeletion<'a> {
index: &'a Index,
documents: Vec<DocumentId>,
}
impl<'a> DocumentsDeletion<'a> {
pub fn new(index: &'a Index) -> DocumentsDeletion<'a> {
DocumentsDeletion { index, documents: Vec::new() }
}
pub fn delete_document_by_id(&mut self, document_id: DocumentId) {
self.documents.push(document_id);
}
pub fn delete_document<D>(&mut self, document: D) -> Result<(), Error>
where D: serde::Serialize,
{
let schema = self.index.schema();
let identifier = schema.identifier_name();
let document_id = match extract_document_id(identifier, &document)? {
Some(id) => id,
None => return Err(Error::MissingDocumentId),
};
self.delete_document_by_id(document_id);
Ok(())
}
pub fn finalize(self) -> Result<u64, Error> {
self.index.push_documents_deletion(self.documents)
}
}
impl Extend<DocumentId> for DocumentsDeletion<'_> {
fn extend<T: IntoIterator<Item=DocumentId>>(&mut self, iter: T) {
self.documents.extend(iter)
}
}
pub fn apply_documents_deletion(
index: &Index,
mut ranked_map: RankedMap,
deletion: Vec<DocumentId>,
) -> Result<(), Error>
{
let ref_index = index.as_ref();
let schema = index.schema();
let docs_words = ref_index.docs_words_index;
let documents = ref_index.documents_index;
let main = ref_index.main_index;
let words = ref_index.words_index;
let idset = SetBuf::from_dirty(deletion);
// collect the ranked attributes according to the schema
let ranked_attrs: Vec<_> = schema.iter()
.filter_map(|(_, attr, prop)| {
if prop.is_ranked() { Some(attr) } else { None }
})
.collect();
let mut words_document_ids = HashMap::new();
for id in idset {
// remove all the ranked attributes from the ranked_map
for ranked_attr in &ranked_attrs {
ranked_map.remove(id, *ranked_attr);
}
if let Some(words) = docs_words.doc_words(id)? {
let mut stream = words.stream();
while let Some(word) = stream.next() {
let word = word.to_vec();
words_document_ids.entry(word).or_insert_with(Vec::new).push(id);
}
}
}
let mut deleted_documents = HashSet::new();
let mut removed_words = BTreeSet::new();
for (word, document_ids) in words_document_ids {
let document_ids = SetBuf::from_dirty(document_ids);
if let Some(doc_indexes) = words.doc_indexes(&word)? {
let op = DifferenceByKey::new(&doc_indexes, &document_ids, |d| d.document_id, |id| *id);
let doc_indexes = op.into_set_buf();
if !doc_indexes.is_empty() {
words.set_doc_indexes(&word, &doc_indexes)?;
} else {
words.del_doc_indexes(&word)?;
removed_words.insert(word);
}
}
for id in document_ids {
if documents.del_all_document_fields(id)? != 0 {
deleted_documents.insert(id);
}
docs_words.del_doc_words(id)?;
}
}
let removed_words = fst::Set::from_iter(removed_words).unwrap();
let words = match main.words_set()? {
Some(words_set) => {
let op = fst::set::OpBuilder::new()
.add(words_set.stream())
.add(removed_words.stream())
.difference();
let mut words_builder = SetBuilder::memory();
words_builder.extend_stream(op).unwrap();
words_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
},
None => fst::Set::default(),
};
main.set_words_set(&words)?;
main.set_ranked_map(&ranked_map)?;
let deleted_documents_len = deleted_documents.len() as u64;
let number_of_documents = main.set_number_of_documents(|old| old - deleted_documents_len)?;
// update the "consistent" view of the Index
let cache = ref_index.cache;
let words = Arc::new(words);
let synonyms = cache.synonyms.clone();
let schema = cache.schema.clone();
let cache = Cache { words, synonyms, schema, ranked_map, number_of_documents };
index.cache.store(Arc::new(cache));
Ok(())
}

View File

@ -0,0 +1,9 @@
mod documents_addition;
mod documents_deletion;
mod synonyms_addition;
mod synonyms_deletion;
pub use self::documents_addition::{DocumentsAddition, apply_documents_addition};
pub use self::documents_deletion::{DocumentsDeletion, apply_documents_deletion};
pub use self::synonyms_addition::{SynonymsAddition, apply_synonyms_addition};
pub use self::synonyms_deletion::{SynonymsDeletion, apply_synonyms_deletion};

View File

@ -0,0 +1,94 @@
use std::collections::BTreeMap;
use std::sync::Arc;
use fst::{SetBuilder, set::OpBuilder};
use meilidb_core::normalize_str;
use sdset::SetBuf;
use crate::database::{Error, Index,index::Cache};
pub struct SynonymsAddition<'a> {
index: &'a Index,
synonyms: BTreeMap<String, Vec<String>>,
}
impl<'a> SynonymsAddition<'a> {
pub fn new(index: &'a Index) -> SynonymsAddition<'a> {
SynonymsAddition { index, synonyms: BTreeMap::new() }
}
pub fn add_synonym<S, T, I>(&mut self, synonym: S, alternatives: I)
where S: AsRef<str>,
T: AsRef<str>,
I: IntoIterator<Item=T>,
{
let synonym = normalize_str(synonym.as_ref());
let alternatives = alternatives.into_iter().map(|s| s.as_ref().to_lowercase());
self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives);
}
pub fn finalize(self) -> Result<u64, Error> {
self.index.push_synonyms_addition(self.synonyms)
}
}
pub fn apply_synonyms_addition(
index: &Index,
addition: BTreeMap<String, Vec<String>>,
) -> Result<(), Error>
{
let ref_index = index.as_ref();
let synonyms = ref_index.synonyms_index;
let main = ref_index.main_index;
let mut synonyms_builder = SetBuilder::memory();
for (synonym, alternatives) in addition {
synonyms_builder.insert(&synonym).unwrap();
let alternatives = {
let alternatives = SetBuf::from_dirty(alternatives);
let mut alternatives_builder = SetBuilder::memory();
alternatives_builder.extend_iter(alternatives).unwrap();
alternatives_builder.into_inner().unwrap()
};
synonyms.set_alternatives_to(synonym.as_bytes(), alternatives)?;
}
let delta_synonyms = synonyms_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
let synonyms = match main.synonyms_set()? {
Some(synonyms) => {
let op = OpBuilder::new()
.add(synonyms.stream())
.add(delta_synonyms.stream())
.r#union();
let mut synonyms_builder = SetBuilder::memory();
synonyms_builder.extend_stream(op).unwrap();
synonyms_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
},
None => delta_synonyms,
};
main.set_synonyms_set(&synonyms)?;
// update the "consistent" view of the Index
let cache = ref_index.cache;
let words = Arc::new(main.words_set()?.unwrap_or_default());
let ranked_map = cache.ranked_map.clone();
let synonyms = Arc::new(synonyms);
let schema = cache.schema.clone();
let number_of_documents = cache.number_of_documents;
let cache = Cache { words, synonyms, schema, ranked_map, number_of_documents };
index.cache.store(Arc::new(cache));
Ok(())
}

View File

@ -0,0 +1,137 @@
use std::collections::BTreeMap;
use std::iter::FromIterator;
use std::sync::Arc;
use fst::{SetBuilder, set::OpBuilder};
use meilidb_core::normalize_str;
use sdset::SetBuf;
use crate::database::{Error, Index, index::Cache};
pub struct SynonymsDeletion<'a> {
index: &'a Index,
synonyms: BTreeMap<String, Option<Vec<String>>>,
}
impl<'a> SynonymsDeletion<'a> {
pub fn new(index: &'a Index) -> SynonymsDeletion<'a> {
SynonymsDeletion { index, synonyms: BTreeMap::new() }
}
pub fn delete_all_alternatives_of<S: AsRef<str>>(&mut self, synonym: S) {
let synonym = normalize_str(synonym.as_ref());
self.synonyms.insert(synonym, None);
}
pub fn delete_specific_alternatives_of<S, T, I>(&mut self, synonym: S, alternatives: I)
where S: AsRef<str>,
T: AsRef<str>,
I: Iterator<Item=T>,
{
let synonym = normalize_str(synonym.as_ref());
let value = self.synonyms.entry(synonym).or_insert(None);
let alternatives = alternatives.map(|s| s.as_ref().to_lowercase());
match value {
Some(v) => v.extend(alternatives),
None => *value = Some(Vec::from_iter(alternatives)),
}
}
pub fn finalize(self) -> Result<u64, Error> {
self.index.push_synonyms_deletion(self.synonyms)
}
}
pub fn apply_synonyms_deletion(
index: &Index,
deletion: BTreeMap<String, Option<Vec<String>>>,
) -> Result<(), Error>
{
let ref_index = index.as_ref();
let synonyms = ref_index.synonyms_index;
let main = ref_index.main_index;
let mut delete_whole_synonym_builder = SetBuilder::memory();
for (synonym, alternatives) in deletion {
match alternatives {
Some(alternatives) => {
let prev_alternatives = synonyms.alternatives_to(synonym.as_bytes())?;
let prev_alternatives = match prev_alternatives {
Some(alternatives) => alternatives,
None => continue,
};
let delta_alternatives = {
let alternatives = SetBuf::from_dirty(alternatives);
let mut builder = SetBuilder::memory();
builder.extend_iter(alternatives).unwrap();
builder.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
};
let op = OpBuilder::new()
.add(prev_alternatives.stream())
.add(delta_alternatives.stream())
.difference();
let (alternatives, empty_alternatives) = {
let mut builder = SetBuilder::memory();
let len = builder.get_ref().len();
builder.extend_stream(op).unwrap();
let is_empty = len == builder.get_ref().len();
let alternatives = builder.into_inner().unwrap();
(alternatives, is_empty)
};
if empty_alternatives {
delete_whole_synonym_builder.insert(synonym.as_bytes())?;
} else {
synonyms.set_alternatives_to(synonym.as_bytes(), alternatives)?;
}
},
None => {
delete_whole_synonym_builder.insert(&synonym).unwrap();
synonyms.del_alternatives_of(synonym.as_bytes())?;
}
}
}
let delta_synonyms = delete_whole_synonym_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap();
let synonyms = match main.synonyms_set()? {
Some(synonyms) => {
let op = OpBuilder::new()
.add(synonyms.stream())
.add(delta_synonyms.stream())
.difference();
let mut synonyms_builder = SetBuilder::memory();
synonyms_builder.extend_stream(op).unwrap();
synonyms_builder
.into_inner()
.and_then(fst::Set::from_bytes)
.unwrap()
},
None => fst::Set::default(),
};
main.set_synonyms_set(&synonyms)?;
// update the "consistent" view of the Index
let cache = ref_index.cache;
let words = Arc::new(main.words_set()?.unwrap_or_default());
let ranked_map = cache.ranked_map.clone();
let synonyms = Arc::new(synonyms);
let schema = cache.schema.clone();
let number_of_documents = cache.number_of_documents;
let cache = Cache { words, synonyms, schema, ranked_map, number_of_documents };
index.cache.store(Arc::new(cache));
Ok(())
}

View File

@ -0,0 +1,69 @@
use meilidb_core::DocumentId;
use meilidb_schema::SchemaAttr;
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct DocumentAttrKey {
pub document_id: DocumentId,
pub attribute: SchemaAttr,
}
impl DocumentAttrKey {
pub fn new(document_id: DocumentId, attribute: SchemaAttr) -> DocumentAttrKey {
DocumentAttrKey { document_id, attribute }
}
pub fn to_be_bytes(self) -> [u8; 10] {
let mut output = [0u8; 10];
let document_id = self.document_id.0.to_be_bytes();
let attribute = self.attribute.0.to_be_bytes();
unsafe {
use std::{mem::size_of, ptr::copy_nonoverlapping};
let output = output.as_mut_ptr();
copy_nonoverlapping(document_id.as_ptr(), output, size_of::<u64>());
let output = output.add(size_of::<u64>());
copy_nonoverlapping(attribute.as_ptr(), output, size_of::<u16>());
}
output
}
pub fn from_be_bytes(bytes: [u8; 10]) -> DocumentAttrKey {
let document_id;
let attribute;
unsafe {
use std::ptr::read_unaligned;
let pointer = bytes.as_ptr() as *const _;
let document_id_bytes = read_unaligned(pointer);
document_id = u64::from_be_bytes(document_id_bytes);
let pointer = pointer.add(1) as *const _;
let attribute_bytes = read_unaligned(pointer);
attribute = u16::from_be_bytes(attribute_bytes);
}
DocumentAttrKey {
document_id: DocumentId(document_id),
attribute: SchemaAttr(attribute),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn to_from_be_bytes() {
let document_id = DocumentId(67578308);
let schema_attr = SchemaAttr(3456);
let x = DocumentAttrKey::new(document_id, schema_attr);
assert_eq!(x, DocumentAttrKey::from_be_bytes(x.to_be_bytes()));
}
}

208
meilidb-data/src/indexer.rs Normal file
View File

@ -0,0 +1,208 @@
use std::collections::{BTreeMap, HashMap};
use std::convert::TryFrom;
use deunicode::deunicode_with_tofu;
use meilidb_core::{DocumentId, DocIndex};
use meilidb_schema::SchemaAttr;
use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token};
use sdset::SetBuf;
type Word = Vec<u8>; // TODO make it be a SmallVec
pub struct Indexer {
word_limit: usize, // the maximum number of indexed words
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
docs_words: HashMap<DocumentId, Vec<Word>>,
}
pub struct Indexed {
pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,
pub docs_words: HashMap<DocumentId, fst::Set>,
}
impl Indexer {
pub fn new() -> Indexer {
Indexer::with_word_limit(1000)
}
pub fn with_word_limit(limit: usize) -> Indexer {
Indexer {
word_limit: limit,
words_doc_indexes: BTreeMap::new(),
docs_words: HashMap::new(),
}
}
pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) {
let lowercase_text = text.to_lowercase();
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
// TODO compute the deunicoded version after the cjk check
let next = if !lowercase_text.contains(is_cjk) && lowercase_text != deunicoded {
Some(deunicoded)
} else {
None
};
let iter = Some(lowercase_text).into_iter().chain(next);
for text in iter {
for token in Tokenizer::new(&text) {
let must_continue = index_token(
token,
id,
attr,
self.word_limit,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
if !must_continue { break }
}
}
}
pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
where I: IntoIterator<Item=&'a str, IntoIter=IT>,
IT: Iterator<Item = &'a str> + Clone,
{
// TODO serialize this to one call to the SeqTokenizer loop
let lowercased: Vec<_> = iter.into_iter().map(str::to_lowercase).collect();
let iter = lowercased.iter().map(|t| t.as_str());
for token in SeqTokenizer::new(iter) {
let must_continue = index_token(
token,
id,
attr,
self.word_limit,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
if !must_continue { break }
}
let deunicoded: Vec<_> = lowercased.into_iter().map(|lowercase_text| {
if lowercase_text.contains(is_cjk) { return lowercase_text }
let deunicoded = deunicode_with_tofu(&lowercase_text, "");
if lowercase_text != deunicoded { deunicoded } else { lowercase_text }
}).collect();
let iter = deunicoded.iter().map(|t| t.as_str());
for token in SeqTokenizer::new(iter) {
let must_continue = index_token(
token,
id,
attr,
self.word_limit,
&mut self.words_doc_indexes,
&mut self.docs_words,
);
if !must_continue { break }
}
}
pub fn build(self) -> Indexed {
let words_doc_indexes = self.words_doc_indexes
.into_iter()
.map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
.collect();
let docs_words = self.docs_words
.into_iter()
.map(|(id, mut words)| {
words.sort_unstable();
words.dedup();
(id, fst::Set::from_iter(words).unwrap())
})
.collect();
Indexed { words_doc_indexes, docs_words }
}
}
fn index_token(
token: Token,
id: DocumentId,
attr: SchemaAttr,
word_limit: usize,
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool
{
if token.word_index >= word_limit { return false }
match token_to_docindex(id, attr, token) {
Some(docindex) => {
let word = Vec::from(token.word);
words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex);
docs_words.entry(id).or_insert_with(Vec::new).push(word);
},
None => return false,
}
true
}
fn token_to_docindex(id: DocumentId, attr: SchemaAttr, token: Token) -> Option<DocIndex> {
let word_index = u16::try_from(token.word_index).ok()?;
let char_index = u16::try_from(token.char_index).ok()?;
let char_length = u16::try_from(token.word.chars().count()).ok()?;
let docindex = DocIndex {
document_id: id,
attribute: attr.0,
word_index,
char_index,
char_length,
};
Some(docindex)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn strange_apostrophe() {
let mut indexer = Indexer::new();
let docid = DocumentId(0);
let attr = SchemaAttr(0);
let text = "Zut, laspirateur, jai oublié de léteindre !";
indexer.index_text(docid, attr, text);
let Indexed { words_doc_indexes, .. } = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe...
assert!(words_doc_indexes.get(&"léteindre".to_owned().into_bytes()).is_some());
}
#[test]
fn strange_apostrophe_in_sequence() {
let mut indexer = Indexer::new();
let docid = DocumentId(0);
let attr = SchemaAttr(0);
let text = vec!["Zut, laspirateur, jai oublié de léteindre !"];
indexer.index_text_seq(docid, attr, text);
let Indexed { words_doc_indexes, .. } = indexer.build();
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
// with the ugly apostrophe...
assert!(words_doc_indexes.get(&"léteindre".to_owned().into_bytes()).is_some());
}
}

20
meilidb-data/src/lib.rs Normal file
View File

@ -0,0 +1,20 @@
mod cf_tree;
mod database;
mod document_attr_key;
mod indexer;
mod number;
mod ranked_map;
mod serde;
pub use self::cf_tree::{CfTree, CfIter};
pub use self::database::{
Database, Index, CustomSettingsIndex, RankingOrdering,
StopWords, RankingOrder, DistinctField, RankingRules,
UpdateType, DetailedDuration, UpdateResult, UpdateStatus,
Error,
};
pub use self::number::Number;
pub use self::ranked_map::RankedMap;
pub use self::serde::{compute_document_id, extract_document_id, value_to_string};
pub type RocksDbResult<T> = Result<T, rocksdb::Error>;

View File

@ -0,0 +1,55 @@
use std::num::{ParseIntError, ParseFloatError};
use std::str::FromStr;
use std::fmt;
use ordered_float::OrderedFloat;
use serde::{Serialize, Deserialize};
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum Number {
Unsigned(u64),
Signed(i64),
Float(OrderedFloat<f64>),
}
impl FromStr for Number {
type Err = ParseNumberError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let uint_error = match u64::from_str(s) {
Ok(unsigned) => return Ok(Number::Unsigned(unsigned)),
Err(error) => error,
};
let int_error = match i64::from_str(s) {
Ok(signed) => return Ok(Number::Signed(signed)),
Err(error) => error,
};
let float_error = match f64::from_str(s) {
Ok(float) => return Ok(Number::Float(OrderedFloat(float))),
Err(error) => error,
};
Err(ParseNumberError { uint_error, int_error, float_error })
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ParseNumberError {
uint_error: ParseIntError,
int_error: ParseIntError,
float_error: ParseFloatError,
}
impl fmt::Display for ParseNumberError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
if self.uint_error == self.int_error {
write!(f, "can not parse number: {}, {}", self.uint_error, self.float_error)
} else {
write!(f, "can not parse number: {}, {}, {}",
self.uint_error, self.int_error, self.float_error)
}
}
}

View File

@ -0,0 +1,36 @@
use std::io::{Read, Write};
use hashbrown::HashMap;
use meilidb_core::DocumentId;
use meilidb_schema::SchemaAttr;
use crate::Number;
#[derive(Debug, Default, Clone, PartialEq, Eq)]
pub struct RankedMap(HashMap<(DocumentId, SchemaAttr), Number>);
impl RankedMap {
pub fn len(&self) -> usize {
self.0.len()
}
pub fn insert(&mut self, document: DocumentId, attribute: SchemaAttr, number: Number) {
self.0.insert((document, attribute), number);
}
pub fn remove(&mut self, document: DocumentId, attribute: SchemaAttr) {
self.0.remove(&(document, attribute));
}
pub fn get(&self, document: DocumentId, attribute: SchemaAttr) -> Option<Number> {
self.0.get(&(document, attribute)).cloned()
}
pub fn read_from_bin<R: Read>(reader: R) -> bincode::Result<RankedMap> {
bincode::deserialize_from(reader).map(RankedMap)
}
pub fn write_to_bin<W: Write>(&self, writer: W) -> bincode::Result<()> {
bincode::serialize_into(writer, &self.0)
}
}

View File

@ -0,0 +1,180 @@
use std::str::FromStr;
use ordered_float::OrderedFloat;
use serde::ser;
use serde::Serialize;
use super::SerializerError;
use crate::Number;
pub struct ConvertToNumber;
impl ser::Serializer for ConvertToNumber {
type Ok = Number;
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
fn serialize_bool(self, value: bool) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(u64::from(value)))
}
fn serialize_char(self, _value: char) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "char" })
}
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(i64::from(value)))
}
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(i64::from(value)))
}
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(i64::from(value)))
}
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(value))
}
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(u64::from(value)))
}
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(u64::from(value)))
}
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(u64::from(value)))
}
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(value))
}
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Float(OrderedFloat(f64::from(value))))
}
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Float(OrderedFloat(value)))
}
fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
Ok(Number::from_str(value)?)
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnrankableType { type_name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnrankableType { type_name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnrankableType { type_name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnrankableType { type_name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnrankableType { type_name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnrankableType { type_name: "map" })
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnrankableType { type_name: "struct" })
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnrankableType { type_name: "struct variant" })
}
}

View File

@ -0,0 +1,176 @@
use serde::Serialize;
use serde::ser;
use super::SerializerError;
pub struct ConvertToString;
impl ser::Serializer for ConvertToString {
type Ok = String;
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "boolean" })
}
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { type_name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "map" })
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "struct" })
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "struct variant" })
}
}

View File

@ -0,0 +1,132 @@
use std::collections::HashSet;
use std::io::Cursor;
use std::{fmt, error::Error};
use meilidb_core::DocumentId;
use meilidb_schema::SchemaAttr;
use rmp_serde::decode::{Deserializer as RmpDeserializer, ReadReader};
use rmp_serde::decode::{Error as RmpError};
use serde::{de, forward_to_deserialize_any};
use crate::database::Index;
#[derive(Debug)]
pub enum DeserializerError {
RmpError(RmpError),
RocksDbError(rocksdb::Error),
Custom(String),
}
impl de::Error for DeserializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
DeserializerError::Custom(msg.to_string())
}
}
impl fmt::Display for DeserializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
DeserializerError::RmpError(e) => write!(f, "rmp serde related error: {}", e),
DeserializerError::RocksDbError(e) => write!(f, "RocksDB related error: {}", e),
DeserializerError::Custom(s) => f.write_str(s),
}
}
}
impl Error for DeserializerError {}
impl From<RmpError> for DeserializerError {
fn from(error: RmpError) -> DeserializerError {
DeserializerError::RmpError(error)
}
}
impl From<rocksdb::Error> for DeserializerError {
fn from(error: rocksdb::Error) -> DeserializerError {
DeserializerError::RocksDbError(error)
}
}
pub struct Deserializer<'a> {
pub document_id: DocumentId,
pub index: &'a Index,
pub fields: Option<&'a HashSet<SchemaAttr>>,
}
impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a>
{
type Error = DeserializerError;
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de>
{
self.deserialize_map(visitor)
}
forward_to_deserialize_any! {
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string
bytes byte_buf option unit unit_struct newtype_struct seq tuple
tuple_struct struct enum identifier ignored_any
}
fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de>
{
let schema = self.index.schema();
let documents = self.index.as_ref().documents_index;
let iter = documents
.document_fields(self.document_id)?
.filter_map(|(attr, value)| {
let is_displayed = schema.props(attr).is_displayed();
if is_displayed && self.fields.map_or(true, |f| f.contains(&attr)) {
let attribute_name = schema.attribute_name(attr);
Some((attribute_name, Value::new(value)))
} else {
None
}
});
let map_deserializer = de::value::MapDeserializer::new(iter);
let result = visitor.visit_map(map_deserializer).map_err(DeserializerError::from);
result
}
}
struct Value<A>(RmpDeserializer<ReadReader<Cursor<A>>>) where A: AsRef<[u8]>;
impl<A> Value<A> where A: AsRef<[u8]>
{
fn new(value: A) -> Value<A> {
Value(RmpDeserializer::new(Cursor::new(value)))
}
}
impl<'de, A> de::IntoDeserializer<'de, RmpError> for Value<A>
where A: AsRef<[u8]>,
{
type Deserializer = Self;
fn into_deserializer(self) -> Self::Deserializer {
self
}
}
impl<'de, 'a, A> de::Deserializer<'de> for Value<A>
where A: AsRef<[u8]>,
{
type Error = RmpError;
fn deserialize_any<V>(mut self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de>
{
self.0.deserialize_any(visitor)
}
forward_to_deserialize_any! {
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string
bytes byte_buf option unit unit_struct newtype_struct seq tuple
tuple_struct map struct enum identifier ignored_any
}
}

View File

@ -0,0 +1,273 @@
use std::hash::{Hash, Hasher};
use meilidb_core::DocumentId;
use serde::{ser, Serialize};
use serde_json::Value;
use siphasher::sip::SipHasher;
use super::{SerializerError, ConvertToString};
pub fn extract_document_id<D>(
identifier: &str,
document: &D,
) -> Result<Option<DocumentId>, SerializerError>
where D: serde::Serialize,
{
let serializer = ExtractDocumentId { identifier };
document.serialize(serializer)
}
pub fn value_to_string(value: &Value) -> Option<String> {
match value {
Value::Null => None,
Value::Bool(_) => None,
Value::Number(value) => Some(value.to_string()),
Value::String(value) => Some(value.to_string()),
Value::Array(_) => None,
Value::Object(_) => None,
}
}
pub fn compute_document_id<H: Hash>(t: H) -> DocumentId {
let mut s = SipHasher::new();
t.hash(&mut s);
let hash = s.finish();
DocumentId(hash)
}
struct ExtractDocumentId<'a> {
identifier: &'a str,
}
impl<'a> ser::Serializer for ExtractDocumentId<'a> {
type Ok = Option<DocumentId>;
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ExtractDocumentIdMapSerializer<'a>;
type SerializeStruct = ExtractDocumentIdStructSerializer<'a>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, _value: &str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "str" })
}
fn serialize_bytes(self, _value: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { type_name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
let serializer = ExtractDocumentIdMapSerializer {
identifier: self.identifier,
document_id: None,
current_key_name: None,
};
Ok(serializer)
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
let serializer = ExtractDocumentIdStructSerializer {
identifier: self.identifier,
document_id: None,
};
Ok(serializer)
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "struct variant" })
}
}
pub struct ExtractDocumentIdMapSerializer<'a> {
identifier: &'a str,
document_id: Option<DocumentId>,
current_key_name: Option<String>,
}
impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> {
type Ok = Option<DocumentId>;
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: Serialize,
{
let key = key.serialize(ConvertToString)?;
self.current_key_name = Some(key);
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: Serialize,
{
let key = self.current_key_name.take().unwrap();
self.serialize_entry(&key, value)
}
fn serialize_entry<K: ?Sized, V: ?Sized>(
&mut self,
key: &K,
value: &V
) -> Result<(), Self::Error>
where K: Serialize, V: Serialize,
{
let key = key.serialize(ConvertToString)?;
if self.identifier == key {
let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?;
match value_to_string(&value).map(|s| compute_document_id(&s)) {
Some(document_id) => self.document_id = Some(document_id),
None => return Err(SerializerError::InvalidDocumentIdType),
}
}
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(self.document_id)
}
}
pub struct ExtractDocumentIdStructSerializer<'a> {
identifier: &'a str,
document_id: Option<DocumentId>,
}
impl<'a> ser::SerializeStruct for ExtractDocumentIdStructSerializer<'a> {
type Ok = Option<DocumentId>;
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T
) -> Result<(), Self::Error>
where T: Serialize,
{
if self.identifier == key {
let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?;
match value_to_string(&value).map(compute_document_id) {
Some(document_id) => self.document_id = Some(document_id),
None => return Err(SerializerError::InvalidDocumentIdType),
}
}
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(self.document_id)
}
}

View File

@ -0,0 +1,336 @@
use meilidb_core::DocumentId;
use meilidb_schema::SchemaAttr;
use serde::ser;
use serde::Serialize;
use crate::indexer::Indexer as RawIndexer;
use super::{SerializerError, ConvertToString};
pub struct Indexer<'a> {
pub attribute: SchemaAttr,
pub indexer: &'a mut RawIndexer,
pub document_id: DocumentId,
}
impl<'a> ser::Serializer for Indexer<'a> {
type Ok = ();
type Error = SerializerError;
type SerializeSeq = SeqIndexer<'a>;
type SerializeTuple = TupleIndexer<'a>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapIndexer<'a>;
type SerializeStruct = StructSerializer<'a>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
fn serialize_bool(self, _value: bool) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "boolean" })
}
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_str(self, text: &str) -> Result<Self::Ok, Self::Error> {
self.indexer.index_text(self.document_id, self.attribute, text);
Ok(())
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "Option" })
}
fn serialize_some<T: ?Sized>(self, value: &T) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
let text = value.serialize(ConvertToString)?;
self.indexer.index_text(self.document_id, self.attribute, &text);
Ok(())
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
Err(SerializerError::UnindexableType { type_name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
let indexer = SeqIndexer {
attribute: self.attribute,
document_id: self.document_id,
indexer: self.indexer,
texts: Vec::new(),
};
Ok(indexer)
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
let indexer = TupleIndexer {
attribute: self.attribute,
document_id: self.document_id,
indexer: self.indexer,
texts: Vec::new(),
};
Ok(indexer)
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
let indexer = MapIndexer {
attribute: self.attribute,
document_id: self.document_id,
indexer: self.indexer,
texts: Vec::new(),
};
Ok(indexer)
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "struct" })
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "struct variant" })
}
}
pub struct SeqIndexer<'a> {
attribute: SchemaAttr,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: ser::Serialize
{
let text = value.serialize(ConvertToString)?;
self.texts.push(text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
Ok(())
}
}
pub struct MapIndexer<'a> {
attribute: SchemaAttr,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeMap for MapIndexer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: ser::Serialize,
{
let text = key.serialize(ConvertToString)?;
self.texts.push(text);
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: ser::Serialize,
{
let text = value.serialize(ConvertToString)?;
self.texts.push(text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
Ok(())
}
}
pub struct StructSerializer<'a> {
attribute: SchemaAttr,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T,
) -> Result<(), Self::Error>
where T: ser::Serialize,
{
let key_text = key.to_owned();
let value_text = value.serialize(ConvertToString)?;
self.texts.push(key_text);
self.texts.push(value_text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
Ok(())
}
}
pub struct TupleIndexer<'a> {
attribute: SchemaAttr,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: Serialize
{
let text = value.serialize(ConvertToString)?;
self.texts.push(text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
Ok(())
}
}

View File

@ -0,0 +1,131 @@
macro_rules! forward_to_unserializable_type {
($($ty:ident => $se_method:ident,)*) => {
$(
fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "$ty" })
}
)*
}
}
mod convert_to_number;
mod convert_to_string;
mod deserializer;
mod extract_document_id;
mod indexer;
mod serializer;
pub use self::deserializer::{Deserializer, DeserializerError};
pub use self::extract_document_id::{extract_document_id, compute_document_id, value_to_string};
pub use self::convert_to_string::ConvertToString;
pub use self::convert_to_number::ConvertToNumber;
pub use self::indexer::Indexer;
pub use self::serializer::Serializer;
use std::collections::BTreeMap;
use std::{fmt, error::Error};
use meilidb_core::DocumentId;
use meilidb_schema::SchemaAttr;
use rmp_serde::encode::Error as RmpError;
use serde_json::Error as SerdeJsonError;
use serde::ser;
use crate::number::ParseNumberError;
#[derive(Debug)]
pub enum SerializerError {
DocumentIdNotFound,
InvalidDocumentIdType,
RmpError(RmpError),
RocksDbError(rocksdb::Error),
SerdeJsonError(SerdeJsonError),
ParseNumberError(ParseNumberError),
UnserializableType { type_name: &'static str },
UnindexableType { type_name: &'static str },
UnrankableType { type_name: &'static str },
Custom(String),
}
impl ser::Error for SerializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
SerializerError::Custom(msg.to_string())
}
}
impl fmt::Display for SerializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
SerializerError::DocumentIdNotFound => {
write!(f, "serialized document does not have an id according to the schema")
},
SerializerError::InvalidDocumentIdType => {
write!(f, "document identifier can only be of type string or number")
},
SerializerError::RmpError(e) => write!(f, "rmp serde related error: {}", e),
SerializerError::RocksDbError(e) => write!(f, "RocksDB related error: {}", e),
SerializerError::SerdeJsonError(e) => write!(f, "serde json error: {}", e),
SerializerError::ParseNumberError(e) => {
write!(f, "error while trying to parse a number: {}", e)
},
SerializerError::UnserializableType { type_name } => {
write!(f, "{} are not a serializable type", type_name)
},
SerializerError::UnindexableType { type_name } => {
write!(f, "{} are not an indexable type", type_name)
},
SerializerError::UnrankableType { type_name } => {
write!(f, "{} types can not be used for ranking", type_name)
},
SerializerError::Custom(s) => f.write_str(s),
}
}
}
impl Error for SerializerError {}
impl From<String> for SerializerError {
fn from(value: String) -> SerializerError {
SerializerError::Custom(value)
}
}
impl From<RmpError> for SerializerError {
fn from(error: RmpError) -> SerializerError {
SerializerError::RmpError(error)
}
}
impl From<SerdeJsonError> for SerializerError {
fn from(error: SerdeJsonError) -> SerializerError {
SerializerError::SerdeJsonError(error)
}
}
impl From<rocksdb::Error> for SerializerError {
fn from(error: rocksdb::Error) -> SerializerError {
SerializerError::RocksDbError(error)
}
}
impl From<ParseNumberError> for SerializerError {
fn from(error: ParseNumberError) -> SerializerError {
SerializerError::ParseNumberError(error)
}
}
pub struct RamDocumentStore(BTreeMap<(DocumentId, SchemaAttr), Vec<u8>>);
impl RamDocumentStore {
pub fn new() -> RamDocumentStore {
RamDocumentStore(BTreeMap::new())
}
pub fn set_document_field(&mut self, id: DocumentId, attr: SchemaAttr, value: Vec<u8>) {
self.0.insert((id, attr), value);
}
pub fn into_inner(self) -> BTreeMap<(DocumentId, SchemaAttr), Vec<u8>> {
self.0
}
}

View File

@ -0,0 +1,287 @@
use meilidb_core::DocumentId;
use meilidb_schema::Schema;
use serde::ser;
use crate::indexer::Indexer as RawIndexer;
use crate::ranked_map::RankedMap;
use super::{RamDocumentStore, SerializerError, ConvertToString, ConvertToNumber, Indexer};
pub struct Serializer<'a> {
pub schema: &'a Schema,
pub document_store: &'a mut RamDocumentStore,
pub indexer: &'a mut RawIndexer,
pub ranked_map: &'a mut RankedMap,
pub document_id: DocumentId,
}
impl<'a> ser::Serializer for Serializer<'a> {
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapSerializer<'a>;
type SerializeStruct = StructSerializer<'a>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "str" })
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
Err(SerializerError::UnserializableType { type_name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Ok(MapSerializer {
schema: self.schema,
document_id: self.document_id,
document_store: self.document_store,
indexer: self.indexer,
ranked_map: self.ranked_map,
current_key_name: None,
})
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Ok(StructSerializer {
schema: self.schema,
document_id: self.document_id,
document_store: self.document_store,
indexer: self.indexer,
ranked_map: self.ranked_map,
})
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "struct variant" })
}
}
pub struct MapSerializer<'a> {
schema: &'a Schema,
document_id: DocumentId,
document_store: &'a mut RamDocumentStore,
indexer: &'a mut RawIndexer,
ranked_map: &'a mut RankedMap,
current_key_name: Option<String>,
}
impl<'a> ser::SerializeMap for MapSerializer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: ser::Serialize,
{
let key = key.serialize(ConvertToString)?;
self.current_key_name = Some(key);
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: ser::Serialize,
{
let key = self.current_key_name.take().unwrap();
self.serialize_entry(&key, value)
}
fn serialize_entry<K: ?Sized, V: ?Sized>(
&mut self,
key: &K,
value: &V,
) -> Result<(), Self::Error>
where K: ser::Serialize, V: ser::Serialize,
{
let key = key.serialize(ConvertToString)?;
serialize_value(
self.schema,
self.document_id,
self.document_store,
self.indexer,
self.ranked_map,
&key,
value,
)
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}
pub struct StructSerializer<'a> {
schema: &'a Schema,
document_id: DocumentId,
document_store: &'a mut RamDocumentStore,
indexer: &'a mut RawIndexer,
ranked_map: &'a mut RankedMap,
}
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T,
) -> Result<(), Self::Error>
where T: ser::Serialize,
{
serialize_value(
self.schema,
self.document_id,
self.document_store,
self.indexer,
self.ranked_map,
key,
value,
)
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}
fn serialize_value<T: ?Sized>(
schema: &Schema,
document_id: DocumentId,
document_store: &mut RamDocumentStore,
indexer: &mut RawIndexer,
ranked_map: &mut RankedMap,
key: &str,
value: &T,
) -> Result<(), SerializerError>
where T: ser::Serialize,
{
if let Some(attribute) = schema.attribute(key) {
let props = schema.props(attribute);
let serialized = rmp_serde::to_vec_named(value)?;
document_store.set_document_field(document_id, attribute, serialized);
if props.is_indexed() {
let indexer = Indexer { attribute, indexer, document_id };
value.serialize(indexer)?;
}
if props.is_ranked() {
let number = value.serialize(ConvertToNumber)?;
ranked_map.insert(document_id, attribute, number);
}
}
Ok(())
}

View File

@ -0,0 +1,15 @@
use meilidb_data::{Database};
use meilidb_data::Index;
use meilidb_schema::{SchemaBuilder, DISPLAYED, INDEXED};
pub fn simple_index() -> Index {
let tmp_dir = tempfile::tempdir().unwrap();
let database = Database::open(&tmp_dir).unwrap();
let mut builder = SchemaBuilder::with_identifier("objectId");
builder.new_attribute("objectId", DISPLAYED | INDEXED);
builder.new_attribute("title", DISPLAYED | INDEXED);
let schema = builder.build();
database.create_index("hello", schema).unwrap()
}

View File

@ -0,0 +1,43 @@
#[macro_use] extern crate maplit;
mod common;
use big_s::S;
use meilidb_data::RankingOrdering;
#[test]
fn stop_words() {
let index = common::simple_index();
let stop_words = hashset!{ S("le"), S("la"), S("les"), };
index.custom_settings().set_stop_words(&stop_words).unwrap();
let ret_stop_words = index.custom_settings().get_stop_words().unwrap().unwrap();
assert_eq!(ret_stop_words, stop_words);
}
#[test]
fn ranking_order() {
let index = common::simple_index();
let ranking_order = vec![S("SumOfTypos"), S("NumberOfWords"), S("WordsProximity"), S("SumOfWordsAttribute"), S("SumOfWordsPosition"), S("Exact"), S("DocumentId")];
index.custom_settings().set_ranking_order(&ranking_order).unwrap();
let ret_ranking_orderer = index.custom_settings().get_ranking_order().unwrap().unwrap();
assert_eq!(ret_ranking_orderer, ranking_order);
}
#[test]
fn distinct_field() {
let index = common::simple_index();
let distinct_field = S("title");
index.custom_settings().set_distinct_field(&distinct_field).unwrap();
let ret_distinct_field = index.custom_settings().get_distinct_field().unwrap().unwrap();
assert_eq!(ret_distinct_field, distinct_field);
}
#[test]
fn ranking_rules() {
let index = common::simple_index();
let ranking_rules = hashmap!{ S("objectId") => RankingOrdering::Asc };
index.custom_settings().set_ranking_rules(&ranking_rules).unwrap();
let ret_ranking_rules = index.custom_settings().get_ranking_rules().unwrap().unwrap();
assert_eq!(ret_ranking_rules, ranking_rules);
}

View File

@ -0,0 +1,67 @@
#[macro_use] extern crate maplit;
mod common;
use std::sync::atomic::{AtomicBool, Ordering::Relaxed};
use std::sync::Arc;
use big_s::S;
use serde_json::json;
#[test]
fn database_stats() {
let index = common::simple_index();
let as_been_updated = Arc::new(AtomicBool::new(false));
let as_been_updated_clone = as_been_updated.clone();
index.set_update_callback(move |_| as_been_updated_clone.store(true, Relaxed));
let doc1 = json!({ "objectId": 123, "title": "hello" });
let mut addition = index.documents_addition();
addition.update_document(&doc1);
let update_id = addition.finalize().unwrap();
let status = index.update_status_blocking(update_id).unwrap();
assert!(as_been_updated.swap(false, Relaxed));
assert!(status.result.is_ok());
let stats = index.stats().unwrap();
let repartition = hashmap!{
S("objectId") => 1u64,
S("title") => 1u64,
};
assert_eq!(stats.number_of_documents, 1);
assert_eq!(stats.documents_fields_repartition, repartition);
let doc2 = json!({ "objectId": 456, "title": "world" });
let mut addition = index.documents_addition();
addition.update_document(&doc2);
let update_id = addition.finalize().unwrap();
let status = index.update_status_blocking(update_id).unwrap();
assert!(as_been_updated.swap(false, Relaxed));
assert!(status.result.is_ok());
let stats = index.stats().unwrap();
let repartition = hashmap!{
S("objectId") => 2u64,
S("title") => 2u64,
};
assert_eq!(stats.number_of_documents, 2);
assert_eq!(stats.documents_fields_repartition, repartition);
let doc3 = json!({ "objectId": 789 });
let mut addition = index.documents_addition();
addition.update_document(&doc3);
let update_id = addition.finalize().unwrap();
let status = index.update_status_blocking(update_id).unwrap();
assert!(as_been_updated.swap(false, Relaxed));
assert!(status.result.is_ok());
let stats = index.stats().unwrap();
let repartition = hashmap!{
S("objectId") => 3u64,
S("title") => 2u64,
};
assert_eq!(stats.number_of_documents, 3);
assert_eq!(stats.documents_fields_repartition, repartition);
}

View File

@ -0,0 +1,99 @@
mod common;
use std::sync::atomic::{AtomicBool, Ordering::Relaxed};
use std::sync::Arc;
use serde_json::json;
#[test]
fn insert_delete_document() {
let index = common::simple_index();
let as_been_updated = Arc::new(AtomicBool::new(false));
let as_been_updated_clone = as_been_updated.clone();
index.set_update_callback(move |_| as_been_updated_clone.store(true, Relaxed));
let doc1 = json!({ "objectId": 123, "title": "hello" });
let mut addition = index.documents_addition();
addition.update_document(&doc1);
let update_id = addition.finalize().unwrap();
let status = index.update_status_blocking(update_id).unwrap();
assert!(as_been_updated.swap(false, Relaxed));
assert!(status.result.is_ok());
assert_eq!(index.number_of_documents(), 1);
let docs = index.query_builder().query("hello", 0..10).unwrap();
assert_eq!(docs.len(), 1);
assert_eq!(index.document(None, docs[0].id).unwrap().as_ref(), Some(&doc1));
let mut deletion = index.documents_deletion();
deletion.delete_document(&doc1).unwrap();
let update_id = deletion.finalize().unwrap();
let status = index.update_status_blocking(update_id).unwrap();
assert!(as_been_updated.swap(false, Relaxed));
assert!(status.result.is_ok());
assert_eq!(index.number_of_documents(), 0);
let docs = index.query_builder().query("hello", 0..10).unwrap();
assert_eq!(docs.len(), 0);
}
#[test]
fn replace_document() {
let index = common::simple_index();
let as_been_updated = Arc::new(AtomicBool::new(false));
let as_been_updated_clone = as_been_updated.clone();
index.set_update_callback(move |_| as_been_updated_clone.store(true, Relaxed));
let doc1 = json!({ "objectId": 123, "title": "hello" });
let doc2 = json!({ "objectId": 123, "title": "coucou" });
let mut addition = index.documents_addition();
addition.update_document(&doc1);
let update_id = addition.finalize().unwrap();
let status = index.update_status_blocking(update_id).unwrap();
assert!(as_been_updated.swap(false, Relaxed));
assert!(status.result.is_ok());
assert_eq!(index.number_of_documents(), 1);
let docs = index.query_builder().query("hello", 0..10).unwrap();
assert_eq!(docs.len(), 1);
assert_eq!(index.document(None, docs[0].id).unwrap().as_ref(), Some(&doc1));
let mut addition = index.documents_addition();
addition.update_document(&doc2);
let update_id = addition.finalize().unwrap();
let status = index.update_status_blocking(update_id).unwrap();
assert!(as_been_updated.swap(false, Relaxed));
assert!(status.result.is_ok());
assert_eq!(index.number_of_documents(), 1);
let docs = index.query_builder().query("hello", 0..10).unwrap();
assert_eq!(docs.len(), 0);
let docs = index.query_builder().query("coucou", 0..10).unwrap();
assert_eq!(docs.len(), 1);
assert_eq!(index.document(None, docs[0].id).unwrap().as_ref(), Some(&doc2));
}
#[test]
fn documents_ids() {
let index = common::simple_index();
let doc1 = json!({ "objectId": 123, "title": "hello" });
let doc2 = json!({ "objectId": 456, "title": "world" });
let doc3 = json!({ "objectId": 789 });
let mut addition = index.documents_addition();
addition.update_document(&doc1);
addition.update_document(&doc2);
addition.update_document(&doc3);
let update_id = addition.finalize().unwrap();
let status = index.update_status_blocking(update_id).unwrap();
assert!(status.result.is_ok());
let documents_ids_count = index.documents_ids().unwrap().count();
assert_eq!(documents_ids_count, 3);
}

12
meilidb-schema/Cargo.toml Normal file
View File

@ -0,0 +1,12 @@
[package]
name = "meilidb-schema"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"
[dependencies]
bincode = "1.1.2"
indexmap = { version = "1.1.0", features = ["serde-1"] }
serde = { version = "1.0.91", features = ["derive"] }
serde_json = { version = "1.0.39", features = ["preserve_order"] }
toml = { version = "0.5.0", features = ["preserve_order"] }

285
meilidb-schema/src/lib.rs Normal file
View File

@ -0,0 +1,285 @@
use std::collections::{HashMap, BTreeMap};
use std::{fmt, u16};
use std::ops::BitOr;
use std::sync::Arc;
use serde::{Serialize, Deserialize};
use indexmap::IndexMap;
pub const DISPLAYED: SchemaProps = SchemaProps { displayed: true, indexed: false, ranked: false };
pub const INDEXED: SchemaProps = SchemaProps { displayed: false, indexed: true, ranked: false };
pub const RANKED: SchemaProps = SchemaProps { displayed: false, indexed: false, ranked: true };
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct SchemaProps {
#[serde(default)]
pub displayed: bool,
#[serde(default)]
pub indexed: bool,
#[serde(default)]
pub ranked: bool,
}
impl SchemaProps {
pub fn is_displayed(self) -> bool {
self.displayed
}
pub fn is_indexed(self) -> bool {
self.indexed
}
pub fn is_ranked(self) -> bool {
self.ranked
}
}
impl BitOr for SchemaProps {
type Output = Self;
fn bitor(self, other: Self) -> Self::Output {
SchemaProps {
displayed: self.displayed | other.displayed,
indexed: self.indexed | other.indexed,
ranked: self.ranked | other.ranked,
}
}
}
#[derive(Serialize, Deserialize)]
pub struct SchemaBuilder {
identifier: String,
attributes: IndexMap<String, SchemaProps>,
}
impl SchemaBuilder {
pub fn with_identifier<S: Into<String>>(name: S) -> SchemaBuilder {
SchemaBuilder {
identifier: name.into(),
attributes: IndexMap::new(),
}
}
pub fn new_attribute<S: Into<String>>(&mut self, name: S, props: SchemaProps) -> SchemaAttr {
let len = self.attributes.len();
if self.attributes.insert(name.into(), props).is_some() {
panic!("Field already inserted.")
}
SchemaAttr(len as u16)
}
pub fn build(self) -> Schema {
let mut attrs = HashMap::new();
let mut props = Vec::new();
for (i, (name, prop)) in self.attributes.into_iter().enumerate() {
attrs.insert(name.clone(), SchemaAttr(i as u16));
props.push((name, prop));
}
let identifier = self.identifier;
Schema { inner: Arc::new(InnerSchema { identifier, attrs, props }) }
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Schema {
inner: Arc<InnerSchema>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct InnerSchema {
identifier: String,
attrs: HashMap<String, SchemaAttr>,
props: Vec<(String, SchemaProps)>,
}
impl Schema {
fn to_builder(&self) -> SchemaBuilder {
let identifier = self.inner.identifier.clone();
let attributes = self.attributes_ordered();
SchemaBuilder { identifier, attributes }
}
fn attributes_ordered(&self) -> IndexMap<String, SchemaProps> {
let mut ordered = BTreeMap::new();
for (name, attr) in &self.inner.attrs {
let (_, props) = self.inner.props[attr.0 as usize];
ordered.insert(attr.0, (name, props));
}
let mut attributes = IndexMap::with_capacity(ordered.len());
for (_, (name, props)) in ordered {
attributes.insert(name.clone(), props);
}
attributes
}
pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
let (_, props) = self.inner.props[attr.0 as usize];
props
}
pub fn identifier_name(&self) -> &str {
&self.inner.identifier
}
pub fn attribute<S: AsRef<str>>(&self, name: S) -> Option<SchemaAttr> {
self.inner.attrs.get(name.as_ref()).cloned()
}
pub fn attribute_name(&self, attr: SchemaAttr) -> &str {
let (name, _) = &self.inner.props[attr.0 as usize];
name
}
pub fn iter<'a>(&'a self) -> impl Iterator<Item=(&str, SchemaAttr, SchemaProps)> + 'a {
self.inner.props.iter()
.map(move |(name, prop)| {
let attr = self.inner.attrs.get(name).unwrap();
(name.as_str(), *attr, *prop)
})
}
}
impl Serialize for Schema {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: serde::ser::Serializer,
{
self.to_builder().serialize(serializer)
}
}
impl<'de> Deserialize<'de> for Schema {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: serde::de::Deserializer<'de>,
{
let builder = SchemaBuilder::deserialize(deserializer)?;
Ok(builder.build())
}
}
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct SchemaAttr(pub u16);
impl SchemaAttr {
pub const fn new(value: u16) -> SchemaAttr {
SchemaAttr(value)
}
pub const fn min() -> SchemaAttr {
SchemaAttr(u16::min_value())
}
pub const fn max() -> SchemaAttr {
SchemaAttr(u16::max_value())
}
pub fn next(self) -> Option<SchemaAttr> {
self.0.checked_add(1).map(SchemaAttr)
}
pub fn prev(self) -> Option<SchemaAttr> {
self.0.checked_sub(1).map(SchemaAttr)
}
}
impl fmt::Display for SchemaAttr {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.0.fmt(f)
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::error::Error;
#[test]
fn serialize_deserialize() -> bincode::Result<()> {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", DISPLAYED);
builder.new_attribute("beta", DISPLAYED | INDEXED);
builder.new_attribute("gamma", INDEXED);
let schema = builder.build();
let mut buffer = Vec::new();
bincode::serialize_into(&mut buffer, &schema)?;
let schema2 = bincode::deserialize_from(buffer.as_slice())?;
assert_eq!(schema, schema2);
Ok(())
}
#[test]
fn serialize_deserialize_toml() -> Result<(), Box<dyn Error>> {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", DISPLAYED);
builder.new_attribute("beta", DISPLAYED | INDEXED);
builder.new_attribute("gamma", INDEXED);
let schema = builder.build();
let buffer = toml::to_vec(&schema)?;
let schema2 = toml::from_slice(buffer.as_slice())?;
assert_eq!(schema, schema2);
let data = r#"
identifier = "id"
[attributes."alpha"]
displayed = true
[attributes."beta"]
displayed = true
indexed = true
[attributes."gamma"]
indexed = true
"#;
let schema2 = toml::from_str(data)?;
assert_eq!(schema, schema2);
Ok(())
}
#[test]
fn serialize_deserialize_json() -> Result<(), Box<dyn Error>> {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", DISPLAYED);
builder.new_attribute("beta", DISPLAYED | INDEXED);
builder.new_attribute("gamma", INDEXED);
let schema = builder.build();
let buffer = serde_json::to_vec(&schema)?;
let schema2 = serde_json::from_slice(buffer.as_slice())?;
assert_eq!(schema, schema2);
let data = r#"
{
"identifier": "id",
"attributes": {
"alpha": {
"displayed": true
},
"beta": {
"displayed": true,
"indexed": true
},
"gamma": {
"indexed": true
}
}
}"#;
let schema2 = serde_json::from_str(data)?;
assert_eq!(schema, schema2);
Ok(())
}
}

View File

@ -0,0 +1,8 @@
[package]
name = "meilidb-tokenizer"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"
[dependencies]
slice-group-by = "0.2.4"

View File

@ -0,0 +1,295 @@
use std::iter::Peekable;
use slice_group_by::StrGroupBy;
use self::SeparatorCategory::*;
pub fn is_cjk(c: char) -> bool {
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
(c >= '\u{3040}' && c <= '\u{309f}') ||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
(c >= '\u{3100}' && c <= '\u{312f}') ||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
(c >= '\u{f900}' && c <= '\u{faff}')
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
enum SeparatorCategory {
Soft,
Hard,
}
impl SeparatorCategory {
fn merge(self, other: SeparatorCategory) -> SeparatorCategory {
if let (Soft, Soft) = (self, other) { Soft } else { Hard }
}
fn to_usize(self) -> usize {
match self {
Soft => 1,
Hard => 8,
}
}
}
fn is_separator(c: char) -> bool {
classify_separator(c).is_some()
}
fn classify_separator(c: char) -> Option<SeparatorCategory> {
match c {
' ' | '\'' | '"' => Some(Soft),
'.' | ';' | ',' | '!' | '?' | '-' | '(' | ')' => Some(Hard),
_ => None,
}
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
enum CharCategory {
Separator(SeparatorCategory),
Cjk,
Other,
}
fn classify_char(c: char) -> CharCategory {
if let Some(category) = classify_separator(c) {
CharCategory::Separator(category)
} else if is_cjk(c) {
CharCategory::Cjk
} else {
CharCategory::Other
}
}
fn is_str_word(s: &str) -> bool {
!s.chars().any(is_separator)
}
fn same_group_category(a: char, b: char) -> bool {
match (classify_char(a), classify_char(b)) {
(CharCategory::Cjk, _) | (_, CharCategory::Cjk) => false,
(CharCategory::Separator(_), CharCategory::Separator(_)) => true,
(a, b) => a == b,
}
}
// fold the number of chars along with the index position
fn chars_count_index((n, _): (usize, usize), (i, c): (usize, char)) -> (usize, usize) {
(n + 1, i + c.len_utf8())
}
pub fn split_query_string(query: &str) -> impl Iterator<Item=&str> {
Tokenizer::new(query).map(|t| t.word)
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub struct Token<'a> {
pub word: &'a str,
pub word_index: usize,
pub char_index: usize,
}
pub struct Tokenizer<'a> {
inner: &'a str,
word_index: usize,
char_index: usize,
}
impl<'a> Tokenizer<'a> {
pub fn new(string: &str) -> Tokenizer {
// skip every separator and set `char_index`
// to the number of char trimmed
let (count, index) = string.char_indices()
.take_while(|(_, c)| is_separator(*c))
.fold((0, 0), chars_count_index);
Tokenizer {
inner: &string[index..],
word_index: 0,
char_index: count,
}
}
}
impl<'a> Iterator for Tokenizer<'a> {
type Item = Token<'a>;
fn next(&mut self) -> Option<Self::Item> {
let mut iter = self.inner.linear_group_by(same_group_category).peekable();
while let (Some(string), next_string) = (iter.next(), iter.peek()) {
let (count, index) = string.char_indices().fold((0, 0), chars_count_index);
if !is_str_word(string) {
self.word_index += string.chars()
.filter_map(classify_separator)
.fold(Soft, |a, x| a.merge(x))
.to_usize();
self.char_index += count;
self.inner = &self.inner[index..];
continue;
}
let token = Token {
word: string,
word_index: self.word_index,
char_index: self.char_index,
};
if next_string.filter(|s| is_str_word(s)).is_some() {
self.word_index += 1;
}
self.char_index += count;
self.inner = &self.inner[index..];
return Some(token);
}
self.inner = "";
None
}
}
pub struct SeqTokenizer<'a, I>
where I: Iterator<Item=&'a str>,
{
inner: I,
current: Option<Peekable<Tokenizer<'a>>>,
word_offset: usize,
char_offset: usize,
}
impl<'a, I> SeqTokenizer<'a, I>
where I: Iterator<Item=&'a str>,
{
pub fn new(mut iter: I) -> SeqTokenizer<'a, I> {
let current = iter.next().map(|s| Tokenizer::new(s).peekable());
SeqTokenizer {
inner: iter,
current: current,
word_offset: 0,
char_offset: 0,
}
}
}
impl<'a, I> Iterator for SeqTokenizer<'a, I>
where I: Iterator<Item=&'a str>,
{
type Item = Token<'a>;
fn next(&mut self) -> Option<Self::Item> {
match &mut self.current {
Some(current) => {
match current.next() {
Some(token) => {
// we must apply the word and char offsets
// to the token before returning it
let token = Token {
word: token.word,
word_index: token.word_index + self.word_offset,
char_index: token.char_index + self.char_offset,
};
// if this is the last iteration on this text
// we must save the offsets for next texts
if current.peek().is_none() {
let hard_space = SeparatorCategory::Hard.to_usize();
self.word_offset = token.word_index + hard_space;
self.char_offset = token.char_index + hard_space;
}
Some(token)
},
None => {
// no more words in this text we must
// start tokenizing the next text
self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable());
self.next()
},
}
},
// no more texts available
None => None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn easy() {
let mut tokenizer = Tokenizer::new("salut");
assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard() {
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 18 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 32, char_index: 24 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard_long_chars() {
let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard_kanjis() {
let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}");
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 11, char_index: 14 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 12, char_index: 23 }));
assert_eq!(tokenizer.next(), None);
}
}

29
meilidb/Cargo.toml Normal file
View File

@ -0,0 +1,29 @@
[package]
edition = "2018"
name = "meilidb"
version = "0.3.1"
authors = ["Kerollmops <renault.cle@gmail.com>"]
[dependencies]
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
meilidb-data = { path = "../meilidb-data", version = "0.1.0" }
meilidb-schema = { path = "../meilidb-schema", version = "0.1.0" }
[dev-dependencies]
csv = "1.0.7"
diskus = "0.5.0"
env_logger = "0.6.1"
indexmap = { version = "1.1.0", features = ["serde-1"] }
jemallocator = "0.3.2"
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
quickcheck = "0.9.0"
rand = "0.7.2"
rand_xorshift = "0.2.0"
rustyline = { version = "5.0.0", default-features = false }
serde = { version = "1.0.91" , features = ["derive"] }
serde_json = "1.0.39"
structopt = "0.3.2"
sysinfo = "0.9.5"
tempfile = "3.0.7"
termcolor = "1.0.4"
toml = "0.5.3"

View File

@ -0,0 +1,215 @@
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
use std::collections::{HashMap, HashSet};
use std::io::{self, BufRead, BufReader};
use std::path::{Path, PathBuf};
use std::time::Instant;
use std::error::Error;
use std::fs::{self, File};
use diskus::Walk;
use sysinfo::{SystemExt, ProcessExt};
use serde::{Serialize, Deserialize};
use structopt::StructOpt;
use meilidb_data::Database;
use meilidb_schema::Schema;
#[derive(Debug, StructOpt)]
pub struct Opt {
/// The destination where the database must be created.
#[structopt(parse(from_os_str))]
pub database_path: PathBuf,
/// The csv file to index.
#[structopt(parse(from_os_str))]
pub csv_data_path: PathBuf,
/// The path to the schema.
#[structopt(long = "schema", parse(from_os_str))]
pub schema_path: PathBuf,
/// The file with the synonyms.
#[structopt(long = "synonyms", parse(from_os_str))]
pub synonyms: Option<PathBuf>,
/// The path to the list of stop words (one by line).
#[structopt(long = "stop-words", parse(from_os_str))]
pub stop_words: Option<PathBuf>,
#[structopt(long = "update-group-size")]
pub update_group_size: Option<usize>,
}
#[derive(Serialize, Deserialize)]
struct Document (
HashMap<String, String>
);
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum Synonym {
OneWay(SynonymOneWay),
MultiWay { synonyms: Vec<String> },
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SynonymOneWay {
pub search_terms: String,
pub synonyms: Synonyms,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum Synonyms {
Multiple(Vec<String>),
Single(String),
}
fn read_synomys(path: &Path) -> Result<Vec<Synonym>, Box<dyn Error>> {
let file = File::open(path)?;
let synonyms = serde_json::from_reader(file)?;
Ok(synonyms)
}
fn index(
schema: Schema,
database_path: &Path,
csv_data_path: &Path,
update_group_size: Option<usize>,
stop_words: &HashSet<String>,
synonyms: Vec<Synonym>,
) -> Result<Database, Box<dyn Error>>
{
let database = Database::open(database_path)?;
let mut wtr = csv::Writer::from_path("./stats.csv").unwrap();
wtr.write_record(&["NumberOfDocuments", "DiskUsed", "MemoryUsed"])?;
let mut system = sysinfo::System::new();
let index = database.create_index("test", schema.clone())?;
let mut synonyms_adder = index.synonyms_addition();
for synonym in synonyms {
match synonym {
Synonym::OneWay(SynonymOneWay { search_terms, synonyms }) => {
let alternatives = match synonyms {
Synonyms::Multiple(alternatives) => alternatives,
Synonyms::Single(alternative) => vec![alternative],
};
synonyms_adder.add_synonym(search_terms, alternatives);
},
Synonym::MultiWay { mut synonyms } => {
for _ in 0..synonyms.len() {
if let Some((synonym, alternatives)) = synonyms.split_first() {
synonyms_adder.add_synonym(synonym, alternatives);
}
synonyms.rotate_left(1);
}
},
}
}
synonyms_adder.finalize()?;
let mut rdr = csv::Reader::from_path(csv_data_path)?;
let mut raw_record = csv::StringRecord::new();
let headers = rdr.headers()?.clone();
let mut i = 0;
let mut end_of_file = false;
while !end_of_file {
let mut update = index.documents_addition();
loop {
end_of_file = !rdr.read_record(&mut raw_record)?;
if end_of_file { break }
let document: Document = match raw_record.deserialize(Some(&headers)) {
Ok(document) => document,
Err(e) => {
eprintln!("{:?}", e);
continue;
}
};
update.update_document(document);
print!("\rindexing document {}", i);
i += 1;
if let Some(group_size) = update_group_size {
if i % group_size == 0 { break }
}
}
println!();
println!("committing update...");
update.finalize()?;
// write stats
let directory_size = Walk::new(&[database_path.to_owned()], 4).run();
system.refresh_all();
let pid = sysinfo::get_current_pid()?;
let memory = system.get_process(pid).unwrap().memory(); // in kb
wtr.write_record(&[i.to_string(), directory_size.to_string(), memory.to_string()])?;
wtr.flush()?;
}
Ok(database)
}
fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
let f = File::open(path)?;
let reader = BufReader::new(f);
let mut words = HashSet::new();
for line in reader.lines() {
let line = line?;
let word = line.trim().to_string();
words.insert(word);
}
Ok(words)
}
fn main() -> Result<(), Box<dyn Error>> {
let _ = env_logger::init();
let opt = Opt::from_args();
let schema = {
let string = fs::read_to_string(&opt.schema_path)?;
toml::from_str(&string)?
};
let stop_words = match opt.stop_words {
Some(ref path) => retrieve_stop_words(path)?,
None => HashSet::new(),
};
let synonyms = match opt.synonyms {
Some(ref path) => read_synomys(path)?,
None => Vec::new(),
};
let start = Instant::now();
let result = index(
schema,
&opt.database_path,
&opt.csv_data_path,
opt.update_group_size,
&stop_words,
synonyms,
);
if let Err(e) = result {
return Err(e.into())
}
println!("database created in {:.2?} at: {:?}", start.elapsed(), opt.database_path);
Ok(())
}

View File

@ -0,0 +1,229 @@
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
use std::collections::btree_map::{BTreeMap, Entry};
use std::collections::HashSet;
use std::error::Error;
use std::io::{self, Write};
use std::iter::FromIterator;
use std::path::PathBuf;
use std::time::{Instant, Duration};
use indexmap::IndexMap;
use rustyline::{Editor, Config};
use structopt::StructOpt;
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
use meilidb_core::Highlight;
use meilidb_data::Database;
use meilidb_schema::SchemaAttr;
#[derive(Debug, StructOpt)]
pub struct Opt {
/// The destination where the database must be created
#[structopt(parse(from_os_str))]
pub database_path: PathBuf,
#[structopt(long = "fetch-timeout-ms")]
pub fetch_timeout_ms: Option<u64>,
/// Fields that must be displayed.
pub displayed_fields: Vec<String>,
/// The number of returned results
#[structopt(short = "n", long = "number-results", default_value = "10")]
pub number_results: usize,
/// The number of characters before and after the first match
#[structopt(short = "C", long = "context", default_value = "35")]
pub char_context: usize,
}
type Document = IndexMap<String, String>;
fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
let mut stdout = StandardStream::stdout(ColorChoice::Always);
let mut highlighted = false;
for range in ranges.windows(2) {
let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() };
if highlighted {
stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
}
write!(&mut stdout, "{}", &text[start..end])?;
stdout.reset()?;
highlighted = !highlighted;
}
Ok(())
}
fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize) {
let mut byte_index = 0;
let mut byte_length = 0;
for (n, (i, c)) in text.char_indices().enumerate() {
if n == index {
byte_index = i;
}
if n + 1 == index + length {
byte_length = i - byte_index + c.len_utf8();
break;
}
}
(byte_index, byte_length)
}
fn create_highlight_areas(text: &str, highlights: &[Highlight]) -> Vec<usize> {
let mut byte_indexes = BTreeMap::new();
for highlight in highlights {
let char_index = highlight.char_index as usize;
let char_length = highlight.char_length as usize;
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
match byte_indexes.entry(byte_index) {
Entry::Vacant(entry) => { entry.insert(byte_length); },
Entry::Occupied(mut entry) => {
if *entry.get() < byte_length {
entry.insert(byte_length);
}
},
}
}
let mut title_areas = Vec::new();
title_areas.push(0);
for (byte_index, length) in byte_indexes {
title_areas.push(byte_index);
title_areas.push(byte_index + length);
}
title_areas.push(text.len());
title_areas.sort_unstable();
title_areas
}
/// note: matches must have been sorted by `char_index` and `char_length` before being passed.
///
/// ```no_run
/// matches.sort_unstable_by_key(|m| (m.char_index, m.char_length));
///
/// let matches = matches.matches.iter().filter(|m| SchemaAttr::new(m.attribute) == attr).cloned();
///
/// let (text, matches) = crop_text(&text, matches, 35);
/// ```
fn crop_text(
text: &str,
highlights: impl IntoIterator<Item=Highlight>,
context: usize,
) -> (String, Vec<Highlight>)
{
let mut highlights = highlights.into_iter().peekable();
let char_index = highlights.peek().map(|m| m.char_index as usize).unwrap_or(0);
let start = char_index.saturating_sub(context);
let text = text.chars().skip(start).take(context * 2).collect();
let highlights = highlights
.take_while(|m| {
(m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)
})
.map(|highlight| {
Highlight { char_index: highlight.char_index - start as u16, ..highlight }
})
.collect();
(text, highlights)
}
fn main() -> Result<(), Box<dyn Error>> {
let _ = env_logger::init();
let opt = Opt::from_args();
let start = Instant::now();
let database = Database::open(&opt.database_path)?;
let index = database.open_index("test")?.unwrap();
let schema = index.schema();
println!("database prepared for you in {:.2?}", start.elapsed());
let fields = opt.displayed_fields.iter().map(String::as_str);
let fields = HashSet::from_iter(fields);
let config = Config::builder().auto_add_history(true).build();
let mut readline = Editor::<()>::with_config(config);
let _ = readline.load_history("query-history.txt");
for result in readline.iter("Searching for: ") {
match result {
Ok(query) => {
let start_total = Instant::now();
let builder = match opt.fetch_timeout_ms {
Some(timeout_ms) => {
let timeout = Duration::from_millis(timeout_ms);
index.query_builder().with_fetch_timeout(timeout)
},
None => index.query_builder(),
};
let documents = builder.query(&query, 0..opt.number_results)?;
let mut retrieve_duration = Duration::default();
let number_of_documents = documents.len();
for mut doc in documents {
doc.highlights.sort_unstable_by_key(|m| (m.char_index, m.char_length));
let start_retrieve = Instant::now();
let result = index.document::<Document>(Some(&fields), doc.id);
retrieve_duration += start_retrieve.elapsed();
match result {
Ok(Some(document)) => {
for (name, text) in document {
print!("{}: ", name);
let attr = schema.attribute(&name).unwrap();
let highlights = doc.highlights.iter()
.filter(|m| SchemaAttr::new(m.attribute) == attr)
.cloned();
let (text, highlights) = crop_text(&text, highlights, opt.char_context);
let areas = create_highlight_areas(&text, &highlights);
display_highlights(&text, &areas)?;
println!();
}
},
Ok(None) => eprintln!("missing document"),
Err(e) => eprintln!("{}", e),
}
let mut matching_attributes = HashSet::new();
for highlight in doc.highlights {
let attr = SchemaAttr::new(highlight.attribute);
let name = schema.attribute_name(attr);
matching_attributes.insert(name);
}
let matching_attributes = Vec::from_iter(matching_attributes);
println!("matching in: {:?}", matching_attributes);
println!();
}
eprintln!("document field retrieve took {:.2?}", retrieve_duration);
eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed());
},
Err(err) => {
println!("Error: {:?}", err);
break
}
}
}
readline.save_history("query-history.txt").unwrap();
Ok(())
}

3
meilidb/src/lib.rs Normal file
View File

@ -0,0 +1,3 @@
mod sort_by_attr;
pub use self::sort_by_attr::SortByAttr;

125
meilidb/src/sort_by_attr.rs Normal file
View File

@ -0,0 +1,125 @@
use std::cmp::Ordering;
use std::error::Error;
use std::fmt;
use meilidb_core::{criterion::Criterion, RawDocument};
use meilidb_data::RankedMap;
use meilidb_schema::{Schema, SchemaAttr};
/// An helper struct that permit to sort documents by
/// some of their stored attributes.
///
/// # Note
///
/// If a document cannot be deserialized it will be considered [`None`][].
///
/// Deserialized documents are compared like `Some(doc0).cmp(&Some(doc1))`,
/// so you must check the [`Ord`] of `Option` implementation.
///
/// [`None`]: https://doc.rust-lang.org/std/option/enum.Option.html#variant.None
/// [`Ord`]: https://doc.rust-lang.org/std/option/enum.Option.html#impl-Ord
///
/// # Example
///
/// ```ignore
/// use serde_derive::Deserialize;
/// use meilidb::rank::criterion::*;
///
/// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?;
///
/// let builder = CriteriaBuilder::with_capacity(8)
/// .add(SumOfTypos)
/// .add(NumberOfWords)
/// .add(WordsProximity)
/// .add(SumOfWordsAttribute)
/// .add(SumOfWordsPosition)
/// .add(Exact)
/// .add(custom_ranking)
/// .add(DocumentId);
///
/// let criterion = builder.build();
///
/// ```
pub struct SortByAttr<'a> {
ranked_map: &'a RankedMap,
attr: SchemaAttr,
reversed: bool,
}
impl<'a> SortByAttr<'a> {
pub fn lower_is_better(
ranked_map: &'a RankedMap,
schema: &Schema,
attr_name: &str,
) -> Result<SortByAttr<'a>, SortByAttrError>
{
SortByAttr::new(ranked_map, schema, attr_name, false)
}
pub fn higher_is_better(
ranked_map: &'a RankedMap,
schema: &Schema,
attr_name: &str,
) -> Result<SortByAttr<'a>, SortByAttrError>
{
SortByAttr::new(ranked_map, schema, attr_name, true)
}
fn new(
ranked_map: &'a RankedMap,
schema: &Schema,
attr_name: &str,
reversed: bool,
) -> Result<SortByAttr<'a>, SortByAttrError>
{
let attr = match schema.attribute(attr_name) {
Some(attr) => attr,
None => return Err(SortByAttrError::AttributeNotFound),
};
if !schema.props(attr).is_ranked() {
return Err(SortByAttrError::AttributeNotRegisteredForRanking);
}
Ok(SortByAttr { ranked_map, attr, reversed })
}
}
impl<'a> Criterion for SortByAttr<'a> {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = self.ranked_map.get(lhs.id, self.attr);
let rhs = self.ranked_map.get(rhs.id, self.attr);
match (lhs, rhs) {
(Some(lhs), Some(rhs)) => {
let order = lhs.cmp(&rhs);
if self.reversed { order.reverse() } else { order }
},
(None, Some(_)) => Ordering::Greater,
(Some(_), None) => Ordering::Less,
(None, None) => Ordering::Equal,
}
}
fn name(&self) -> &'static str {
"SortByAttr"
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum SortByAttrError {
AttributeNotFound,
AttributeNotRegisteredForRanking,
}
impl fmt::Display for SortByAttrError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use SortByAttrError::*;
match self {
AttributeNotFound => f.write_str("attribute not found in the schema"),
AttributeNotRegisteredForRanking => f.write_str("attribute not registered for ranking"),
}
}
}
impl Error for SortByAttrError { }

View File

@ -95,7 +95,8 @@ or
other
ought
our
ours ourselves
ours
ourselves
out
over
own

163
misc/fr.stopwords.txt Normal file
View File

@ -0,0 +1,163 @@
au
aux
avec
ce
ces
dans
de
des
du
elle
en
et
eux
il
je
la
le
leur
lui
ma
mais
me
même
mes
moi
mon
ne
nos
notre
nous
on
ou
par
pas
pour
qu
que
qui
sa
se
ses
son
sur
ta
te
tes
toi
ton
tu
un
une
vos
votre
vous
c
d
j
l
à
m
n
s
t
y
été
étée
étées
étés
étant
suis
es
est
sommes
êtes
sont
serai
seras
sera
serons
serez
seront
serais
serait
serions
seriez
seraient
étais
était
étions
étiez
étaient
fus
fut
fûmes
fûtes
furent
sois
soit
soyons
soyez
soient
fusse
fusses
fût
fussions
fussiez
fussent
ayant
eu
eue
eues
eus
ai
as
avons
avez
ont
aurai
auras
aura
aurons
aurez
auront
aurais
aurait
aurions
auriez
auraient
avais
avait
avions
aviez
avaient
eut
eûmes
eûtes
eurent
aie
aies
ait
ayons
ayez
aient
eusse
eusses
eût
eussions
eussiez
eussent
ceci
celà
cet
cette
ici
ils
les
leurs
quel
quels
quelle
quelles
sans
soi

View File

@ -1,90 +0,0 @@
use fst::Automaton;
use lazy_static::lazy_static;
use levenshtein_automata::{
LevenshteinAutomatonBuilder as LevBuilder,
DFA, Distance,
};
lazy_static! {
static ref LEVDIST0: LevBuilder = LevBuilder::new(0, false);
static ref LEVDIST1: LevBuilder = LevBuilder::new(1, false);
static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false);
}
pub struct DfaExt {
query_len: usize,
automaton: DFA,
}
impl Automaton for DfaExt {
type State = <DFA as Automaton>::State;
fn start(&self) -> Self::State {
self.automaton.start()
}
fn is_match(&self, state: &Self::State) -> bool {
self.automaton.is_match(state)
}
fn can_match(&self, state: &Self::State) -> bool {
self.automaton.can_match(state)
}
fn will_always_match(&self, state: &Self::State) -> bool {
self.automaton.will_always_match(state)
}
fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
self.automaton.accept(state, byte)
}
}
impl AutomatonExt for DfaExt {
fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance {
self.automaton.eval(s)
}
fn query_len(&self) -> usize {
self.query_len
}
}
enum PrefixSetting {
Prefix,
NoPrefix,
}
fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DfaExt {
use self::PrefixSetting::{Prefix, NoPrefix};
let dfa = match query.len() {
0 ..= 4 => match setting {
Prefix => LEVDIST0.build_prefix_dfa(query),
NoPrefix => LEVDIST0.build_dfa(query),
},
5 ..= 8 => match setting {
Prefix => LEVDIST1.build_prefix_dfa(query),
NoPrefix => LEVDIST1.build_dfa(query),
},
_ => match setting {
Prefix => LEVDIST2.build_prefix_dfa(query),
NoPrefix => LEVDIST2.build_dfa(query),
},
};
DfaExt { query_len: query.len(), automaton: dfa }
}
pub fn build_prefix_dfa(query: &str) -> DfaExt {
build_dfa_with_setting(query, PrefixSetting::Prefix)
}
pub fn build_dfa(query: &str) -> DfaExt {
build_dfa_with_setting(query, PrefixSetting::NoPrefix)
}
pub trait AutomatonExt: Automaton {
fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance;
fn query_len(&self) -> usize;
}

View File

@ -1,26 +0,0 @@
use std::io::{self, BufReader, BufRead};
use std::collections::HashSet;
use std::path::Path;
use std::fs::File;
#[derive(Debug)]
pub struct CommonWords(HashSet<String>);
impl CommonWords {
pub fn from_file<P>(path: P) -> io::Result<Self>
where P: AsRef<Path>
{
let file = File::open(path)?;
let file = BufReader::new(file);
let mut set = HashSet::new();
for line in file.lines().filter_map(|l| l.ok()) {
let word = line.trim().to_owned();
set.insert(word);
}
Ok(CommonWords(set))
}
pub fn contains(&self, word: &str) -> bool {
self.0.contains(word)
}
}

View File

@ -1,59 +0,0 @@
use std::slice::from_raw_parts;
use std::error::Error;
use std::path::Path;
use std::sync::Arc;
use std::{io, mem};
use sdset::Set;
use fst::raw::MmapReadOnly;
use serde::ser::{Serialize, Serializer};
use crate::DocumentId;
use crate::data::Data;
#[derive(Default, Clone)]
pub struct DocIds {
data: Data,
}
impl DocIds {
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
let mmap = MmapReadOnly::open_path(path)?;
let data = Data::Mmap(mmap);
Ok(DocIds { data })
}
pub fn from_bytes(vec: Vec<u8>) -> Result<Self, Box<Error>> {
// FIXME check if modulo DocumentId
let len = vec.len();
let data = Data::Shared {
bytes: Arc::new(vec),
offset: 0,
len: len
};
Ok(DocIds { data })
}
pub fn from_document_ids(vec: Vec<DocumentId>) -> Self {
DocIds::from_bytes(unsafe { mem::transmute(vec) }).unwrap()
}
pub fn contains(&self, doc: DocumentId) -> bool {
// FIXME prefer using the sdset::exponential_search function
self.doc_ids().binary_search(&doc).is_ok()
}
pub fn doc_ids(&self) -> &Set<DocumentId> {
let slice = &self.data;
let ptr = slice.as_ptr() as *const DocumentId;
let len = slice.len() / mem::size_of::<DocumentId>();
let slice = unsafe { from_raw_parts(ptr, len) };
Set::new_unchecked(slice)
}
}
impl Serialize for DocIds {
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
self.data.as_ref().serialize(serializer)
}
}

View File

@ -1,204 +0,0 @@
use std::slice::from_raw_parts;
use std::io::{self, Write};
use std::mem::size_of;
use std::ops::Index;
use std::path::Path;
use std::sync::Arc;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use fst::raw::MmapReadOnly;
use sdset::Set;
use crate::DocIndex;
use crate::data::Data;
#[derive(Debug)]
#[repr(C)]
struct Range {
start: u64,
end: u64,
}
#[derive(Clone, Default)]
pub struct DocIndexes {
ranges: Data,
indexes: Data,
}
impl DocIndexes {
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
let mmap = MmapReadOnly::open_path(path)?;
DocIndexes::from_data(Data::Mmap(mmap))
}
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
let len = vec.len();
DocIndexes::from_shared_bytes(Arc::new(vec), 0, len)
}
pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> io::Result<Self> {
let data = Data::Shared { bytes, offset, len };
DocIndexes::from_data(data)
}
fn from_data(data: Data) -> io::Result<Self> {
let ranges_len_offset = data.len() - size_of::<u64>();
let ranges_len = (&data[ranges_len_offset..]).read_u64::<LittleEndian>()?;
let ranges_len = ranges_len as usize;
let ranges_offset = ranges_len_offset - ranges_len;
let ranges = data.range(ranges_offset, ranges_len);
let indexes = data.range(0, ranges_offset);
Ok(DocIndexes { ranges, indexes })
}
pub fn to_vec(&self) -> Vec<u8> {
let capacity = self.indexes.len() + self.ranges.len() + size_of::<u64>();
let mut bytes = Vec::with_capacity(capacity);
bytes.extend_from_slice(&self.indexes);
bytes.extend_from_slice(&self.ranges);
bytes.write_u64::<LittleEndian>(self.ranges.len() as u64).unwrap();
bytes
}
pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> {
self.ranges().get(index as usize).map(|Range { start, end }| {
let start = *start as usize;
let end = *end as usize;
let slice = &self.indexes()[start..end];
Set::new_unchecked(slice)
})
}
fn ranges(&self) -> &[Range] {
let slice = &self.ranges;
let ptr = slice.as_ptr() as *const Range;
let len = slice.len() / size_of::<Range>();
unsafe { from_raw_parts(ptr, len) }
}
fn indexes(&self) -> &[DocIndex] {
let slice = &self.indexes;
let ptr = slice.as_ptr() as *const DocIndex;
let len = slice.len() / size_of::<DocIndex>();
unsafe { from_raw_parts(ptr, len) }
}
}
impl Index<usize> for DocIndexes {
type Output = [DocIndex];
fn index(&self, index: usize) -> &Self::Output {
match self.get(index) {
Some(indexes) => indexes,
None => panic!("index {} out of range for a maximum of {} ranges", index, self.ranges().len()),
}
}
}
pub struct DocIndexesBuilder<W> {
ranges: Vec<Range>,
wtr: W,
}
impl DocIndexesBuilder<Vec<u8>> {
pub fn memory() -> Self {
DocIndexesBuilder::new(Vec::new())
}
}
impl<W: Write> DocIndexesBuilder<W> {
pub fn new(wtr: W) -> Self {
DocIndexesBuilder {
ranges: Vec::new(),
wtr: wtr,
}
}
pub fn insert(&mut self, indexes: &Set<DocIndex>) -> io::Result<()> {
let len = indexes.len() as u64;
let start = self.ranges.last().map(|r| r.end).unwrap_or(0);
let range = Range { start, end: start + len };
self.ranges.push(range);
// write the values
let indexes = unsafe { into_u8_slice(indexes) };
self.wtr.write_all(indexes)
}
pub fn finish(self) -> io::Result<()> {
self.into_inner().map(drop)
}
pub fn into_inner(mut self) -> io::Result<W> {
// write the ranges
let ranges = unsafe { into_u8_slice(self.ranges.as_slice()) };
self.wtr.write_all(ranges)?;
// write the length of the ranges
let len = ranges.len() as u64;
self.wtr.write_u64::<LittleEndian>(len)?;
Ok(self.wtr)
}
}
unsafe fn into_u8_slice<T>(slice: &[T]) -> &[u8] {
let ptr = slice.as_ptr() as *const u8;
let len = slice.len() * size_of::<T>();
from_raw_parts(ptr, len)
}
#[cfg(test)]
mod tests {
use super::*;
use std::error::Error;
#[test]
fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
let mut builder = DocIndexesBuilder::memory();
builder.insert(Set::new(&[a])?)?;
builder.insert(Set::new(&[a, b, c])?)?;
builder.insert(Set::new(&[a, c])?)?;
let bytes = builder.into_inner()?;
let docs = DocIndexes::from_bytes(bytes)?;
assert_eq!(docs.get(0), Some(Set::new(&[a])?));
assert_eq!(docs.get(1), Some(Set::new(&[a, b, c])?));
assert_eq!(docs.get(2), Some(Set::new(&[a, c])?));
assert_eq!(docs.get(3), None);
Ok(())
}
#[test]
fn serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
let mut builder = DocIndexesBuilder::memory();
builder.insert(Set::new(&[a])?)?;
builder.insert(Set::new(&[a, b, c])?)?;
builder.insert(Set::new(&[a, c])?)?;
let builder_bytes = builder.into_inner()?;
let docs = DocIndexes::from_bytes(builder_bytes.clone())?;
let bytes = docs.to_vec();
assert_eq!(builder_bytes, bytes);
Ok(())
}
}

View File

@ -1,65 +0,0 @@
mod doc_ids;
mod doc_indexes;
use std::ops::Deref;
use std::sync::Arc;
use fst::raw::MmapReadOnly;
pub use self::doc_ids::DocIds;
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
#[derive(Clone)]
enum Data {
Shared {
bytes: Arc<Vec<u8>>,
offset: usize,
len: usize,
},
Mmap(MmapReadOnly),
}
impl Data {
pub fn range(&self, off: usize, l: usize) -> Data {
match self {
Data::Shared { bytes, offset, len } => {
assert!(off + l <= *len);
Data::Shared {
bytes: bytes.clone(),
offset: offset + off,
len: l,
}
},
Data::Mmap(mmap) => Data::Mmap(mmap.range(off, l)),
}
}
}
impl Default for Data {
fn default() -> Data {
Data::Shared {
bytes: Arc::default(),
offset: 0,
len: 0,
}
}
}
impl Deref for Data {
type Target = [u8];
fn deref(&self) -> &Self::Target {
self.as_ref()
}
}
impl AsRef<[u8]> for Data {
fn as_ref(&self) -> &[u8] {
match self {
Data::Shared { bytes, offset, len } => {
&bytes[*offset..offset + len]
},
Data::Mmap(m) => m.as_slice(),
}
}
}

View File

@ -1,110 +0,0 @@
mod ops;
pub mod positive;
pub mod negative;
pub use self::positive::{PositiveBlob, PositiveBlobBuilder};
pub use self::negative::NegativeBlob;
pub use self::ops::OpBuilder;
use std::fmt;
use serde_derive::{Serialize, Deserialize};
use serde::ser::{Serialize, Serializer, SerializeTuple};
use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor};
#[derive(Debug)]
pub enum Blob {
Positive(PositiveBlob),
Negative(NegativeBlob),
}
impl Blob {
pub fn is_negative(&self) -> bool {
self.sign() == Sign::Negative
}
pub fn is_positive(&self) -> bool {
self.sign() == Sign::Positive
}
pub fn sign(&self) -> Sign {
match self {
Blob::Positive(_) => Sign::Positive,
Blob::Negative(_) => Sign::Negative,
}
}
}
impl Serialize for Blob {
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
match self {
Blob::Positive(blob) => {
let mut tuple = serializer.serialize_tuple(2)?;
tuple.serialize_element(&Sign::Positive)?;
tuple.serialize_element(&blob)?;
tuple.end()
},
Blob::Negative(blob) => {
let mut tuple = serializer.serialize_tuple(2)?;
tuple.serialize_element(&Sign::Negative)?;
tuple.serialize_element(&blob)?;
tuple.end()
},
}
}
}
impl<'de> Deserialize<'de> for Blob {
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Blob, D::Error> {
struct TupleVisitor;
impl<'de> Visitor<'de> for TupleVisitor {
type Value = Blob;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
formatter.write_str("a Blob struct")
}
#[inline]
fn visit_seq<A: SeqAccess<'de>>(self, mut seq: A) -> Result<Self::Value, A::Error> {
let sign = match seq.next_element()? {
Some(value) => value,
None => return Err(de::Error::invalid_length(0, &self)),
};
match sign {
Sign::Positive => {
let blob = match seq.next_element()? {
Some(value) => value,
None => return Err(de::Error::invalid_length(1, &self)),
};
Ok(Blob::Positive(blob))
},
Sign::Negative => {
let blob = match seq.next_element()? {
Some(value) => value,
None => return Err(de::Error::invalid_length(1, &self)),
};
Ok(Blob::Negative(blob))
},
}
}
}
deserializer.deserialize_tuple(2, TupleVisitor)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum Sign {
Positive,
Negative,
}
impl Sign {
pub fn invert(self) -> Sign {
match self {
Sign::Positive => Sign::Negative,
Sign::Negative => Sign::Positive,
}
}
}

View File

@ -1,67 +0,0 @@
use std::error::Error;
use std::path::Path;
use std::fmt;
use sdset::Set;
use serde::de::{self, Deserialize, Deserializer};
use serde::ser::{Serialize, Serializer};
use crate::data::DocIds;
use crate::DocumentId;
#[derive(Default)]
pub struct NegativeBlob {
doc_ids: DocIds,
}
impl NegativeBlob {
pub unsafe fn from_path<P>(doc_ids: P) -> Result<Self, Box<Error>>
where P: AsRef<Path>,
{
let doc_ids = DocIds::from_path(doc_ids)?;
Ok(NegativeBlob { doc_ids })
}
pub fn from_bytes(doc_ids: Vec<u8>) -> Result<Self, Box<Error>> {
let doc_ids = DocIds::from_bytes(doc_ids)?;
Ok(NegativeBlob { doc_ids })
}
pub fn from_raw(doc_ids: DocIds) -> Self {
NegativeBlob { doc_ids }
}
pub fn as_ids(&self) -> &DocIds {
&self.doc_ids
}
pub fn into_doc_ids(self) -> DocIds {
self.doc_ids
}
}
impl AsRef<Set<DocumentId>> for NegativeBlob {
fn as_ref(&self) -> &Set<DocumentId> {
self.as_ids().doc_ids()
}
}
impl fmt::Debug for NegativeBlob {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "NegativeBlob(")?;
f.debug_list().entries(self.as_ref().as_slice()).finish()?;
write!(f, ")")
}
}
impl Serialize for NegativeBlob {
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
self.doc_ids.serialize(serializer)
}
}
impl<'de> Deserialize<'de> for NegativeBlob {
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<NegativeBlob, D::Error> {
let bytes = Vec::deserialize(deserializer)?;
NegativeBlob::from_bytes(bytes).map_err(de::Error::custom)
}
}

View File

@ -1,5 +0,0 @@
mod blob;
mod ops;
pub use self::blob::NegativeBlob;
pub use self::ops::OpBuilder;

View File

@ -1,73 +0,0 @@
use sdset::multi::OpBuilder as SdOpBuilder;
use sdset::Set;
use crate::database::blob::NegativeBlob;
use crate::data::DocIds;
use crate::DocumentId;
pub struct OpBuilder<'a> {
inner: SdOpBuilder<'a, DocumentId>,
}
/// Do a set operation on multiple negative blobs.
impl<'a> OpBuilder<'a> {
pub fn new() -> Self {
Self { inner: SdOpBuilder::new() }
}
pub fn with_capacity(cap: usize) -> Self {
Self { inner: SdOpBuilder::with_capacity(cap) }
}
pub fn add(mut self, blob: &'a NegativeBlob) -> Self {
self.push(blob);
self
}
pub fn push(&mut self, blob: &'a NegativeBlob) {
let set = Set::new_unchecked(blob.as_ref());
self.inner.push(set);
}
pub fn union(self) -> Union<'a> {
Union::new(self.inner.union())
}
pub fn intersection(self) -> Intersection<'a> {
Intersection::new(self.inner.intersection())
}
pub fn difference(self) -> Difference<'a> {
Difference::new(self.inner.difference())
}
pub fn symmetric_difference(self) -> SymmetricDifference<'a> {
SymmetricDifference::new(self.inner.symmetric_difference())
}
}
macro_rules! logical_operation {
(struct $name:ident, $operation:ident) => {
pub struct $name<'a> {
op: sdset::multi::$name<'a, DocumentId>,
}
impl<'a> $name<'a> {
fn new(op: sdset::multi::$name<'a, DocumentId>) -> Self {
$name { op }
}
pub fn into_negative_blob(self) -> NegativeBlob {
let document_ids = sdset::SetOperation::into_set_buf(self.op);
let doc_ids = DocIds::from_document_ids(document_ids.into_vec());
NegativeBlob::from_raw(doc_ids)
}
}
}}
logical_operation!(struct Union, union);
logical_operation!(struct Intersection, intersection);
logical_operation!(struct Difference, difference);
logical_operation!(struct SymmetricDifference, symmetric_difference);

View File

@ -1,109 +0,0 @@
use std::error::Error;
use fst::{IntoStreamer, Streamer};
use sdset::duo::DifferenceByKey;
use sdset::{Set, SetOperation};
use group_by::GroupBy;
use crate::database::blob::{Blob, Sign, PositiveBlob, PositiveBlobBuilder, NegativeBlob};
use crate::database::blob::{positive, negative};
fn blob_same_sign(a: &Blob, b: &Blob) -> bool {
a.sign() == b.sign()
}
fn unwrap_positive(blob: &Blob) -> &PositiveBlob {
match blob {
Blob::Positive(blob) => blob,
Blob::Negative(_) => panic!("called `unwrap_positive()` on a `Negative` value"),
}
}
fn unwrap_negative(blob: &Blob) -> &NegativeBlob {
match blob {
Blob::Negative(blob) => blob,
Blob::Positive(_) => panic!("called `unwrap_negative()` on a `Positive` value"),
}
}
pub struct OpBuilder {
blobs: Vec<Blob>,
}
impl OpBuilder {
pub fn new() -> OpBuilder {
OpBuilder { blobs: Vec::new() }
}
pub fn with_capacity(cap: usize) -> OpBuilder {
OpBuilder { blobs: Vec::with_capacity(cap) }
}
pub fn push(&mut self, blob: Blob) {
if self.blobs.is_empty() && blob.is_negative() { return }
self.blobs.push(blob);
}
pub fn merge(self) -> Result<PositiveBlob, Box<Error>> {
let groups = GroupBy::new(&self.blobs, blob_same_sign);
let mut aggregated = Vec::new();
for blobs in groups {
match blobs[0].sign() {
Sign::Positive => {
let mut op_builder = positive::OpBuilder::with_capacity(blobs.len());
for blob in blobs {
op_builder.push(unwrap_positive(blob));
}
let mut stream = op_builder.union().into_stream();
let mut builder = PositiveBlobBuilder::memory();
while let Some((input, doc_indexes)) = stream.next() {
// FIXME empty doc_indexes must be handled by OpBuilder
if !doc_indexes.is_empty() {
builder.insert(input, doc_indexes).unwrap();
}
}
let (map, doc_indexes) = builder.into_inner().unwrap();
let blob = PositiveBlob::from_bytes(map, doc_indexes).unwrap();
aggregated.push(Blob::Positive(blob));
},
Sign::Negative => {
let mut op_builder = negative::OpBuilder::with_capacity(blobs.len());
for blob in blobs {
op_builder.push(unwrap_negative(blob));
}
let blob = op_builder.union().into_negative_blob();
aggregated.push(Blob::Negative(blob));
},
}
}
let mut buffer = Vec::new();
aggregated.chunks(2).try_fold(PositiveBlob::default(), |base, slice| {
let negative = NegativeBlob::default();
let (positive, negative) = match slice {
[a, b] => (unwrap_positive(a), unwrap_negative(b)),
[a] => (unwrap_positive(a), &negative),
_ => unreachable!(),
};
let mut builder = PositiveBlobBuilder::memory();
let op_builder = positive::OpBuilder::new().add(&base).add(&positive);
let mut stream = op_builder.union().into_stream();
while let Some((input, doc_indexes)) = stream.next() {
let op = DifferenceByKey::new(doc_indexes, negative.as_ref(), |x| x.document_id, |x| *x);
buffer.clear();
op.extend_vec(&mut buffer);
if !buffer.is_empty() {
builder.insert(input, Set::new_unchecked(&buffer))?;
}
}
let (map, doc_indexes) = builder.into_inner()?;
PositiveBlob::from_bytes(map, doc_indexes)
})
}
}

View File

@ -1,254 +0,0 @@
use std::fmt;
use std::io::Write;
use std::path::Path;
use std::error::Error;
use fst::{map, Map, Streamer, IntoStreamer};
use sdset::Set;
use crate::DocIndex;
use crate::data::{DocIndexes, DocIndexesBuilder};
use serde::ser::{Serialize, Serializer, SerializeTuple};
use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor};
#[derive(Default)]
pub struct PositiveBlob {
map: Map,
indexes: DocIndexes,
}
impl PositiveBlob {
pub unsafe fn from_paths<P, Q>(map: P, indexes: Q) -> Result<Self, Box<Error>>
where P: AsRef<Path>,
Q: AsRef<Path>,
{
let map = Map::from_path(map)?;
let indexes = DocIndexes::from_path(indexes)?;
Ok(PositiveBlob { map, indexes })
}
pub fn from_bytes(map: Vec<u8>, indexes: Vec<u8>) -> Result<Self, Box<Error>> {
let map = Map::from_bytes(map)?;
let indexes = DocIndexes::from_bytes(indexes)?;
Ok(PositiveBlob { map, indexes })
}
pub fn from_raw(map: Map, indexes: DocIndexes) -> Self {
PositiveBlob { map, indexes }
}
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[DocIndex]> {
self.map.get(key).map(|index| &self.indexes[index as usize])
}
pub fn as_map(&self) -> &Map {
&self.map
}
pub fn as_indexes(&self) -> &DocIndexes {
&self.indexes
}
pub fn explode(self) -> (Map, DocIndexes) {
(self.map, self.indexes)
}
}
impl fmt::Debug for PositiveBlob {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "PositiveBlob([")?;
let mut stream = self.into_stream();
let mut first = true;
while let Some((k, v)) = stream.next() {
if !first {
write!(f, ", ")?;
}
first = false;
write!(f, "({}, {:?})", String::from_utf8_lossy(k), v)?;
}
write!(f, "])")
}
}
impl<'m, 'a> IntoStreamer<'a> for &'m PositiveBlob {
type Item = (&'a [u8], &'a [DocIndex]);
/// The type of the stream to be constructed.
type Into = PositiveBlobStream<'m>;
/// Construct a stream from `Self`.
fn into_stream(self) -> Self::Into {
PositiveBlobStream {
map_stream: self.map.into_stream(),
doc_indexes: &self.indexes,
}
}
}
pub struct PositiveBlobStream<'m> {
map_stream: map::Stream<'m>,
doc_indexes: &'m DocIndexes,
}
impl<'m, 'a> Streamer<'a> for PositiveBlobStream<'m> {
type Item = (&'a [u8], &'a [DocIndex]);
fn next(&'a mut self) -> Option<Self::Item> {
match self.map_stream.next() {
Some((input, index)) => {
let doc_indexes = &self.doc_indexes[index as usize];
Some((input, doc_indexes))
},
None => None,
}
}
}
impl Serialize for PositiveBlob {
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
let mut tuple = serializer.serialize_tuple(2)?;
tuple.serialize_element(&self.map.as_fst().to_vec())?;
tuple.serialize_element(&self.indexes.to_vec())?;
tuple.end()
}
}
impl<'de> Deserialize<'de> for PositiveBlob {
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<PositiveBlob, D::Error> {
struct TupleVisitor;
impl<'de> Visitor<'de> for TupleVisitor {
type Value = PositiveBlob;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
formatter.write_str("a PositiveBlob struct")
}
#[inline]
fn visit_seq<A: SeqAccess<'de>>(self, mut seq: A) -> Result<Self::Value, A::Error> {
let map = match seq.next_element()? {
Some(bytes) => match Map::from_bytes(bytes) {
Ok(value) => value,
Err(err) => return Err(de::Error::custom(err)),
},
None => return Err(de::Error::invalid_length(0, &self)),
};
let indexes = match seq.next_element()? {
Some(bytes) => match DocIndexes::from_bytes(bytes) {
Ok(value) => value,
Err(err) => return Err(de::Error::custom(err)),
},
None => return Err(de::Error::invalid_length(1, &self)),
};
Ok(PositiveBlob { map, indexes })
}
}
deserializer.deserialize_tuple(2, TupleVisitor)
}
}
pub struct PositiveBlobBuilder<W, X> {
map: fst::MapBuilder<W>,
indexes: DocIndexesBuilder<X>,
value: u64,
}
impl PositiveBlobBuilder<Vec<u8>, Vec<u8>> {
pub fn memory() -> Self {
PositiveBlobBuilder {
map: fst::MapBuilder::memory(),
indexes: DocIndexesBuilder::memory(),
value: 0,
}
}
}
impl<W: Write, X: Write> PositiveBlobBuilder<W, X> {
pub fn new(map: W, indexes: X) -> Result<Self, Box<Error>> {
Ok(PositiveBlobBuilder {
map: fst::MapBuilder::new(map)?,
indexes: DocIndexesBuilder::new(indexes),
value: 0,
})
}
/// If a key is inserted that is less than or equal to any previous key added,
/// then an error is returned. Similarly, if there was a problem writing
/// to the underlying writer, an error is returned.
// FIXME what if one write doesn't work but the other do ?
pub fn insert<K>(&mut self, key: K, doc_indexes: &Set<DocIndex>) -> Result<(), Box<Error>>
where K: AsRef<[u8]>,
{
self.map.insert(key, self.value)?;
self.indexes.insert(doc_indexes)?;
self.value += 1;
Ok(())
}
pub fn finish(self) -> Result<(), Box<Error>> {
self.into_inner().map(drop)
}
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
let map = self.map.into_inner()?;
let indexes = self.indexes.into_inner()?;
Ok((map, indexes))
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::error::Error;
#[test]
fn serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
let mut builder = PositiveBlobBuilder::memory();
builder.insert("aaa", Set::new(&[a])?)?;
builder.insert("aab", Set::new(&[a, b, c])?)?;
builder.insert("aac", Set::new(&[a, c])?)?;
let (map_bytes, indexes_bytes) = builder.into_inner()?;
let positive_blob = PositiveBlob::from_bytes(map_bytes, indexes_bytes)?;
assert_eq!(positive_blob.get("aaa"), Some(&[a][..]));
assert_eq!(positive_blob.get("aab"), Some(&[a, b, c][..]));
assert_eq!(positive_blob.get("aac"), Some(&[a, c][..]));
assert_eq!(positive_blob.get("aad"), None);
Ok(())
}
#[test]
fn serde_serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
let mut builder = PositiveBlobBuilder::memory();
builder.insert("aaa", Set::new(&[a])?)?;
builder.insert("aab", Set::new(&[a, b, c])?)?;
builder.insert("aac", Set::new(&[a, c])?)?;
let (map_bytes, indexes_bytes) = builder.into_inner()?;
let positive_blob = PositiveBlob::from_bytes(map_bytes, indexes_bytes)?;
let bytes = bincode::serialize(&positive_blob)?;
let positive_blob: PositiveBlob = bincode::deserialize(&bytes)?;
assert_eq!(positive_blob.get("aaa"), Some(&[a][..]));
assert_eq!(positive_blob.get("aab"), Some(&[a, b, c][..]));
assert_eq!(positive_blob.get("aac"), Some(&[a, c][..]));
assert_eq!(positive_blob.get("aad"), None);
Ok(())
}
}

View File

@ -1,5 +0,0 @@
mod blob;
mod ops;
pub use self::blob::{PositiveBlob, PositiveBlobBuilder};
pub use self::ops::OpBuilder;

View File

@ -1,128 +0,0 @@
use sdset::multi::OpBuilder as SdOpBuilder;
use sdset::{SetOperation, Set};
use crate::database::blob::PositiveBlob;
use crate::data::DocIndexes;
use crate::DocIndex;
pub struct OpBuilder<'m> {
// the operation on the maps is always an union.
map_op: fst::map::OpBuilder<'m>,
indexes: Vec<&'m DocIndexes>,
}
/// Do a set operation on multiple positive blobs.
impl<'m> OpBuilder<'m> {
pub fn new() -> Self {
Self {
map_op: fst::map::OpBuilder::new(),
indexes: Vec::new(),
}
}
pub fn with_capacity(cap: usize) -> Self {
Self {
map_op: fst::map::OpBuilder::new(), // TODO patch fst to add with_capacity
indexes: Vec::with_capacity(cap),
}
}
pub fn add(mut self, blob: &'m PositiveBlob) -> Self {
self.push(blob);
self
}
pub fn push(&mut self, blob: &'m PositiveBlob) {
self.map_op.push(blob.as_map());
self.indexes.push(blob.as_indexes());
}
pub fn union(self) -> Union<'m> {
Union::new(self.map_op.union(), self.indexes)
}
pub fn intersection(self) -> Intersection<'m> {
Intersection::new(self.map_op.union(), self.indexes)
}
pub fn difference(self) -> Difference<'m> {
Difference::new(self.map_op.union(), self.indexes)
}
pub fn symmetric_difference(self) -> SymmetricDifference<'m> {
SymmetricDifference::new(self.map_op.union(), self.indexes)
}
}
macro_rules! logical_operation {
(struct $name:ident, $operation:ident) => {
pub struct $name<'m> {
stream: fst::map::Union<'m>,
indexes: Vec<&'m DocIndexes>,
outs: Vec<DocIndex>,
}
impl<'m> $name<'m> {
fn new(stream: fst::map::Union<'m>, indexes: Vec<&'m DocIndexes>) -> Self {
$name {
stream: stream,
indexes: indexes,
outs: Vec::new(),
}
}
}
impl<'m, 'a> fst::Streamer<'a> for $name<'m> {
type Item = (&'a [u8], &'a Set<DocIndex>);
fn next(&'a mut self) -> Option<Self::Item> {
// loop {
// let (input, ivalues) = match self.stream.next() {
// Some(value) => value,
// None => return None,
// };
// self.outs.clear();
// let mut builder = SdOpBuilder::with_capacity(ivalues.len());
// for ivalue in ivalues {
// let indexes = self.indexes[ivalue.index];
// let indexes = indexes.get(ivalue.value).expect("BUG: could not find document indexes");
// let set = Set::new_unchecked(indexes);
// builder.push(set);
// }
// builder.$operation().extend_vec(&mut self.outs);
// if self.outs.is_empty() { continue }
// return Some((input, &self.outs))
// }
// FIXME make the above code compile
match self.stream.next() {
Some((input, ivalues)) => {
self.outs.clear();
let mut builder = SdOpBuilder::with_capacity(ivalues.len());
for ivalue in ivalues {
let doc_indexes = &self.indexes[ivalue.index][ivalue.value as usize];
let set = Set::new_unchecked(doc_indexes);
builder.push(set);
}
builder.$operation().extend_vec(&mut self.outs);
if self.outs.is_empty() { return None }
return Some((input, Set::new_unchecked(&self.outs)))
},
None => None
}
}
}
}}
logical_operation!(struct Union, union);
logical_operation!(struct Intersection, intersection);
logical_operation!(struct Difference, difference);
logical_operation!(struct SymmetricDifference, symmetric_difference);

View File

@ -1,175 +0,0 @@
use std::error::Error;
use std::path::Path;
use std::ops::Deref;
use std::{fmt, marker};
use rocksdb::rocksdb_options::{ReadOptions, EnvOptions, ColumnFamilyOptions};
use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey, SstFileWriter};
use serde::de::DeserializeOwned;
use crate::database::{DocumentKey, DocumentKeyAttr};
use crate::database::{retrieve_data_schema, retrieve_data_index};
use crate::database::blob::positive::PositiveBlob;
use crate::database::deserializer::Deserializer;
use crate::database::schema::Schema;
use crate::rank::QueryBuilder;
use crate::DocumentId;
pub struct DatabaseView<D>
where D: Deref<Target=DB>
{
snapshot: Snapshot<D>,
blob: PositiveBlob,
schema: Schema,
}
impl<D> DatabaseView<D>
where D: Deref<Target=DB>
{
pub fn new(snapshot: Snapshot<D>) -> Result<DatabaseView<D>, Box<Error>> {
let schema = retrieve_data_schema(&snapshot)?;
let blob = retrieve_data_index(&snapshot)?;
Ok(DatabaseView { snapshot, blob, schema })
}
pub fn schema(&self) -> &Schema {
&self.schema
}
pub fn blob(&self) -> &PositiveBlob {
&self.blob
}
pub fn into_snapshot(self) -> Snapshot<D> {
self.snapshot
}
pub fn snapshot(&self) -> &Snapshot<D> {
&self.snapshot
}
pub fn get(&self, key: &[u8]) -> Result<Option<DBVector>, Box<Error>> {
Ok(self.snapshot.get(key)?)
}
pub fn dump_all<P: AsRef<Path>>(&self, path: P) -> Result<(), Box<Error>> {
let path = path.as_ref().to_string_lossy();
let env_options = EnvOptions::new();
let column_family_options = ColumnFamilyOptions::new();
let mut file_writer = SstFileWriter::new(env_options, column_family_options);
file_writer.open(&path)?;
let mut iter = self.snapshot.iter();
iter.seek(SeekKey::Start);
for (key, value) in &mut iter {
file_writer.put(&key, &value)?;
}
file_writer.finish()?;
Ok(())
}
pub fn query_builder(&self) -> Result<QueryBuilder<D>, Box<Error>> {
QueryBuilder::new(self)
}
// TODO create an enum error type
pub fn retrieve_document<T>(&self, id: DocumentId) -> Result<T, Box<Error>>
where T: DeserializeOwned
{
let mut deserializer = Deserializer::new(&self.snapshot, &self.schema, id);
Ok(T::deserialize(&mut deserializer)?)
}
pub fn retrieve_documents<T, I>(&self, ids: I) -> DocumentIter<D, T, I::IntoIter>
where T: DeserializeOwned,
I: IntoIterator<Item=DocumentId>,
{
DocumentIter {
database_view: self,
document_ids: ids.into_iter(),
_phantom: marker::PhantomData,
}
}
}
impl<D> fmt::Debug for DatabaseView<D>
where D: Deref<Target=DB>
{
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut options = ReadOptions::new();
let lower = DocumentKey::new(0);
options.set_iterate_lower_bound(lower.as_ref());
let mut iter = self.snapshot.iter_opt(options);
iter.seek(SeekKey::Start);
let iter = iter.map(|(key, _)| DocumentKeyAttr::from_bytes(&key));
if f.alternate() {
writeln!(f, "DatabaseView(")?;
} else {
write!(f, "DatabaseView(")?;
}
self.schema.fmt(f)?;
if f.alternate() {
writeln!(f, ",")?;
} else {
write!(f, ", ")?;
}
f.debug_list().entries(iter).finish()?;
write!(f, ")")
}
}
// TODO this is just an iter::Map !!!
pub struct DocumentIter<'a, D, T, I>
where D: Deref<Target=DB>
{
database_view: &'a DatabaseView<D>,
document_ids: I,
_phantom: marker::PhantomData<T>,
}
impl<'a, D, T, I> Iterator for DocumentIter<'a, D, T, I>
where D: Deref<Target=DB>,
T: DeserializeOwned,
I: Iterator<Item=DocumentId>,
{
type Item = Result<T, Box<Error>>;
fn size_hint(&self) -> (usize, Option<usize>) {
self.document_ids.size_hint()
}
fn next(&mut self) -> Option<Self::Item> {
match self.document_ids.next() {
Some(id) => Some(self.database_view.retrieve_document(id)),
None => None
}
}
}
impl<'a, D, T, I> ExactSizeIterator for DocumentIter<'a, D, T, I>
where D: Deref<Target=DB>,
T: DeserializeOwned,
I: ExactSizeIterator + Iterator<Item=DocumentId>,
{ }
impl<'a, D, T, I> DoubleEndedIterator for DocumentIter<'a, D, T, I>
where D: Deref<Target=DB>,
T: DeserializeOwned,
I: DoubleEndedIterator + Iterator<Item=DocumentId>,
{
fn next_back(&mut self) -> Option<Self::Item> {
match self.document_ids.next_back() {
Some(id) => Some(self.database_view.retrieve_document(id)),
None => None
}
}
}

View File

@ -1,186 +0,0 @@
use std::error::Error;
use std::ops::Deref;
use std::fmt;
use rocksdb::rocksdb::{DB, Snapshot, SeekKey};
use rocksdb::rocksdb_options::ReadOptions;
use serde::forward_to_deserialize_any;
use serde::de::value::MapDeserializer;
use serde::de::{self, Visitor, IntoDeserializer};
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
use crate::database::schema::Schema;
use crate::DocumentId;
pub struct Deserializer<'a, D>
where D: Deref<Target=DB>
{
snapshot: &'a Snapshot<D>,
schema: &'a Schema,
document_id: DocumentId,
}
impl<'a, D> Deserializer<'a, D>
where D: Deref<Target=DB>
{
pub fn new(snapshot: &'a Snapshot<D>, schema: &'a Schema, doc: DocumentId) -> Self {
Deserializer { snapshot, schema, document_id: doc }
}
}
impl<'de, 'a, 'b, D> de::Deserializer<'de> for &'b mut Deserializer<'a, D>
where D: Deref<Target=DB>
{
type Error = DeserializerError;
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
self.deserialize_map(visitor)
}
forward_to_deserialize_any! {
bool u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 char str string unit seq
bytes byte_buf unit_struct tuple_struct
identifier tuple ignored_any option newtype_struct enum struct
}
fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
let mut options = ReadOptions::new();
let lower = DocumentKey::new(self.document_id);
let upper = lower.with_attribute_max();
options.set_iterate_lower_bound(lower.as_ref());
options.set_iterate_upper_bound(upper.as_ref());
let mut iter = self.snapshot.iter_opt(options);
iter.seek(SeekKey::Start);
if iter.kv().is_none() {
// FIXME return an error
}
let iter = iter.map(|(key, value)| {
// retrieve the schema attribute name
// from the schema attribute number
let document_key_attr = DocumentKeyAttr::from_bytes(&key);
let schema_attr = document_key_attr.attribute();
let attribute_name = self.schema.attribute_name(schema_attr);
(attribute_name, Value(value))
});
let map_deserializer = MapDeserializer::new(iter);
visitor.visit_map(map_deserializer)
}
}
struct Value(Vec<u8>);
impl<'de> IntoDeserializer<'de, DeserializerError> for Value {
type Deserializer = Self;
fn into_deserializer(self) -> Self::Deserializer {
self
}
}
macro_rules! forward_to_bincode_values {
($($ty:ident => $de_method:ident,)*) => {
$(
fn $de_method<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de>
{
match bincode::deserialize::<$ty>(&self.0) {
Ok(val) => val.into_deserializer().$de_method(visitor),
Err(e) => Err(de::Error::custom(e)),
}
}
)*
}
}
impl<'de, 'a> de::Deserializer<'de> for Value {
type Error = DeserializerError;
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
self.0.into_deserializer().deserialize_any(visitor)
}
fn deserialize_str<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
self.deserialize_string(visitor)
}
fn deserialize_string<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
match bincode::deserialize::<String>(&self.0) {
Ok(val) => val.into_deserializer().deserialize_string(visitor),
Err(e) => Err(de::Error::custom(e)),
}
}
fn deserialize_bytes<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
self.deserialize_byte_buf(visitor)
}
fn deserialize_byte_buf<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
match bincode::deserialize::<Vec<u8>>(&self.0) {
Ok(val) => val.into_deserializer().deserialize_byte_buf(visitor),
Err(e) => Err(de::Error::custom(e)),
}
}
forward_to_bincode_values! {
char => deserialize_char,
bool => deserialize_bool,
u8 => deserialize_u8,
u16 => deserialize_u16,
u32 => deserialize_u32,
u64 => deserialize_u64,
i8 => deserialize_i8,
i16 => deserialize_i16,
i32 => deserialize_i32,
i64 => deserialize_i64,
f32 => deserialize_f32,
f64 => deserialize_f64,
}
forward_to_deserialize_any! {
unit seq map
unit_struct tuple_struct
identifier tuple ignored_any option newtype_struct enum struct
}
}
#[derive(Debug)]
pub enum DeserializerError {
Custom(String),
}
impl de::Error for DeserializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
DeserializerError::Custom(msg.to_string())
}
}
impl fmt::Display for DeserializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
DeserializerError::Custom(s) => f.write_str(&s),
}
}
}
impl Error for DeserializerError {}

View File

@ -1,118 +0,0 @@
use std::io::{Cursor, Read, Write};
use std::mem::size_of;
use std::fmt;
use byteorder::{NativeEndian, WriteBytesExt, ReadBytesExt};
use crate::database::schema::SchemaAttr;
use crate::DocumentId;
const DOC_KEY_LEN: usize = 4 + size_of::<u64>();
const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::<u32>();
#[derive(Copy, Clone)]
pub struct DocumentKey([u8; DOC_KEY_LEN]);
impl DocumentKey {
pub fn new(id: DocumentId) -> DocumentKey {
let mut buffer = [0; DOC_KEY_LEN];
let mut wtr = Cursor::new(&mut buffer[..]);
wtr.write_all(b"doc-").unwrap();
wtr.write_u64::<NativeEndian>(id).unwrap();
DocumentKey(buffer)
}
pub fn from_bytes(mut bytes: &[u8]) -> DocumentKey {
assert!(bytes.len() >= DOC_KEY_LEN);
assert_eq!(&bytes[..4], b"doc-");
let mut buffer = [0; DOC_KEY_LEN];
bytes.read_exact(&mut buffer).unwrap();
DocumentKey(buffer)
}
pub fn with_attribute(&self, attr: SchemaAttr) -> DocumentKeyAttr {
DocumentKeyAttr::new(self.document_id(), attr)
}
pub fn with_attribute_max(&self) -> DocumentKeyAttr {
DocumentKeyAttr::new(self.document_id(), SchemaAttr::max())
}
pub fn document_id(&self) -> DocumentId {
(&self.0[4..]).read_u64::<NativeEndian>().unwrap()
}
}
impl AsRef<[u8]> for DocumentKey {
fn as_ref(&self) -> &[u8] {
&self.0
}
}
impl fmt::Debug for DocumentKey {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("DocumentKey")
.field("document_id", &self.document_id())
.finish()
}
}
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct DocumentKeyAttr([u8; DOC_KEY_ATTR_LEN]);
impl DocumentKeyAttr {
pub fn new(id: DocumentId, attr: SchemaAttr) -> DocumentKeyAttr {
let mut buffer = [0; DOC_KEY_ATTR_LEN];
let DocumentKey(raw_key) = DocumentKey::new(id);
let mut wtr = Cursor::new(&mut buffer[..]);
wtr.write_all(&raw_key).unwrap();
wtr.write_all(b"-").unwrap();
wtr.write_u32::<NativeEndian>(attr.as_u32()).unwrap();
DocumentKeyAttr(buffer)
}
pub fn from_bytes(mut bytes: &[u8]) -> DocumentKeyAttr {
assert!(bytes.len() >= DOC_KEY_ATTR_LEN);
assert_eq!(&bytes[..4], b"doc-");
let mut buffer = [0; DOC_KEY_ATTR_LEN];
bytes.read_exact(&mut buffer).unwrap();
DocumentKeyAttr(buffer)
}
pub fn document_id(&self) -> DocumentId {
(&self.0[4..]).read_u64::<NativeEndian>().unwrap()
}
pub fn attribute(&self) -> SchemaAttr {
let offset = 4 + size_of::<u64>() + 1;
let value = (&self.0[offset..]).read_u32::<NativeEndian>().unwrap();
SchemaAttr::new(value)
}
pub fn into_document_key(self) -> DocumentKey {
DocumentKey::new(self.document_id())
}
}
impl AsRef<[u8]> for DocumentKeyAttr {
fn as_ref(&self) -> &[u8] {
&self.0
}
}
impl fmt::Debug for DocumentKeyAttr {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("DocumentKeyAttr")
.field("document_id", &self.document_id())
.field("attribute", &self.attribute().as_u32())
.finish()
}
}

View File

@ -1,259 +0,0 @@
use std::sync::{Arc, Mutex, RwLock, RwLockReadGuard};
use std::error::Error;
use std::path::Path;
use std::ops::Deref;
use rocksdb::rocksdb_options::{DBOptions, IngestExternalFileOptions, ColumnFamilyOptions};
use rocksdb::rocksdb::{Writable, Snapshot};
use rocksdb::{DB, DBVector, MergeOperands};
pub use self::document_key::{DocumentKey, DocumentKeyAttr};
pub use self::database_view::{DatabaseView, DocumentIter};
use self::blob::positive::PositiveBlob;
use self::update::Update;
use self::schema::Schema;
use self::blob::Blob;
pub mod blob;
pub mod schema;
pub mod update;
mod document_key;
mod database_view;
mod deserializer;
const DATA_INDEX: &[u8] = b"data-index";
const DATA_SCHEMA: &[u8] = b"data-schema";
pub fn retrieve_data_schema<D>(snapshot: &Snapshot<D>) -> Result<Schema, Box<Error>>
where D: Deref<Target=DB>
{
match snapshot.get(DATA_SCHEMA)? {
Some(vector) => Ok(Schema::read_from(&*vector)?),
None => Err(String::from("BUG: no schema found in the database").into()),
}
}
pub fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<PositiveBlob, Box<Error>>
where D: Deref<Target=DB>
{
match snapshot.get(DATA_INDEX)? {
Some(vector) => Ok(bincode::deserialize(&*vector)?),
None => Ok(PositiveBlob::default()),
}
}
pub struct Database {
// DB is under a Mutex to sync update ingestions and separate DB update locking
// and DatabaseView acquiring locking in other words:
// "Block readers the minimum possible amount of time"
db: Mutex<Arc<DB>>,
// This view is updated each time the DB ingests an update
view: RwLock<DatabaseView<Arc<DB>>>,
}
impl Database {
pub fn create<P: AsRef<Path>>(path: P, schema: Schema) -> Result<Database, Box<Error>> {
let path = path.as_ref();
if path.exists() {
return Err(format!("File already exists at path: {}, cannot create database.",
path.display()).into())
}
let path = path.to_string_lossy();
let mut opts = DBOptions::new();
opts.create_if_missing(true);
// opts.error_if_exists(true); // FIXME pull request that
let mut cf_opts = ColumnFamilyOptions::new();
cf_opts.add_merge_operator("data-index merge operator", merge_indexes);
let db = DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
let mut schema_bytes = Vec::new();
schema.write_to(&mut schema_bytes)?;
db.put(DATA_SCHEMA, &schema_bytes)?;
let db = Arc::new(db);
let snapshot = Snapshot::new(db.clone());
let view = RwLock::new(DatabaseView::new(snapshot)?);
Ok(Database { db: Mutex::new(db), view })
}
pub fn open<P: AsRef<Path>>(path: P) -> Result<Database, Box<Error>> {
let path = path.as_ref().to_string_lossy();
let mut opts = DBOptions::new();
opts.create_if_missing(false);
let mut cf_opts = ColumnFamilyOptions::new();
cf_opts.add_merge_operator("data-index merge operator", merge_indexes);
let db = DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
// FIXME create a generic function to do that !
let _schema = match db.get(DATA_SCHEMA)? {
Some(value) => Schema::read_from(&*value)?,
None => return Err(String::from("Database does not contain a schema").into()),
};
let db = Arc::new(db);
let snapshot = Snapshot::new(db.clone());
let view = RwLock::new(DatabaseView::new(snapshot)?);
Ok(Database { db: Mutex::new(db), view })
}
pub fn ingest_update_file(&self, update: Update) -> Result<(), Box<Error>> {
let snapshot = {
// We must have a mutex here to ensure that update ingestions and compactions
// are done atomatically and in the right order.
// This way update ingestions will block other update ingestions without blocking view
// creations while doing the "data-index" compaction
let db = match self.db.lock() {
Ok(db) => db,
Err(e) => return Err(e.to_string().into()),
};
let move_update = update.can_be_moved();
let path = update.into_path_buf();
let path = path.to_string_lossy();
let mut options = IngestExternalFileOptions::new();
options.move_files(move_update);
let cf_handle = db.cf_handle("default").expect("\"default\" column family not found");
db.ingest_external_file_optimized(&cf_handle, &options, &[&path])?;
// Compacting to trigger the merge operator only one time
// while ingesting the update and not each time searching
db.compact_range(Some(DATA_INDEX), Some(DATA_INDEX));
Snapshot::new(db.clone())
};
// Here we will block the view creation for the minimum amount of time:
// updating the DatabaseView itself with the new database snapshot
let view = DatabaseView::new(snapshot)?;
match self.view.write() {
Ok(mut lock) => *lock = view,
Err(e) => return Err(e.to_string().into()),
}
Ok(())
}
pub fn get(&self, key: &[u8]) -> Result<Option<DBVector>, Box<Error>> {
self.view().get(key)
}
pub fn flush(&self) -> Result<(), Box<Error>> {
match self.db.lock() {
Ok(db) => Ok(db.flush(true)?),
Err(e) => Err(e.to_string().into()),
}
}
pub fn view(&self) -> RwLockReadGuard<DatabaseView<Arc<DB>>> {
self.view.read().unwrap()
}
}
fn merge_indexes(key: &[u8], existing_value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
if key != DATA_INDEX {
panic!("The merge operator only supports \"data-index\" merging")
}
let capacity = {
let remaining = operands.size_hint().0;
let already_exist = usize::from(existing_value.is_some());
remaining + already_exist
};
let mut op = blob::OpBuilder::with_capacity(capacity);
if let Some(existing_value) = existing_value {
let blob = bincode::deserialize(existing_value).expect("BUG: could not deserialize data-index");
op.push(Blob::Positive(blob));
}
for bytes in operands {
let blob = bincode::deserialize(bytes).expect("BUG: could not deserialize blob");
op.push(blob);
}
let blob = op.merge().expect("BUG: could not merge blobs");
bincode::serialize(&blob).expect("BUG: could not serialize merged blob")
}
#[cfg(test)]
mod tests {
use super::*;
use std::error::Error;
use serde_derive::{Serialize, Deserialize};
use tempfile::tempdir;
use crate::tokenizer::DefaultBuilder;
use crate::database::update::PositiveUpdateBuilder;
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
#[test]
fn ingest_update_file() -> Result<(), Box<Error>> {
let dir = tempdir()?;
let rocksdb_path = dir.path().join("rocksdb.rdb");
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
struct SimpleDoc {
title: String,
description: String,
timestamp: u64,
}
let schema = {
let mut builder = SchemaBuilder::new();
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
builder.new_attribute("timestamp", STORED);
builder.build()
};
let database = Database::create(&rocksdb_path, schema.clone())?;
let tokenizer_builder = DefaultBuilder::new();
let update_path = dir.path().join("update.sst");
let doc0 = SimpleDoc {
title: String::from("I am a title"),
description: String::from("I am a description"),
timestamp: 1234567,
};
let doc1 = SimpleDoc {
title: String::from("I am the second title"),
description: String::from("I am the second description"),
timestamp: 7654321,
};
let mut update = {
let mut builder = PositiveUpdateBuilder::new(update_path, schema, tokenizer_builder);
builder.update(0, &doc0).unwrap();
builder.update(1, &doc1).unwrap();
builder.build()?
};
update.set_move(true);
database.ingest_update_file(update)?;
let view = database.view();
let de_doc0: SimpleDoc = view.retrieve_document(0)?;
let de_doc1: SimpleDoc = view.retrieve_document(1)?;
assert_eq!(doc0, de_doc0);
assert_eq!(doc1, de_doc1);
Ok(dir.close()?)
}
}

View File

@ -1,172 +0,0 @@
use std::collections::{HashMap, BTreeMap};
use std::io::{Read, Write};
use std::{fmt, u32};
use std::path::Path;
use std::ops::BitOr;
use std::sync::Arc;
use std::fs::File;
use serde_derive::{Serialize, Deserialize};
use linked_hash_map::LinkedHashMap;
pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false };
pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true };
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct SchemaProps {
stored: bool,
indexed: bool,
}
impl SchemaProps {
pub fn is_stored(&self) -> bool {
self.stored
}
pub fn is_indexed(&self) -> bool {
self.indexed
}
}
impl BitOr for SchemaProps {
type Output = Self;
fn bitor(self, other: Self) -> Self::Output {
SchemaProps {
stored: self.stored | other.stored,
indexed: self.indexed | other.indexed,
}
}
}
pub struct SchemaBuilder {
attrs: LinkedHashMap<String, SchemaProps>,
}
impl SchemaBuilder {
pub fn new() -> SchemaBuilder {
SchemaBuilder { attrs: LinkedHashMap::new() }
}
pub fn new_attribute<S: Into<String>>(&mut self, name: S, props: SchemaProps) -> SchemaAttr {
let len = self.attrs.len();
if self.attrs.insert(name.into(), props).is_some() {
panic!("Field already inserted.")
}
SchemaAttr(len as u32)
}
pub fn build(self) -> Schema {
let mut attrs = HashMap::new();
let mut props = Vec::new();
for (i, (name, prop)) in self.attrs.into_iter().enumerate() {
attrs.insert(name.clone(), SchemaAttr(i as u32));
props.push((name, prop));
}
Schema { inner: Arc::new(InnerSchema { attrs, props }) }
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Schema {
inner: Arc<InnerSchema>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct InnerSchema {
attrs: HashMap<String, SchemaAttr>,
props: Vec<(String, SchemaProps)>,
}
impl Schema {
pub fn open<P: AsRef<Path>>(path: P) -> bincode::Result<Schema> {
let file = File::open(path)?;
Schema::read_from(file)
}
pub fn read_from<R: Read>(reader: R) -> bincode::Result<Schema> {
let attrs = bincode::deserialize_from(reader)?;
let builder = SchemaBuilder { attrs };
Ok(builder.build())
}
pub fn write_to<W: Write>(&self, writer: W) -> bincode::Result<()> {
let mut ordered = BTreeMap::new();
for (name, field) in &self.inner.attrs {
let index = field.as_u32();
let (_, props) = self.inner.props[index as usize];
ordered.insert(index, (name, props));
}
let mut attrs = LinkedHashMap::with_capacity(ordered.len());
for (_, (name, props)) in ordered {
attrs.insert(name, props);
}
bincode::serialize_into(writer, &attrs)
}
pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
let index = attr.as_u32();
let (_, props) = self.inner.props[index as usize];
props
}
pub fn attribute<S: AsRef<str>>(&self, name: S) -> Option<SchemaAttr> {
self.inner.attrs.get(name.as_ref()).cloned()
}
pub fn attribute_name(&self, attr: SchemaAttr) -> &str {
let index = attr.as_u32();
let (name, _) = &self.inner.props[index as usize];
name
}
}
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)]
pub struct SchemaAttr(u32);
impl SchemaAttr {
pub fn new(value: u32) -> SchemaAttr {
SchemaAttr(value)
}
pub fn max() -> SchemaAttr {
SchemaAttr(u32::MAX)
}
pub fn as_u32(&self) -> u32 {
self.0
}
}
impl fmt::Display for SchemaAttr {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.0.fmt(f)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn serialize_deserialize() -> bincode::Result<()> {
let mut builder = SchemaBuilder::new();
builder.new_attribute("alphabet", STORED);
builder.new_attribute("beta", STORED | INDEXED);
builder.new_attribute("gamma", INDEXED);
let schema = builder.build();
let mut buffer = Vec::new();
schema.write_to(&mut buffer)?;
let schema2 = Schema::read_from(buffer.as_slice())?;
assert_eq!(schema, schema2);
Ok(())
}
}

View File

@ -1,35 +0,0 @@
use std::path::PathBuf;
use std::error::Error;
mod negative;
mod positive;
pub use self::positive::{PositiveUpdateBuilder, NewState};
pub use self::negative::NegativeUpdateBuilder;
pub struct Update {
path: PathBuf,
can_be_moved: bool,
}
impl Update {
pub fn open<P: Into<PathBuf>>(path: P) -> Result<Update, Box<Error>> {
Ok(Update { path: path.into(), can_be_moved: false })
}
pub fn open_and_move<P: Into<PathBuf>>(path: P) -> Result<Update, Box<Error>> {
Ok(Update { path: path.into(), can_be_moved: true })
}
pub fn set_move(&mut self, can_be_moved: bool) {
self.can_be_moved = can_be_moved
}
pub fn can_be_moved(&self) -> bool {
self.can_be_moved
}
pub fn into_path_buf(self) -> PathBuf {
self.path
}
}

View File

@ -1,4 +0,0 @@
mod update;
mod unordered_builder;
pub use self::update::NegativeUpdateBuilder;

View File

@ -1,37 +0,0 @@
use std::collections::BTreeSet;
use std::io;
use byteorder::{NativeEndian, WriteBytesExt};
use crate::DocumentId;
pub struct UnorderedNegativeBlobBuilder<W> {
doc_ids: BTreeSet<DocumentId>, // TODO: prefer a linked-list
wrt: W,
}
impl UnorderedNegativeBlobBuilder<Vec<u8>> {
pub fn memory() -> Self {
UnorderedNegativeBlobBuilder::new(Vec::new())
}
}
impl<W: io::Write> UnorderedNegativeBlobBuilder<W> {
pub fn new(wrt: W) -> Self {
Self {
doc_ids: BTreeSet::new(),
wrt: wrt,
}
}
pub fn insert(&mut self, doc: DocumentId) -> bool {
self.doc_ids.insert(doc)
}
pub fn into_inner(mut self) -> io::Result<W> {
for id in self.doc_ids {
self.wrt.write_u64::<NativeEndian>(id)?;
}
Ok(self.wrt)
}
}

View File

@ -1,60 +0,0 @@
use std::path::PathBuf;
use std::error::Error;
use ::rocksdb::rocksdb_options;
use crate::database::update::negative::unordered_builder::UnorderedNegativeBlobBuilder;
use crate::database::blob::{Blob, NegativeBlob};
use crate::database::update::Update;
use crate::database::DocumentKey;
use crate::database::DATA_INDEX;
use crate::DocumentId;
pub struct NegativeUpdateBuilder {
path: PathBuf,
doc_ids: UnorderedNegativeBlobBuilder<Vec<u8>>,
}
impl NegativeUpdateBuilder {
pub fn new<P: Into<PathBuf>>(path: P) -> NegativeUpdateBuilder {
NegativeUpdateBuilder {
path: path.into(),
doc_ids: UnorderedNegativeBlobBuilder::memory(),
}
}
pub fn remove(&mut self, id: DocumentId) -> bool {
self.doc_ids.insert(id)
}
pub fn build(self) -> Result<Update, Box<Error>> {
let env_options = rocksdb_options::EnvOptions::new();
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
file_writer.open(&self.path.to_string_lossy())?;
let bytes = self.doc_ids.into_inner()?;
let negative_blob = NegativeBlob::from_bytes(bytes)?;
let blob = Blob::Negative(negative_blob);
// write the data-index aka negative blob
let bytes = bincode::serialize(&blob)?;
file_writer.merge(DATA_INDEX, &bytes)?;
// FIXME remove this ugly thing !
// let Blob::Negative(negative_blob) = blob;
let negative_blob = match blob {
Blob::Negative(blob) => blob,
Blob::Positive(_) => unreachable!(),
};
for &document_id in negative_blob.as_ref().as_slice() {
let start = DocumentKey::new(document_id);
let end = start.with_attribute_max();
file_writer.delete_range(start.as_ref(), end.as_ref())?;
}
file_writer.finish()?;
Update::open(self.path)
}
}

View File

@ -1,4 +0,0 @@
mod update;
mod unordered_builder;
pub use self::update::{PositiveUpdateBuilder, NewState};

View File

@ -1,49 +0,0 @@
#![allow(unused)]
use std::collections::BTreeMap;
use std::error::Error;
use std::io::Write;
use sdset::Set;
use crate::database::blob::positive::PositiveBlobBuilder;
use crate::DocIndex;
pub struct UnorderedPositiveBlobBuilder<W, X> {
builder: PositiveBlobBuilder<W, X>,
map: BTreeMap<Vec<u8>, Vec<DocIndex>>,
}
impl UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>> {
pub fn memory() -> Self {
Self {
builder: PositiveBlobBuilder::memory(),
map: BTreeMap::new(),
}
}
}
impl<W: Write, X: Write> UnorderedPositiveBlobBuilder<W, X> {
pub fn new(map_wtr: W, doc_wtr: X) -> Result<Self, Box<Error>> {
Ok(UnorderedPositiveBlobBuilder {
builder: PositiveBlobBuilder::new(map_wtr, doc_wtr)?,
map: BTreeMap::new(),
})
}
pub fn insert<K: Into<Vec<u8>>>(&mut self, input: K, doc_index: DocIndex) {
self.map.entry(input.into()).or_insert_with(Vec::new).push(doc_index);
}
pub fn finish(self) -> Result<(), Box<Error>> {
self.into_inner().map(drop)
}
pub fn into_inner(mut self) -> Result<(W, X), Box<Error>> {
for (key, mut doc_indexes) in self.map {
doc_indexes.sort_unstable();
self.builder.insert(&key, Set::new_unchecked(&doc_indexes))?;
}
self.builder.into_inner()
}
}

View File

@ -1,514 +0,0 @@
use std::collections::BTreeMap;
use std::path::PathBuf;
use std::error::Error;
use std::fmt;
use ::rocksdb::rocksdb_options;
use serde::ser::{self, Serialize};
use crate::database::update::positive::unordered_builder::UnorderedPositiveBlobBuilder;
use crate::database::blob::positive::PositiveBlob;
use crate::database::schema::{Schema, SchemaAttr};
use crate::tokenizer::TokenizerBuilder;
use crate::database::DocumentKeyAttr;
use crate::database::update::Update;
use crate::{DocumentId, DocIndex};
use crate::database::DATA_INDEX;
use crate::database::blob::Blob;
pub enum NewState {
Updated { value: Vec<u8> },
Removed,
}
pub struct PositiveUpdateBuilder<B> {
path: PathBuf,
schema: Schema,
tokenizer_builder: B,
builder: UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
new_states: BTreeMap<DocumentKeyAttr, NewState>,
}
impl<B> PositiveUpdateBuilder<B> {
pub fn new<P: Into<PathBuf>>(path: P, schema: Schema, tokenizer_builder: B) -> PositiveUpdateBuilder<B> {
PositiveUpdateBuilder {
path: path.into(),
schema: schema,
tokenizer_builder: tokenizer_builder,
builder: UnorderedPositiveBlobBuilder::memory(),
new_states: BTreeMap::new(),
}
}
pub fn update<T: Serialize>(&mut self, id: DocumentId, document: &T) -> Result<(), Box<Error>>
where B: TokenizerBuilder
{
let serializer = Serializer {
schema: &self.schema,
document_id: id,
tokenizer_builder: &self.tokenizer_builder,
builder: &mut self.builder,
new_states: &mut self.new_states
};
Ok(ser::Serialize::serialize(document, serializer)?)
}
// TODO value must be a field that can be indexed
pub fn update_field(&mut self, id: DocumentId, attr: SchemaAttr, value: String) {
let value = bincode::serialize(&value).unwrap();
self.new_states.insert(DocumentKeyAttr::new(id, attr), NewState::Updated { value });
}
pub fn remove_field(&mut self, id: DocumentId, attr: SchemaAttr) {
self.new_states.insert(DocumentKeyAttr::new(id, attr), NewState::Removed);
}
}
#[derive(Debug)]
pub enum SerializerError {
SchemaDontMatch { attribute: String },
UnserializableType { name: &'static str },
Custom(String),
}
impl ser::Error for SerializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
SerializerError::Custom(msg.to_string())
}
}
impl fmt::Display for SerializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
SerializerError::SchemaDontMatch { attribute } => {
write!(f, "serialized document try to specify the \
{:?} attribute that is not known by the schema", attribute)
},
SerializerError::UnserializableType { name } => {
write!(f, "Only struct and map types are considered valid documents and
can be serialized, not {} types directly.", name)
},
SerializerError::Custom(s) => f.write_str(&s),
}
}
}
impl Error for SerializerError {}
struct Serializer<'a, B> {
schema: &'a Schema,
tokenizer_builder: &'a B,
document_id: DocumentId,
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
new_states: &'a mut BTreeMap<DocumentKeyAttr, NewState>,
}
macro_rules! forward_to_unserializable_type {
($($ty:ident => $se_method:ident,)*) => {
$(
fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "$ty" })
}
)*
}
}
impl<'a, B> ser::Serializer for Serializer<'a, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = StructSerializer<'a, B>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "str" })
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
// Ok(MapSerializer {
// schema: self.schema,
// document_id: self.document_id,
// new_states: self.new_states,
// })
Err(SerializerError::UnserializableType { name: "map" })
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Ok(StructSerializer {
schema: self.schema,
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
builder: self.builder,
new_states: self.new_states,
})
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}
struct StructSerializer<'a, B> {
schema: &'a Schema,
tokenizer_builder: &'a B,
document_id: DocumentId,
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
new_states: &'a mut BTreeMap<DocumentKeyAttr, NewState>,
}
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T
) -> Result<(), Self::Error>
where T: Serialize,
{
match self.schema.attribute(key) {
Some(attr) => {
let props = self.schema.props(attr);
if props.is_stored() {
let value = bincode::serialize(value).unwrap();
let key = DocumentKeyAttr::new(self.document_id, attr);
self.new_states.insert(key, NewState::Updated { value });
}
if props.is_indexed() {
let serializer = IndexerSerializer {
builder: self.builder,
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
attribute: attr,
};
value.serialize(serializer)?;
}
Ok(())
},
None => Err(SerializerError::SchemaDontMatch { attribute: key.to_owned() }),
}
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}
struct IndexerSerializer<'a, B> {
tokenizer_builder: &'a B,
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
document_id: DocumentId,
attribute: SchemaAttr,
}
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
for (index, word) in self.tokenizer_builder.build(v) {
let doc_index = DocIndex {
document_id: self.document_id,
attribute: self.attribute.as_u32() as u8,
attribute_index: index as u32,
};
// insert the exact representation
let word_lower = word.to_lowercase();
// and the unidecoded lowercased version
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
if word_lower != word_unidecoded {
self.builder.insert(word_unidecoded, doc_index);
}
self.builder.insert(word_lower, doc_index);
}
Ok(())
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "seq" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { name: "map" })
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct" })
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}
impl<B> PositiveUpdateBuilder<B> {
pub fn build(self) -> Result<Update, Box<Error>> {
let env_options = rocksdb_options::EnvOptions::new();
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
file_writer.open(&self.path.to_string_lossy())?;
let (blob_fst_map, blob_doc_idx) = self.builder.into_inner()?;
let positive_blob = PositiveBlob::from_bytes(blob_fst_map, blob_doc_idx)?;
let blob = Blob::Positive(positive_blob);
// write the data-index aka positive blob
let bytes = bincode::serialize(&blob)?;
file_writer.merge(DATA_INDEX, &bytes)?;
// write all the documents fields updates
for (key, state) in self.new_states {
match state {
NewState::Updated { value } => {
file_writer.put(key.as_ref(), &value)?
},
NewState::Removed => file_writer.delete(key.as_ref())?,
}
}
file_writer.finish()?;
Update::open(self.path)
}
}

View File

@ -1,99 +0,0 @@
pub mod automaton;
pub mod database;
pub mod data;
pub mod rank;
pub mod tokenizer;
pub mod vec_read_only;
mod common_words;
pub use rocksdb;
pub use self::tokenizer::Tokenizer;
pub use self::common_words::CommonWords;
pub type DocumentId = u64;
/// This structure represent the position of a word
/// in a document and its attributes.
///
/// This is stored in the map, generated at index time,
/// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
#[repr(C)]
pub struct DocIndex {
/// The document identifier where the word was found.
pub document_id: DocumentId,
/// The attribute identifier in the document
/// where the word was found.
///
/// This is an `u8` therefore a document
/// can not have more than `2^8` attributes.
pub attribute: u8,
/// The index where the word was found in the attribute.
///
/// Only the first 1000 words are indexed.
pub attribute_index: u32,
}
/// This structure represent a matching word with informations
/// on the location of the word in the document.
///
/// The order of the field is important because it defines
/// the way these structures are ordered between themselves.
///
/// The word in itself is not important.
// TODO do data oriented programming ? very arrays ?
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
pub struct Match {
/// The word index in the query sentence.
/// Same as the `attribute_index` but for the query words.
///
/// Used to retrieve the automaton that match this word.
pub query_index: u32,
/// The distance the word has with the query word
/// (i.e. the Levenshtein distance).
pub distance: u8,
/// The attribute in which the word is located
/// (i.e. Title is 0, Description is 1).
///
/// This is an `u8` therefore a document
/// can not have more than `2^8` attributes.
pub attribute: u8,
/// Where does this word is located in the attribute string
/// (i.e. at the start or the end of the attribute).
///
/// The index in the attribute is limited to a maximum of `2^32`
/// this is because we index only the first 1000 words
/// in an attribute.
pub attribute_index: u32,
/// Whether the word that match is an exact match or a prefix.
pub is_exact: bool,
}
impl Match {
pub fn zero() -> Self {
Match {
query_index: 0,
distance: 0,
attribute: 0,
attribute_index: 0,
is_exact: false,
}
}
pub fn max() -> Self {
Match {
query_index: u32::max_value(),
distance: u8::max_value(),
attribute: u8::max_value(),
attribute_index: u32::max_value(),
is_exact: true,
}
}
}

Some files were not shown because too many files have changed in this diff Show More