Compare commits

...

134 Commits
v0.1 ... v0.3.0

Author SHA1 Message Date
c615c31016 Merge pull request #101 from Kerollmops/version-bump
Bump version to 0.3.0
2019-02-07 15:26:38 +01:00
908b28790b chore: Bump version to 0.3.0 2019-02-07 14:51:39 +01:00
4c0279729b Merge pull request #100 from qdequele/master
Allow users to manage multiple database indexes
2019-02-07 14:49:52 +01:00
96dfac5b33 feat: Allow users to manage multiple database indexes 2019-02-07 13:05:55 +01:00
8576218b51 Merge pull request #99 from Kerollmops/simplify-transactional-update
Remove the lifetime restriction for Database Updates
2019-02-06 18:19:45 +01:00
1c1f9201b8 feat: Remove the lifetime restriction for Database Updates 2019-02-06 18:03:41 +01:00
4398b88a3a Merge pull request #98 from Kerollmops/updates-with-transactions
Change updates to be handled using the RocksDB WriteBatch feature
2019-02-06 16:13:47 +01:00
73e79f5ca4 chore: Make travis build with Rust 1.32 2019-02-06 15:58:48 +01:00
1bfd51d6e9 feat: Change updates to be handled using the RocksDB WriteBatch feature 2019-02-06 15:58:47 +01:00
0d2daf27f2 Merge pull request #97 from Kerollmops/remove-hashbrown-stop-words
Remove the hashbrown dependency for library users
2019-02-03 17:31:08 +01:00
87f0d8cf3c feat: Remove the hashbrown dependency for library users 2019-02-03 12:22:50 +01:00
06d5a10902 Merge pull request #96 from Kerollmops/chore
Make some little changes
2019-02-03 11:55:06 +01:00
94b89c5439 chore: Make the Document from_raw method private 2019-02-03 11:24:44 +01:00
c5e951be09 chore: Move the deseserializer into the serde module 2019-02-03 11:24:44 +01:00
66ae5c8161 chore: Clarify some QueryBuilder comments 2019-02-03 11:24:44 +01:00
8438e2202f Merge pull request #95 from Kerollmops/fix-querybuilder-with-criteria
Make the QueryBuilder with_criteria use FilterFunc
2019-02-03 11:24:17 +01:00
7a6166d229 feat: Make the QueryBuilder with_criteria use FilterFunc 2019-02-03 10:55:16 +01:00
d46fa4b215 Merge pull request #94 from Kerollmops/data-oriented
Introduce Data Oriented design into the search algorithm
2019-02-02 15:40:10 +01:00
2bd5b4ab86 feat: Remove useless WordsProximity criterion benchmark 2019-02-02 15:12:54 +01:00
5efbc5ceb3 feat: Introduce the revisited SortBy criterion 2019-02-02 14:42:12 +01:00
2e905bac08 chore: Remove Attribute and WordArea structures 2019-02-02 14:40:15 +01:00
4c0ad5f964 feat: Simplify the Criterion Trait by removing the DatabaseView param 2019-02-02 14:40:15 +01:00
455cbf3bf4 feat: Make the search algorithm become fully data oriented 2019-02-02 14:40:14 +01:00
a3a28c56fa feat: Replace compressed Match fields by uncompressed ones 2019-02-02 14:40:14 +01:00
b0b3175641 Merge pull request #93 from Kerollmops/slice-group-by
Use the GroupBy/Mut Traits of the slice-group-by library
2019-01-30 17:52:27 +01:00
c2f0df3f73 feat: Use the GroupBy/Mut Traits of the slice-group-by library 2019-01-30 16:54:52 +01:00
820f1f9ac6 Merge pull request #91 from Kerollmops/warn-reused-document-id
Emit warnings when a document id is reused
2019-01-28 21:05:42 +01:00
337aee5b65 chore: Emit warnings when a document id is reused 2019-01-28 16:11:55 +01:00
810dfdf656 Merge pull request #90 from Kerollmops/version-bump
Bump version to 0.2.1
2019-01-25 17:08:53 +01:00
f016652fca chore: Bump version to 0.2.1 2019-01-25 16:41:08 +01:00
6c99ebe3fa Merge pull request #89 from Kerollmops/no-more-compaction
Remove the manual compaction triggering
2019-01-25 16:40:08 +01:00
94d357985f feat: Remove the manual compaction triggering 2019-01-25 16:05:56 +01:00
fbc698567a Merge pull request #87 from Kerollmops/measure-index-loading
Display index loading times
2019-01-24 14:07:11 +01:00
aa9db14c09 chore: Display index loading times 2019-01-23 11:19:44 +01:00
61e83a1c21 Merge pull request #86 from Kerollmops/measure-indexation
Display timings of indexation operations
2019-01-16 13:32:44 +01:00
1316be5b09 chore: Display timings of indexation operations 2019-01-16 11:45:33 +01:00
4e8b0383dd Merge pull request #85 from Kerollmops/debug-more-stats
Display more stats infos
2019-01-15 14:20:28 +01:00
4fa10753c1 chore: Display more stats infos 2019-01-14 21:18:46 +01:00
2473e289e8 Merge pull request #84 from qdequele/create-server-example
Example HTTP server example can use stopwords
2019-01-14 18:55:58 +01:00
e0e5e87ed3 feat: HTTP server example can use stopwords 2019-01-14 18:21:58 +01:00
b13e61f40a Merge pull request #83 from qdequele/create-server-example
Create an example of HTTP server managing multiple databases
2019-01-14 14:35:33 +01:00
c023cb3065 feat: Create an example for HTTP server managing multiple databases 2019-01-14 13:39:54 +01:00
0a3d069fbc Merge pull request #79 from qdequele/master
Schema can be de/serialized from a json format
2019-01-12 21:50:02 +01:00
fa062ce2cf feat: Schema can be de/serialized from a json format 2019-01-12 21:05:48 +01:00
cdc6e47bf5 Merge pull request #81 from Kerollmops/update-readme
Simplify the examples command lines
2019-01-12 13:43:42 +01:00
d5f44838be doc: Simplify the examples command lines 2019-01-12 12:56:11 +01:00
5939f6e68a Merge pull request #80 from Kerollmops/version-bump
Bump version to 0.2.0
2019-01-12 12:52:08 +01:00
97edc987f8 chore: Bump version to 0.2.0 2019-01-12 12:18:29 +01:00
e4e50cecce Merge pull request #77 from Kerollmops/update-dependencies
Update the quickcheck dev-dependency
2019-01-10 22:09:44 +01:00
77e0c19749 chore: Update the quickcheck dev-dependency 2019-01-10 21:25:32 +01:00
251bccbbc3 Merge pull request #76 from Kerollmops/update-readme
Update readme
2019-01-10 21:20:39 +01:00
f7561f8552 doc: Update examples usages 2019-01-10 21:14:01 +01:00
05fd7e87ec doc: Add some wrk stats to the Readme 2019-01-10 21:13:54 +01:00
446d6a5455 Merge pull request #75 from Kerollmops/binary-group-by-mut-query-builder
Introduce binary group by in the query builder
2019-01-10 21:10:31 +01:00
78786a0007 feat: Introduce binary group by in the query builder 2019-01-10 20:13:40 +01:00
3d820a27ee Merge pull request #74 from Kerollmops/same-document-update-shadowed
Make multiple document updates shadow themselves
2019-01-10 15:57:49 +01:00
ac347d788c feat: Make multiple document updates shadow themselves 2019-01-10 15:25:24 +01:00
5627f15d41 Merge pull request #73 from Kerollmops/module-for-attribute-wordarea
Module for attribute wordarea
2019-01-10 15:23:03 +01:00
e31afc2da2 chore: Move the WordArea type to its own module 2019-01-10 13:37:22 +01:00
77c252e12a chore: Move the Attribute type to its own module 2019-01-10 11:59:42 +01:00
30c9c053c2 Merge pull request #72 from Kerollmops/wordarea-char-index
Make WordArea be based on char index and length
2019-01-09 20:53:59 +01:00
b53ef08d05 feat: Make WordArea be based on char index and length 2019-01-09 20:14:08 +01:00
86bfb173ef Merge pull request #70 from Kerollmops/fix-assert-new-attribute
Remove assert on Attribute::new()
2019-01-09 11:09:18 +01:00
8e5f834625 chore: remove assert on Attribute::new() 2019-01-08 18:46:55 +01:00
563b021679 Merge pull request #69 from tpayet/patch-1
Update README.md
2019-01-08 18:45:10 +01:00
681f721b1d Correct README typos 2019-01-08 17:09:48 +01:00
8a7c061539 Update README.md 2019-01-08 17:09:48 +01:00
8c781a4d05 Merge pull request #67 from Kerollmops/reintroduce-stop-words
Reintroduce stop words
2019-01-07 13:29:23 +01:00
de59ea495d feat: Log some update steps 2019-01-06 22:49:12 +01:00
966eda8ae5 feat: Do the sum of typos using usizes 2019-01-06 22:49:12 +01:00
32f8908d71 feat: Reintroduce stopwords for the serializer 2019-01-06 22:49:11 +01:00
a2f5e8aa25 Merge pull request #66 from Kerollmops/revert-precompute-query-index-groups
Revert precompute query index groups
2019-01-06 22:38:44 +01:00
f00b978801 Revert "feat: Pre-compute matches query index groups"
This reverts commit 039a9a4cc7.
2019-01-06 21:54:49 +01:00
a78b5d225f Revert "feat: Allow Matches to be constructed"
This reverts commit d21406a939.
2019-01-06 21:44:53 +01:00
f32a59720d Revert "feat: Introducing the Matches as_matches method"
This reverts commit ef7ba96d4a.
2019-01-06 21:44:53 +01:00
2cc5fbde1a Revert "feat: Introduce multiple Iterator impl for Matches"
This reverts commit c594597a01.
2019-01-06 21:44:53 +01:00
34d2850d28 Revert "feat: Prefer using ranges and not using unreachable!"
This reverts commit d899b86603.
2019-01-06 21:44:51 +01:00
023f62b0ce Merge pull request #65 from Kerollmops/logging
Add a little bit of logging
2019-01-06 15:55:48 +01:00
7f35b971f0 feat: Log the total number of documents to rank 2019-01-06 15:02:53 +01:00
3418adb06a feat: Add log libraries dependencies 2019-01-06 15:02:53 +01:00
510426c05c Merge pull request #64 from Kerollmops/precompute-query-index-groups
Precompute query index groups
2019-01-06 14:59:04 +01:00
c74caa0f82 feat: Sum usizes instead of little u16/u32 2019-01-06 13:54:14 +01:00
d899b86603 feat: Prefer using ranges and not using unreachable! 2019-01-06 13:54:14 +01:00
0d07af3caf fix: Filter and count the exact matching words 2019-01-06 13:54:13 +01:00
c594597a01 feat: Introduce multiple Iterator impl for Matches 2019-01-06 13:54:13 +01:00
ef7ba96d4a feat: Introducing the Matches as_matches method 2019-01-06 13:54:13 +01:00
d21406a939 feat: Allow Matches to be constructed 2019-01-06 13:54:13 +01:00
039a9a4cc7 feat: Pre-compute matches query index groups 2019-01-06 11:11:55 +01:00
40ab9e7a55 Merge pull request #63 from Kerollmops/update-rocksdb
Update RocksDB to Titan
2019-01-06 10:37:54 +01:00
d21abb50fa chore: Update RocksDB to Titan 2019-01-05 12:47:03 +01:00
3dd5e2445a Merge pull request #62 from Kerollmops/test-document-key-attr
Add tests to DocumentKeyAttr
2019-01-02 22:20:37 +01:00
7f5e6c5b6e test: Add test to the DocumentKeyAttr slice repr 2019-01-02 21:48:58 +01:00
e6d3840f12 Merge pull request #61 from Kerollmops/update-remove-kv-attributes
UpdateBuilder handles document attributes deletion
2019-01-02 18:20:14 +01:00
c05fab783a fix: Write and Read DocumentKeyAttr in big endian 2019-01-02 17:53:53 +01:00
95dc6fe904 feat: Rework the UpdateBuilder struct 2019-01-02 17:53:52 +01:00
b2e9ae4136 Merge pull request #60 from Kerollmops/improve-perfs
Improve performances
2019-01-01 17:03:41 +01:00
b070778d44 feat: Use the jemalloc global allocator in examples 2019-01-01 16:37:15 +01:00
6731025003 chore: Update group-by 2019-01-01 16:27:39 +01:00
04544c1531 feat: Expose nightly features of some dependencies 2019-01-01 16:27:08 +01:00
9dd68b4eaa Merge pull request #58 from Kerollmops/clean-up
Clean up some database functions
2019-01-01 11:43:27 +01:00
1d67012aa5 chore: Clean up some database functions 2019-01-01 01:40:20 +01:00
e723e01ec8 Merge pull request #57 from Kerollmops/clippy-pass
Clippy pass
2018-12-31 23:46:18 +01:00
7845292ea8 chore: Clippy pass 2018-12-31 23:20:30 +01:00
521df85c0d Merge pull request #55 from Kerollmops/add-benchmarks
Add benchmarks
2018-12-31 21:48:38 +01:00
dfa19582a2 test: Add benchmarks to mesure the words proximity criterion 2018-12-31 21:18:42 +01:00
87ec95f7a0 test: Add benchmarks to mesure the database 2018-12-31 21:18:37 +01:00
76ef2cceeb Merge pull request #49 from Kerollmops/serialize-any-map
Serialize any map
2018-12-31 21:11:17 +01:00
20b5a6a06e doc: Add examples for runtime defined data and Schema 2018-12-31 20:44:33 +01:00
a842e647f7 Merge pull request #56 from Kerollmops/new-index-struct
New Index structure
2018-12-31 19:55:18 +01:00
21bb38c3b0 test: Add more tests for updates ingestion 2018-12-31 19:27:21 +01:00
64d53ee1bd chore: Rework the data module structures
being able to be constructed from SharedData
2018-12-31 19:27:21 +01:00
c022fa3fca chore: Move serde related structs to their module 2018-12-31 19:26:28 +01:00
0080bf486f feat: Introduce the new Index structure
replacing the old ugly Blob system
2018-12-31 19:26:27 +01:00
6bd779f9ae feat: Improve the deserialization time of a Blob 2018-12-31 13:15:37 +01:00
a18401f47e Merge pull request #53 from Kerollmops/query-builder-filter
Distinct/QueryBuilder filtering
2018-12-29 23:11:43 +01:00
7132c3be89 feat: Allow filtering on QueryBuilder 2018-12-29 22:30:41 +01:00
aa3d059363 feat: Allow filtering on DistinctQueryBuilder 2018-12-29 22:30:41 +01:00
e2a9dbc404 feat: Introduce filtering methods for Distinct/QueryBuilder 2018-12-29 22:30:40 +01:00
a0a11faee5 Merge pull request #54 from Kerollmops/arccell-instead-of-rwlock
Prefer using ArcCell instead of RWLock for database updates
2018-12-29 22:29:35 +01:00
36ef9581aa feat: Return the database view for each update 2018-12-29 21:07:01 +01:00
f4b04dfb72 feat: Prefer doing DatabaseView updates atomically 2018-12-29 20:52:00 +01:00
cf5d56e63a Merge pull request #52 from Kerollmops/schema-toml
Schema can be de/serialized from a toml format
2018-12-28 19:59:40 +01:00
8412c14b5b feat: Schema can be toml de/serialized 2018-12-28 19:24:50 +01:00
70772eca5c Merge pull request #51 from Kerollmops/wordarea-attribute-fallible
Make the Attribute and WordArea errors recoverable
2018-12-28 18:26:19 +01:00
b27f632e14 feat: Make the Attribute and WordArea errors recoverable 2018-12-28 16:15:22 +01:00
e3bfb866e5 Merge pull request #46 from Kerollmops/schema-considers-id
Schema considers document ids
2018-12-27 12:26:57 +01:00
fa238f21ef feat: Move Database to its own module 2018-12-27 11:21:47 +01:00
444a4c1af7 feat: Make the schema consider document ids 2018-12-27 11:21:47 +01:00
2e5c5fad33 Merge pull request #45 from Kerollmops/index-length-in-docindex
Introduce the WordArea struct
2018-12-24 17:08:20 +01:00
b32c96cdc9 feat: Introduce a WordArea struct
Useful to highlight matching areas in the original text.
2018-12-24 15:58:46 +01:00
62521262e8 Merge pull request #44 from Kerollmops/real-document-id-type
Create a real DocumentId type
2018-12-24 15:41:47 +01:00
4ebae7784c feat: Create a strong DocumentId type
Forcing it to be something internal will permit to avoid possible miss comparisons to be done with other types.
2018-12-24 12:42:24 +01:00
a756ca5e3f Merge pull request #39 from Kerollmops/readme-badges
Add badges to the README
2018-12-19 14:42:54 +01:00
aa104fa253 doc: Add some funny badges to the README 2018-12-19 12:00:29 +01:00
55 changed files with 3599 additions and 2341 deletions

View File

@ -11,8 +11,8 @@ matrix:
include: include:
# Test crates on their minimum Rust versions. # Test crates on their minimum Rust versions.
- rust: 1.31.0 - rust: 1.32.0
name: "meilidb on 1.31.0" name: "meilidb on 1.32.0"
script: ./ci/meilidb.sh script: ./ci/meilidb.sh
# Test crates on nightly Rust. # Test crates on nightly Rust.

View File

@ -1,39 +1,55 @@
[package] [package]
edition = "2018" edition = "2018"
name = "meilidb" name = "meilidb"
version = "0.1.0" version = "0.3.0"
authors = ["Kerollmops <renault.cle@gmail.com>"] authors = ["Kerollmops <renault.cle@gmail.com>"]
[dependencies] [dependencies]
bincode = "1.0" bincode = "1.0"
byteorder = "1.2" byteorder = "1.2"
crossbeam = "0.6"
elapsed = "0.1"
fst = "0.3" fst = "0.3"
hashbrown = "0.1" hashbrown = { version = "0.1", features = ["serde"] }
lazy_static = "1.1" lazy_static = "1.1"
levenshtein_automata = { version = "0.1", features = ["fst_automaton"] } levenshtein_automata = { version = "0.1", features = ["fst_automaton"] }
linked-hash-map = { version = "0.5", features = ["serde_impl"] } linked-hash-map = { version = "0.5", features = ["serde_impl"] }
log = "0.4"
sdset = "0.3" sdset = "0.3"
serde = "1.0" serde = "1.0"
serde_derive = "1.0" serde_derive = "1.0"
serde_json = { version = "1.0", features = ["preserve_order"] }
slice-group-by = "0.2"
unidecode = "0.3" unidecode = "0.3"
rayon = "1.0"
lockfree = "0.5.1"
[dependencies.toml]
git = "https://github.com/Kerollmops/toml-rs.git"
features = ["preserve_order"]
rev = "0372ba6"
[dependencies.rocksdb] [dependencies.rocksdb]
git = "https://github.com/pingcap/rust-rocksdb.git" git = "https://github.com/pingcap/rust-rocksdb.git"
rev = "c2eb140" rev = "306e201"
[dependencies.group-by]
git = "https://github.com/Kerollmops/group-by.git"
rev = "cab857b"
[features] [features]
default = ["simd"] default = ["simd"]
i128 = ["bincode/i128", "byteorder/i128"] i128 = ["bincode/i128", "byteorder/i128"]
simd = ["rocksdb/sse"]
portable = ["rocksdb/portable"] portable = ["rocksdb/portable"]
nightly = [] simd = ["rocksdb/sse"]
nightly = ["hashbrown/nightly", "slice-group-by/nightly"]
[dev-dependencies] [dev-dependencies]
csv = "1.0" csv = "1.0"
elapsed = "0.1" env_logger = "0.6"
jemallocator = "0.1"
quickcheck = "0.8"
rand = "0.6"
rand_xorshift = "0.1"
structopt = "0.2" structopt = "0.2"
tempfile = "3.0" tempfile = "3.0"
termcolor = "1.0"
[profile.release]
debug = true

View File

@ -1,47 +1,60 @@
# MeiliDB # MeiliDB
[![Build Status](https://travis-ci.org/Kerollmops/MeiliDB.svg?branch=master)](https://travis-ci.org/Kerollmops/MeiliDB)
[![dependency status](https://deps.rs/repo/github/Kerollmops/MeiliDB/status.svg)](https://deps.rs/repo/github/Kerollmops/MeiliDB)
[![License](https://img.shields.io/github/license/Kerollmops/MeiliDB.svg)](https://github.com/Kerollmops/MeiliDB)
[![Rust 1.31+](https://img.shields.io/badge/rust-1.31+-lightgray.svg)](
https://www.rust-lang.org)
A _full-text search database_ using a key-value store internally. A _full-text search database_ using a key-value store internally.
It uses [RocksDB](https://github.com/facebook/rocksdb) like a classic database, to store documents and internal data. The key-value store power allow us to handle updates and queries with small memory and CPU overheads. It uses [RocksDB](https://github.com/facebook/rocksdb) as the internal key-value store. The key-value store allows us to handle updates and queries with small memory and CPU overheads.
You can [read the deep dive](deep-dive.md) if you want more informations on the engine, it describes the whole process of generating updates and handling queries. You can [read the deep dive](deep-dive.md) if you want more information on the engine, it describes the whole process of generating updates and handling queries.
We will be proud if you send pull requests to help us grow this project, you can start with [issues tagged "good-first-issue"](https://github.com/Kerollmops/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) to start ! We will be proud if you submit issues and pull requests. You can help to grow this project and start contributing by checking [issues tagged "good-first-issue"](https://github.com/Kerollmops/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). It is a good start!
At the moment this is a library only, this means that binaries are not part of this repository but since I'm still nice I have made some examples for you in the `examples/` folder that works with the data located in the `misc/` folder. The project is only a library yet. It means that there is no binary provided yet. To get started, you can check the examples wich are made to work with the data located in the `misc/` folder.
In a near future MeiliDB we be a binary like any database: updated and queried using some kind of protocol. It is the final goal, [see the milestones](https://github.com/Kerollmops/MeiliDB/milestones). MeiliDB will just be a bunch of network and protocols functions wrapping the library which itself will be published to https://crates.io, following the same update cycle. MeiliDB will be a binary in a near future so you will be able to use it as a database out-of-the-box. We should be able to query it using a [to-be-defined](https://github.com/Kerollmops/MeiliDB/issues/38) protocol. This is our current goal, [see the milestones](https://github.com/Kerollmops/MeiliDB/milestones). In the end, the binary will be a bunch of network protocols and wrappers around the library - which will also be published on [crates.io](https://crates.io). Both the binary and the library will follow the same update cycle.
## Performances ## Performances
_these informations have been made with a version dated of october 2018, we must update them_ With a database composed of _100 353_ documents with _352_ attributes each and _90_ of them indexed.
So nearly _9 million_ fields indexed for _35 million_ stored we can handle more than _1.2k req/sec_ on an Intel i7-7700 (8) @ 4.2GHz.
We made some tests on remote machines and found that we can handle with a dataset of near 280k products, on a server that cost 5$/month with 1vCPU and 1GB of ram and on the same index and with a simple query: Requests are made using [wrk](https://github.com/wg/wrk) and scripted to generate real users queries.
- near 190 users with an average response time of 90ms ```
- 150 users with an average response time of 70ms Running 10s test @ http://localhost:2230
- 100 users with an average response time of 45ms 2 threads and 12 connections
Thread Stats Avg Stdev Max +/- Stdev
Network is mesured, servers are located in amsterdam and tests are made between two different datacenters. Latency 18.86ms 49.39ms 614.89ms 95.23%
Req/Sec 620.41 59.53 790.00 65.00%
12359 requests in 10.00s, 3.26MB read
Requests/sec: 1235.54
Transfer/sec: 334.22KB
```
### Notes
The default Rust allocator has recently been [changed to use the system allocator](https://github.com/rust-lang/rust/pull/51241/).
We have seen much better performances when [using jemalloc as the global allocator](https://github.com/alexcrichton/jemallocator#documentation).
## Usage and examples ## Usage and examples
MeiliDB work with an index like most of the search engines. MeiliDB runs with an index like most search engines.
So to test the library you can create one by indexing a simple csv file. So to test the library you can create one by indexing a simple csv file.
```bash ```bash
cargo run --release --example create-database -- test.mdb misc/kaggle.csv cargo run --release --example create-database -- test.mdb misc/kaggle.csv --schema schema-example.toml
``` ```
Once the command finished indexing the database should have been saved under the `test.mdb` folder. Once the command is executed, the index should be in the `test.mdb` folder. You are now able to run the `query-database` example and play with MeiliDB.
Now you can easily run the `query-database` example to check what is stored in it.
```bash ```bash
cargo run --release --example query-database -- test.mdb cargo run --release --example query-database -- test.mdb -n 10 id title
``` ```

View File

@ -1,64 +1,74 @@
use std::collections::hash_map::DefaultHasher; #[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
use std::collections::{HashMap, HashSet};
use std::io::{self, BufRead, BufReader};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::hash::{Hash, Hasher};
use std::error::Error; use std::error::Error;
use std::borrow::Cow;
use std::fs::File;
use serde_derive::{Serialize, Deserialize}; use serde_derive::{Serialize, Deserialize};
use structopt::StructOpt; use structopt::StructOpt;
use meilidb::database::schema::{Schema, SchemaBuilder, STORED, INDEXED}; use meilidb::database::{Database, Schema};
use meilidb::database::update::PositiveUpdateBuilder;
use meilidb::tokenizer::DefaultBuilder; use meilidb::tokenizer::DefaultBuilder;
use meilidb::database::Database;
#[derive(Debug, StructOpt)] #[derive(Debug, StructOpt)]
pub struct Opt { pub struct Opt {
/// The destination where the database must be created /// The destination where the database must be created.
#[structopt(parse(from_os_str))] #[structopt(parse(from_os_str))]
pub database_path: PathBuf, pub database_path: PathBuf,
/// The csv file to index. /// The csv file to index.
#[structopt(parse(from_os_str))] #[structopt(parse(from_os_str))]
pub csv_data_path: PathBuf, pub csv_data_path: PathBuf,
/// The path to the schema.
#[structopt(long = "schema", parse(from_os_str))]
pub schema_path: PathBuf,
/// The path to the list of stop words (one by line).
#[structopt(long = "stop-words", parse(from_os_str))]
pub stop_words_path: Option<PathBuf>,
#[structopt(long = "update-group-size")]
pub update_group_size: Option<usize>,
} }
#[derive(Debug, Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
struct Document<'a> { struct Document<'a> (
id: &'a str, #[serde(borrow)]
title: &'a str, HashMap<Cow<'a, str>, Cow<'a, str>>
description: &'a str, );
image: &'a str,
}
fn calculate_hash<T: Hash>(t: &T) -> u64 { fn index(
let mut s = DefaultHasher::new(); schema: Schema,
t.hash(&mut s); database_path: &Path,
s.finish() csv_data_path: &Path,
} update_group_size: Option<usize>,
stop_words: &HashSet<String>,
) -> Result<Database, Box<Error>>
{
let database = Database::create(database_path)?;
fn create_schema() -> Schema { database.create_index("default", &schema)?;
let mut schema = SchemaBuilder::new();
schema.new_attribute("id", STORED);
schema.new_attribute("title", STORED | INDEXED);
schema.new_attribute("description", STORED | INDEXED);
schema.new_attribute("image", STORED);
schema.build()
}
fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<Database, Box<Error>> {
let database = Database::create(database_path, schema.clone())?;
println!("start indexing...");
let tokenizer_builder = DefaultBuilder::new();
let update_path = tempfile::NamedTempFile::new()?;
let mut update = PositiveUpdateBuilder::new(update_path.path(), schema, tokenizer_builder);
let mut rdr = csv::Reader::from_path(csv_data_path)?; let mut rdr = csv::Reader::from_path(csv_data_path)?;
let mut raw_record = csv::StringRecord::new(); let mut raw_record = csv::StringRecord::new();
let headers = rdr.headers()?.clone(); let headers = rdr.headers()?.clone();
while rdr.read_record(&mut raw_record)? { let mut i = 0;
let mut end_of_file = false;
while !end_of_file {
let tokenizer_builder = DefaultBuilder::new();
let mut update = database.start_update("default")?;
loop {
end_of_file = !rdr.read_record(&mut raw_record)?;
if end_of_file { break }
let document: Document = match raw_record.deserialize(Some(&headers)) { let document: Document = match raw_record.deserialize(Some(&headers)) {
Ok(document) => document, Ok(document) => document,
Err(e) => { Err(e) => {
@ -67,25 +77,55 @@ fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<D
} }
}; };
let document_id = calculate_hash(&document.id); update.update_document(&document, &tokenizer_builder, &stop_words)?;
update.update(document_id, &document).unwrap();
print!("\rindexing document {}", i);
i += 1;
if let Some(group_size) = update_group_size {
if i % group_size == 0 { break }
}
} }
let mut update = update.build()?; println!();
update.set_move(true); println!("committing update...");
database.ingest_update_file(update)?; database.commit_update(update)?;
}
Ok(database) Ok(database)
} }
fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
let f = File::open(path)?;
let reader = BufReader::new(f);
let mut words = HashSet::new();
for line in reader.lines() {
let line = line?;
let word = line.trim().to_string();
words.insert(word);
}
Ok(words)
}
fn main() -> Result<(), Box<Error>> { fn main() -> Result<(), Box<Error>> {
let _ = env_logger::init();
let opt = Opt::from_args(); let opt = Opt::from_args();
let schema = create_schema(); let schema = {
let file = File::open(&opt.schema_path)?;
Schema::from_toml(file)?
};
let stop_words = match opt.stop_words_path {
Some(ref path) => retrieve_stop_words(path)?,
None => HashSet::new(),
};
let (elapsed, result) = elapsed::measure_time(|| { let (elapsed, result) = elapsed::measure_time(|| {
index(schema, &opt.database_path, &opt.csv_data_path) index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words)
}); });
if let Err(e) = result { if let Err(e) = result {
@ -93,6 +133,5 @@ fn main() -> Result<(), Box<Error>> {
} }
println!("database created in {} at: {:?}", elapsed, opt.database_path); println!("database created in {} at: {:?}", elapsed, opt.database_path);
Ok(()) Ok(())
} }

View File

@ -1,11 +1,19 @@
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
use std::collections::btree_map::{BTreeMap, Entry};
use std::iter::FromIterator;
use std::io::{self, Write}; use std::io::{self, Write};
use std::path::PathBuf; use std::path::PathBuf;
use std::error::Error; use std::error::Error;
use serde_derive::{Serialize, Deserialize}; use hashbrown::{HashMap, HashSet};
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
use structopt::StructOpt; use structopt::StructOpt;
use meilidb::database::schema::SchemaAttr;
use meilidb::database::Database; use meilidb::database::Database;
use meilidb::Match;
#[derive(Debug, StructOpt)] #[derive(Debug, StructOpt)]
pub struct Opt { pub struct Opt {
@ -13,20 +21,85 @@ pub struct Opt {
#[structopt(parse(from_os_str))] #[structopt(parse(from_os_str))]
pub database_path: PathBuf, pub database_path: PathBuf,
/// Fields that must be displayed.
pub displayed_fields: Vec<String>,
/// The number of returned results /// The number of returned results
#[structopt(short = "n", long = "number-results", default_value = "10")] #[structopt(short = "n", long = "number-results", default_value = "10")]
pub number_results: usize, pub number_results: usize,
} }
#[derive(Debug, Serialize, Deserialize)] type Document = HashMap<String, String>;
struct Document {
id: String, fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
title: String, let mut stdout = StandardStream::stdout(ColorChoice::Always);
description: String, let mut highlighted = false;
image: String,
for range in ranges.windows(2) {
let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() };
if highlighted {
stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
}
write!(&mut stdout, "{}", &text[start..end])?;
stdout.reset()?;
highlighted = !highlighted;
}
Ok(())
}
fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize) {
let mut byte_index = 0;
let mut byte_length = 0;
for (n, (i, c)) in text.char_indices().enumerate() {
if n == index {
byte_index = i;
}
if n + 1 == index + length {
byte_length = i - byte_index + c.len_utf8();
break;
}
}
(byte_index, byte_length)
}
fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr) -> Vec<usize> {
let mut byte_indexes = BTreeMap::new();
for match_ in matches {
let match_attribute = match_.attribute;
if SchemaAttr::new(match_attribute) == attribute {
let char_index = match_.char_index as usize;
let char_length = match_.char_length as usize;
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
match byte_indexes.entry(byte_index) {
Entry::Vacant(entry) => { entry.insert(byte_length); },
Entry::Occupied(mut entry) => {
if *entry.get() < byte_length {
entry.insert(byte_length);
}
},
}
}
}
let mut title_areas = Vec::new();
title_areas.push(0);
for (byte_index, length) in byte_indexes {
title_areas.push(byte_index);
title_areas.push(byte_index + length);
}
title_areas.push(text.len());
title_areas.sort_unstable();
title_areas
} }
fn main() -> Result<(), Box<Error>> { fn main() -> Result<(), Box<Error>> {
let _ = env_logger::init();
let opt = Opt::from_args(); let opt = Opt::from_args();
let (elapsed, result) = elapsed::measure_time(|| Database::open(&opt.database_path)); let (elapsed, result) = elapsed::measure_time(|| Database::open(&opt.database_path));
@ -41,26 +114,53 @@ fn main() -> Result<(), Box<Error>> {
io::stdout().flush()?; io::stdout().flush()?;
if input.read_line(&mut buffer)? == 0 { break } if input.read_line(&mut buffer)? == 0 { break }
let query = buffer.trim_end_matches('\n');
let view = database.view(); let view = database.view("default")?;
let schema = view.schema();
let (elapsed, documents) = elapsed::measure_time(|| { let (elapsed, documents) = elapsed::measure_time(|| {
let builder = view.query_builder().unwrap(); let builder = view.query_builder().unwrap();
builder.query(&buffer, 0..opt.number_results) builder.query(query, 0..opt.number_results)
}); });
let mut full_documents = Vec::with_capacity(documents.len()); let number_of_documents = documents.len();
for doc in documents {
match view.document_by_id::<Document>(doc.id) {
Ok(document) => {
for name in &opt.displayed_fields {
let attr = match schema.attribute(name) {
Some(attr) => attr,
None => continue,
};
let text = match document.get(name) {
Some(text) => text,
None => continue,
};
for document in documents { print!("{}: ", name);
match view.retrieve_document::<Document>(document.id) { let areas = create_highlight_areas(&text, &doc.matches, attr);
Ok(document) => full_documents.push(document), display_highlights(&text, &areas)?;
println!();
}
},
Err(e) => eprintln!("{}", e), Err(e) => eprintln!("{}", e),
} }
let mut matching_attributes = HashSet::new();
for _match in doc.matches {
let attr = SchemaAttr::new(_match.attribute);
let name = schema.attribute_name(attr);
matching_attributes.insert(name);
} }
println!("{:#?}", full_documents); let matching_attributes = Vec::from_iter(matching_attributes);
println!("Found {} results in {}", full_documents.len(), elapsed); println!("matching in: {:?}", matching_attributes);
println!();
}
eprintln!("===== Found {} results in {} =====", number_of_documents, elapsed);
buffer.clear(); buffer.clear();
} }

View File

@ -0,0 +1,19 @@
# This schema has been generated ...
# The order in which the attributes are declared is important,
# it specify the attribute xxx...
identifier = "id"
[attributes.id]
stored = true
[attributes.title]
stored = true
indexed = true
[attributes.description]
stored = true
indexed = true
[attributes.image]
stored = true

View File

@ -95,7 +95,8 @@ or
other other
ought ought
our our
ours ourselves ours
ourselves
out out
over over
own own

163
misc/fr.stopwords.txt Normal file
View File

@ -0,0 +1,163 @@
au
aux
avec
ce
ces
dans
de
des
du
elle
en
et
eux
il
je
la
le
leur
lui
ma
mais
me
même
mes
moi
mon
ne
nos
notre
nous
on
ou
par
pas
pour
qu
que
qui
sa
se
ses
son
sur
ta
te
tes
toi
ton
tu
un
une
vos
votre
vous
c
d
j
l
à
m
n
s
t
y
été
étée
étées
étés
étant
suis
es
est
sommes
êtes
sont
serai
seras
sera
serons
serez
seront
serais
serait
serions
seriez
seraient
étais
était
étions
étiez
étaient
fus
fut
fûmes
fûtes
furent
sois
soit
soyons
soyez
soient
fusse
fusses
fût
fussions
fussiez
fussent
ayant
eu
eue
eues
eus
ai
as
avons
avez
ont
aurai
auras
aura
aurons
aurez
auront
aurais
aurait
aurions
auriez
auraient
avais
avait
avions
aviez
avaient
eut
eûmes
eûtes
eurent
aie
aies
ait
ayons
ayez
aient
eusse
eusses
eût
eussions
eussiez
eussent
ceci
celà
cet
cette
ici
ils
les
leurs
quel
quels
quelle
quelles
sans
soi

View File

@ -50,6 +50,7 @@ impl AutomatonExt for DfaExt {
} }
} }
#[derive(Copy, Clone)]
enum PrefixSetting { enum PrefixSetting {
Prefix, Prefix,
NoPrefix, NoPrefix,

View File

@ -1,59 +1,54 @@
use std::io::{self, Cursor, BufRead};
use std::slice::from_raw_parts; use std::slice::from_raw_parts;
use std::error::Error; use std::mem::size_of;
use std::path::Path;
use std::sync::Arc;
use std::{io, mem};
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use sdset::Set; use sdset::Set;
use fst::raw::MmapReadOnly;
use serde::ser::{Serialize, Serializer};
use crate::DocumentId; use crate::DocumentId;
use crate::data::Data; use crate::data::SharedData;
use super::into_u8_slice;
#[derive(Default, Clone)] #[derive(Default, Clone)]
pub struct DocIds { pub struct DocIds(SharedData);
data: Data,
}
impl DocIds { impl DocIds {
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> { pub fn new(ids: &Set<DocumentId>) -> DocIds {
let mmap = MmapReadOnly::open_path(path)?; let bytes = unsafe { into_u8_slice(ids.as_slice()) };
let data = Data::Mmap(mmap); let data = SharedData::from_bytes(bytes.to_vec());
Ok(DocIds { data }) DocIds(data)
} }
pub fn from_bytes(vec: Vec<u8>) -> Result<Self, Box<Error>> { pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIds> {
// FIXME check if modulo DocumentId let len = cursor.read_u64::<LittleEndian>()? as usize;
let len = vec.len(); let offset = cursor.position() as usize;
let data = Data::Shared { let doc_ids = cursor.get_ref().range(offset, len);
bytes: Arc::new(vec), cursor.consume(len);
offset: 0,
len: len Ok(DocIds(doc_ids))
};
Ok(DocIds { data })
} }
pub fn from_document_ids(vec: Vec<DocumentId>) -> Self { pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
DocIds::from_bytes(unsafe { mem::transmute(vec) }).unwrap() let len = self.0.len() as u64;
bytes.write_u64::<LittleEndian>(len).unwrap();
bytes.extend_from_slice(&self.0);
} }
pub fn contains(&self, doc: DocumentId) -> bool { pub fn is_empty(&self) -> bool {
// FIXME prefer using the sdset::exponential_search function self.0.is_empty()
self.doc_ids().binary_search(&doc).is_ok()
} }
pub fn doc_ids(&self) -> &Set<DocumentId> { pub fn as_bytes(&self) -> &[u8] {
let slice = &self.data; &self.0
}
}
impl AsRef<Set<DocumentId>> for DocIds {
fn as_ref(&self) -> &Set<DocumentId> {
let slice = &self.0;
let ptr = slice.as_ptr() as *const DocumentId; let ptr = slice.as_ptr() as *const DocumentId;
let len = slice.len() / mem::size_of::<DocumentId>(); let len = slice.len() / size_of::<DocumentId>();
let slice = unsafe { from_raw_parts(ptr, len) }; let slice = unsafe { from_raw_parts(ptr, len) };
Set::new_unchecked(slice) Set::new_unchecked(slice)
} }
} }
impl Serialize for DocIds {
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
self.data.as_ref().serialize(serializer)
}
}

View File

@ -1,16 +1,15 @@
use std::io::{self, Write, Cursor, BufRead};
use std::slice::from_raw_parts; use std::slice::from_raw_parts;
use std::io::{self, Write};
use std::mem::size_of; use std::mem::size_of;
use std::ops::Index; use std::ops::Index;
use std::path::Path;
use std::sync::Arc; use std::sync::Arc;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use fst::raw::MmapReadOnly;
use sdset::Set; use sdset::Set;
use crate::DocIndex; use crate::DocIndex;
use crate::data::Data; use crate::data::SharedData;
use super::into_u8_slice;
#[derive(Debug)] #[derive(Debug)]
#[repr(C)] #[repr(C)]
@ -21,52 +20,45 @@ struct Range {
#[derive(Clone, Default)] #[derive(Clone, Default)]
pub struct DocIndexes { pub struct DocIndexes {
ranges: Data, ranges: SharedData,
indexes: Data, indexes: SharedData,
} }
impl DocIndexes { impl DocIndexes {
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> { pub fn from_bytes(bytes: Vec<u8>) -> io::Result<DocIndexes> {
let mmap = MmapReadOnly::open_path(path)?; let bytes = Arc::new(bytes);
DocIndexes::from_data(Data::Mmap(mmap)) let len = bytes.len();
let data = SharedData::new(bytes, 0, len);
let mut cursor = Cursor::new(data);
DocIndexes::from_cursor(&mut cursor)
} }
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> { pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIndexes> {
let len = vec.len(); let len = cursor.read_u64::<LittleEndian>()? as usize;
DocIndexes::from_shared_bytes(Arc::new(vec), 0, len) let offset = cursor.position() as usize;
} let ranges = cursor.get_ref().range(offset, len);
cursor.consume(len);
pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> io::Result<Self> { let len = cursor.read_u64::<LittleEndian>()? as usize;
let data = Data::Shared { bytes, offset, len }; let offset = cursor.position() as usize;
DocIndexes::from_data(data) let indexes = cursor.get_ref().range(offset, len);
} cursor.consume(len);
fn from_data(data: Data) -> io::Result<Self> {
let ranges_len_offset = data.len() - size_of::<u64>();
let ranges_len = (&data[ranges_len_offset..]).read_u64::<LittleEndian>()?;
let ranges_len = ranges_len as usize;
let ranges_offset = ranges_len_offset - ranges_len;
let ranges = data.range(ranges_offset, ranges_len);
let indexes = data.range(0, ranges_offset);
Ok(DocIndexes { ranges, indexes }) Ok(DocIndexes { ranges, indexes })
} }
pub fn to_vec(&self) -> Vec<u8> { pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
let capacity = self.indexes.len() + self.ranges.len() + size_of::<u64>(); let ranges_len = self.ranges.len() as u64;
let mut bytes = Vec::with_capacity(capacity); let _ = bytes.write_u64::<LittleEndian>(ranges_len);
bytes.extend_from_slice(&self.indexes);
bytes.extend_from_slice(&self.ranges); bytes.extend_from_slice(&self.ranges);
bytes.write_u64::<LittleEndian>(self.ranges.len() as u64).unwrap();
bytes let indexes_len = self.indexes.len() as u64;
let _ = bytes.write_u64::<LittleEndian>(indexes_len);
bytes.extend_from_slice(&self.indexes);
} }
pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> { pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> {
self.ranges().get(index as usize).map(|Range { start, end }| { self.ranges().get(index).map(|Range { start, end }| {
let start = *start as usize; let start = *start as usize;
let end = *end as usize; let end = *end as usize;
let slice = &self.indexes()[start..end]; let slice = &self.indexes()[start..end];
@ -102,12 +94,17 @@ impl Index<usize> for DocIndexes {
pub struct DocIndexesBuilder<W> { pub struct DocIndexesBuilder<W> {
ranges: Vec<Range>, ranges: Vec<Range>,
indexes: Vec<DocIndex>,
wtr: W, wtr: W,
} }
impl DocIndexesBuilder<Vec<u8>> { impl DocIndexesBuilder<Vec<u8>> {
pub fn memory() -> Self { pub fn memory() -> Self {
DocIndexesBuilder::new(Vec::new()) DocIndexesBuilder {
ranges: Vec::new(),
indexes: Vec::new(),
wtr: Vec::new(),
}
} }
} }
@ -115,19 +112,18 @@ impl<W: Write> DocIndexesBuilder<W> {
pub fn new(wtr: W) -> Self { pub fn new(wtr: W) -> Self {
DocIndexesBuilder { DocIndexesBuilder {
ranges: Vec::new(), ranges: Vec::new(),
indexes: Vec::new(),
wtr: wtr, wtr: wtr,
} }
} }
pub fn insert(&mut self, indexes: &Set<DocIndex>) -> io::Result<()> { pub fn insert(&mut self, indexes: &Set<DocIndex>) {
let len = indexes.len() as u64; let len = indexes.len() as u64;
let start = self.ranges.last().map(|r| r.end).unwrap_or(0); let start = self.ranges.last().map(|r| r.end).unwrap_or(0);
let range = Range { start, end: start + len }; let range = Range { start, end: start + len };
self.ranges.push(range); self.ranges.push(range);
// write the values self.indexes.extend_from_slice(indexes);
let indexes = unsafe { into_u8_slice(indexes) };
self.wtr.write_all(indexes)
} }
pub fn finish(self) -> io::Result<()> { pub fn finish(self) -> io::Result<()> {
@ -135,40 +131,55 @@ impl<W: Write> DocIndexesBuilder<W> {
} }
pub fn into_inner(mut self) -> io::Result<W> { pub fn into_inner(mut self) -> io::Result<W> {
// write the ranges let ranges = unsafe { into_u8_slice(&self.ranges) };
let ranges = unsafe { into_u8_slice(self.ranges.as_slice()) };
self.wtr.write_all(ranges)?;
// write the length of the ranges
let len = ranges.len() as u64; let len = ranges.len() as u64;
self.wtr.write_u64::<LittleEndian>(len)?; self.wtr.write_u64::<LittleEndian>(len)?;
self.wtr.write_all(ranges)?;
let indexes = unsafe { into_u8_slice(&self.indexes) };
let len = indexes.len() as u64;
self.wtr.write_u64::<LittleEndian>(len)?;
self.wtr.write_all(indexes)?;
Ok(self.wtr) Ok(self.wtr)
} }
} }
unsafe fn into_u8_slice<T>(slice: &[T]) -> &[u8] {
let ptr = slice.as_ptr() as *const u8;
let len = slice.len() * size_of::<T>();
from_raw_parts(ptr, len)
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*;
use std::error::Error; use std::error::Error;
use crate::DocumentId;
use super::*;
#[test] #[test]
fn builder_serialize_deserialize() -> Result<(), Box<Error>> { fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 }; let a = DocIndex {
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 }; document_id: DocumentId(0),
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 }; attribute: 3,
word_index: 11,
char_index: 30,
char_length: 4,
};
let b = DocIndex {
document_id: DocumentId(1),
attribute: 4,
word_index: 21,
char_index: 35,
char_length: 6,
};
let c = DocIndex {
document_id: DocumentId(2),
attribute: 8,
word_index: 2,
char_index: 89,
char_length: 6,
};
let mut builder = DocIndexesBuilder::memory(); let mut builder = DocIndexesBuilder::memory();
builder.insert(Set::new(&[a])?)?; builder.insert(Set::new(&[a])?);
builder.insert(Set::new(&[a, b, c])?)?; builder.insert(Set::new(&[a, b, c])?);
builder.insert(Set::new(&[a, c])?)?; builder.insert(Set::new(&[a, c])?);
let bytes = builder.into_inner()?; let bytes = builder.into_inner()?;
let docs = DocIndexes::from_bytes(bytes)?; let docs = DocIndexes::from_bytes(bytes)?;
@ -183,19 +194,39 @@ mod tests {
#[test] #[test]
fn serialize_deserialize() -> Result<(), Box<Error>> { fn serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 }; let a = DocIndex {
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 }; document_id: DocumentId(0),
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 }; attribute: 3,
word_index: 11,
char_index: 30,
char_length: 4,
};
let b = DocIndex {
document_id: DocumentId(1),
attribute: 4,
word_index: 21,
char_index: 35,
char_length: 6,
};
let c = DocIndex {
document_id: DocumentId(2),
attribute: 8,
word_index: 2,
char_index: 89,
char_length: 6,
};
let mut builder = DocIndexesBuilder::memory(); let mut builder = DocIndexesBuilder::memory();
builder.insert(Set::new(&[a])?)?; builder.insert(Set::new(&[a])?);
builder.insert(Set::new(&[a, b, c])?)?; builder.insert(Set::new(&[a, b, c])?);
builder.insert(Set::new(&[a, c])?)?; builder.insert(Set::new(&[a, c])?);
let builder_bytes = builder.into_inner()?; let builder_bytes = builder.into_inner()?;
let docs = DocIndexes::from_bytes(builder_bytes.clone())?; let docs = DocIndexes::from_bytes(builder_bytes.clone())?;
let bytes = docs.to_vec();
let mut bytes = Vec::new();
docs.write_to_bytes(&mut bytes);
assert_eq!(builder_bytes, bytes); assert_eq!(builder_bytes, bytes);

View File

@ -1,51 +1,43 @@
mod doc_ids; mod doc_ids;
mod doc_indexes; mod doc_indexes;
use std::slice::from_raw_parts;
use std::mem::size_of;
use std::ops::Deref; use std::ops::Deref;
use std::sync::Arc; use std::sync::Arc;
use fst::raw::MmapReadOnly;
pub use self::doc_ids::DocIds; pub use self::doc_ids::DocIds;
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder}; pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
#[derive(Clone)] #[derive(Default, Clone)]
enum Data { pub struct SharedData {
Shared { pub bytes: Arc<Vec<u8>>,
bytes: Arc<Vec<u8>>, pub offset: usize,
offset: usize, pub len: usize,
len: usize,
},
Mmap(MmapReadOnly),
} }
impl Data { impl SharedData {
pub fn range(&self, off: usize, l: usize) -> Data { pub fn from_bytes(vec: Vec<u8>) -> SharedData {
match self { let len = vec.len();
Data::Shared { bytes, offset, len } => { let bytes = Arc::new(vec);
assert!(off + l <= *len); SharedData::new(bytes, 0, len)
Data::Shared {
bytes: bytes.clone(),
offset: offset + off,
len: l,
} }
},
Data::Mmap(mmap) => Data::Mmap(mmap.range(off, l)), pub fn new(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedData {
SharedData { bytes, offset, len }
}
pub fn range(&self, offset: usize, len: usize) -> SharedData {
assert!(offset + len <= self.len);
SharedData {
bytes: self.bytes.clone(),
offset: self.offset + offset,
len: len,
} }
} }
} }
impl Default for Data { impl Deref for SharedData {
fn default() -> Data {
Data::Shared {
bytes: Arc::default(),
offset: 0,
len: 0,
}
}
}
impl Deref for Data {
type Target = [u8]; type Target = [u8];
fn deref(&self) -> &Self::Target { fn deref(&self) -> &Self::Target {
@ -53,13 +45,14 @@ impl Deref for Data {
} }
} }
impl AsRef<[u8]> for Data { impl AsRef<[u8]> for SharedData {
fn as_ref(&self) -> &[u8] { fn as_ref(&self) -> &[u8] {
match self { &self.bytes[self.offset..self.offset + self.len]
Data::Shared { bytes, offset, len } => {
&bytes[*offset..offset + len]
},
Data::Mmap(m) => m.as_slice(),
}
} }
} }
unsafe fn into_u8_slice<T: Sized>(slice: &[T]) -> &[u8] {
let ptr = slice.as_ptr() as *const u8;
let len = slice.len() * size_of::<T>();
from_raw_parts(ptr, len)
}

View File

@ -1,110 +0,0 @@
mod ops;
pub mod positive;
pub mod negative;
pub use self::positive::{PositiveBlob, PositiveBlobBuilder};
pub use self::negative::NegativeBlob;
pub use self::ops::OpBuilder;
use std::fmt;
use serde_derive::{Serialize, Deserialize};
use serde::ser::{Serialize, Serializer, SerializeTuple};
use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor};
#[derive(Debug)]
pub enum Blob {
Positive(PositiveBlob),
Negative(NegativeBlob),
}
impl Blob {
pub fn is_negative(&self) -> bool {
self.sign() == Sign::Negative
}
pub fn is_positive(&self) -> bool {
self.sign() == Sign::Positive
}
pub fn sign(&self) -> Sign {
match self {
Blob::Positive(_) => Sign::Positive,
Blob::Negative(_) => Sign::Negative,
}
}
}
impl Serialize for Blob {
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
match self {
Blob::Positive(blob) => {
let mut tuple = serializer.serialize_tuple(2)?;
tuple.serialize_element(&Sign::Positive)?;
tuple.serialize_element(&blob)?;
tuple.end()
},
Blob::Negative(blob) => {
let mut tuple = serializer.serialize_tuple(2)?;
tuple.serialize_element(&Sign::Negative)?;
tuple.serialize_element(&blob)?;
tuple.end()
},
}
}
}
impl<'de> Deserialize<'de> for Blob {
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Blob, D::Error> {
struct TupleVisitor;
impl<'de> Visitor<'de> for TupleVisitor {
type Value = Blob;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
formatter.write_str("a Blob struct")
}
#[inline]
fn visit_seq<A: SeqAccess<'de>>(self, mut seq: A) -> Result<Self::Value, A::Error> {
let sign = match seq.next_element()? {
Some(value) => value,
None => return Err(de::Error::invalid_length(0, &self)),
};
match sign {
Sign::Positive => {
let blob = match seq.next_element()? {
Some(value) => value,
None => return Err(de::Error::invalid_length(1, &self)),
};
Ok(Blob::Positive(blob))
},
Sign::Negative => {
let blob = match seq.next_element()? {
Some(value) => value,
None => return Err(de::Error::invalid_length(1, &self)),
};
Ok(Blob::Negative(blob))
},
}
}
}
deserializer.deserialize_tuple(2, TupleVisitor)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum Sign {
Positive,
Negative,
}
impl Sign {
pub fn invert(self) -> Sign {
match self {
Sign::Positive => Sign::Negative,
Sign::Negative => Sign::Positive,
}
}
}

View File

@ -1,67 +0,0 @@
use std::error::Error;
use std::path::Path;
use std::fmt;
use sdset::Set;
use serde::de::{self, Deserialize, Deserializer};
use serde::ser::{Serialize, Serializer};
use crate::data::DocIds;
use crate::DocumentId;
#[derive(Default)]
pub struct NegativeBlob {
doc_ids: DocIds,
}
impl NegativeBlob {
pub unsafe fn from_path<P>(doc_ids: P) -> Result<Self, Box<Error>>
where P: AsRef<Path>,
{
let doc_ids = DocIds::from_path(doc_ids)?;
Ok(NegativeBlob { doc_ids })
}
pub fn from_bytes(doc_ids: Vec<u8>) -> Result<Self, Box<Error>> {
let doc_ids = DocIds::from_bytes(doc_ids)?;
Ok(NegativeBlob { doc_ids })
}
pub fn from_raw(doc_ids: DocIds) -> Self {
NegativeBlob { doc_ids }
}
pub fn as_ids(&self) -> &DocIds {
&self.doc_ids
}
pub fn into_doc_ids(self) -> DocIds {
self.doc_ids
}
}
impl AsRef<Set<DocumentId>> for NegativeBlob {
fn as_ref(&self) -> &Set<DocumentId> {
self.as_ids().doc_ids()
}
}
impl fmt::Debug for NegativeBlob {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "NegativeBlob(")?;
f.debug_list().entries(self.as_ref().as_slice()).finish()?;
write!(f, ")")
}
}
impl Serialize for NegativeBlob {
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
self.doc_ids.serialize(serializer)
}
}
impl<'de> Deserialize<'de> for NegativeBlob {
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<NegativeBlob, D::Error> {
let bytes = Vec::deserialize(deserializer)?;
NegativeBlob::from_bytes(bytes).map_err(de::Error::custom)
}
}

View File

@ -1,5 +0,0 @@
mod blob;
mod ops;
pub use self::blob::NegativeBlob;
pub use self::ops::OpBuilder;

View File

@ -1,73 +0,0 @@
use sdset::multi::OpBuilder as SdOpBuilder;
use sdset::Set;
use crate::database::blob::NegativeBlob;
use crate::data::DocIds;
use crate::DocumentId;
pub struct OpBuilder<'a> {
inner: SdOpBuilder<'a, DocumentId>,
}
/// Do a set operation on multiple negative blobs.
impl<'a> OpBuilder<'a> {
pub fn new() -> Self {
Self { inner: SdOpBuilder::new() }
}
pub fn with_capacity(cap: usize) -> Self {
Self { inner: SdOpBuilder::with_capacity(cap) }
}
pub fn add(mut self, blob: &'a NegativeBlob) -> Self {
self.push(blob);
self
}
pub fn push(&mut self, blob: &'a NegativeBlob) {
let set = Set::new_unchecked(blob.as_ref());
self.inner.push(set);
}
pub fn union(self) -> Union<'a> {
Union::new(self.inner.union())
}
pub fn intersection(self) -> Intersection<'a> {
Intersection::new(self.inner.intersection())
}
pub fn difference(self) -> Difference<'a> {
Difference::new(self.inner.difference())
}
pub fn symmetric_difference(self) -> SymmetricDifference<'a> {
SymmetricDifference::new(self.inner.symmetric_difference())
}
}
macro_rules! logical_operation {
(struct $name:ident, $operation:ident) => {
pub struct $name<'a> {
op: sdset::multi::$name<'a, DocumentId>,
}
impl<'a> $name<'a> {
fn new(op: sdset::multi::$name<'a, DocumentId>) -> Self {
$name { op }
}
pub fn into_negative_blob(self) -> NegativeBlob {
let document_ids = sdset::SetOperation::into_set_buf(self.op);
let doc_ids = DocIds::from_document_ids(document_ids.into_vec());
NegativeBlob::from_raw(doc_ids)
}
}
}}
logical_operation!(struct Union, union);
logical_operation!(struct Intersection, intersection);
logical_operation!(struct Difference, difference);
logical_operation!(struct SymmetricDifference, symmetric_difference);

View File

@ -1,109 +0,0 @@
use std::error::Error;
use fst::{IntoStreamer, Streamer};
use sdset::duo::DifferenceByKey;
use sdset::{Set, SetOperation};
use group_by::GroupBy;
use crate::database::blob::{Blob, Sign, PositiveBlob, PositiveBlobBuilder, NegativeBlob};
use crate::database::blob::{positive, negative};
fn blob_same_sign(a: &Blob, b: &Blob) -> bool {
a.sign() == b.sign()
}
fn unwrap_positive(blob: &Blob) -> &PositiveBlob {
match blob {
Blob::Positive(blob) => blob,
Blob::Negative(_) => panic!("called `unwrap_positive()` on a `Negative` value"),
}
}
fn unwrap_negative(blob: &Blob) -> &NegativeBlob {
match blob {
Blob::Negative(blob) => blob,
Blob::Positive(_) => panic!("called `unwrap_negative()` on a `Positive` value"),
}
}
pub struct OpBuilder {
blobs: Vec<Blob>,
}
impl OpBuilder {
pub fn new() -> OpBuilder {
OpBuilder { blobs: Vec::new() }
}
pub fn with_capacity(cap: usize) -> OpBuilder {
OpBuilder { blobs: Vec::with_capacity(cap) }
}
pub fn push(&mut self, blob: Blob) {
if self.blobs.is_empty() && blob.is_negative() { return }
self.blobs.push(blob);
}
pub fn merge(self) -> Result<PositiveBlob, Box<Error>> {
let groups = GroupBy::new(&self.blobs, blob_same_sign);
let mut aggregated = Vec::new();
for blobs in groups {
match blobs[0].sign() {
Sign::Positive => {
let mut op_builder = positive::OpBuilder::with_capacity(blobs.len());
for blob in blobs {
op_builder.push(unwrap_positive(blob));
}
let mut stream = op_builder.union().into_stream();
let mut builder = PositiveBlobBuilder::memory();
while let Some((input, doc_indexes)) = stream.next() {
// FIXME empty doc_indexes must be handled by OpBuilder
if !doc_indexes.is_empty() {
builder.insert(input, doc_indexes).unwrap();
}
}
let (map, doc_indexes) = builder.into_inner().unwrap();
let blob = PositiveBlob::from_bytes(map, doc_indexes).unwrap();
aggregated.push(Blob::Positive(blob));
},
Sign::Negative => {
let mut op_builder = negative::OpBuilder::with_capacity(blobs.len());
for blob in blobs {
op_builder.push(unwrap_negative(blob));
}
let blob = op_builder.union().into_negative_blob();
aggregated.push(Blob::Negative(blob));
},
}
}
let mut buffer = Vec::new();
aggregated.chunks(2).try_fold(PositiveBlob::default(), |base, slice| {
let negative = NegativeBlob::default();
let (positive, negative) = match slice {
[a, b] => (unwrap_positive(a), unwrap_negative(b)),
[a] => (unwrap_positive(a), &negative),
_ => unreachable!(),
};
let mut builder = PositiveBlobBuilder::memory();
let op_builder = positive::OpBuilder::new().add(&base).add(&positive);
let mut stream = op_builder.union().into_stream();
while let Some((input, doc_indexes)) = stream.next() {
let op = DifferenceByKey::new(doc_indexes, negative.as_ref(), |x| x.document_id, |x| *x);
buffer.clear();
op.extend_vec(&mut buffer);
if !buffer.is_empty() {
builder.insert(input, Set::new_unchecked(&buffer))?;
}
}
let (map, doc_indexes) = builder.into_inner()?;
PositiveBlob::from_bytes(map, doc_indexes)
})
}
}

View File

@ -1,254 +0,0 @@
use std::fmt;
use std::io::Write;
use std::path::Path;
use std::error::Error;
use fst::{map, Map, Streamer, IntoStreamer};
use sdset::Set;
use crate::DocIndex;
use crate::data::{DocIndexes, DocIndexesBuilder};
use serde::ser::{Serialize, Serializer, SerializeTuple};
use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor};
#[derive(Default)]
pub struct PositiveBlob {
map: Map,
indexes: DocIndexes,
}
impl PositiveBlob {
pub unsafe fn from_paths<P, Q>(map: P, indexes: Q) -> Result<Self, Box<Error>>
where P: AsRef<Path>,
Q: AsRef<Path>,
{
let map = Map::from_path(map)?;
let indexes = DocIndexes::from_path(indexes)?;
Ok(PositiveBlob { map, indexes })
}
pub fn from_bytes(map: Vec<u8>, indexes: Vec<u8>) -> Result<Self, Box<Error>> {
let map = Map::from_bytes(map)?;
let indexes = DocIndexes::from_bytes(indexes)?;
Ok(PositiveBlob { map, indexes })
}
pub fn from_raw(map: Map, indexes: DocIndexes) -> Self {
PositiveBlob { map, indexes }
}
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[DocIndex]> {
self.map.get(key).map(|index| &self.indexes[index as usize])
}
pub fn as_map(&self) -> &Map {
&self.map
}
pub fn as_indexes(&self) -> &DocIndexes {
&self.indexes
}
pub fn explode(self) -> (Map, DocIndexes) {
(self.map, self.indexes)
}
}
impl fmt::Debug for PositiveBlob {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "PositiveBlob([")?;
let mut stream = self.into_stream();
let mut first = true;
while let Some((k, v)) = stream.next() {
if !first {
write!(f, ", ")?;
}
first = false;
write!(f, "({}, {:?})", String::from_utf8_lossy(k), v)?;
}
write!(f, "])")
}
}
impl<'m, 'a> IntoStreamer<'a> for &'m PositiveBlob {
type Item = (&'a [u8], &'a [DocIndex]);
/// The type of the stream to be constructed.
type Into = PositiveBlobStream<'m>;
/// Construct a stream from `Self`.
fn into_stream(self) -> Self::Into {
PositiveBlobStream {
map_stream: self.map.into_stream(),
doc_indexes: &self.indexes,
}
}
}
pub struct PositiveBlobStream<'m> {
map_stream: map::Stream<'m>,
doc_indexes: &'m DocIndexes,
}
impl<'m, 'a> Streamer<'a> for PositiveBlobStream<'m> {
type Item = (&'a [u8], &'a [DocIndex]);
fn next(&'a mut self) -> Option<Self::Item> {
match self.map_stream.next() {
Some((input, index)) => {
let doc_indexes = &self.doc_indexes[index as usize];
Some((input, doc_indexes))
},
None => None,
}
}
}
impl Serialize for PositiveBlob {
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
let mut tuple = serializer.serialize_tuple(2)?;
tuple.serialize_element(&self.map.as_fst().to_vec())?;
tuple.serialize_element(&self.indexes.to_vec())?;
tuple.end()
}
}
impl<'de> Deserialize<'de> for PositiveBlob {
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<PositiveBlob, D::Error> {
struct TupleVisitor;
impl<'de> Visitor<'de> for TupleVisitor {
type Value = PositiveBlob;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
formatter.write_str("a PositiveBlob struct")
}
#[inline]
fn visit_seq<A: SeqAccess<'de>>(self, mut seq: A) -> Result<Self::Value, A::Error> {
let map = match seq.next_element()? {
Some(bytes) => match Map::from_bytes(bytes) {
Ok(value) => value,
Err(err) => return Err(de::Error::custom(err)),
},
None => return Err(de::Error::invalid_length(0, &self)),
};
let indexes = match seq.next_element()? {
Some(bytes) => match DocIndexes::from_bytes(bytes) {
Ok(value) => value,
Err(err) => return Err(de::Error::custom(err)),
},
None => return Err(de::Error::invalid_length(1, &self)),
};
Ok(PositiveBlob { map, indexes })
}
}
deserializer.deserialize_tuple(2, TupleVisitor)
}
}
pub struct PositiveBlobBuilder<W, X> {
map: fst::MapBuilder<W>,
indexes: DocIndexesBuilder<X>,
value: u64,
}
impl PositiveBlobBuilder<Vec<u8>, Vec<u8>> {
pub fn memory() -> Self {
PositiveBlobBuilder {
map: fst::MapBuilder::memory(),
indexes: DocIndexesBuilder::memory(),
value: 0,
}
}
}
impl<W: Write, X: Write> PositiveBlobBuilder<W, X> {
pub fn new(map: W, indexes: X) -> Result<Self, Box<Error>> {
Ok(PositiveBlobBuilder {
map: fst::MapBuilder::new(map)?,
indexes: DocIndexesBuilder::new(indexes),
value: 0,
})
}
/// If a key is inserted that is less than or equal to any previous key added,
/// then an error is returned. Similarly, if there was a problem writing
/// to the underlying writer, an error is returned.
// FIXME what if one write doesn't work but the other do ?
pub fn insert<K>(&mut self, key: K, doc_indexes: &Set<DocIndex>) -> Result<(), Box<Error>>
where K: AsRef<[u8]>,
{
self.map.insert(key, self.value)?;
self.indexes.insert(doc_indexes)?;
self.value += 1;
Ok(())
}
pub fn finish(self) -> Result<(), Box<Error>> {
self.into_inner().map(drop)
}
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
let map = self.map.into_inner()?;
let indexes = self.indexes.into_inner()?;
Ok((map, indexes))
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::error::Error;
#[test]
fn serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
let mut builder = PositiveBlobBuilder::memory();
builder.insert("aaa", Set::new(&[a])?)?;
builder.insert("aab", Set::new(&[a, b, c])?)?;
builder.insert("aac", Set::new(&[a, c])?)?;
let (map_bytes, indexes_bytes) = builder.into_inner()?;
let positive_blob = PositiveBlob::from_bytes(map_bytes, indexes_bytes)?;
assert_eq!(positive_blob.get("aaa"), Some(&[a][..]));
assert_eq!(positive_blob.get("aab"), Some(&[a, b, c][..]));
assert_eq!(positive_blob.get("aac"), Some(&[a, c][..]));
assert_eq!(positive_blob.get("aad"), None);
Ok(())
}
#[test]
fn serde_serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
let mut builder = PositiveBlobBuilder::memory();
builder.insert("aaa", Set::new(&[a])?)?;
builder.insert("aab", Set::new(&[a, b, c])?)?;
builder.insert("aac", Set::new(&[a, c])?)?;
let (map_bytes, indexes_bytes) = builder.into_inner()?;
let positive_blob = PositiveBlob::from_bytes(map_bytes, indexes_bytes)?;
let bytes = bincode::serialize(&positive_blob)?;
let positive_blob: PositiveBlob = bincode::deserialize(&bytes)?;
assert_eq!(positive_blob.get("aaa"), Some(&[a][..]));
assert_eq!(positive_blob.get("aab"), Some(&[a, b, c][..]));
assert_eq!(positive_blob.get("aac"), Some(&[a, c][..]));
assert_eq!(positive_blob.get("aad"), None);
Ok(())
}
}

View File

@ -1,5 +0,0 @@
mod blob;
mod ops;
pub use self::blob::{PositiveBlob, PositiveBlobBuilder};
pub use self::ops::OpBuilder;

View File

@ -1,128 +0,0 @@
use sdset::multi::OpBuilder as SdOpBuilder;
use sdset::{SetOperation, Set};
use crate::database::blob::PositiveBlob;
use crate::data::DocIndexes;
use crate::DocIndex;
pub struct OpBuilder<'m> {
// the operation on the maps is always an union.
map_op: fst::map::OpBuilder<'m>,
indexes: Vec<&'m DocIndexes>,
}
/// Do a set operation on multiple positive blobs.
impl<'m> OpBuilder<'m> {
pub fn new() -> Self {
Self {
map_op: fst::map::OpBuilder::new(),
indexes: Vec::new(),
}
}
pub fn with_capacity(cap: usize) -> Self {
Self {
map_op: fst::map::OpBuilder::new(), // TODO patch fst to add with_capacity
indexes: Vec::with_capacity(cap),
}
}
pub fn add(mut self, blob: &'m PositiveBlob) -> Self {
self.push(blob);
self
}
pub fn push(&mut self, blob: &'m PositiveBlob) {
self.map_op.push(blob.as_map());
self.indexes.push(blob.as_indexes());
}
pub fn union(self) -> Union<'m> {
Union::new(self.map_op.union(), self.indexes)
}
pub fn intersection(self) -> Intersection<'m> {
Intersection::new(self.map_op.union(), self.indexes)
}
pub fn difference(self) -> Difference<'m> {
Difference::new(self.map_op.union(), self.indexes)
}
pub fn symmetric_difference(self) -> SymmetricDifference<'m> {
SymmetricDifference::new(self.map_op.union(), self.indexes)
}
}
macro_rules! logical_operation {
(struct $name:ident, $operation:ident) => {
pub struct $name<'m> {
stream: fst::map::Union<'m>,
indexes: Vec<&'m DocIndexes>,
outs: Vec<DocIndex>,
}
impl<'m> $name<'m> {
fn new(stream: fst::map::Union<'m>, indexes: Vec<&'m DocIndexes>) -> Self {
$name {
stream: stream,
indexes: indexes,
outs: Vec::new(),
}
}
}
impl<'m, 'a> fst::Streamer<'a> for $name<'m> {
type Item = (&'a [u8], &'a Set<DocIndex>);
fn next(&'a mut self) -> Option<Self::Item> {
// loop {
// let (input, ivalues) = match self.stream.next() {
// Some(value) => value,
// None => return None,
// };
// self.outs.clear();
// let mut builder = SdOpBuilder::with_capacity(ivalues.len());
// for ivalue in ivalues {
// let indexes = self.indexes[ivalue.index];
// let indexes = indexes.get(ivalue.value).expect("BUG: could not find document indexes");
// let set = Set::new_unchecked(indexes);
// builder.push(set);
// }
// builder.$operation().extend_vec(&mut self.outs);
// if self.outs.is_empty() { continue }
// return Some((input, &self.outs))
// }
// FIXME make the above code compile
match self.stream.next() {
Some((input, ivalues)) => {
self.outs.clear();
let mut builder = SdOpBuilder::with_capacity(ivalues.len());
for ivalue in ivalues {
let doc_indexes = &self.indexes[ivalue.index][ivalue.value as usize];
let set = Set::new_unchecked(doc_indexes);
builder.push(set);
}
builder.$operation().extend_vec(&mut self.outs);
if self.outs.is_empty() { return None }
return Some((input, Set::new_unchecked(&self.outs)))
},
None => None
}
}
}
}}
logical_operation!(struct Union, union);
logical_operation!(struct Intersection, intersection);
logical_operation!(struct Difference, difference);
logical_operation!(struct SymmetricDifference, symmetric_difference);

View File

@ -2,13 +2,13 @@ use std::io::{Cursor, Read, Write};
use std::mem::size_of; use std::mem::size_of;
use std::fmt; use std::fmt;
use byteorder::{NativeEndian, WriteBytesExt, ReadBytesExt}; use byteorder::{BigEndian, WriteBytesExt, ReadBytesExt};
use crate::database::schema::SchemaAttr; use crate::database::schema::SchemaAttr;
use crate::DocumentId; use crate::DocumentId;
const DOC_KEY_LEN: usize = 4 + size_of::<u64>(); const DOC_KEY_LEN: usize = 4 + size_of::<u64>();
const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::<u32>(); const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::<u16>();
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct DocumentKey([u8; DOC_KEY_LEN]); pub struct DocumentKey([u8; DOC_KEY_LEN]);
@ -19,7 +19,7 @@ impl DocumentKey {
let mut wtr = Cursor::new(&mut buffer[..]); let mut wtr = Cursor::new(&mut buffer[..]);
wtr.write_all(b"doc-").unwrap(); wtr.write_all(b"doc-").unwrap();
wtr.write_u64::<NativeEndian>(id).unwrap(); wtr.write_u64::<BigEndian>(id.0).unwrap();
DocumentKey(buffer) DocumentKey(buffer)
} }
@ -38,12 +38,17 @@ impl DocumentKey {
DocumentKeyAttr::new(self.document_id(), attr) DocumentKeyAttr::new(self.document_id(), attr)
} }
pub fn with_attribute_min(&self) -> DocumentKeyAttr {
DocumentKeyAttr::new(self.document_id(), SchemaAttr::min())
}
pub fn with_attribute_max(&self) -> DocumentKeyAttr { pub fn with_attribute_max(&self) -> DocumentKeyAttr {
DocumentKeyAttr::new(self.document_id(), SchemaAttr::max()) DocumentKeyAttr::new(self.document_id(), SchemaAttr::max())
} }
pub fn document_id(&self) -> DocumentId { pub fn document_id(&self) -> DocumentId {
(&self.0[4..]).read_u64::<NativeEndian>().unwrap() let id = (&self.0[4..]).read_u64::<BigEndian>().unwrap();
DocumentId(id)
} }
} }
@ -72,11 +77,19 @@ impl DocumentKeyAttr {
let mut wtr = Cursor::new(&mut buffer[..]); let mut wtr = Cursor::new(&mut buffer[..]);
wtr.write_all(&raw_key).unwrap(); wtr.write_all(&raw_key).unwrap();
wtr.write_all(b"-").unwrap(); wtr.write_all(b"-").unwrap();
wtr.write_u32::<NativeEndian>(attr.as_u32()).unwrap(); wtr.write_u16::<BigEndian>(attr.0).unwrap();
DocumentKeyAttr(buffer) DocumentKeyAttr(buffer)
} }
pub fn with_attribute_min(id: DocumentId) -> DocumentKeyAttr {
DocumentKeyAttr::new(id, SchemaAttr::min())
}
pub fn with_attribute_max(id: DocumentId) -> DocumentKeyAttr {
DocumentKeyAttr::new(id, SchemaAttr::max())
}
pub fn from_bytes(mut bytes: &[u8]) -> DocumentKeyAttr { pub fn from_bytes(mut bytes: &[u8]) -> DocumentKeyAttr {
assert!(bytes.len() >= DOC_KEY_ATTR_LEN); assert!(bytes.len() >= DOC_KEY_ATTR_LEN);
assert_eq!(&bytes[..4], b"doc-"); assert_eq!(&bytes[..4], b"doc-");
@ -88,12 +101,13 @@ impl DocumentKeyAttr {
} }
pub fn document_id(&self) -> DocumentId { pub fn document_id(&self) -> DocumentId {
(&self.0[4..]).read_u64::<NativeEndian>().unwrap() let id = (&self.0[4..]).read_u64::<BigEndian>().unwrap();
DocumentId(id)
} }
pub fn attribute(&self) -> SchemaAttr { pub fn attribute(&self) -> SchemaAttr {
let offset = 4 + size_of::<u64>() + 1; let offset = 4 + size_of::<u64>() + 1;
let value = (&self.0[offset..]).read_u32::<NativeEndian>().unwrap(); let value = (&self.0[offset..]).read_u16::<BigEndian>().unwrap();
SchemaAttr::new(value) SchemaAttr::new(value)
} }
@ -112,7 +126,24 @@ impl fmt::Debug for DocumentKeyAttr {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("DocumentKeyAttr") f.debug_struct("DocumentKeyAttr")
.field("document_id", &self.document_id()) .field("document_id", &self.document_id())
.field("attribute", &self.attribute().as_u32()) .field("attribute", &self.attribute().0)
.finish() .finish()
} }
} }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn keep_as_ref_order() {
for (a, b) in (0..).zip(1..).take(u16::max_value() as usize - 1) {
let id = DocumentId(0);
let a = DocumentKeyAttr::new(id, SchemaAttr(a));
let b = DocumentKeyAttr::new(id, SchemaAttr(b));
assert!(a < b);
assert!(a.as_ref() < b.as_ref());
}
}
}

82
src/database/index/mod.rs Normal file
View File

@ -0,0 +1,82 @@
mod negative;
mod positive;
pub(crate) use self::negative::Negative;
pub(crate) use self::positive::{Positive, PositiveBuilder};
use std::error::Error;
use std::io::Cursor;
use std::sync::Arc;
use fst::{IntoStreamer, Streamer};
use sdset::duo::DifferenceByKey;
use sdset::{Set, SetOperation};
use fst::Map;
use crate::data::{SharedData, DocIndexes};
#[derive(Default)]
pub struct Index {
pub(crate) negative: Negative,
pub(crate) positive: Positive,
}
impl Index {
pub fn from_bytes(bytes: Vec<u8>) -> Result<Index, Box<Error>> {
let len = bytes.len();
Index::from_shared_bytes(Arc::new(bytes), 0, len)
}
pub fn from_shared_bytes(
bytes: Arc<Vec<u8>>,
offset: usize,
len: usize,
) -> Result<Index, Box<Error>>
{
let data = SharedData::new(bytes, offset, len);
let mut cursor = Cursor::new(data);
let negative = Negative::from_cursor(&mut cursor)?;
let positive = Positive::from_cursor(&mut cursor)?;
Ok(Index { negative, positive })
}
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
self.negative.write_to_bytes(bytes);
self.positive.write_to_bytes(bytes);
}
pub fn merge(&self, other: &Index) -> Result<Index, Box<Error>> {
if other.negative.is_empty() {
let negative = Negative::default();
let positive = self.positive.union(&other.positive)?;
return Ok(Index { negative, positive })
}
let mut buffer = Vec::new();
let mut builder = PositiveBuilder::memory();
let mut stream = self.positive.into_stream();
while let Some((key, indexes)) = stream.next() {
let op = DifferenceByKey::new(indexes, &other.negative, |x| x.document_id, |x| *x);
buffer.clear();
op.extend_vec(&mut buffer);
if !buffer.is_empty() {
let indexes = Set::new_unchecked(&buffer);
builder.insert(key, indexes)?;
}
}
let positive = {
let (map, indexes) = builder.into_inner()?;
let map = Map::from_bytes(map)?;
let indexes = DocIndexes::from_bytes(indexes)?;
Positive::new(map, indexes)
};
let negative = Negative::default();
let positive = positive.union(&other.positive)?;
Ok(Index { negative, positive })
}
}

View File

@ -0,0 +1,43 @@
use std::error::Error;
use std::io::Cursor;
use std::ops::Deref;
use sdset::Set;
use byteorder::{LittleEndian, WriteBytesExt};
use crate::data::SharedData;
use crate::data::DocIds;
use crate::DocumentId;
#[derive(Default)]
pub struct Negative(DocIds);
impl Negative {
pub fn new(doc_ids: DocIds) -> Negative {
Negative(doc_ids)
}
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Negative, Box<Error>> {
let doc_ids = DocIds::from_cursor(cursor)?;
Ok(Negative(doc_ids))
}
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
let slice = self.0.as_bytes();
let len = slice.len() as u64;
let _ = bytes.write_u64::<LittleEndian>(len);
bytes.extend_from_slice(slice);
}
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
}
impl Deref for Negative {
type Target = Set<DocumentId>;
fn deref(&self) -> &Self::Target {
self.0.as_ref()
}
}

View File

@ -0,0 +1,166 @@
use std::io::{Write, BufRead, Cursor};
use std::error::Error;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use fst::{map, Map, Streamer, IntoStreamer};
use sdset::{Set, SetOperation};
use sdset::duo::Union;
use fst::raw::Fst;
use crate::data::{DocIndexes, DocIndexesBuilder};
use crate::data::SharedData;
use crate::DocIndex;
#[derive(Default)]
pub struct Positive {
map: Map,
indexes: DocIndexes,
}
impl Positive {
pub fn new(map: Map, indexes: DocIndexes) -> Positive {
Positive { map, indexes }
}
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Positive, Box<Error>> {
let len = cursor.read_u64::<LittleEndian>()? as usize;
let offset = cursor.position() as usize;
let data = cursor.get_ref().range(offset, len);
let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?;
let map = Map::from(fst);
cursor.consume(len);
let indexes = DocIndexes::from_cursor(cursor)?;
Ok(Positive { map, indexes})
}
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
let slice = self.map.as_fst().as_bytes();
let len = slice.len() as u64;
let _ = bytes.write_u64::<LittleEndian>(len);
bytes.extend_from_slice(slice);
self.indexes.write_to_bytes(bytes);
}
pub fn map(&self) -> &Map {
&self.map
}
pub fn indexes(&self) -> &DocIndexes {
&self.indexes
}
pub fn union(&self, other: &Positive) -> Result<Positive, Box<Error>> {
let mut builder = PositiveBuilder::memory();
let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union();
let mut buffer = Vec::new();
while let Some((key, ivalues)) = stream.next() {
buffer.clear();
match ivalues {
[a, b] => {
let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
let indexes = indexes.get(a.value as usize).ok_or(format!("index not found"))?;
let a = Set::new_unchecked(indexes);
let indexes = if b.index == 0 { &self.indexes } else { &other.indexes };
let indexes = indexes.get(b.value as usize).ok_or(format!("index not found"))?;
let b = Set::new_unchecked(indexes);
let op = Union::new(a, b);
op.extend_vec(&mut buffer);
},
[a] => {
let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
let indexes = indexes.get(a.value as usize).ok_or(format!("index not found"))?;
buffer.extend_from_slice(indexes)
},
_ => continue,
}
if !buffer.is_empty() {
let indexes = Set::new_unchecked(&buffer);
builder.insert(key, indexes)?;
}
}
let (map, indexes) = builder.into_inner()?;
let map = Map::from_bytes(map)?;
let indexes = DocIndexes::from_bytes(indexes)?;
Ok(Positive { map, indexes })
}
}
impl<'m, 'a> IntoStreamer<'a> for &'m Positive {
type Item = (&'a [u8], &'a Set<DocIndex>);
/// The type of the stream to be constructed.
type Into = Stream<'m>;
/// Construct a stream from `Self`.
fn into_stream(self) -> Self::Into {
Stream {
map_stream: self.map.into_stream(),
indexes: &self.indexes,
}
}
}
pub struct Stream<'m> {
map_stream: map::Stream<'m>,
indexes: &'m DocIndexes,
}
impl<'m, 'a> Streamer<'a> for Stream<'m> {
type Item = (&'a [u8], &'a Set<DocIndex>);
fn next(&'a mut self) -> Option<Self::Item> {
match self.map_stream.next() {
Some((input, index)) => {
let indexes = &self.indexes[index as usize];
let indexes = Set::new_unchecked(indexes);
Some((input, indexes))
},
None => None,
}
}
}
pub struct PositiveBuilder<W, X> {
map: fst::MapBuilder<W>,
indexes: DocIndexesBuilder<X>,
value: u64,
}
impl PositiveBuilder<Vec<u8>, Vec<u8>> {
pub fn memory() -> Self {
PositiveBuilder {
map: fst::MapBuilder::memory(),
indexes: DocIndexesBuilder::memory(),
value: 0,
}
}
}
impl<W: Write, X: Write> PositiveBuilder<W, X> {
/// If a key is inserted that is less than or equal to any previous key added,
/// then an error is returned. Similarly, if there was a problem writing
/// to the underlying writer, an error is returned.
// FIXME what if one write doesn't work but the other do ?
pub fn insert<K>(&mut self, key: K, indexes: &Set<DocIndex>) -> Result<(), Box<Error>>
where K: AsRef<[u8]>,
{
self.map.insert(key, self.value)?;
self.indexes.insert(indexes);
self.value += 1;
Ok(())
}
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
let map = self.map.into_inner()?;
let indexes = self.indexes.into_inner()?;
Ok((map, indexes))
}
}

View File

@ -1,66 +1,128 @@
use std::sync::{Arc, Mutex, RwLock, RwLockReadGuard}; use std::sync::Arc;
use std::error::Error; use std::error::Error;
use std::path::Path; use std::ffi::OsStr;
use std::ops::Deref; use std::fs;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicBool, Ordering};
use std::ops::{Deref, DerefMut};
use rocksdb::rocksdb_options::{DBOptions, IngestExternalFileOptions, ColumnFamilyOptions}; use crossbeam::atomic::ArcCell;
use log::{info, error, warn};
use rocksdb::rocksdb::{Writable, Snapshot}; use rocksdb::rocksdb::{Writable, Snapshot};
use rocksdb::{DB, DBVector, MergeOperands}; use rocksdb::rocksdb_options::{DBOptions, ColumnFamilyOptions};
use rocksdb::{DB, MergeOperands};
use lockfree::map::Map;
pub use self::document_key::{DocumentKey, DocumentKeyAttr}; pub use self::document_key::{DocumentKey, DocumentKeyAttr};
pub use self::database_view::{DatabaseView, DocumentIter}; pub use self::view::{DatabaseView, DocumentIter};
use self::blob::positive::PositiveBlob; pub use self::update::Update;
use self::update::Update; pub use self::serde::SerializerError;
use self::schema::Schema; pub use self::schema::Schema;
use self::blob::Blob; pub use self::index::Index;
pub mod blob;
pub mod schema;
pub mod update;
mod document_key;
mod database_view;
mod deserializer;
const DATA_INDEX: &[u8] = b"data-index"; const DATA_INDEX: &[u8] = b"data-index";
const DATA_SCHEMA: &[u8] = b"data-schema"; const DATA_SCHEMA: &[u8] = b"data-schema";
pub fn retrieve_data_schema<D>(snapshot: &Snapshot<D>) -> Result<Schema, Box<Error>> pub mod schema;
pub(crate) mod index;
mod document_key;
mod serde;
mod update;
mod view;
fn retrieve_data_schema<D>(snapshot: &Snapshot<D>) -> Result<Schema, Box<Error>>
where D: Deref<Target=DB> where D: Deref<Target=DB>
{ {
match snapshot.get(DATA_SCHEMA)? { match snapshot.get(DATA_SCHEMA)? {
Some(vector) => Ok(Schema::read_from(&*vector)?), Some(vector) => Ok(Schema::read_from_bin(&*vector)?),
None => Err(String::from("BUG: no schema found in the database").into()), None => Err(String::from("BUG: no schema found in the database").into()),
} }
} }
pub fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<PositiveBlob, Box<Error>> fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<Index, Box<Error>>
where D: Deref<Target=DB> where D: Deref<Target=DB>
{ {
match snapshot.get(DATA_INDEX)? { let (elapsed, vector) = elapsed::measure_time(|| snapshot.get(DATA_INDEX));
Some(vector) => Ok(bincode::deserialize(&*vector)?), info!("loading index from kv-store took {}", elapsed);
None => Ok(PositiveBlob::default()),
let index = match vector? {
Some(vector) => {
let bytes = vector.as_ref().to_vec();
info!("index size if {} MiB", bytes.len() / 1024 / 1024);
let (elapsed, index) = elapsed::measure_time(|| Index::from_bytes(bytes));
info!("loading index from bytes took {}", elapsed);
index?
},
None => Index::default(),
};
Ok(index)
}
fn merge_indexes(key: &[u8], existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
assert_eq!(key, DATA_INDEX, "The merge operator only supports \"data-index\" merging");
let mut index: Option<Index> = None;
for bytes in existing.into_iter().chain(operands) {
let operand = Index::from_bytes(bytes.to_vec()).unwrap();
let merged = match index {
Some(ref index) => index.merge(&operand).unwrap(),
None => operand,
};
index.replace(merged);
}
let index = index.unwrap_or_default();
let mut bytes = Vec::new();
index.write_to_bytes(&mut bytes);
bytes
}
pub struct IndexUpdate {
index: String,
update: Update,
}
impl Deref for IndexUpdate {
type Target = Update;
fn deref(&self) -> &Update {
&self.update
} }
} }
pub struct Database { impl DerefMut for IndexUpdate {
// DB is under a Mutex to sync update ingestions and separate DB update locking fn deref_mut(&mut self) -> &mut Update {
// and DatabaseView acquiring locking in other words: &mut self.update
// "Block readers the minimum possible amount of time" }
db: Mutex<Arc<DB>>,
// This view is updated each time the DB ingests an update
view: RwLock<DatabaseView<Arc<DB>>>,
} }
impl Database { struct DatabaseIndex {
pub fn create<P: AsRef<Path>>(path: P, schema: Schema) -> Result<Database, Box<Error>> { db: Arc<DB>,
// This view is updated each time the DB ingests an update
view: ArcCell<DatabaseView<Arc<DB>>>,
// This path is the path to the mdb folder stored on disk
path: PathBuf,
// must_die false by default, must be set as true when the Index is dropped.
// It's used to erase the folder saved on disk when the user request to delete an index
must_die: AtomicBool,
}
impl DatabaseIndex {
fn create<P: AsRef<Path>>(path: P, schema: &Schema) -> Result<DatabaseIndex, Box<Error>> {
let path = path.as_ref(); let path = path.as_ref();
if path.exists() { if path.exists() {
return Err(format!("File already exists at path: {}, cannot create database.", return Err(format!("File already exists at path: {}, cannot create database.",
path.display()).into()) path.display()).into())
} }
let path = path.to_string_lossy(); let path_lossy = path.to_string_lossy();
let mut opts = DBOptions::new(); let mut opts = DBOptions::new();
opts.create_if_missing(true); opts.create_if_missing(true);
// opts.error_if_exists(true); // FIXME pull request that // opts.error_if_exists(true); // FIXME pull request that
@ -68,21 +130,27 @@ impl Database {
let mut cf_opts = ColumnFamilyOptions::new(); let mut cf_opts = ColumnFamilyOptions::new();
cf_opts.add_merge_operator("data-index merge operator", merge_indexes); cf_opts.add_merge_operator("data-index merge operator", merge_indexes);
let db = DB::open_cf(opts, &path, vec![("default", cf_opts)])?; let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?;
let mut schema_bytes = Vec::new(); let mut schema_bytes = Vec::new();
schema.write_to(&mut schema_bytes)?; schema.write_to_bin(&mut schema_bytes)?;
db.put(DATA_SCHEMA, &schema_bytes)?; db.put(DATA_SCHEMA, &schema_bytes)?;
let db = Arc::new(db); let db = Arc::new(db);
let snapshot = Snapshot::new(db.clone()); let snapshot = Snapshot::new(db.clone());
let view = RwLock::new(DatabaseView::new(snapshot)?); let view = ArcCell::new(Arc::new(DatabaseView::new(snapshot)?));
Ok(Database { db: Mutex::new(db), view })
Ok(DatabaseIndex {
db: db,
view: view,
path: path.to_path_buf(),
must_die: AtomicBool::new(false)
})
} }
pub fn open<P: AsRef<Path>>(path: P) -> Result<Database, Box<Error>> { fn open<P: AsRef<Path>>(path: P) -> Result<DatabaseIndex, Box<Error>> {
let path = path.as_ref().to_string_lossy(); let path_lossy = path.as_ref().to_string_lossy();
let mut opts = DBOptions::new(); let mut opts = DBOptions::new();
opts.create_if_missing(false); opts.create_if_missing(false);
@ -90,170 +158,639 @@ impl Database {
let mut cf_opts = ColumnFamilyOptions::new(); let mut cf_opts = ColumnFamilyOptions::new();
cf_opts.add_merge_operator("data-index merge operator", merge_indexes); cf_opts.add_merge_operator("data-index merge operator", merge_indexes);
let db = DB::open_cf(opts, &path, vec![("default", cf_opts)])?; let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?;
// FIXME create a generic function to do that ! // FIXME create a generic function to do that !
let _schema = match db.get(DATA_SCHEMA)? { let _schema = match db.get(DATA_SCHEMA)? {
Some(value) => Schema::read_from(&*value)?, Some(value) => Schema::read_from_bin(&*value)?,
None => return Err(String::from("Database does not contain a schema").into()), None => return Err(String::from("Database does not contain a schema").into()),
}; };
let db = Arc::new(db); let db = Arc::new(db);
let snapshot = Snapshot::new(db.clone()); let snapshot = Snapshot::new(db.clone());
let view = RwLock::new(DatabaseView::new(snapshot)?); let view = ArcCell::new(Arc::new(DatabaseView::new(snapshot)?));
Ok(Database { db: Mutex::new(db), view }) Ok(DatabaseIndex {
db: db,
view: view,
path: path.as_ref().to_path_buf(),
must_die: AtomicBool::new(false)
})
} }
pub fn ingest_update_file(&self, update: Update) -> Result<(), Box<Error>> { fn must_die(&self) {
let snapshot = { self.must_die.store(true, Ordering::Relaxed)
// We must have a mutex here to ensure that update ingestions and compactions }
// are done atomatically and in the right order.
// This way update ingestions will block other update ingestions without blocking view fn start_update(&self) -> Result<Update, Box<Error>> {
// creations while doing the "data-index" compaction let schema = match self.db.get(DATA_SCHEMA)? {
let db = match self.db.lock() { Some(value) => Schema::read_from_bin(&*value)?,
None => panic!("Database does not contain a schema"),
};
Ok(Update::new(schema))
}
fn commit_update(&self, update: Update) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
let batch = update.build()?;
self.db.write(batch)?;
let snapshot = Snapshot::new(self.db.clone());
let view = Arc::new(DatabaseView::new(snapshot)?);
self.view.set(view.clone());
Ok(view)
}
fn view(&self) -> Arc<DatabaseView<Arc<DB>>> {
self.view.get()
}
}
impl Drop for DatabaseIndex {
fn drop(&mut self) {
if self.must_die.load(Ordering::Relaxed) {
if let Err(err) = fs::remove_dir_all(&self.path) {
error!("Impossible to remove mdb when Database id dropped; {}", err);
}
}
}
}
pub struct Database {
indexes: Map<String, Arc<DatabaseIndex>>,
path: PathBuf,
}
impl Database {
pub fn create<P: AsRef<Path>>(path: P) -> Result<Database, Box<Error>> {
Ok(Database {
indexes: Map::new(),
path: path.as_ref().to_path_buf(),
})
}
pub fn open<P: AsRef<Path>>(path: P) -> Result<Database, Box<Error>> {
let entries = fs::read_dir(&path)?;
let indexes = Map::new();
for entry in entries {
let path = match entry {
Ok(p) => p.path(),
Err(err) => {
warn!("Impossible to retrieve the path from an entry; {}", err);
continue
}
};
let name = match path.file_stem().and_then(OsStr::to_str) {
Some(name) => name.to_owned(),
None => continue
};
let db = match DatabaseIndex::open(path.clone()) {
Ok(db) => db, Ok(db) => db,
Err(e) => return Err(e.to_string().into()), Err(err) => {
}; warn!("Impossible to open the database; {}", err);
continue
let move_update = update.can_be_moved();
let path = update.into_path_buf();
let path = path.to_string_lossy();
let mut options = IngestExternalFileOptions::new();
options.move_files(move_update);
let cf_handle = db.cf_handle("default").expect("\"default\" column family not found");
db.ingest_external_file_optimized(&cf_handle, &options, &[&path])?;
// Compacting to trigger the merge operator only one time
// while ingesting the update and not each time searching
db.compact_range(Some(DATA_INDEX), Some(DATA_INDEX));
Snapshot::new(db.clone())
};
// Here we will block the view creation for the minimum amount of time:
// updating the DatabaseView itself with the new database snapshot
let view = DatabaseView::new(snapshot)?;
match self.view.write() {
Ok(mut lock) => *lock = view,
Err(e) => return Err(e.to_string().into()),
} }
};
info!("Load database {}", name);
indexes.insert(name, Arc::new(db));
}
Ok(Database {
indexes: indexes,
path: path.as_ref().to_path_buf(),
})
}
pub fn create_index(&self, name: &str, schema: &Schema) -> Result<(), Box<Error>> {
let index_path = self.path.join(name);
if index_path.exists() {
return Err("Index already exists".into());
}
let index = DatabaseIndex::create(index_path, schema)?;
self.indexes.insert(name.to_owned(), Arc::new(index));
Ok(()) Ok(())
} }
pub fn get(&self, key: &[u8]) -> Result<Option<DBVector>, Box<Error>> { pub fn delete_index(&self, name: &str) -> Result<(), Box<Error>> {
self.view().get(key) let index_guard = self.indexes.remove(name).ok_or("Index not found")?;
index_guard.val().must_die();
Ok(())
} }
pub fn flush(&self) -> Result<(), Box<Error>> { pub fn list_indexes(&self) -> Vec<String> {
match self.db.lock() { self.indexes.iter().map(|g| g.key().clone()).collect()
Ok(db) => Ok(db.flush(true)?),
Err(e) => Err(e.to_string().into()),
}
} }
pub fn view(&self) -> RwLockReadGuard<DatabaseView<Arc<DB>>> { pub fn start_update(&self, index: &str) -> Result<IndexUpdate, Box<Error>> {
self.view.read().unwrap() let index_guard = self.indexes.get(index).ok_or("Index not found")?;
} let update = index_guard.val().start_update()?;
}
fn merge_indexes(key: &[u8], existing_value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> { Ok(IndexUpdate { index: index.to_owned(), update })
if key != DATA_INDEX {
panic!("The merge operator only supports \"data-index\" merging")
} }
let capacity = { pub fn commit_update(&self, update: IndexUpdate)-> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
let remaining = operands.size_hint().0; let index_guard = self.indexes.get(&update.index).ok_or("Index not found")?;
let already_exist = usize::from(existing_value.is_some());
remaining + already_exist
};
let mut op = blob::OpBuilder::with_capacity(capacity); index_guard.val().commit_update(update.update)
if let Some(existing_value) = existing_value {
let blob = bincode::deserialize(existing_value).expect("BUG: could not deserialize data-index");
op.push(Blob::Positive(blob));
} }
for bytes in operands { pub fn view(&self, index: &str) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
let blob = bincode::deserialize(bytes).expect("BUG: could not deserialize blob"); let index_guard = self.indexes.get(index).ok_or("Index not found")?;
op.push(blob);
Ok(index_guard.val().view())
} }
let blob = op.merge().expect("BUG: could not merge blobs");
bincode::serialize(&blob).expect("BUG: could not serialize merged blob")
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use std::collections::HashSet;
use std::error::Error; use std::error::Error;
use serde_derive::{Serialize, Deserialize}; use serde_derive::{Serialize, Deserialize};
use tempfile::tempdir;
use crate::tokenizer::DefaultBuilder;
use crate::database::update::PositiveUpdateBuilder;
use crate::database::schema::{SchemaBuilder, STORED, INDEXED}; use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
use crate::tokenizer::DefaultBuilder;
use super::*;
#[test] #[test]
fn ingest_update_file() -> Result<(), Box<Error>> { fn ingest_one_easy_update() -> Result<(), Box<Error>> {
let dir = tempdir()?; let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let rocksdb_path = dir.path().join("rocksdb.rdb"); let meilidb_path = dir.path().join("meilidb.mdb");
let meilidb_index_name = "default";
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
struct SimpleDoc { struct SimpleDoc {
id: u64,
title: String, title: String,
description: String, description: String,
timestamp: u64, timestamp: u64,
} }
let schema = { let schema = {
let mut builder = SchemaBuilder::new(); let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("id", STORED);
builder.new_attribute("title", STORED | INDEXED); builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED); builder.new_attribute("description", STORED | INDEXED);
builder.new_attribute("timestamp", STORED); builder.new_attribute("timestamp", STORED);
builder.build() builder.build()
}; };
let database = Database::create(&rocksdb_path, schema.clone())?; let database = Database::create(&meilidb_path)?;
let tokenizer_builder = DefaultBuilder::new();
let update_path = dir.path().join("update.sst"); database.create_index(meilidb_index_name, &schema)?;
let doc0 = SimpleDoc { let doc0 = SimpleDoc {
id: 0,
title: String::from("I am a title"), title: String::from("I am a title"),
description: String::from("I am a description"), description: String::from("I am a description"),
timestamp: 1234567, timestamp: 1234567,
}; };
let doc1 = SimpleDoc { let doc1 = SimpleDoc {
id: 1,
title: String::from("I am the second title"), title: String::from("I am the second title"),
description: String::from("I am the second description"), description: String::from("I am the second description"),
timestamp: 7654321, timestamp: 7654321,
}; };
let mut update = { let tokenizer_builder = DefaultBuilder::new();
let mut builder = PositiveUpdateBuilder::new(update_path, schema, tokenizer_builder); let mut builder = database.start_update(meilidb_index_name)?;
builder.update(0, &doc0).unwrap(); let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
builder.update(1, &doc1).unwrap(); let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
builder.build()? let view = database.commit_update(builder)?;
};
update.set_move(true); let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
database.ingest_update_file(update)?; let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
let view = database.view();
let de_doc0: SimpleDoc = view.retrieve_document(0)?;
let de_doc1: SimpleDoc = view.retrieve_document(1)?;
assert_eq!(doc0, de_doc0); assert_eq!(doc0, de_doc0);
assert_eq!(doc1, de_doc1); assert_eq!(doc1, de_doc1);
Ok(dir.close()?) Ok(dir.close()?)
} }
#[test]
fn ingest_two_easy_updates() -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let meilidb_path = dir.path().join("meilidb.mdb");
let meilidb_index_name = "default";
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
struct SimpleDoc {
id: u64,
title: String,
description: String,
timestamp: u64,
}
let schema = {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("id", STORED);
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
builder.new_attribute("timestamp", STORED);
builder.build()
};
let database = Database::create(&meilidb_path)?;
database.create_index(meilidb_index_name, &schema)?;
let doc0 = SimpleDoc {
id: 0,
title: String::from("I am a title"),
description: String::from("I am a description"),
timestamp: 1234567,
};
let doc1 = SimpleDoc {
id: 1,
title: String::from("I am the second title"),
description: String::from("I am the second description"),
timestamp: 7654321,
};
let doc2 = SimpleDoc {
id: 2,
title: String::from("I am the third title"),
description: String::from("I am the third description"),
timestamp: 7654321,
};
let doc3 = SimpleDoc {
id: 3,
title: String::from("I am the fourth title"),
description: String::from("I am the fourth description"),
timestamp: 7654321,
};
let tokenizer_builder = DefaultBuilder::new();
let mut builder = database.start_update(meilidb_index_name)?;
let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
database.commit_update(builder)?;
let mut builder = database.start_update(meilidb_index_name)?;
let docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?;
let docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?;
let view = database.commit_update(builder)?;
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
assert_eq!(doc0, de_doc0);
assert_eq!(doc1, de_doc1);
let de_doc2: SimpleDoc = view.document_by_id(docid2)?;
let de_doc3: SimpleDoc = view.document_by_id(docid3)?;
assert_eq!(doc2, de_doc2);
assert_eq!(doc3, de_doc3);
Ok(dir.close()?)
}
}
#[cfg(all(feature = "nightly", test))]
mod bench {
extern crate test;
use std::collections::HashSet;
use std::error::Error;
use std::iter::repeat_with;
use self::test::Bencher;
use rand::distributions::Alphanumeric;
use rand_xorshift::XorShiftRng;
use rand::{Rng, SeedableRng};
use serde_derive::Serialize;
use rand::seq::SliceRandom;
use crate::tokenizer::DefaultBuilder;
use crate::database::schema::*;
use super::*;
fn random_sentences<R: Rng>(number: usize, rng: &mut R) -> String {
let mut words = String::new();
for i in 0..number {
let word_len = rng.gen_range(1, 12);
let iter = repeat_with(|| rng.sample(Alphanumeric)).take(word_len);
words.extend(iter);
if i == number - 1 { // last word
let final_ = [".", "?", "!", "..."].choose(rng).cloned();
words.extend(final_);
} else {
let middle = [",", ", "].choose(rng).cloned();
words.extend(middle);
}
}
words
}
#[bench]
fn open_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let index_name = "default";
let database = Database::create(&db_path)?;
database.create_index(index_name, &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..300 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
database.commit_update(builder)?;
drop(database);
bench.iter(|| {
let database = Database::open(db_path.clone()).unwrap();
test::black_box(|| database);
});
Ok(())
}
#[bench]
fn open_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let index_name = "default";
let database = Database::create(&db_path)?;
database.create_index(index_name, &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..3000 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
database.commit_update(builder)?;
drop(database);
bench.iter(|| {
let database = Database::open(db_path.clone()).unwrap();
test::black_box(|| database);
});
Ok(())
}
#[bench]
#[ignore]
fn open_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let index_name = "default";
let database = Database::create(&db_path)?;
database.create_index(index_name, &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..30_000 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
database.commit_update(builder)?;
drop(database);
bench.iter(|| {
let database = Database::open(db_path.clone()).unwrap();
test::black_box(|| database);
});
Ok(())
}
#[bench]
fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let index_name = "default";
let database = Database::create(&db_path)?;
database.create_index(index_name, &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..300 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let view = database.commit_update(builder)?;
bench.iter(|| {
for q in &["a", "b", "c", "d", "e"] {
let documents = view.query_builder().unwrap().query(q, 0..20);
test::black_box(|| documents);
}
});
Ok(())
}
#[bench]
fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let index_name = "default";
let database = Database::create(&db_path)?;
database.create_index(index_name, &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..3000 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let view = database.commit_update(builder)?;
bench.iter(|| {
for q in &["a", "b", "c", "d", "e"] {
let documents = view.query_builder().unwrap().query(q, 0..20);
test::black_box(|| documents);
}
});
Ok(())
}
#[bench]
#[ignore]
fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let index_name = "default";
let database = Database::create(&db_path)?;
database.create_index(index_name, &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..30_000 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let view = database.commit_update(builder)?;
bench.iter(|| {
for q in &["a", "b", "c", "d", "e"] {
let documents = view.query_builder().unwrap().query(q, 0..20);
test::black_box(|| documents);
}
});
Ok(())
}
} }

View File

@ -1,29 +1,36 @@
use std::collections::{HashMap, BTreeMap}; use std::collections::{HashMap, BTreeMap};
use std::io::{Read, Write}; use std::io::{Read, Write};
use std::{fmt, u32}; use std::error::Error;
use std::path::Path; use std::{fmt, u16};
use std::ops::BitOr; use std::ops::BitOr;
use std::sync::Arc; use std::sync::Arc;
use std::fs::File;
use serde_derive::{Serialize, Deserialize}; use serde_derive::{Serialize, Deserialize};
use linked_hash_map::LinkedHashMap; use linked_hash_map::LinkedHashMap;
use serde::Serialize;
use crate::database::serde::find_id::FindDocumentIdSerializer;
use crate::database::serde::SerializerError;
use crate::DocumentId;
pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false }; pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false };
pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true }; pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true };
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] #[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct SchemaProps { pub struct SchemaProps {
#[serde(default)]
stored: bool, stored: bool,
#[serde(default)]
indexed: bool, indexed: bool,
} }
impl SchemaProps { impl SchemaProps {
pub fn is_stored(&self) -> bool { pub fn is_stored(self) -> bool {
self.stored self.stored
} }
pub fn is_indexed(&self) -> bool { pub fn is_indexed(self) -> bool {
self.indexed self.indexed
} }
} }
@ -39,33 +46,39 @@ impl BitOr for SchemaProps {
} }
} }
#[derive(Serialize, Deserialize)]
pub struct SchemaBuilder { pub struct SchemaBuilder {
attrs: LinkedHashMap<String, SchemaProps>, identifier: String,
attributes: LinkedHashMap<String, SchemaProps>,
} }
impl SchemaBuilder { impl SchemaBuilder {
pub fn new() -> SchemaBuilder { pub fn with_identifier<S: Into<String>>(name: S) -> SchemaBuilder {
SchemaBuilder { attrs: LinkedHashMap::new() } SchemaBuilder {
identifier: name.into(),
attributes: LinkedHashMap::new(),
}
} }
pub fn new_attribute<S: Into<String>>(&mut self, name: S, props: SchemaProps) -> SchemaAttr { pub fn new_attribute<S: Into<String>>(&mut self, name: S, props: SchemaProps) -> SchemaAttr {
let len = self.attrs.len(); let len = self.attributes.len();
if self.attrs.insert(name.into(), props).is_some() { if self.attributes.insert(name.into(), props).is_some() {
panic!("Field already inserted.") panic!("Field already inserted.")
} }
SchemaAttr(len as u32) SchemaAttr(len as u16)
} }
pub fn build(self) -> Schema { pub fn build(self) -> Schema {
let mut attrs = HashMap::new(); let mut attrs = HashMap::new();
let mut props = Vec::new(); let mut props = Vec::new();
for (i, (name, prop)) in self.attrs.into_iter().enumerate() { for (i, (name, prop)) in self.attributes.into_iter().enumerate() {
attrs.insert(name.clone(), SchemaAttr(i as u32)); attrs.insert(name.clone(), SchemaAttr(i as u16));
props.push((name, prop)); props.push((name, prop));
} }
Schema { inner: Arc::new(InnerSchema { attrs, props }) } let identifier = self.identifier;
Schema { inner: Arc::new(InnerSchema { identifier, attrs, props }) }
} }
} }
@ -76,69 +89,124 @@ pub struct Schema {
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Debug, Clone, PartialEq, Eq)]
struct InnerSchema { struct InnerSchema {
identifier: String,
attrs: HashMap<String, SchemaAttr>, attrs: HashMap<String, SchemaAttr>,
props: Vec<(String, SchemaProps)>, props: Vec<(String, SchemaProps)>,
} }
impl Schema { impl Schema {
pub fn open<P: AsRef<Path>>(path: P) -> bincode::Result<Schema> { pub fn from_toml<R: Read>(mut reader: R) -> Result<Schema, Box<Error>> {
let file = File::open(path)?; let mut buffer = Vec::new();
Schema::read_from(file) reader.read_to_end(&mut buffer)?;
} let builder: SchemaBuilder = toml::from_slice(&buffer)?;
pub fn read_from<R: Read>(reader: R) -> bincode::Result<Schema> {
let attrs = bincode::deserialize_from(reader)?;
let builder = SchemaBuilder { attrs };
Ok(builder.build()) Ok(builder.build())
} }
pub fn write_to<W: Write>(&self, writer: W) -> bincode::Result<()> { pub fn to_toml<W: Write>(&self, mut writer: W) -> Result<(), Box<Error>> {
let identifier = self.inner.identifier.clone();
let attributes = self.attributes_ordered();
let builder = SchemaBuilder { identifier, attributes };
let string = toml::to_string_pretty(&builder)?;
writer.write_all(string.as_bytes())?;
Ok(())
}
pub fn from_json<R: Read>(mut reader: R) -> Result<Schema, Box<Error>> {
let mut buffer = Vec::new();
reader.read_to_end(&mut buffer)?;
let builder: SchemaBuilder = serde_json::from_slice(&buffer)?;
Ok(builder.build())
}
pub fn to_json<W: Write>(&self, mut writer: W) -> Result<(), Box<Error>> {
let identifier = self.inner.identifier.clone();
let attributes = self.attributes_ordered();
let builder = SchemaBuilder { identifier, attributes };
let string = serde_json::to_string_pretty(&builder)?;
writer.write_all(string.as_bytes())?;
Ok(())
}
pub(crate) fn read_from_bin<R: Read>(reader: R) -> bincode::Result<Schema> {
let builder: SchemaBuilder = bincode::deserialize_from(reader)?;
Ok(builder.build())
}
pub(crate) fn write_to_bin<W: Write>(&self, writer: W) -> bincode::Result<()> {
let identifier = self.inner.identifier.clone();
let attributes = self.attributes_ordered();
let builder = SchemaBuilder { identifier, attributes };
bincode::serialize_into(writer, &builder)
}
fn attributes_ordered(&self) -> LinkedHashMap<String, SchemaProps> {
let mut ordered = BTreeMap::new(); let mut ordered = BTreeMap::new();
for (name, field) in &self.inner.attrs { for (name, attr) in &self.inner.attrs {
let index = field.as_u32(); let (_, props) = self.inner.props[attr.0 as usize];
let (_, props) = self.inner.props[index as usize]; ordered.insert(attr.0, (name, props));
ordered.insert(index, (name, props));
} }
let mut attrs = LinkedHashMap::with_capacity(ordered.len()); let mut attributes = LinkedHashMap::with_capacity(ordered.len());
for (_, (name, props)) in ordered { for (_, (name, props)) in ordered {
attrs.insert(name, props); attributes.insert(name.clone(), props);
} }
bincode::serialize_into(writer, &attrs) attributes
}
pub fn document_id<T>(&self, document: T) -> Result<DocumentId, SerializerError>
where T: Serialize,
{
let id_attribute_name = &self.inner.identifier;
let serializer = FindDocumentIdSerializer { id_attribute_name };
document.serialize(serializer)
} }
pub fn props(&self, attr: SchemaAttr) -> SchemaProps { pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
let index = attr.as_u32(); let (_, props) = self.inner.props[attr.0 as usize];
let (_, props) = self.inner.props[index as usize];
props props
} }
pub fn identifier_name(&self) -> &str {
&self.inner.identifier
}
pub fn attribute<S: AsRef<str>>(&self, name: S) -> Option<SchemaAttr> { pub fn attribute<S: AsRef<str>>(&self, name: S) -> Option<SchemaAttr> {
self.inner.attrs.get(name.as_ref()).cloned() self.inner.attrs.get(name.as_ref()).cloned()
} }
pub fn attribute_name(&self, attr: SchemaAttr) -> &str { pub fn attribute_name(&self, attr: SchemaAttr) -> &str {
let index = attr.as_u32(); let (name, _) = &self.inner.props[attr.0 as usize];
let (name, _) = &self.inner.props[index as usize];
name name
} }
} }
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)] #[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)]
pub struct SchemaAttr(u32); pub struct SchemaAttr(pub(crate) u16);
impl SchemaAttr { impl SchemaAttr {
pub fn new(value: u32) -> SchemaAttr { pub fn new(value: u16) -> SchemaAttr {
SchemaAttr(value) SchemaAttr(value)
} }
pub fn max() -> SchemaAttr { pub fn min() -> SchemaAttr {
SchemaAttr(u32::MAX) SchemaAttr(0)
} }
pub fn as_u32(&self) -> u32 { pub fn next(self) -> Option<SchemaAttr> {
self.0 self.0.checked_add(1).map(SchemaAttr)
}
pub fn prev(self) -> Option<SchemaAttr> {
self.0.checked_sub(1).map(SchemaAttr)
}
pub fn max() -> SchemaAttr {
SchemaAttr(u16::MAX)
} }
} }
@ -151,22 +219,92 @@ impl fmt::Display for SchemaAttr {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use std::error::Error;
#[test] #[test]
fn serialize_deserialize() -> bincode::Result<()> { fn serialize_deserialize() -> bincode::Result<()> {
let mut builder = SchemaBuilder::new(); let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alphabet", STORED); builder.new_attribute("alpha", STORED);
builder.new_attribute("beta", STORED | INDEXED); builder.new_attribute("beta", STORED | INDEXED);
builder.new_attribute("gamma", INDEXED); builder.new_attribute("gamma", INDEXED);
let schema = builder.build(); let schema = builder.build();
let mut buffer = Vec::new(); let mut buffer = Vec::new();
schema.write_to(&mut buffer)?; schema.write_to_bin(&mut buffer)?;
let schema2 = Schema::read_from(buffer.as_slice())?; let schema2 = Schema::read_from_bin(buffer.as_slice())?;
assert_eq!(schema, schema2); assert_eq!(schema, schema2);
Ok(()) Ok(())
} }
#[test]
fn serialize_deserialize_toml() -> Result<(), Box<Error>> {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", STORED);
builder.new_attribute("beta", STORED | INDEXED);
builder.new_attribute("gamma", INDEXED);
let schema = builder.build();
let mut buffer = Vec::new();
schema.to_toml(&mut buffer)?;
let schema2 = Schema::from_toml(buffer.as_slice())?;
assert_eq!(schema, schema2);
let data = r#"
identifier = "id"
[attributes."alpha"]
stored = true
[attributes."beta"]
stored = true
indexed = true
[attributes."gamma"]
indexed = true
"#;
let schema2 = Schema::from_toml(data.as_bytes())?;
assert_eq!(schema, schema2);
Ok(())
}
#[test]
fn serialize_deserialize_json() -> Result<(), Box<Error>> {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("alpha", STORED);
builder.new_attribute("beta", STORED | INDEXED);
builder.new_attribute("gamma", INDEXED);
let schema = builder.build();
let mut buffer = Vec::new();
schema.to_json(&mut buffer)?;
let schema2 = Schema::from_json(buffer.as_slice())?;
assert_eq!(schema, schema2);
let data = r#"
{
"identifier": "id",
"attributes": {
"alpha": {
"stored": true
},
"beta": {
"stored": true,
"indexed": true
},
"gamma": {
"indexed": true
}
}
}"#;
let schema2 = Schema::from_json(data.as_bytes())?;
assert_eq!(schema, schema2);
Ok(())
}
} }

View File

@ -0,0 +1,243 @@
use serde::Serialize;
use serde::ser;
use crate::database::serde::key_to_string::KeyToStringSerializer;
use crate::database::serde::{SerializerError, calculate_hash};
use crate::DocumentId;
pub struct FindDocumentIdSerializer<'a> {
pub id_attribute_name: &'a str,
}
impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
type Ok = DocumentId;
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = FindDocumentIdMapSerializer<'a>;
type SerializeStruct = FindDocumentIdStructSerializer<'a>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "str" })
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Ok(FindDocumentIdMapSerializer {
id_attribute_name: self.id_attribute_name,
document_id: None,
current_key_name: None,
})
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Ok(FindDocumentIdStructSerializer {
id_attribute_name: self.id_attribute_name,
document_id: None,
})
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}
pub struct FindDocumentIdMapSerializer<'a> {
id_attribute_name: &'a str,
document_id: Option<DocumentId>,
current_key_name: Option<String>,
}
impl<'a> ser::SerializeMap for FindDocumentIdMapSerializer<'a> {
type Ok = DocumentId;
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: Serialize,
{
let key = key.serialize(KeyToStringSerializer)?;
self.current_key_name = Some(key);
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: Serialize,
{
let key = self.current_key_name.take().unwrap();
self.serialize_entry(&key, value)
}
fn serialize_entry<K: ?Sized, V: ?Sized>(
&mut self,
key: &K,
value: &V
) -> Result<(), Self::Error>
where K: Serialize, V: Serialize,
{
let key = key.serialize(KeyToStringSerializer)?;
if self.id_attribute_name == key {
// TODO is it possible to have multiple ids?
let id = bincode::serialize(value).unwrap();
let hash = calculate_hash(&id);
self.document_id = Some(DocumentId(hash));
}
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
match self.document_id {
Some(document_id) => Ok(document_id),
None => Err(SerializerError::DocumentIdNotFound)
}
}
}
pub struct FindDocumentIdStructSerializer<'a> {
id_attribute_name: &'a str,
document_id: Option<DocumentId>,
}
impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> {
type Ok = DocumentId;
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T
) -> Result<(), Self::Error>
where T: Serialize,
{
if self.id_attribute_name == key {
// TODO can it be possible to have multiple ids?
let id = bincode::serialize(value).unwrap();
let hash = calculate_hash(&id);
self.document_id = Some(DocumentId(hash));
}
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
match self.document_id {
Some(document_id) => Ok(document_id),
None => Err(SerializerError::DocumentIdNotFound)
}
}
}

View File

@ -0,0 +1,191 @@
use std::collections::HashSet;
use serde::Serialize;
use serde::ser;
use crate::database::update::DocumentUpdate;
use crate::database::serde::SerializerError;
use crate::database::schema::SchemaAttr;
use crate::tokenizer::TokenizerBuilder;
use crate::tokenizer::Token;
use crate::{DocumentId, DocIndex};
pub struct IndexerSerializer<'a, 'b, B> {
pub tokenizer_builder: &'a B,
pub update: &'a mut DocumentUpdate<'b>,
pub document_id: DocumentId,
pub attribute: SchemaAttr,
pub stop_words: &'a HashSet<String>,
}
impl<'a, 'b, B> ser::Serializer for IndexerSerializer<'a, 'b, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
for token in self.tokenizer_builder.build(v) {
let Token { word, word_index, char_index } = token;
let document_id = self.document_id;
// FIXME must u32::try_from instead
let attribute = self.attribute.0;
let word_index = word_index as u32;
// insert the exact representation
let word_lower = word.to_lowercase();
let length = word.chars().count() as u16;
if self.stop_words.contains(&word_lower) { continue }
// and the unidecoded lowercased version
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
if word_lower != word_unidecoded {
let char_index = char_index as u32;
let char_length = length;
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index)?;
}
let char_index = char_index as u32;
let char_length = length;
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
self.update.insert_doc_index(word_lower.into_bytes(), doc_index)?;
}
Ok(())
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "seq" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { name: "map" })
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct" })
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}

View File

@ -0,0 +1,146 @@
use serde::Serialize;
use serde::ser;
use crate::database::serde::SerializerError;
pub struct KeyToStringSerializer;
impl ser::Serializer for KeyToStringSerializer {
type Ok = String;
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { name: "map" })
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct" })
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}

64
src/database/serde/mod.rs Normal file
View File

@ -0,0 +1,64 @@
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
use std::error::Error;
use std::fmt;
use serde::ser;
macro_rules! forward_to_unserializable_type {
($($ty:ident => $se_method:ident,)*) => {
$(
fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "$ty" })
}
)*
}
}
pub mod find_id;
pub mod key_to_string;
pub mod serializer;
pub mod indexer_serializer;
pub mod deserializer;
pub fn calculate_hash<T: Hash>(t: &T) -> u64 {
let mut s = DefaultHasher::new();
t.hash(&mut s);
s.finish()
}
#[derive(Debug)]
pub enum SerializerError {
DocumentIdNotFound,
UnserializableType { name: &'static str },
Custom(String),
}
impl ser::Error for SerializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
SerializerError::Custom(msg.to_string())
}
}
impl fmt::Display for SerializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
SerializerError::DocumentIdNotFound => {
write!(f, "serialized document does not have an id according to the schema")
}
SerializerError::UnserializableType { name } => {
write!(f, "Only struct and map types are considered valid documents and
can be serialized, not {} types directly.", name)
},
SerializerError::Custom(s) => f.write_str(&s),
}
}
}
impl Error for SerializerError {}
impl From<String> for SerializerError {
fn from(value: String) -> SerializerError {
SerializerError::Custom(value)
}
}

View File

@ -0,0 +1,287 @@
use std::collections::HashSet;
use serde::Serialize;
use serde::ser;
use crate::database::serde::indexer_serializer::IndexerSerializer;
use crate::database::serde::key_to_string::KeyToStringSerializer;
use crate::database::update::DocumentUpdate;
use crate::database::serde::SerializerError;
use crate::tokenizer::TokenizerBuilder;
use crate::database::schema::Schema;
use crate::DocumentId;
pub struct Serializer<'a, 'b, B> {
pub schema: &'a Schema,
pub update: &'a mut DocumentUpdate<'b>,
pub document_id: DocumentId,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
}
impl<'a, 'b, B> ser::Serializer for Serializer<'a, 'b, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapSerializer<'a, 'b, B>;
type SerializeStruct = StructSerializer<'a, 'b, B>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "str" })
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Ok(MapSerializer {
schema: self.schema,
document_id: self.document_id,
update: self.update,
tokenizer_builder: self.tokenizer_builder,
stop_words: self.stop_words,
current_key_name: None,
})
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Ok(StructSerializer {
schema: self.schema,
update: self.update,
document_id: self.document_id,
tokenizer_builder: self.tokenizer_builder,
stop_words: self.stop_words,
})
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}
pub struct MapSerializer<'a, 'b, B> {
pub schema: &'a Schema,
pub document_id: DocumentId,
pub update: &'a mut DocumentUpdate<'b>,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
pub current_key_name: Option<String>,
}
impl<'a, 'b, B> ser::SerializeMap for MapSerializer<'a, 'b, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: Serialize,
{
let key = key.serialize(KeyToStringSerializer)?;
self.current_key_name = Some(key);
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: Serialize,
{
let key = self.current_key_name.take().unwrap();
self.serialize_entry(&key, value)
}
fn serialize_entry<K: ?Sized, V: ?Sized>(
&mut self,
key: &K,
value: &V,
) -> Result<(), Self::Error>
where K: Serialize, V: Serialize,
{
let key = key.serialize(KeyToStringSerializer)?;
if let Some(attr) = self.schema.attribute(key) {
let props = self.schema.props(attr);
if props.is_stored() {
let value = bincode::serialize(value).unwrap();
self.update.insert_attribute_value(attr, &value)?;
}
if props.is_indexed() {
let serializer = IndexerSerializer {
update: self.update,
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
attribute: attr,
stop_words: self.stop_words,
};
value.serialize(serializer)?;
}
}
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}
pub struct StructSerializer<'a, 'b, B> {
pub schema: &'a Schema,
pub document_id: DocumentId,
pub update: &'a mut DocumentUpdate<'b>,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
}
impl<'a, 'b, B> ser::SerializeStruct for StructSerializer<'a, 'b, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T
) -> Result<(), Self::Error>
where T: Serialize,
{
if let Some(attr) = self.schema.attribute(key) {
let props = self.schema.props(attr);
if props.is_stored() {
let value = bincode::serialize(value).unwrap();
self.update.insert_attribute_value(attr, &value)?;
}
if props.is_indexed() {
let serializer = IndexerSerializer {
update: self.update,
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
attribute: attr,
stop_words: self.stop_words,
};
value.serialize(serializer)?;
}
}
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}

198
src/database/update.rs Normal file
View File

@ -0,0 +1,198 @@
use std::collections::{HashSet, BTreeMap};
use std::error::Error;
use rocksdb::rocksdb::{Writable, WriteBatch};
use hashbrown::hash_map::HashMap;
use serde::Serialize;
use fst::map::Map;
use sdset::Set;
use crate::database::index::{Positive, PositiveBuilder, Negative};
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
use crate::database::serde::serializer::Serializer;
use crate::database::serde::SerializerError;
use crate::database::schema::SchemaAttr;
use crate::tokenizer::TokenizerBuilder;
use crate::data::{DocIds, DocIndexes};
use crate::database::schema::Schema;
use crate::database::index::Index;
use crate::{DocumentId, DocIndex};
use crate::database::DATA_INDEX;
pub type Token = Vec<u8>; // TODO could be replaced by a SmallVec
pub struct Update {
schema: Schema,
raw_builder: RawUpdateBuilder,
}
impl Update {
pub(crate) fn new(schema: Schema) -> Update {
Update { schema, raw_builder: RawUpdateBuilder::new() }
}
pub fn update_document<T, B>(
&mut self,
document: T,
tokenizer_builder: &B,
stop_words: &HashSet<String>,
) -> Result<DocumentId, SerializerError>
where T: Serialize,
B: TokenizerBuilder,
{
let document_id = self.schema.document_id(&document)?;
let serializer = Serializer {
schema: &self.schema,
document_id: document_id,
tokenizer_builder: tokenizer_builder,
update: &mut self.raw_builder.document_update(document_id)?,
stop_words: stop_words,
};
document.serialize(serializer)?;
Ok(document_id)
}
pub fn remove_document<T>(&mut self, document: T) -> Result<DocumentId, SerializerError>
where T: Serialize,
{
let document_id = self.schema.document_id(&document)?;
self.raw_builder.document_update(document_id)?.remove()?;
Ok(document_id)
}
pub(crate) fn build(self) -> Result<WriteBatch, Box<Error>> {
self.raw_builder.build()
}
}
#[derive(Copy, Clone, PartialEq, Eq)]
enum UpdateType {
Updated,
Deleted,
}
use UpdateType::{Updated, Deleted};
pub struct RawUpdateBuilder {
documents_update: HashMap<DocumentId, UpdateType>,
indexed_words: BTreeMap<Token, Vec<DocIndex>>,
batch: WriteBatch,
}
impl RawUpdateBuilder {
pub fn new() -> RawUpdateBuilder {
RawUpdateBuilder {
documents_update: HashMap::new(),
indexed_words: BTreeMap::new(),
batch: WriteBatch::new(),
}
}
pub fn document_update(&mut self, document_id: DocumentId) -> Result<DocumentUpdate, SerializerError> {
use serde::ser::Error;
match self.documents_update.get(&document_id) {
Some(Deleted) | None => Ok(DocumentUpdate { document_id, inner: self }),
Some(Updated) => Err(SerializerError::custom(
"This document has already been removed and cannot be updated in the same update"
)),
}
}
pub fn build(self) -> Result<WriteBatch, Box<Error>> {
let negative = {
let mut removed_document_ids = Vec::new();
for (id, update_type) in self.documents_update {
if update_type == Deleted {
removed_document_ids.push(id);
}
}
removed_document_ids.sort_unstable();
let removed_document_ids = Set::new_unchecked(&removed_document_ids);
let doc_ids = DocIds::new(removed_document_ids);
Negative::new(doc_ids)
};
let positive = {
let mut positive_builder = PositiveBuilder::memory();
for (key, mut indexes) in self.indexed_words {
indexes.sort_unstable();
let indexes = Set::new_unchecked(&indexes);
positive_builder.insert(key, indexes)?;
}
let (map, indexes) = positive_builder.into_inner()?;
let map = Map::from_bytes(map)?;
let indexes = DocIndexes::from_bytes(indexes)?;
Positive::new(map, indexes)
};
let index = Index { negative, positive };
// write the data-index
let mut bytes = Vec::new();
index.write_to_bytes(&mut bytes);
self.batch.merge(DATA_INDEX, &bytes)?;
Ok(self.batch)
}
}
pub struct DocumentUpdate<'a> {
document_id: DocumentId,
inner: &'a mut RawUpdateBuilder,
}
impl<'a> DocumentUpdate<'a> {
pub fn remove(&mut self) -> Result<(), SerializerError> {
use serde::ser::Error;
if let Updated = self.inner.documents_update.entry(self.document_id).or_insert(Deleted) {
return Err(SerializerError::custom(
"This document has already been updated and cannot be removed in the same update"
));
}
let start = DocumentKey::new(self.document_id).with_attribute_min();
let end = DocumentKey::new(self.document_id).with_attribute_max(); // FIXME max + 1
self.inner.batch.delete_range(start.as_ref(), end.as_ref())?;
Ok(())
}
pub fn insert_attribute_value(&mut self, attr: SchemaAttr, value: &[u8]) -> Result<(), SerializerError> {
use serde::ser::Error;
if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) {
return Err(SerializerError::custom(
"This document has already been deleted and cannot be updated in the same update"
));
}
let key = DocumentKeyAttr::new(self.document_id, attr);
self.inner.batch.put(key.as_ref(), &value)?;
Ok(())
}
pub fn insert_doc_index(&mut self, token: Token, doc_index: DocIndex) -> Result<(), SerializerError> {
use serde::ser::Error;
if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) {
return Err(SerializerError::custom(
"This document has already been deleted and cannot be updated in the same update"
));
}
self.inner.indexed_words.entry(token).or_insert_with(Vec::new).push(doc_index);
Ok(())
}
}

View File

@ -1,35 +0,0 @@
use std::path::PathBuf;
use std::error::Error;
mod negative;
mod positive;
pub use self::positive::{PositiveUpdateBuilder, NewState};
pub use self::negative::NegativeUpdateBuilder;
pub struct Update {
path: PathBuf,
can_be_moved: bool,
}
impl Update {
pub fn open<P: Into<PathBuf>>(path: P) -> Result<Update, Box<Error>> {
Ok(Update { path: path.into(), can_be_moved: false })
}
pub fn open_and_move<P: Into<PathBuf>>(path: P) -> Result<Update, Box<Error>> {
Ok(Update { path: path.into(), can_be_moved: true })
}
pub fn set_move(&mut self, can_be_moved: bool) {
self.can_be_moved = can_be_moved
}
pub fn can_be_moved(&self) -> bool {
self.can_be_moved
}
pub fn into_path_buf(self) -> PathBuf {
self.path
}
}

View File

@ -1,4 +0,0 @@
mod update;
mod unordered_builder;
pub use self::update::NegativeUpdateBuilder;

View File

@ -1,37 +0,0 @@
use std::collections::BTreeSet;
use std::io;
use byteorder::{NativeEndian, WriteBytesExt};
use crate::DocumentId;
pub struct UnorderedNegativeBlobBuilder<W> {
doc_ids: BTreeSet<DocumentId>, // TODO: prefer a linked-list
wrt: W,
}
impl UnorderedNegativeBlobBuilder<Vec<u8>> {
pub fn memory() -> Self {
UnorderedNegativeBlobBuilder::new(Vec::new())
}
}
impl<W: io::Write> UnorderedNegativeBlobBuilder<W> {
pub fn new(wrt: W) -> Self {
Self {
doc_ids: BTreeSet::new(),
wrt: wrt,
}
}
pub fn insert(&mut self, doc: DocumentId) -> bool {
self.doc_ids.insert(doc)
}
pub fn into_inner(mut self) -> io::Result<W> {
for id in self.doc_ids {
self.wrt.write_u64::<NativeEndian>(id)?;
}
Ok(self.wrt)
}
}

View File

@ -1,60 +0,0 @@
use std::path::PathBuf;
use std::error::Error;
use ::rocksdb::rocksdb_options;
use crate::database::update::negative::unordered_builder::UnorderedNegativeBlobBuilder;
use crate::database::blob::{Blob, NegativeBlob};
use crate::database::update::Update;
use crate::database::DocumentKey;
use crate::database::DATA_INDEX;
use crate::DocumentId;
pub struct NegativeUpdateBuilder {
path: PathBuf,
doc_ids: UnorderedNegativeBlobBuilder<Vec<u8>>,
}
impl NegativeUpdateBuilder {
pub fn new<P: Into<PathBuf>>(path: P) -> NegativeUpdateBuilder {
NegativeUpdateBuilder {
path: path.into(),
doc_ids: UnorderedNegativeBlobBuilder::memory(),
}
}
pub fn remove(&mut self, id: DocumentId) -> bool {
self.doc_ids.insert(id)
}
pub fn build(self) -> Result<Update, Box<Error>> {
let env_options = rocksdb_options::EnvOptions::new();
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
file_writer.open(&self.path.to_string_lossy())?;
let bytes = self.doc_ids.into_inner()?;
let negative_blob = NegativeBlob::from_bytes(bytes)?;
let blob = Blob::Negative(negative_blob);
// write the data-index aka negative blob
let bytes = bincode::serialize(&blob)?;
file_writer.merge(DATA_INDEX, &bytes)?;
// FIXME remove this ugly thing !
// let Blob::Negative(negative_blob) = blob;
let negative_blob = match blob {
Blob::Negative(blob) => blob,
Blob::Positive(_) => unreachable!(),
};
for &document_id in negative_blob.as_ref().as_slice() {
let start = DocumentKey::new(document_id);
let end = start.with_attribute_max();
file_writer.delete_range(start.as_ref(), end.as_ref())?;
}
file_writer.finish()?;
Update::open(self.path)
}
}

View File

@ -1,4 +0,0 @@
mod update;
mod unordered_builder;
pub use self::update::{PositiveUpdateBuilder, NewState};

View File

@ -1,49 +0,0 @@
#![allow(unused)]
use std::collections::BTreeMap;
use std::error::Error;
use std::io::Write;
use sdset::Set;
use crate::database::blob::positive::PositiveBlobBuilder;
use crate::DocIndex;
pub struct UnorderedPositiveBlobBuilder<W, X> {
builder: PositiveBlobBuilder<W, X>,
map: BTreeMap<Vec<u8>, Vec<DocIndex>>,
}
impl UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>> {
pub fn memory() -> Self {
Self {
builder: PositiveBlobBuilder::memory(),
map: BTreeMap::new(),
}
}
}
impl<W: Write, X: Write> UnorderedPositiveBlobBuilder<W, X> {
pub fn new(map_wtr: W, doc_wtr: X) -> Result<Self, Box<Error>> {
Ok(UnorderedPositiveBlobBuilder {
builder: PositiveBlobBuilder::new(map_wtr, doc_wtr)?,
map: BTreeMap::new(),
})
}
pub fn insert<K: Into<Vec<u8>>>(&mut self, input: K, doc_index: DocIndex) {
self.map.entry(input.into()).or_insert_with(Vec::new).push(doc_index);
}
pub fn finish(self) -> Result<(), Box<Error>> {
self.into_inner().map(drop)
}
pub fn into_inner(mut self) -> Result<(W, X), Box<Error>> {
for (key, mut doc_indexes) in self.map {
doc_indexes.sort_unstable();
self.builder.insert(&key, Set::new_unchecked(&doc_indexes))?;
}
self.builder.into_inner()
}
}

View File

@ -1,514 +0,0 @@
use std::collections::BTreeMap;
use std::path::PathBuf;
use std::error::Error;
use std::fmt;
use ::rocksdb::rocksdb_options;
use serde::ser::{self, Serialize};
use crate::database::update::positive::unordered_builder::UnorderedPositiveBlobBuilder;
use crate::database::blob::positive::PositiveBlob;
use crate::database::schema::{Schema, SchemaAttr};
use crate::tokenizer::TokenizerBuilder;
use crate::database::DocumentKeyAttr;
use crate::database::update::Update;
use crate::{DocumentId, DocIndex};
use crate::database::DATA_INDEX;
use crate::database::blob::Blob;
pub enum NewState {
Updated { value: Vec<u8> },
Removed,
}
pub struct PositiveUpdateBuilder<B> {
path: PathBuf,
schema: Schema,
tokenizer_builder: B,
builder: UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
new_states: BTreeMap<DocumentKeyAttr, NewState>,
}
impl<B> PositiveUpdateBuilder<B> {
pub fn new<P: Into<PathBuf>>(path: P, schema: Schema, tokenizer_builder: B) -> PositiveUpdateBuilder<B> {
PositiveUpdateBuilder {
path: path.into(),
schema: schema,
tokenizer_builder: tokenizer_builder,
builder: UnorderedPositiveBlobBuilder::memory(),
new_states: BTreeMap::new(),
}
}
pub fn update<T: Serialize>(&mut self, id: DocumentId, document: &T) -> Result<(), Box<Error>>
where B: TokenizerBuilder
{
let serializer = Serializer {
schema: &self.schema,
document_id: id,
tokenizer_builder: &self.tokenizer_builder,
builder: &mut self.builder,
new_states: &mut self.new_states
};
Ok(ser::Serialize::serialize(document, serializer)?)
}
// TODO value must be a field that can be indexed
pub fn update_field(&mut self, id: DocumentId, attr: SchemaAttr, value: String) {
let value = bincode::serialize(&value).unwrap();
self.new_states.insert(DocumentKeyAttr::new(id, attr), NewState::Updated { value });
}
pub fn remove_field(&mut self, id: DocumentId, attr: SchemaAttr) {
self.new_states.insert(DocumentKeyAttr::new(id, attr), NewState::Removed);
}
}
#[derive(Debug)]
pub enum SerializerError {
SchemaDontMatch { attribute: String },
UnserializableType { name: &'static str },
Custom(String),
}
impl ser::Error for SerializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
SerializerError::Custom(msg.to_string())
}
}
impl fmt::Display for SerializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
SerializerError::SchemaDontMatch { attribute } => {
write!(f, "serialized document try to specify the \
{:?} attribute that is not known by the schema", attribute)
},
SerializerError::UnserializableType { name } => {
write!(f, "Only struct and map types are considered valid documents and
can be serialized, not {} types directly.", name)
},
SerializerError::Custom(s) => f.write_str(&s),
}
}
}
impl Error for SerializerError {}
struct Serializer<'a, B> {
schema: &'a Schema,
tokenizer_builder: &'a B,
document_id: DocumentId,
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
new_states: &'a mut BTreeMap<DocumentKeyAttr, NewState>,
}
macro_rules! forward_to_unserializable_type {
($($ty:ident => $se_method:ident,)*) => {
$(
fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "$ty" })
}
)*
}
}
impl<'a, B> ser::Serializer for Serializer<'a, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = StructSerializer<'a, B>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "str" })
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
// Ok(MapSerializer {
// schema: self.schema,
// document_id: self.document_id,
// new_states: self.new_states,
// })
Err(SerializerError::UnserializableType { name: "map" })
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Ok(StructSerializer {
schema: self.schema,
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
builder: self.builder,
new_states: self.new_states,
})
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}
struct StructSerializer<'a, B> {
schema: &'a Schema,
tokenizer_builder: &'a B,
document_id: DocumentId,
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
new_states: &'a mut BTreeMap<DocumentKeyAttr, NewState>,
}
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T
) -> Result<(), Self::Error>
where T: Serialize,
{
match self.schema.attribute(key) {
Some(attr) => {
let props = self.schema.props(attr);
if props.is_stored() {
let value = bincode::serialize(value).unwrap();
let key = DocumentKeyAttr::new(self.document_id, attr);
self.new_states.insert(key, NewState::Updated { value });
}
if props.is_indexed() {
let serializer = IndexerSerializer {
builder: self.builder,
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
attribute: attr,
};
value.serialize(serializer)?;
}
Ok(())
},
None => Err(SerializerError::SchemaDontMatch { attribute: key.to_owned() }),
}
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}
struct IndexerSerializer<'a, B> {
tokenizer_builder: &'a B,
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
document_id: DocumentId,
attribute: SchemaAttr,
}
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
for (index, word) in self.tokenizer_builder.build(v) {
let doc_index = DocIndex {
document_id: self.document_id,
attribute: self.attribute.as_u32() as u8,
attribute_index: index as u32,
};
// insert the exact representation
let word_lower = word.to_lowercase();
// and the unidecoded lowercased version
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
if word_lower != word_unidecoded {
self.builder.insert(word_unidecoded, doc_index);
}
self.builder.insert(word_lower, doc_index);
}
Ok(())
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "seq" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { name: "map" })
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct" })
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}
impl<B> PositiveUpdateBuilder<B> {
pub fn build(self) -> Result<Update, Box<Error>> {
let env_options = rocksdb_options::EnvOptions::new();
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
file_writer.open(&self.path.to_string_lossy())?;
let (blob_fst_map, blob_doc_idx) = self.builder.into_inner()?;
let positive_blob = PositiveBlob::from_bytes(blob_fst_map, blob_doc_idx)?;
let blob = Blob::Positive(positive_blob);
// write the data-index aka positive blob
let bytes = bincode::serialize(&blob)?;
file_writer.merge(DATA_INDEX, &bytes)?;
// write all the documents fields updates
for (key, state) in self.new_states {
match state {
NewState::Updated { value } => {
file_writer.put(key.as_ref(), &value)?
},
NewState::Removed => file_writer.delete(key.as_ref())?,
}
}
file_writer.finish()?;
Update::open(self.path)
}
}

View File

@ -9,17 +9,17 @@ use serde::de::DeserializeOwned;
use crate::database::{DocumentKey, DocumentKeyAttr}; use crate::database::{DocumentKey, DocumentKeyAttr};
use crate::database::{retrieve_data_schema, retrieve_data_index}; use crate::database::{retrieve_data_schema, retrieve_data_index};
use crate::database::blob::positive::PositiveBlob; use crate::database::serde::deserializer::Deserializer;
use crate::database::deserializer::Deserializer;
use crate::database::schema::Schema; use crate::database::schema::Schema;
use crate::rank::QueryBuilder; use crate::database::index::Index;
use crate::rank::{QueryBuilder, FilterFunc};
use crate::DocumentId; use crate::DocumentId;
pub struct DatabaseView<D> pub struct DatabaseView<D>
where D: Deref<Target=DB> where D: Deref<Target=DB>
{ {
snapshot: Snapshot<D>, snapshot: Snapshot<D>,
blob: PositiveBlob, index: Index,
schema: Schema, schema: Schema,
} }
@ -28,16 +28,16 @@ where D: Deref<Target=DB>
{ {
pub fn new(snapshot: Snapshot<D>) -> Result<DatabaseView<D>, Box<Error>> { pub fn new(snapshot: Snapshot<D>) -> Result<DatabaseView<D>, Box<Error>> {
let schema = retrieve_data_schema(&snapshot)?; let schema = retrieve_data_schema(&snapshot)?;
let blob = retrieve_data_index(&snapshot)?; let index = retrieve_data_index(&snapshot)?;
Ok(DatabaseView { snapshot, blob, schema }) Ok(DatabaseView { snapshot, index, schema })
} }
pub fn schema(&self) -> &Schema { pub fn schema(&self) -> &Schema {
&self.schema &self.schema
} }
pub fn blob(&self) -> &PositiveBlob { pub fn index(&self) -> &Index {
&self.blob &self.index
} }
pub fn into_snapshot(self) -> Snapshot<D> { pub fn into_snapshot(self) -> Snapshot<D> {
@ -71,19 +71,18 @@ where D: Deref<Target=DB>
Ok(()) Ok(())
} }
pub fn query_builder(&self) -> Result<QueryBuilder<D>, Box<Error>> { pub fn query_builder(&self) -> Result<QueryBuilder<D, FilterFunc<D>>, Box<Error>> {
QueryBuilder::new(self) QueryBuilder::new(self)
} }
// TODO create an enum error type pub fn document_by_id<T>(&self, id: DocumentId) -> Result<T, Box<Error>>
pub fn retrieve_document<T>(&self, id: DocumentId) -> Result<T, Box<Error>>
where T: DeserializeOwned where T: DeserializeOwned
{ {
let mut deserializer = Deserializer::new(&self.snapshot, &self.schema, id); let mut deserializer = Deserializer::new(&self.snapshot, &self.schema, id);
Ok(T::deserialize(&mut deserializer)?) Ok(T::deserialize(&mut deserializer)?)
} }
pub fn retrieve_documents<T, I>(&self, ids: I) -> DocumentIter<D, T, I::IntoIter> pub fn documents_by_id<T, I>(&self, ids: I) -> DocumentIter<D, T, I::IntoIter>
where T: DeserializeOwned, where T: DeserializeOwned,
I: IntoIterator<Item=DocumentId>, I: IntoIterator<Item=DocumentId>,
{ {
@ -100,7 +99,7 @@ where D: Deref<Target=DB>
{ {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut options = ReadOptions::new(); let mut options = ReadOptions::new();
let lower = DocumentKey::new(0); let lower = DocumentKey::new(DocumentId(0));
options.set_iterate_lower_bound(lower.as_ref()); options.set_iterate_lower_bound(lower.as_ref());
let mut iter = self.snapshot.iter_opt(options); let mut iter = self.snapshot.iter_opt(options);
@ -149,7 +148,7 @@ where D: Deref<Target=DB>,
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
match self.document_ids.next() { match self.document_ids.next() {
Some(id) => Some(self.database_view.retrieve_document(id)), Some(id) => Some(self.database_view.document_by_id(id)),
None => None None => None
} }
} }
@ -168,7 +167,7 @@ where D: Deref<Target=DB>,
{ {
fn next_back(&mut self) -> Option<Self::Item> { fn next_back(&mut self) -> Option<Self::Item> {
match self.document_ids.next_back() { match self.document_ids.next_back() {
Some(id) => Some(self.database_view.retrieve_document(id)), Some(id) => Some(self.database_view.document_by_id(id)),
None => None None => None
} }
} }

View File

@ -1,9 +1,10 @@
#![cfg_attr(feature = "nightly", feature(test))]
pub mod automaton; pub mod automaton;
pub mod database; pub mod database;
pub mod data; pub mod data;
pub mod rank; pub mod rank;
pub mod tokenizer; pub mod tokenizer;
pub mod vec_read_only;
mod common_words; mod common_words;
pub use rocksdb; pub use rocksdb;
@ -11,30 +12,36 @@ pub use rocksdb;
pub use self::tokenizer::Tokenizer; pub use self::tokenizer::Tokenizer;
pub use self::common_words::CommonWords; pub use self::common_words::CommonWords;
pub type DocumentId = u64; /// Represent an internally generated document unique identifier.
///
/// It is used to inform the database the document you want to deserialize.
/// Helpful for custom ranking.
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
pub struct DocumentId(u64);
/// This structure represent the position of a word /// This structure represent the position of a word
/// in a document and its attributes. /// in a document and its attributes.
/// ///
/// This is stored in the map, generated at index time, /// This is stored in the map, generated at index time,
/// extracted and interpreted at search time. /// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(C)] #[repr(C)]
pub struct DocIndex { pub struct DocIndex {
/// The document identifier where the word was found. /// The document identifier where the word was found.
pub document_id: DocumentId, pub document_id: DocumentId,
/// The attribute identifier in the document /// The attribute in the document where the word was found
/// where the word was found. /// along with the index in it.
/// pub attribute: u16,
/// This is an `u8` therefore a document pub word_index: u32,
/// can not have more than `2^8` attributes.
pub attribute: u8,
/// The index where the word was found in the attribute. /// The position in bytes where the word was found
/// along with the length of it.
/// ///
/// Only the first 1000 words are indexed. /// It informs on the original word area in the text indexed
pub attribute_index: u32, /// without needing to run the tokenizer again.
pub char_index: u32,
pub char_length: u16,
} }
/// This structure represent a matching word with informations /// This structure represent a matching word with informations
@ -45,7 +52,7 @@ pub struct DocIndex {
/// ///
/// The word in itself is not important. /// The word in itself is not important.
// TODO do data oriented programming ? very arrays ? // TODO do data oriented programming ? very arrays ?
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Match { pub struct Match {
/// The word index in the query sentence. /// The word index in the query sentence.
/// Same as the `attribute_index` but for the query words. /// Same as the `attribute_index` but for the query words.
@ -57,23 +64,21 @@ pub struct Match {
/// (i.e. the Levenshtein distance). /// (i.e. the Levenshtein distance).
pub distance: u8, pub distance: u8,
/// The attribute in which the word is located /// The attribute in the document where the word was found
/// (i.e. Title is 0, Description is 1). /// along with the index in it.
/// pub attribute: u16,
/// This is an `u8` therefore a document pub word_index: u32,
/// can not have more than `2^8` attributes.
pub attribute: u8,
/// Where does this word is located in the attribute string
/// (i.e. at the start or the end of the attribute).
///
/// The index in the attribute is limited to a maximum of `2^32`
/// this is because we index only the first 1000 words
/// in an attribute.
pub attribute_index: u32,
/// Whether the word that match is an exact match or a prefix. /// Whether the word that match is an exact match or a prefix.
pub is_exact: bool, pub is_exact: bool,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u32,
pub char_length: u16,
} }
impl Match { impl Match {
@ -82,8 +87,10 @@ impl Match {
query_index: 0, query_index: 0,
distance: 0, distance: 0,
attribute: 0, attribute: 0,
attribute_index: 0, word_index: 0,
is_exact: false, is_exact: false,
char_index: 0,
char_length: 0,
} }
} }
@ -91,9 +98,22 @@ impl Match {
Match { Match {
query_index: u32::max_value(), query_index: u32::max_value(),
distance: u8::max_value(), distance: u8::max_value(),
attribute: u8::max_value(), attribute: u16::max_value(),
attribute_index: u32::max_value(), word_index: u32::max_value(),
is_exact: true, is_exact: true,
char_index: u32::max_value(),
char_length: u16::max_value(),
} }
} }
} }
#[cfg(test)]
mod tests {
use super::*;
use std::mem;
#[test]
fn docindex_mem_size() {
assert_eq!(mem::size_of::<DocIndex>(), 24);
}
}

View File

@ -1,19 +1,13 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::ops::Deref;
use rocksdb::DB;
use crate::rank::criterion::Criterion; use crate::rank::criterion::Criterion;
use crate::database::DatabaseView; use crate::rank::RawDocument;
use crate::rank::Document;
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct DocumentId; pub struct DocumentId;
impl<D> Criterion<D> for DocumentId impl Criterion for DocumentId {
where D: Deref<Target=DB> fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
{
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering {
lhs.id.cmp(&rhs.id) lhs.id.cmp(&rhs.id)
} }
} }

View File

@ -1,33 +1,40 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::ops::Deref;
use rocksdb::DB; use slice_group_by::GroupBy;
use group_by::GroupBy;
use crate::rank::{match_query_index, Document};
use crate::rank::criterion::Criterion; use crate::rank::criterion::Criterion;
use crate::database::DatabaseView; use crate::rank::RawDocument;
use crate::Match;
#[inline] #[inline]
fn contains_exact(matches: &[Match]) -> bool { fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize {
matches.iter().any(|m| m.is_exact) let mut count = 0;
} let mut index = 0;
#[inline] for group in query_index.linear_group_by(PartialEq::eq) {
fn number_exact_matches(matches: &[Match]) -> usize { let len = group.len();
GroupBy::new(matches, match_query_index).map(contains_exact).count() count += is_exact[index..index + len].contains(&true) as usize;
index += len;
}
count
} }
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct Exact; pub struct Exact;
impl<D> Criterion<D> for Exact impl Criterion for Exact {
where D: Deref<Target=DB> fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
{ let lhs = {
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering { let query_index = lhs.query_index();
let lhs = number_exact_matches(&lhs.matches); let is_exact = lhs.is_exact();
let rhs = number_exact_matches(&rhs.matches); number_exact_matches(query_index, is_exact)
};
let rhs = {
let query_index = rhs.query_index();
let is_exact = rhs.is_exact();
number_exact_matches(query_index, is_exact)
};
lhs.cmp(&rhs).reverse() lhs.cmp(&rhs).reverse()
} }

View File

@ -8,12 +8,7 @@ mod sort_by;
mod document_id; mod document_id;
use std::cmp::Ordering; use std::cmp::Ordering;
use std::ops::Deref; use crate::rank::RawDocument;
use rocksdb::DB;
use crate::database::DatabaseView;
use crate::rank::Document;
pub use self::{ pub use self::{
sum_of_typos::SumOfTypos, sum_of_typos::SumOfTypos,
@ -26,56 +21,47 @@ pub use self::{
document_id::DocumentId, document_id::DocumentId,
}; };
pub trait Criterion<D> pub trait Criterion: Send + Sync {
where D: Deref<Target=DB> fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering;
{
#[inline]
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering;
#[inline] #[inline]
fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> bool { fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
self.evaluate(lhs, rhs, view) == Ordering::Equal self.evaluate(lhs, rhs) == Ordering::Equal
} }
} }
impl<'a, D, T: Criterion<D> + ?Sized> Criterion<D> for &'a T impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T {
where D: Deref<Target=DB> fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
{ (**self).evaluate(lhs, rhs)
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering {
(**self).evaluate(lhs, rhs, view)
} }
fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> bool { fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
(**self).eq(lhs, rhs, view) (**self).eq(lhs, rhs)
} }
} }
impl<D, T: Criterion<D> + ?Sized> Criterion<D> for Box<T> impl<T: Criterion + ?Sized> Criterion for Box<T> {
where D: Deref<Target=DB> fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
{ (**self).evaluate(lhs, rhs)
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering {
(**self).evaluate(lhs, rhs, view)
} }
fn eq(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> bool { fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
(**self).eq(lhs, rhs, view) (**self).eq(lhs, rhs)
} }
} }
pub struct CriteriaBuilder<D> #[derive(Default)]
where D: Deref<Target=DB> pub struct CriteriaBuilder {
{ inner: Vec<Box<dyn Criterion>>
inner: Vec<Box<dyn Criterion<D>>>
} }
impl<D> CriteriaBuilder<D> impl CriteriaBuilder
where D: Deref<Target=DB>
{ {
pub fn new() -> CriteriaBuilder<D> { pub fn new() -> CriteriaBuilder {
CriteriaBuilder { inner: Vec::new() } CriteriaBuilder { inner: Vec::new() }
} }
pub fn with_capacity(capacity: usize) -> CriteriaBuilder<D> { pub fn with_capacity(capacity: usize) -> CriteriaBuilder {
CriteriaBuilder { inner: Vec::with_capacity(capacity) } CriteriaBuilder { inner: Vec::with_capacity(capacity) }
} }
@ -83,33 +69,29 @@ where D: Deref<Target=DB>
self.inner.reserve(additional) self.inner.reserve(additional)
} }
pub fn add<C>(mut self, criterion: C) -> CriteriaBuilder<D> pub fn add<C>(mut self, criterion: C) -> CriteriaBuilder
where C: 'static + Criterion<D>, where C: 'static + Criterion,
{ {
self.push(criterion); self.push(criterion);
self self
} }
pub fn push<C>(&mut self, criterion: C) pub fn push<C>(&mut self, criterion: C)
where C: 'static + Criterion<D>, where C: 'static + Criterion,
{ {
self.inner.push(Box::new(criterion)); self.inner.push(Box::new(criterion));
} }
pub fn build(self) -> Criteria<D> { pub fn build(self) -> Criteria {
Criteria { inner: self.inner } Criteria { inner: self.inner }
} }
} }
pub struct Criteria<D> pub struct Criteria {
where D: Deref<Target=DB> inner: Vec<Box<dyn Criterion>>,
{
inner: Vec<Box<dyn Criterion<D>>>,
} }
impl<D> Default for Criteria<D> impl Default for Criteria {
where D: Deref<Target=DB>
{
fn default() -> Self { fn default() -> Self {
CriteriaBuilder::with_capacity(7) CriteriaBuilder::with_capacity(7)
.add(SumOfTypos) .add(SumOfTypos)
@ -123,10 +105,8 @@ where D: Deref<Target=DB>
} }
} }
impl<D> AsRef<[Box<dyn Criterion<D>>]> for Criteria<D> impl AsRef<[Box<dyn Criterion>]> for Criteria {
where D: Deref<Target=DB> fn as_ref(&self) -> &[Box<dyn Criterion>] {
{
fn as_ref(&self) -> &[Box<dyn Criterion<D>>] {
&self.inner &self.inner
} }
} }

View File

@ -1,28 +1,28 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::ops::Deref;
use rocksdb::DB; use slice_group_by::GroupBy;
use group_by::GroupBy;
use crate::rank::{match_query_index, Document};
use crate::rank::criterion::Criterion; use crate::rank::criterion::Criterion;
use crate::database::DatabaseView; use crate::rank::RawDocument;
use crate::Match;
#[inline] #[inline]
fn number_of_query_words(matches: &[Match]) -> usize { fn number_of_query_words(query_index: &[u32]) -> usize {
GroupBy::new(matches, match_query_index).count() query_index.linear_group_by(PartialEq::eq).count()
} }
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct NumberOfWords; pub struct NumberOfWords;
impl<D> Criterion<D> for NumberOfWords impl Criterion for NumberOfWords {
where D: Deref<Target=DB> fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
{ let lhs = {
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering { let query_index = lhs.query_index();
let lhs = number_of_query_words(&lhs.matches); number_of_query_words(query_index)
let rhs = number_of_query_words(&rhs.matches); };
let rhs = {
let query_index = rhs.query_index();
number_of_query_words(query_index)
};
lhs.cmp(&rhs).reverse() lhs.cmp(&rhs).reverse()
} }

View File

@ -7,7 +7,7 @@ use serde::de::DeserializeOwned;
use crate::rank::criterion::Criterion; use crate::rank::criterion::Criterion;
use crate::database::DatabaseView; use crate::database::DatabaseView;
use crate::rank::Document; use crate::rank::RawDocument;
/// An helper struct that permit to sort documents by /// An helper struct that permit to sort documents by
/// some of their stored attributes. /// some of their stored attributes.
@ -24,7 +24,7 @@ use crate::rank::Document;
/// ///
/// # Example /// # Example
/// ///
/// ```no-test /// ```ignore
/// use serde_derive::Deserialize; /// use serde_derive::Deserialize;
/// use meilidb::rank::criterion::*; /// use meilidb::rank::criterion::*;
/// ///
@ -40,34 +40,40 @@ use crate::rank::Document;
/// .add(SumOfWordsAttribute) /// .add(SumOfWordsAttribute)
/// .add(SumOfWordsPosition) /// .add(SumOfWordsPosition)
/// .add(Exact) /// .add(Exact)
/// .add(SortBy::<TimeOnly>::new()) /// .add(SortBy::<TimeOnly>::new(&view))
/// .add(DocumentId); /// .add(DocumentId);
/// ///
/// let criterion = builder.build(); /// let criterion = builder.build();
/// ///
/// ``` /// ```
#[derive(Default)] pub struct SortBy<'a, T, D>
pub struct SortBy<T> { where D: Deref<Target=DB> + Send + Sync,
T: Send + Sync
{
view: &'a DatabaseView<D>,
_phantom: marker::PhantomData<T>, _phantom: marker::PhantomData<T>,
} }
impl<T> SortBy<T> { impl<'a, T, D> SortBy<'a, T, D>
pub fn new() -> Self { where D: Deref<Target=DB> + Send + Sync,
SortBy { _phantom: marker::PhantomData } T: Send + Sync
{
pub fn new(view: &'a DatabaseView<D>) -> Self {
SortBy { view, _phantom: marker::PhantomData }
} }
} }
impl<T, D> Criterion<D> for SortBy<T> impl<'a, T, D> Criterion for SortBy<'a, T, D>
where D: Deref<Target=DB>, where D: Deref<Target=DB> + Send + Sync,
T: DeserializeOwned + Ord, T: DeserializeOwned + Ord + Send + Sync,
{ {
fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering { fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = match view.retrieve_document::<T>(lhs.id) { let lhs = match self.view.document_by_id::<T>(lhs.id) {
Ok(doc) => Some(doc), Ok(doc) => Some(doc),
Err(e) => { eprintln!("{}", e); None }, Err(e) => { eprintln!("{}", e); None },
}; };
let rhs = match view.retrieve_document::<T>(rhs.id) { let rhs = match self.view.document_by_id::<T>(rhs.id) {
Ok(doc) => Some(doc), Ok(doc) => Some(doc),
Err(e) => { eprintln!("{}", e); None }, Err(e) => { eprintln!("{}", e); None },
}; };

View File

@ -1,25 +1,20 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::ops::Deref;
use rocksdb::DB; use slice_group_by::GroupBy;
use group_by::GroupBy;
use crate::rank::{match_query_index, Document};
use crate::rank::criterion::Criterion; use crate::rank::criterion::Criterion;
use crate::database::DatabaseView; use crate::rank::RawDocument;
use crate::Match;
#[inline] #[inline]
fn sum_matches_typos(matches: &[Match]) -> i8 { fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> isize {
let mut sum_typos = 0; let mut sum_typos = 0;
let mut number_words = 0; let mut number_words = 0;
let mut index = 0;
// note that GroupBy will never return an empty group for group in query_index.linear_group_by(PartialEq::eq) {
// so we can do this assumption safely sum_typos += distance[index] as isize;
for group in GroupBy::new(matches, match_query_index) {
sum_typos += unsafe { group.get_unchecked(0).distance } as i8;
number_words += 1; number_words += 1;
index += group.len();
} }
sum_typos - number_words sum_typos - number_words
@ -28,18 +23,24 @@ fn sum_matches_typos(matches: &[Match]) -> i8 {
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct SumOfTypos; pub struct SumOfTypos;
impl<D> Criterion<D> for SumOfTypos impl Criterion for SumOfTypos {
where D: Deref<Target=DB> fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
{ let lhs = {
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering { let query_index = lhs.query_index();
let lhs = sum_matches_typos(&lhs.matches); let distance = lhs.distance();
let rhs = sum_matches_typos(&rhs.matches); sum_matches_typos(query_index, distance)
};
let rhs = {
let query_index = rhs.query_index();
let distance = rhs.distance();
sum_matches_typos(query_index, distance)
};
lhs.cmp(&rhs) lhs.cmp(&rhs)
} }
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
@ -50,30 +51,14 @@ mod tests {
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation" // doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
#[test] #[test]
fn one_typo_reference() { fn one_typo_reference() {
let doc0 = { let query_index0 = &[0, 1];
let matches = vec![ let distance0 = &[0, 0];
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 2, is_exact: false },
];
Document {
id: 0,
matches: matches,
}
};
let doc1 = { let query_index1 = &[0, 1];
let matches = vec![ let distance1 = &[1, 0];
Match { query_index: 0, distance: 1, attribute: 0, attribute_index: 0, is_exact: false },
Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 2, is_exact: false },
];
Document {
id: 1,
matches: matches,
}
};
let lhs = sum_matches_typos(&doc0.matches); let lhs = sum_matches_typos(query_index0, distance0);
let rhs = sum_matches_typos(&doc1.matches); let rhs = sum_matches_typos(query_index1, distance1);
assert_eq!(lhs.cmp(&rhs), Ordering::Less); assert_eq!(lhs.cmp(&rhs), Ordering::Less);
} }
@ -83,29 +68,14 @@ mod tests {
// doc1: "bouton" // doc1: "bouton"
#[test] #[test]
fn no_typo() { fn no_typo() {
let doc0 = { let query_index0 = &[0, 1];
let matches = vec![ let distance0 = &[0, 0];
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 1, is_exact: false },
];
Document {
id: 0,
matches: matches,
}
};
let doc1 = { let query_index1 = &[0];
let matches = vec![ let distance1 = &[0];
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
];
Document {
id: 1,
matches: matches,
}
};
let lhs = sum_matches_typos(&doc0.matches); let lhs = sum_matches_typos(query_index0, distance0);
let rhs = sum_matches_typos(&doc1.matches); let rhs = sum_matches_typos(query_index1, distance1);
assert_eq!(lhs.cmp(&rhs), Ordering::Less); assert_eq!(lhs.cmp(&rhs), Ordering::Less);
} }
@ -115,29 +85,14 @@ mod tests {
// doc1: "bouton" // doc1: "bouton"
#[test] #[test]
fn one_typo() { fn one_typo() {
let doc0 = { let query_index0 = &[0, 1];
let matches = vec![ let distance0 = &[0, 1];
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
Match { query_index: 1, distance: 1, attribute: 0, attribute_index: 1, is_exact: false },
];
Document {
id: 0,
matches: matches,
}
};
let doc1 = { let query_index1 = &[0];
let matches = vec![ let distance1 = &[0];
Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
];
Document {
id: 1,
matches: matches,
}
};
let lhs = sum_matches_typos(&doc0.matches); let lhs = sum_matches_typos(query_index0, distance0);
let rhs = sum_matches_typos(&doc1.matches); let rhs = sum_matches_typos(query_index1, distance1);
assert_eq!(lhs.cmp(&rhs), Ordering::Equal); assert_eq!(lhs.cmp(&rhs), Ordering::Equal);
} }
} }

View File

@ -1,32 +1,39 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::ops::Deref;
use rocksdb::DB; use slice_group_by::GroupBy;
use group_by::GroupBy;
use crate::database::DatabaseView;
use crate::rank::{match_query_index, Document};
use crate::rank::criterion::Criterion; use crate::rank::criterion::Criterion;
use crate::Match; use crate::rank::RawDocument;
#[inline] #[inline]
fn sum_matches_attributes(matches: &[Match]) -> u8 { fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
// note that GroupBy will never return an empty group let mut sum_attributes = 0;
// so we can do this assumption safely let mut index = 0;
GroupBy::new(matches, match_query_index).map(|group| unsafe {
group.get_unchecked(0).attribute for group in query_index.linear_group_by(PartialEq::eq) {
}).sum() sum_attributes += attribute[index] as usize;
index += group.len();
}
sum_attributes
} }
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct SumOfWordsAttribute; pub struct SumOfWordsAttribute;
impl<D> Criterion<D> for SumOfWordsAttribute impl Criterion for SumOfWordsAttribute {
where D: Deref<Target=DB> fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
{ let lhs = {
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering { let query_index = lhs.query_index();
let lhs = sum_matches_attributes(&lhs.matches); let attribute = lhs.attribute();
let rhs = sum_matches_attributes(&rhs.matches); sum_matches_attributes(query_index, attribute)
};
let rhs = {
let query_index = rhs.query_index();
let attribute = rhs.attribute();
sum_matches_attributes(query_index, attribute)
};
lhs.cmp(&rhs) lhs.cmp(&rhs)
} }

View File

@ -1,32 +1,39 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::ops::Deref;
use rocksdb::DB; use slice_group_by::GroupBy;
use group_by::GroupBy;
use crate::database::DatabaseView;
use crate::rank::{match_query_index, Document};
use crate::rank::criterion::Criterion; use crate::rank::criterion::Criterion;
use crate::Match; use crate::rank::RawDocument;
#[inline] #[inline]
fn sum_matches_attribute_index(matches: &[Match]) -> u32 { fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u32]) -> usize {
// note that GroupBy will never return an empty group let mut sum_word_index = 0;
// so we can do this assumption safely let mut index = 0;
GroupBy::new(matches, match_query_index).map(|group| unsafe {
group.get_unchecked(0).attribute_index for group in query_index.linear_group_by(PartialEq::eq) {
}).sum() sum_word_index += word_index[index] as usize;
index += group.len();
}
sum_word_index
} }
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct SumOfWordsPosition; pub struct SumOfWordsPosition;
impl<D> Criterion<D> for SumOfWordsPosition impl Criterion for SumOfWordsPosition {
where D: Deref<Target=DB> fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
{ let lhs = {
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering { let query_index = lhs.query_index();
let lhs = sum_matches_attribute_index(&lhs.matches); let word_index = lhs.word_index();
let rhs = sum_matches_attribute_index(&rhs.matches); sum_matches_attribute_index(query_index, word_index)
};
let rhs = {
let query_index = rhs.query_index();
let word_index = rhs.word_index();
sum_matches_attribute_index(query_index, word_index)
};
lhs.cmp(&rhs) lhs.cmp(&rhs)
} }

View File

@ -1,16 +1,17 @@
use std::cmp::{self, Ordering}; use std::cmp::{self, Ordering};
use std::ops::Deref;
use rocksdb::DB; use slice_group_by::GroupBy;
use group_by::GroupBy;
use crate::rank::{match_query_index, Document};
use crate::rank::criterion::Criterion; use crate::rank::criterion::Criterion;
use crate::database::DatabaseView; use crate::rank::RawDocument;
use crate::Match;
const MAX_DISTANCE: u32 = 8; const MAX_DISTANCE: u32 = 8;
#[inline]
fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
(a.clone(), b.clone())
}
fn index_proximity(lhs: u32, rhs: u32) -> u32 { fn index_proximity(lhs: u32, rhs: u32) -> u32 {
if lhs < rhs { if lhs < rhs {
cmp::min(rhs - lhs, MAX_DISTANCE) cmp::min(rhs - lhs, MAX_DISTANCE)
@ -19,30 +20,48 @@ fn index_proximity(lhs: u32, rhs: u32) -> u32 {
} }
} }
fn attribute_proximity(lhs: &Match, rhs: &Match) -> u32 { fn attribute_proximity((lattr, lwi): (u16, u32), (rattr, rwi): (u16, u32)) -> u32 {
if lhs.attribute != rhs.attribute { return MAX_DISTANCE } if lattr != rattr { return MAX_DISTANCE }
index_proximity(lhs.attribute_index, rhs.attribute_index) index_proximity(lwi, rwi)
} }
fn min_proximity(lhs: &[Match], rhs: &[Match]) -> u32 { fn min_proximity((lattr, lwi): (&[u16], &[u32]), (rattr, rwi): (&[u16], &[u32])) -> u32 {
let mut min_prox = u32::max_value(); let mut min_prox = u32::max_value();
for a in lhs { for a in lattr.iter().zip(lwi) {
for b in rhs { for b in rattr.iter().zip(rwi) {
let a = clone_tuple(a);
let b = clone_tuple(b);
min_prox = cmp::min(min_prox, attribute_proximity(a, b)); min_prox = cmp::min(min_prox, attribute_proximity(a, b));
} }
} }
min_prox min_prox
} }
fn matches_proximity(matches: &[Match]) -> u32 { fn matches_proximity(query_index: &[u32], attribute: &[u16], word_index: &[u32]) -> u32 {
let mut proximity = 0; let mut proximity = 0;
let mut iter = GroupBy::new(matches, match_query_index);
// iterate over groups by windows of size 2 let mut index = 0;
let mut last = iter.next(); let mut iter = query_index.linear_group_by(PartialEq::eq);
let mut last = iter.next().map(|group| {
let len = group.len();
let rattr = &attribute[index..index + len];
let rwi = &word_index[index..index + len];
index += len;
(rattr, rwi)
});
while let (Some(lhs), Some(rhs)) = (last, iter.next()) { while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
let len = rhs.len();
let rattr = &attribute[index..index + len];
let rwi = &word_index[index..index + len];
let rhs = (rattr, rwi);
proximity += min_proximity(lhs, rhs); proximity += min_proximity(lhs, rhs);
last = Some(rhs); last = Some(rhs);
index += len;
} }
proximity proximity
@ -51,18 +70,26 @@ fn matches_proximity(matches: &[Match]) -> u32 {
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct WordsProximity; pub struct WordsProximity;
impl<D> Criterion<D> for WordsProximity impl Criterion for WordsProximity {
where D: Deref<Target=DB> fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
{ let lhs = {
fn evaluate(&self, lhs: &Document, rhs: &Document, _: &DatabaseView<D>) -> Ordering { let query_index = lhs.query_index();
let lhs = matches_proximity(&lhs.matches); let attribute = lhs.attribute();
let rhs = matches_proximity(&rhs.matches); let word_index = lhs.word_index();
matches_proximity(query_index, attribute, word_index)
};
let rhs = {
let query_index = rhs.query_index();
let attribute = rhs.attribute();
let word_index = rhs.word_index();
matches_proximity(query_index, attribute, word_index)
};
lhs.cmp(&rhs) lhs.cmp(&rhs)
} }
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
@ -78,18 +105,14 @@ mod tests {
// { id: 2, attr: 2, attr_index: 0 } // { id: 2, attr: 2, attr_index: 0 }
// { id: 3, attr: 3, attr_index: 1 } // { id: 3, attr: 3, attr_index: 1 }
let matches = &[ let query_index = &[0, 1, 2, 2, 3];
Match { query_index: 0, attribute: 0, attribute_index: 0, ..Match::zero() }, let attribute = &[0, 1, 1, 2, 3];
Match { query_index: 1, attribute: 1, attribute_index: 0, ..Match::zero() }, let word_index = &[0, 0, 1, 0, 1];
Match { query_index: 2, attribute: 1, attribute_index: 1, ..Match::zero() },
Match { query_index: 2, attribute: 2, attribute_index: 0, ..Match::zero() },
Match { query_index: 3, attribute: 3, attribute_index: 1, ..Match::zero() },
];
// soup -> of = 8 // soup -> of = 8
// + of -> the = 1 // + of -> the = 1
// + the -> day = 8 (not 1) // + the -> day = 8 (not 1)
assert_eq!(matches_proximity(matches), 17); assert_eq!(matches_proximity(query_index, attribute, word_index), 17);
} }
#[test] #[test]
@ -104,18 +127,13 @@ mod tests {
// { id: 3, attr: 0, attr_index: 1 } // { id: 3, attr: 0, attr_index: 1 }
// { id: 3, attr: 1, attr_index: 3 } // { id: 3, attr: 1, attr_index: 3 }
let matches = &[ let query_index = &[0, 0, 1, 2, 3, 3];
Match { query_index: 0, attribute: 0, attribute_index: 0, ..Match::zero() }, let attribute = &[0, 1, 1, 1, 0, 1];
Match { query_index: 0, attribute: 1, attribute_index: 0, ..Match::zero() }, let word_index = &[0, 0, 1, 2, 1, 3];
Match { query_index: 1, attribute: 1, attribute_index: 1, ..Match::zero() },
Match { query_index: 2, attribute: 1, attribute_index: 2, ..Match::zero() },
Match { query_index: 3, attribute: 0, attribute_index: 1, ..Match::zero() },
Match { query_index: 3, attribute: 1, attribute_index: 3, ..Match::zero() },
];
// soup -> of = 1 // soup -> of = 1
// + of -> the = 1 // + of -> the = 1
// + the -> day = 1 // + the -> day = 1
assert_eq!(matches_proximity(matches), 3); assert_eq!(matches_proximity(query_index, attribute, word_index), 3);
} }
} }

View File

@ -2,32 +2,182 @@ pub mod criterion;
mod query_builder; mod query_builder;
mod distinct_map; mod distinct_map;
use std::sync::Arc;
use slice_group_by::GroupBy;
use rayon::slice::ParallelSliceMut;
use crate::{Match, DocumentId}; use crate::{Match, DocumentId};
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder}; pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};
#[inline] #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
fn match_query_index(a: &Match, b: &Match) -> bool {
a.query_index == b.query_index
}
#[derive(Debug, Clone)]
pub struct Document { pub struct Document {
pub id: DocumentId, pub id: DocumentId,
pub matches: Vec<Match>, pub matches: Vec<Match>,
} }
impl Document { impl Document {
pub fn new(doc: DocumentId, match_: Match) -> Self { fn from_raw(raw: &RawDocument) -> Document {
unsafe { Self::from_sorted_matches(doc, vec![match_]) } let len = raw.matches.range.len();
let mut matches = Vec::with_capacity(len);
let query_index = raw.query_index();
let distance = raw.distance();
let attribute = raw.attribute();
let word_index = raw.word_index();
let is_exact = raw.is_exact();
let char_index = raw.char_index();
let char_length = raw.char_length();
for i in 0..len {
let match_ = Match {
query_index: query_index[i],
distance: distance[i],
attribute: attribute[i],
word_index: word_index[i],
is_exact: is_exact[i],
char_index: char_index[i],
char_length: char_length[i],
};
matches.push(match_);
} }
pub fn from_matches(doc: DocumentId, mut matches: Vec<Match>) -> Self { Document { id: raw.id, matches }
matches.sort_unstable(); }
unsafe { Self::from_sorted_matches(doc, matches) } }
}
#[derive(Clone)]
pub unsafe fn from_sorted_matches(id: DocumentId, matches: Vec<Match>) -> Self { pub struct RawDocument {
Self { id, matches } pub id: DocumentId,
pub matches: SharedMatches,
}
impl RawDocument {
fn new(id: DocumentId, range: Range, matches: Arc<Matches>) -> RawDocument {
RawDocument { id, matches: SharedMatches { range, matches } }
}
pub fn query_index(&self) -> &[u32] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) }
}
pub fn distance(&self) -> &[u8] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
}
pub fn attribute(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
}
pub fn word_index(&self) -> &[u32] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) }
}
pub fn is_exact(&self) -> &[bool] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
}
pub fn char_index(&self) -> &[u32] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.char_index.get_unchecked(r.start..r.end) }
}
pub fn char_length(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.char_length.get_unchecked(r.start..r.end) }
}
}
pub fn raw_documents_from_matches(mut matches: Vec<(DocumentId, Match)>) -> Vec<RawDocument> {
let mut docs_ranges = Vec::<(DocumentId, Range)>::new();
let mut matches2 = Matches::with_capacity(matches.len());
matches.par_sort_unstable();
for group in matches.linear_group_by(|(a, _), (b, _)| a == b) {
let id = group[0].0;
let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0);
let end = start + group.len();
docs_ranges.push((id, Range { start, end }));
matches2.extend_from_slice(group);
}
let matches = Arc::new(matches2);
docs_ranges.into_iter().map(|(i, r)| RawDocument::new(i, r, matches.clone())).collect()
}
#[derive(Debug, Copy, Clone)]
struct Range {
start: usize,
end: usize,
}
impl Range {
fn len(self) -> usize {
self.end - self.start
}
}
#[derive(Clone)]
pub struct SharedMatches {
range: Range,
matches: Arc<Matches>,
}
#[derive(Clone)]
struct Matches {
query_index: Vec<u32>,
distance: Vec<u8>,
attribute: Vec<u16>,
word_index: Vec<u32>,
is_exact: Vec<bool>,
char_index: Vec<u32>,
char_length: Vec<u16>,
}
impl Matches {
fn with_capacity(cap: usize) -> Matches {
Matches {
query_index: Vec::with_capacity(cap),
distance: Vec::with_capacity(cap),
attribute: Vec::with_capacity(cap),
word_index: Vec::with_capacity(cap),
is_exact: Vec::with_capacity(cap),
char_index: Vec::with_capacity(cap),
char_length: Vec::with_capacity(cap),
}
}
fn extend_from_slice(&mut self, matches: &[(DocumentId, Match)]) {
for (_, match_) in matches {
self.query_index.push(match_.query_index);
self.distance.push(match_.distance);
self.attribute.push(match_.attribute);
self.word_index.push(match_.word_index);
self.is_exact.push(match_.is_exact);
self.char_index.push(match_.char_index);
self.char_length.push(match_.char_length);
}
} }
} }

View File

@ -4,17 +4,20 @@ use std::error::Error;
use std::hash::Hash; use std::hash::Hash;
use std::rc::Rc; use std::rc::Rc;
use group_by::GroupByMut; use rayon::slice::ParallelSliceMut;
use slice_group_by::GroupByMut;
use elapsed::measure_time;
use hashbrown::HashMap; use hashbrown::HashMap;
use fst::Streamer; use fst::Streamer;
use rocksdb::DB; use rocksdb::DB;
use log::info;
use crate::automaton::{self, DfaExt, AutomatonExt}; use crate::automaton::{self, DfaExt, AutomatonExt};
use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap};
use crate::rank::criterion::Criteria; use crate::rank::criterion::Criteria;
use crate::database::DatabaseView; use crate::database::DatabaseView;
use crate::{Match, DocumentId}; use crate::{Match, DocumentId};
use crate::rank::Document; use crate::rank::{raw_documents_from_matches, RawDocument, Document};
fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> { fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
@ -34,34 +37,45 @@ fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
automatons automatons
} }
pub struct QueryBuilder<'a, D> pub type FilterFunc<D> = fn(DocumentId, &DatabaseView<D>) -> bool;
pub struct QueryBuilder<'a, D, FI>
where D: Deref<Target=DB> where D: Deref<Target=DB>
{ {
view: &'a DatabaseView<D>, view: &'a DatabaseView<D>,
criteria: Criteria<D>, criteria: Criteria,
filter: Option<FI>,
} }
impl<'a, D> QueryBuilder<'a, D> impl<'a, D> QueryBuilder<'a, D, FilterFunc<D>>
where D: Deref<Target=DB> where D: Deref<Target=DB>
{ {
pub fn new(view: &'a DatabaseView<D>) -> Result<Self, Box<Error>> { pub fn new(view: &'a DatabaseView<D>) -> Result<Self, Box<Error>> {
QueryBuilder::with_criteria(view, Criteria::default()) QueryBuilder::with_criteria(view, Criteria::default())
} }
pub fn with_criteria(view: &'a DatabaseView<D>, criteria: Criteria) -> Result<Self, Box<Error>> {
Ok(QueryBuilder { view, criteria, filter: None })
}
} }
impl<'a, D> QueryBuilder<'a, D> impl<'a, D, FI> QueryBuilder<'a, D, FI>
where D: Deref<Target=DB> where D: Deref<Target=DB>,
{ {
pub fn with_criteria(view: &'a DatabaseView<D>, criteria: Criteria<D>) -> Result<Self, Box<Error>> { pub fn with_filter<F>(self, function: F) -> QueryBuilder<'a, D, F>
Ok(QueryBuilder { view, criteria }) where F: Fn(DocumentId, &DatabaseView<D>) -> bool,
{
QueryBuilder {
view: self.view,
criteria: self.criteria,
filter: Some(function)
}
} }
pub fn criteria(&mut self, criteria: Criteria<D>) -> &mut Self { pub fn with_distinct<F, K>(self, function: F, size: usize) -> DistinctQueryBuilder<'a, D, FI, F>
self.criteria = criteria; where F: Fn(DocumentId, &DatabaseView<D>) -> Option<K>,
self K: Hash + Eq,
} {
pub fn with_distinct<F>(self, function: F, size: usize) -> DistinctQueryBuilder<'a, D, F> {
DistinctQueryBuilder { DistinctQueryBuilder {
inner: self, inner: self,
function: function, function: function,
@ -69,19 +83,19 @@ where D: Deref<Target=DB>
} }
} }
fn query_all(&self, query: &str) -> Vec<Document> { fn query_all(&self, query: &str) -> Vec<RawDocument> {
let automatons = split_whitespace_automatons(query); let automatons = split_whitespace_automatons(query);
let mut stream = { let mut stream = {
let mut op_builder = fst::map::OpBuilder::new(); let mut op_builder = fst::map::OpBuilder::new();
for automaton in &automatons { for automaton in &automatons {
let stream = self.view.blob().as_map().search(automaton); let stream = self.view.index().positive.map().search(automaton);
op_builder.push(stream); op_builder.push(stream);
} }
op_builder.union() op_builder.union()
}; };
let mut matches = HashMap::new(); let mut matches = Vec::new();
while let Some((input, indexed_values)) = stream.next() { while let Some((input, indexed_values)) = stream.next() {
for iv in indexed_values { for iv in indexed_values {
@ -89,7 +103,7 @@ where D: Deref<Target=DB>
let distance = automaton.eval(input).to_u8(); let distance = automaton.eval(input).to_u8();
let is_exact = distance == 0 && input.len() == automaton.query_len(); let is_exact = distance == 0 && input.len() == automaton.query_len();
let doc_indexes = self.view.blob().as_indexes(); let doc_indexes = &self.view.index().positive.indexes();
let doc_indexes = &doc_indexes[iv.value as usize]; let doc_indexes = &doc_indexes[iv.value as usize];
for doc_index in doc_indexes { for doc_index in doc_indexes {
@ -97,31 +111,50 @@ where D: Deref<Target=DB>
query_index: iv.index as u32, query_index: iv.index as u32,
distance: distance, distance: distance,
attribute: doc_index.attribute, attribute: doc_index.attribute,
attribute_index: doc_index.attribute_index, word_index: doc_index.word_index,
is_exact: is_exact, is_exact: is_exact,
char_index: doc_index.char_index,
char_length: doc_index.char_length,
}; };
matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_); matches.push((doc_index.document_id, match_));
} }
} }
} }
matches.into_iter().map(|(id, matches)| Document::from_matches(id, matches)).collect() let total_matches = matches.len();
let raw_documents = raw_documents_from_matches(matches);
info!("{} total documents to classify", raw_documents.len());
info!("{} total matches to classify", total_matches);
raw_documents
} }
} }
impl<'a, D> QueryBuilder<'a, D> impl<'a, D, FI> QueryBuilder<'a, D, FI>
where D: Deref<Target=DB>, where D: Deref<Target=DB>,
FI: Fn(DocumentId, &DatabaseView<D>) -> bool,
{ {
pub fn query(&self, query: &str, range: Range<usize>) -> Vec<Document> { pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
let mut documents = self.query_all(query); // We delegate the filter work to the distinct query builder,
let mut groups = vec![documents.as_mut_slice()]; // specifying a distinct rule that has no effect.
let view = &self.view; if self.filter.is_some() {
let builder = self.with_distinct(|_, _| None as Option<()>, 1);
return builder.query(query, range);
}
'criteria: for criterion in self.criteria.as_ref() { let (elapsed, mut documents) = measure_time(|| self.query_all(query));
info!("query_all took {}", elapsed);
let mut groups = vec![documents.as_mut_slice()];
'criteria: for (ci, criterion) in self.criteria.as_ref().iter().enumerate() {
let tmp_groups = mem::replace(&mut groups, Vec::new()); let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut documents_seen = 0; let mut documents_seen = 0;
for group in tmp_groups { for group in tmp_groups {
info!("criterion {}, documents group of size {}", ci, group.len());
// if this group does not overlap with the requested range, // if this group does not overlap with the requested range,
// push it without sorting and splitting it // push it without sorting and splitting it
if documents_seen + group.len() < range.start { if documents_seen + group.len() < range.start {
@ -130,9 +163,12 @@ where D: Deref<Target=DB>,
continue; continue;
} }
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view)); let (elapsed, _) = measure_time(|| {
group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b));
});
info!("criterion {} sort took {}", ci, elapsed);
for group in GroupByMut::new(group, |a, b| criterion.eq(a, b, view)) { for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
documents_seen += group.len(); documents_seen += group.len();
groups.push(group); groups.push(group);
@ -143,46 +179,63 @@ where D: Deref<Target=DB>,
} }
} }
// `drain` removes the documents efficiently using `ptr::copy`
// TODO it could be more efficient to have a custom iterator
let offset = cmp::min(documents.len(), range.start); let offset = cmp::min(documents.len(), range.start);
documents.drain(0..offset); let iter = documents.into_iter().skip(offset).take(range.len());
documents.truncate(range.len()); iter.map(|d| Document::from_raw(&d)).collect()
documents
} }
} }
pub struct DistinctQueryBuilder<'a, D, F> pub struct DistinctQueryBuilder<'a, D, FI, FD>
where D: Deref<Target=DB> where D: Deref<Target=DB>
{ {
inner: QueryBuilder<'a, D>, inner: QueryBuilder<'a, D, FI>,
function: F, function: FD,
size: usize, size: usize,
} }
impl<'a, D, F, K> DistinctQueryBuilder<'a, D, F> impl<'a, D, FI, FD> DistinctQueryBuilder<'a, D, FI, FD>
where D: Deref<Target=DB>, where D: Deref<Target=DB>,
F: Fn(DocumentId, &DatabaseView<D>) -> Option<K>, {
pub fn with_filter<F>(self, function: F) -> DistinctQueryBuilder<'a, D, F, FD>
where F: Fn(DocumentId, &DatabaseView<D>) -> bool,
{
DistinctQueryBuilder {
inner: self.inner.with_filter(function),
function: self.function,
size: self.size
}
}
}
impl<'a, D, FI, FD, K> DistinctQueryBuilder<'a, D, FI, FD>
where D: Deref<Target=DB>,
FI: Fn(DocumentId, &DatabaseView<D>) -> bool,
FD: Fn(DocumentId, &DatabaseView<D>) -> Option<K>,
K: Hash + Eq, K: Hash + Eq,
{ {
pub fn query(&self, query: &str, range: Range<usize>) -> Vec<Document> { pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
let mut documents = self.inner.query_all(query); let (elapsed, mut documents) = measure_time(|| self.inner.query_all(query));
info!("query_all took {}", elapsed);
let mut groups = vec![documents.as_mut_slice()]; let mut groups = vec![documents.as_mut_slice()];
let mut key_cache = HashMap::new(); let mut key_cache = HashMap::new();
let view = &self.inner.view; let view = &self.inner.view;
let mut filter_map = HashMap::new();
// these two variables informs on the current distinct map and // these two variables informs on the current distinct map and
// on the raw offset of the start of the group where the // on the raw offset of the start of the group where the
// range.start bound is located according to the distinct function // range.start bound is located according to the distinct function
let mut distinct_map = DistinctMap::new(self.size); let mut distinct_map = DistinctMap::new(self.size);
let mut distinct_raw_offset = 0; let mut distinct_raw_offset = 0;
'criteria: for criterion in self.inner.criteria.as_ref() { 'criteria: for (ci, criterion) in self.inner.criteria.as_ref().iter().enumerate() {
let tmp_groups = mem::replace(&mut groups, Vec::new()); let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map); let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
let mut documents_seen = 0; let mut documents_seen = 0;
for group in tmp_groups { for group in tmp_groups {
info!("criterion {}, documents group of size {}", ci, group.len());
// if this group does not overlap with the requested range, // if this group does not overlap with the requested range,
// push it without sorting and splitting it // push it without sorting and splitting it
if documents_seen + group.len() < distinct_raw_offset { if documents_seen + group.len() < distinct_raw_offset {
@ -191,11 +244,23 @@ where D: Deref<Target=DB>,
continue; continue;
} }
group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view)); let (elapsed, _) = measure_time(|| {
group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b));
});
info!("criterion {} sort took {}", ci, elapsed);
for group in GroupByMut::new(group, |a, b| criterion.eq(a, b, view)) { for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
// we must compute the real distinguished len of this sub-group // we must compute the real distinguished len of this sub-group
for document in group.iter() { for document in group.iter() {
let filter_accepted = match &self.inner.filter {
Some(filter) => {
let entry = filter_map.entry(document.id);
*entry.or_insert_with(|| (filter)(document.id, view))
},
None => true,
};
if filter_accepted {
let entry = key_cache.entry(document.id); let entry = key_cache.entry(document.id);
let key = entry.or_insert_with(|| (self.function)(document.id, view).map(Rc::new)); let key = entry.or_insert_with(|| (self.function)(document.id, view).map(Rc::new));
@ -203,6 +268,7 @@ where D: Deref<Target=DB>,
Some(key) => buf_distinct.register(key), Some(key) => buf_distinct.register(key),
None => buf_distinct.register_without_key(), None => buf_distinct.register_without_key(),
}; };
}
// the requested range end is reached: stop computing distinct // the requested range end is reached: stop computing distinct
if buf_distinct.len() >= range.end { break } if buf_distinct.len() >= range.end { break }
@ -229,18 +295,24 @@ where D: Deref<Target=DB>,
let mut seen = BufferedDistinctMap::new(&mut distinct_map); let mut seen = BufferedDistinctMap::new(&mut distinct_map);
for document in documents.into_iter().skip(distinct_raw_offset) { for document in documents.into_iter().skip(distinct_raw_offset) {
let key = key_cache.remove(&document.id).expect("BUG: cached key not found"); let filter_accepted = match &self.inner.filter {
Some(_) => filter_map.remove(&document.id).expect("BUG: filtered not found"),
None => true,
};
let accepted = match key { if filter_accepted {
let key = key_cache.remove(&document.id).expect("BUG: cached key not found");
let distinct_accepted = match key {
Some(key) => seen.register(key), Some(key) => seen.register(key),
None => seen.register_without_key(), None => seen.register_without_key(),
}; };
if accepted && seen.len() > range.start { if distinct_accepted && seen.len() > range.start {
out_documents.push(document); out_documents.push(Document::from_raw(&document));
if out_documents.len() == range.len() { break } if out_documents.len() == range.len() { break }
} }
} }
}
out_documents out_documents
} }

View File

@ -2,7 +2,7 @@ use std::mem;
use self::Separator::*; use self::Separator::*;
pub trait TokenizerBuilder { pub trait TokenizerBuilder {
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a>; fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a>;
} }
pub struct DefaultBuilder; pub struct DefaultBuilder;
@ -13,22 +13,39 @@ impl DefaultBuilder {
} }
} }
#[derive(Debug, PartialEq, Eq)]
pub struct Token<'a> {
pub word: &'a str,
pub word_index: usize,
pub char_index: usize,
}
impl TokenizerBuilder for DefaultBuilder { impl TokenizerBuilder for DefaultBuilder {
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a> { fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a> {
Box::new(Tokenizer::new(text)) Box::new(Tokenizer::new(text))
} }
} }
pub struct Tokenizer<'a> { pub struct Tokenizer<'a> {
index: usize, word_index: usize,
char_index: usize,
inner: &'a str, inner: &'a str,
} }
impl<'a> Tokenizer<'a> { impl<'a> Tokenizer<'a> {
pub fn new(string: &str) -> Tokenizer { pub fn new(string: &str) -> Tokenizer {
let mut char_advance = 0;
let mut index_advance = 0;
for (n, (i, c)) in string.char_indices().enumerate() {
char_advance = n;
index_advance = i;
if detect_separator(c).is_none() { break }
}
Tokenizer { Tokenizer {
index: 0, word_index: 0,
inner: string.trim_matches(&[' ', '.', ';', ',', '!', '?', '-', '\'', '"'][..]), char_index: char_advance,
inner: &string[index_advance..],
} }
} }
} }
@ -56,43 +73,58 @@ impl Separator {
} }
} }
fn detect_separator(c: char) -> Option<Separator> {
match c {
'.' | ';' | ',' | '!' | '?' | '-' => Some(Long),
' ' | '\'' | '"' => Some(Short),
_ => None,
}
}
impl<'a> Iterator for Tokenizer<'a> { impl<'a> Iterator for Tokenizer<'a> {
type Item = (usize, &'a str); type Item = Token<'a>;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
let mut start_word = None; let mut start_word = None;
let mut distance = None; let mut distance = None;
for (i, c) in self.inner.char_indices() { for (i, c) in self.inner.char_indices() {
let separator = match c { match detect_separator(c) {
'.' | ';' | ',' | '!' | '?' | '-' => Some(Long), Some(sep) => {
' ' | '\'' | '"' => Some(Short),
_ => None,
};
match separator {
Some(dist) => {
if let Some(start_word) = start_word { if let Some(start_word) = start_word {
let (word, tail) = self.inner.split_at(i); let (prefix, tail) = self.inner.split_at(i);
let (spaces, word) = prefix.split_at(start_word);
self.inner = tail; self.inner = tail;
self.index += distance.map(Separator::to_usize).unwrap_or(0); self.char_index += spaces.chars().count();
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
let word = &word[start_word..]; let token = Token {
return Some((self.index, word)) word: word,
word_index: self.word_index,
char_index: self.char_index,
};
self.char_index += word.chars().count();
return Some(token)
} }
distance = Some(distance.map(|s| s.add(dist)).unwrap_or(dist));
distance.replace(distance.map_or(sep, |s| s.add(sep)));
}, },
None => { start_word.get_or_insert(i); }, None => { start_word.get_or_insert(i); },
} }
} }
if let Some(start_word) = start_word { if let Some(start_word) = start_word {
let word = mem::replace(&mut self.inner, ""); let prefix = mem::replace(&mut self.inner, "");
self.index += distance.map(Separator::to_usize).unwrap_or(0); let (spaces, word) = prefix.split_at(start_word);
let word = &word[start_word..]; let token = Token {
return Some((self.index, word)) word: word,
word_index: self.word_index + distance.map(Separator::to_usize).unwrap_or(0),
char_index: self.char_index + spaces.chars().count(),
};
return Some(token)
} }
None None
@ -107,12 +139,12 @@ mod tests {
fn easy() { fn easy() {
let mut tokenizer = Tokenizer::new("salut"); let mut tokenizer = Tokenizer::new("salut");
assert_eq!(tokenizer.next(), Some((0, "salut"))); assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), None); assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo "); let mut tokenizer = Tokenizer::new("yo ");
assert_eq!(tokenizer.next(), Some((0, "yo"))); assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), None); assert_eq!(tokenizer.next(), None);
} }
@ -120,18 +152,37 @@ mod tests {
fn hard() { fn hard() {
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe"); let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe");
assert_eq!(tokenizer.next(), Some((0, "yo"))); assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some((1, "lolo"))); assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some((9, "aïe"))); assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
assert_eq!(tokenizer.next(), None); assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,"); let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
assert_eq!(tokenizer.next(), Some((0, "yo"))); assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some((8, "lolo"))); assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some((16, "wtf"))); assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some((24, "lol"))); assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 18 }));
assert_eq!(tokenizer.next(), Some((32, "aïe"))); assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 32, char_index: 24 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard_long_chars() {
let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 }));
assert_eq!(tokenizer.next(), None); assert_eq!(tokenizer.next(), None);
} }
} }

View File

@ -1,51 +0,0 @@
use std::ops::Deref;
use std::sync::Arc;
use std::fmt;
#[derive(Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct VecReadOnly<T> {
inner: Arc<Vec<T>>,
offset: usize,
len: usize,
}
impl<T> VecReadOnly<T> {
pub fn new(vec: Vec<T>) -> Self {
let len = vec.len();
Self {
inner: Arc::new(vec),
offset: 0,
len: len,
}
}
pub fn len(&self) -> usize {
self.len
}
pub fn range(&self, offset: usize, len: usize) -> Self {
Self {
inner: self.inner.clone(),
offset: self.offset + offset,
len: len,
}
}
pub fn as_slice(&self) -> &[T] {
&self.inner[self.offset..self.offset + self.len]
}
}
impl<T> Deref for VecReadOnly<T> {
type Target = [T];
fn deref(&self) -> &Self::Target {
self.as_slice()
}
}
impl<T: fmt::Debug> fmt::Debug for VecReadOnly<T> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.inner.fmt(f)
}
}