feat: Introduce RocksDB in this project

in order to save fields of the products
This commit is contained in:
Clément Renault
2018-07-06 21:26:07 +02:00
parent b9a4be10c6
commit d6e113c683
6 changed files with 354 additions and 23 deletions

View File

@ -1,3 +1,11 @@
[[package]]
name = "base64"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "bincode"
version = "1.0.1"
@ -7,11 +15,55 @@ dependencies = [
"serde 1.0.68 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "blob"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"base64 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.68 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "build_const"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "byteorder"
version = "1.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "bzip2-sys"
version = "0.1.6"
source = "git+https://github.com/alexcrichton/bzip2-rs.git#0ae38c2ccfea01625ae256e4fd483a15eb7ad62c"
dependencies = [
"cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "cc"
version = "1.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "cmake"
version = "0.1.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "crc"
version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"build_const 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "dtoa"
version = "0.4.2"
@ -26,6 +78,16 @@ dependencies = [
"memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "gcc"
version = "0.3.54"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "glob"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "group-by"
version = "0.1.0"
@ -49,6 +111,41 @@ name = "libc"
version = "0.2.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "librocksdb_sys"
version = "0.1.0"
source = "git+https://github.com/pingcap/rust-rocksdb.git#9a1c83c5382fbaee8a5102213c711bbe52d71470"
dependencies = [
"bzip2-sys 0.1.6 (git+https://github.com/alexcrichton/bzip2-rs.git)",
"cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)",
"cmake 0.1.31 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)",
"libz-sys 1.0.18 (git+https://github.com/busyjay/libz-sys.git?branch=static-link)",
"lz4-sys 1.8.0 (git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build)",
"snappy-sys 0.1.0 (git+https://github.com/busyjay/rust-snappy.git?branch=static-link)",
"zstd-sys 1.4.4+zstd.1.3.5 (git+https://github.com/gyscos/zstd-rs.git)",
]
[[package]]
name = "libz-sys"
version = "1.0.18"
source = "git+https://github.com/busyjay/libz-sys.git?branch=static-link#bb77b618ffc5ca41efd7a89d282d96e35e79dae4"
dependencies = [
"cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)",
"pkg-config 0.3.11 (registry+https://github.com/rust-lang/crates.io-index)",
"vcpkg 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "lz4-sys"
version = "1.8.0"
source = "git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build#41509fea212e9ca55c1f6c53d4fd1ddf28cdf689"
dependencies = [
"cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "memmap"
version = "0.6.2"
@ -58,6 +155,11 @@ dependencies = [
"winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "pkg-config"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "proc-macro2"
version = "0.4.6"
@ -91,12 +193,23 @@ name = "raptor-indexer"
version = "0.1.0"
dependencies = [
"raptor 0.1.0",
"rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)",
"serde 1.0.68 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.68 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.22 (registry+https://github.com/rust-lang/crates.io-index)",
"unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rocksdb"
version = "0.3.0"
source = "git+https://github.com/pingcap/rust-rocksdb.git#9a1c83c5382fbaee8a5102213c711bbe52d71470"
dependencies = [
"crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)",
"librocksdb_sys 0.1.0 (git+https://github.com/pingcap/rust-rocksdb.git)",
]
[[package]]
name = "serde"
version = "1.0.68"
@ -122,6 +235,16 @@ dependencies = [
"serde 1.0.68 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "snappy-sys"
version = "0.1.0"
source = "git+https://github.com/busyjay/rust-snappy.git?branch=static-link#be02178330bb17648d6ac605af249eba18b32b71"
dependencies = [
"cmake 0.1.31 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)",
"pkg-config 0.3.11 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "syn"
version = "0.14.2"
@ -142,6 +265,11 @@ name = "unidecode"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "vcpkg"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "winapi"
version = "0.3.5"
@ -161,24 +289,52 @@ name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "zstd-sys"
version = "1.4.4+zstd.1.3.5"
source = "git+https://github.com/gyscos/zstd-rs.git#9ff4442c1977fad400f90d9c48e4f114c474117c"
dependencies = [
"blob 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"gcc 0.3.54 (registry+https://github.com/rust-lang/crates.io-index)",
"glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)",
]
[metadata]
"checksum base64 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "30e93c03064e7590d0466209155251b90c22e37fab1daf2771582598b5827557"
"checksum bincode 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "9f2fb9e29e72fd6bc12071533d5dc7664cb01480c59406f656d7ac25c7bd8ff7"
"checksum blob 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "122c3fa3949d822d2a51c648db9e8105d6e75b89dc628cc366901d3d396fa4f4"
"checksum build_const 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "39092a32794787acd8525ee150305ff051b0aa6cc2abaf193924f5ab05425f39"
"checksum byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "74c0b906e9446b0a2e4f760cdb3fa4b2c48cdc6db8766a845c54b6ff063fd2e9"
"checksum bzip2-sys 0.1.6 (git+https://github.com/alexcrichton/bzip2-rs.git)" = "<none>"
"checksum cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)" = "49ec142f5768efb5b7622aebc3fdbdbb8950a4b9ba996393cb76ef7466e8747d"
"checksum cmake 0.1.31 (registry+https://github.com/rust-lang/crates.io-index)" = "95470235c31c726d72bf2e1f421adc1e65b9d561bf5529612cbe1a72da1467b3"
"checksum crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb"
"checksum dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "09c3753c3db574d215cba4ea76018483895d7bff25a31b49ba45db21c48e50ab"
"checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)" = "<none>"
"checksum gcc 0.3.54 (registry+https://github.com/rust-lang/crates.io-index)" = "5e33ec290da0d127825013597dbdfc28bee4964690c7ce1166cbc2a7bd08b1bb"
"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb"
"checksum group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)" = "<none>"
"checksum itoa 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c069bbec61e1ca5a596166e55dfe4773ff745c3d16b700013bcaff9a6df2c682"
"checksum levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "<none>"
"checksum libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)" = "b685088df2b950fccadf07a7187c8ef846a959c142338a48f9dc0b94517eb5f1"
"checksum librocksdb_sys 0.1.0 (git+https://github.com/pingcap/rust-rocksdb.git)" = "<none>"
"checksum libz-sys 1.0.18 (git+https://github.com/busyjay/libz-sys.git?branch=static-link)" = "<none>"
"checksum lz4-sys 1.8.0 (git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build)" = "<none>"
"checksum memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff"
"checksum pkg-config 0.3.11 (registry+https://github.com/rust-lang/crates.io-index)" = "110d5ee3593dbb73f56294327fe5668bcc997897097cbc76b51e7aed3f52452f"
"checksum proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "effdb53b25cdad54f8f48843d67398f7ef2e14f12c1b4cb4effc549a6462a4d6"
"checksum quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)" = "e44651a0dc4cdd99f71c83b561e221f714912d11af1a4dff0631f923d53af035"
"checksum rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)" = "<none>"
"checksum serde 1.0.68 (registry+https://github.com/rust-lang/crates.io-index)" = "429fcc4efa8a11341b5422c2ace724daba276c1748467e869478f53c0ba4562e"
"checksum serde_derive 1.0.68 (registry+https://github.com/rust-lang/crates.io-index)" = "6a25ad0bf818ed2d180c89addbe29198d1de6c89ed08a48aa6a4d3d16a63cbfe"
"checksum serde_json 1.0.22 (registry+https://github.com/rust-lang/crates.io-index)" = "84b8035cabe9b35878adec8ac5fe03d5f6bc97ff6edd7ccb96b44c1276ba390e"
"checksum snappy-sys 0.1.0 (git+https://github.com/busyjay/rust-snappy.git?branch=static-link)" = "<none>"
"checksum syn 0.14.2 (registry+https://github.com/rust-lang/crates.io-index)" = "c67da57e61ebc7b7b6fff56bb34440ca3a83db037320b0507af4c10368deda7d"
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
"checksum unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "402bb19d8e03f1d1a7450e2bd613980869438e0666331be3e073089124aa1adc"
"checksum vcpkg 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)" = "cbe533e138811704c0e3cbde65a818b35d3240409b4346256c5ede403e082474"
"checksum winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "773ef9dcc5f24b7d850d0ff101e542ff24c3b090a9768e03ff889fdef41f00fd"
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
"checksum zstd-sys 1.4.4+zstd.1.3.5 (git+https://github.com/gyscos/zstd-rs.git)" = "<none>"

View File

@ -10,6 +10,9 @@ serde_derive = "1.0"
serde_json = "1.0"
unidecode = "0.3"
[dependencies.rocksdb]
git = "https://github.com/pingcap/rust-rocksdb.git"
[profile.release]
debug = true
lto = true

View File

@ -2,6 +2,7 @@
// make only one binary
extern crate raptor;
extern crate rocksdb;
extern crate serde_json;
#[macro_use] extern crate serde_derive;
extern crate unidecode;
@ -13,6 +14,7 @@ use std::io::{self, BufReader, BufRead};
use std::iter;
use raptor::{DocIndexMapBuilder, DocIndexMap, DocIndex};
use rocksdb::{DB, WriteBatch, Writable};
use serde_json::from_str;
use unidecode::unidecode;
@ -62,8 +64,9 @@ fn main() {
let map_file = "map.fst";
let values_file = "values.vecs";
let rocksdb_file = "rocksdb/storage";
for file in &[map_file, values_file] {
for file in &[map_file, values_file, rocksdb_file] {
match is_readonly(file) {
Ok(true) => panic!("the {:?} file is readonly, please make it writeable", file),
Err(ref e) if e.kind() == io::ErrorKind::NotFound => (),
@ -72,6 +75,9 @@ fn main() {
}
}
fs::remove_file(rocksdb_file);
let db = DB::open_default(rocksdb_file).unwrap();
let mut builder = DocIndexMapBuilder::new();
for line in data.lines() {
let line = line.unwrap();
@ -81,6 +87,16 @@ fn main() {
let title = iter::repeat(0).zip(product.title.split_whitespace()).filter(|&(_, w)| !common_words.contains(w)).enumerate();
let description = iter::repeat(1).zip(product.ft.split_whitespace()).filter(|&(_, w)| !common_words.contains(w)).enumerate();
let mut batch = WriteBatch::new();
let title_key = format!("{}-title", product.product_id);
let _ = batch.put(title_key.as_bytes(), product.title.as_bytes());
let description_key = format!("{}-description", product.product_id);
let _ = batch.put(description_key.as_bytes(), product.ft.as_bytes());
db.write(batch).unwrap();
let words = title.chain(description);
for (i, (attr, word)) in words {
let doc_index = DocIndex {
@ -108,6 +124,7 @@ fn main() {
set_readonly(map_file, true).unwrap();
set_readonly(values_file, true).unwrap();
set_readonly(rocksdb_file, true).unwrap();
println!("Checking the dump consistency...");
unsafe { DocIndexMap::from_paths("map.fst", "values.vecs").unwrap() };