chore: Rename bin into indexer

This commit is contained in:
Kerollmops
2018-06-24 01:28:27 +02:00
committed by Clément Renault
parent d210e5d8db
commit 879e28fb7d
6 changed files with 93 additions and 51 deletions

3
raptor-indexer/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
/target
**/*.rs.bk

184
raptor-indexer/Cargo.lock generated Normal file
View File

@ -0,0 +1,184 @@
[[package]]
name = "bincode"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "byteorder"
version = "1.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "dtoa"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "fst"
version = "0.3.0"
source = "git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state#6e0ab4e4ee5443cc55079996bf9f703086322c33"
dependencies = [
"byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "group-by"
version = "0.1.0"
source = "git+https://github.com/Kerollmops/group-by.git#7e432aa232834b650ca85ecd46056a43a0094dec"
[[package]]
name = "itoa"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "levenshtein_automata"
version = "0.1.0"
source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#5e8183a7634c4a0182ea7bb398140b2fe9854f77"
dependencies = [
"fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)",
]
[[package]]
name = "libc"
version = "0.2.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "memmap"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "proc-macro2"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "quote"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "raptor"
version = "0.1.0"
dependencies = [
"bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)",
"group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)",
"levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)",
"serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "raptor-indexer"
version = "0.1.0"
dependencies = [
"raptor 0.1.0",
"serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)",
"unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "serde"
version = "1.0.54"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "serde_derive"
version = "1.0.54"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
"syn 0.13.9 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "serde_json"
version = "1.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
"itoa 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "syn"
version = "0.13.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "unicode-xid"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "unidecode"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "winapi"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[metadata]
"checksum bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bda13183df33055cbb84b847becce220d392df502ebe7a4a78d7021771ed94d0"
"checksum byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "73b5bdfe7ee3ad0b99c9801d58807a9dbc9e09196365b0203853b99889ab3c87"
"checksum dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "09c3753c3db574d215cba4ea76018483895d7bff25a31b49ba45db21c48e50ab"
"checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)" = "<none>"
"checksum group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)" = "<none>"
"checksum itoa 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c069bbec61e1ca5a596166e55dfe4773ff745c3d16b700013bcaff9a6df2c682"
"checksum levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "<none>"
"checksum libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)" = "6fd41f331ac7c5b8ac259b8bf82c75c0fb2e469bbf37d2becbba9a6a2221965b"
"checksum memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff"
"checksum proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "1b06e2f335f48d24442b35a19df506a835fb3547bc3c06ef27340da9acf5cae7"
"checksum quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9949cfe66888ffe1d53e6ec9d9f3b70714083854be20fd5e271b232a017401e8"
"checksum serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)" = "db9c1726bdebaed7ac8afb7028672e068e12cf1b0b97cddd742a3a7939159699"
"checksum serde_derive 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)" = "5121751b76f5a2e6f51b4c0d07976f4f04e33ae7a981467c2845e7cd4b67a114"
"checksum serde_json 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)" = "f3ad6d546e765177cf3dded3c2e424a8040f870083a0e64064746b958ece9cb1"
"checksum syn 0.13.9 (registry+https://github.com/rust-lang/crates.io-index)" = "505550dded6ff93eb63bd9d0ada380ffccd9f51c046a5e80a3078d53fcef0038"
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
"checksum unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "402bb19d8e03f1d1a7450e2bd613980869438e0666331be3e073089124aa1adc"
"checksum winapi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "04e3bd221fcbe8a271359c04f21a76db7d0c6028862d1bb5512d85e1e2eb5bb3"
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"

15
raptor-indexer/Cargo.toml Normal file
View File

@ -0,0 +1,15 @@
[package]
name = "raptor-indexer"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
[dependencies]
raptor = { path = ".." }
serde = "1.0"
serde_derive = "1.0"
serde_json = "1.0"
unidecode = "0.3"
[profile.release]
debug = true
lto = true

View File

@ -0,0 +1,97 @@
// TODO make the raptor binary expose multiple subcommand
// make only one binary
extern crate raptor;
extern crate serde_json;
#[macro_use] extern crate serde_derive;
extern crate unidecode;
use std::path::Path;
use std::collections::HashSet;
use std::fs::{self, File};
use std::io::{self, BufReader, BufRead};
use std::iter;
use raptor::{DocIndexMapBuilder, DocIndexMap, DocIndex};
use serde_json::from_str;
use unidecode::unidecode;
#[derive(Debug, Deserialize)]
struct Product {
title: String,
product_id: u64,
ft: String,
}
fn set_readonly<P>(path: P, readonly: bool) -> io::Result<()>
where P: AsRef<Path>
{
let mut perms = fs::metadata(&path)?.permissions();
perms.set_readonly(readonly);
fs::set_permissions(&path, perms)
}
fn main() {
let data = File::open("products.json_lines").unwrap();
let data = BufReader::new(data);
let common_words = {
match File::open("fr.stopwords.txt") {
Ok(file) => {
let file = BufReader::new(file);
let mut set = HashSet::new();
for line in file.lines().filter_map(|l| l.ok()) {
for word in line.split_whitespace() {
set.insert(word.to_owned());
}
}
set
},
Err(e) => {
eprintln!("{:?}", e);
HashSet::new()
},
}
};
let mut builder = DocIndexMapBuilder::new();
for line in data.lines() {
let line = line.unwrap();
let product: Product = from_str(&line).unwrap();
let title = iter::repeat(0).zip(product.title.split_whitespace()).enumerate();
let description = iter::repeat(1).zip(product.ft.split_whitespace()).enumerate();
let words = title.chain(description);
for (i, (attr, word)) in words {
if common_words.contains(word) { continue }
let doc_index = DocIndex {
document: product.product_id,
attribute: attr,
attribute_index: i as u32,
};
// insert the exact representation
let word_lower = word.to_lowercase();
// and the unidecoded lowercased version
let word_unidecoded = unidecode(word).to_lowercase();
if word_lower != word_unidecoded {
builder.insert(word_unidecoded, doc_index);
}
builder.insert(word_lower, doc_index);
}
}
let map = File::create("map.fst").unwrap();
let values = File::create("values.vecs").unwrap();
let (map, values) = builder.build(map, values).unwrap();
set_readonly("map.fst", true).unwrap();
set_readonly("values.vecs", true).unwrap();
println!("Checking the dump consistency...");
unsafe { DocIndexMap::from_paths("map.fst", "values.vecs").unwrap() };
}