mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-28 01:01:00 +00:00
chore: Rename bin into indexer
This commit is contained in:
committed by
Clément Renault
parent
d210e5d8db
commit
879e28fb7d
3
raptor-indexer/.gitignore
vendored
Normal file
3
raptor-indexer/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
|
||||
/target
|
||||
**/*.rs.bk
|
184
raptor-indexer/Cargo.lock
generated
Normal file
184
raptor-indexer/Cargo.lock
generated
Normal file
@ -0,0 +1,184 @@
|
||||
[[package]]
|
||||
name = "bincode"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "dtoa"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "fst"
|
||||
version = "0.3.0"
|
||||
source = "git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state#6e0ab4e4ee5443cc55079996bf9f703086322c33"
|
||||
dependencies = [
|
||||
"byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "group-by"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/Kerollmops/group-by.git#7e432aa232834b650ca85ecd46056a43a0094dec"
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "levenshtein_automata"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#5e8183a7634c4a0182ea7bb398140b2fe9854f77"
|
||||
dependencies = [
|
||||
"fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.40"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "memmap"
|
||||
version = "0.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"winapi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "0.3.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "raptor"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)",
|
||||
"group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)",
|
||||
"levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)",
|
||||
"serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde_derive 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "raptor-indexer"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"raptor 0.1.0",
|
||||
"serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde_derive 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde_json 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.54"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.54"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"syn 0.13.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_json"
|
||||
version = "1.0.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"itoa 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "0.13.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-xid"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "unidecode"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-i686-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "winapi-x86_64-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[metadata]
|
||||
"checksum bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bda13183df33055cbb84b847becce220d392df502ebe7a4a78d7021771ed94d0"
|
||||
"checksum byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "73b5bdfe7ee3ad0b99c9801d58807a9dbc9e09196365b0203853b99889ab3c87"
|
||||
"checksum dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "09c3753c3db574d215cba4ea76018483895d7bff25a31b49ba45db21c48e50ab"
|
||||
"checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)" = "<none>"
|
||||
"checksum group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)" = "<none>"
|
||||
"checksum itoa 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c069bbec61e1ca5a596166e55dfe4773ff745c3d16b700013bcaff9a6df2c682"
|
||||
"checksum levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "<none>"
|
||||
"checksum libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)" = "6fd41f331ac7c5b8ac259b8bf82c75c0fb2e469bbf37d2becbba9a6a2221965b"
|
||||
"checksum memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff"
|
||||
"checksum proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "1b06e2f335f48d24442b35a19df506a835fb3547bc3c06ef27340da9acf5cae7"
|
||||
"checksum quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9949cfe66888ffe1d53e6ec9d9f3b70714083854be20fd5e271b232a017401e8"
|
||||
"checksum serde 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)" = "db9c1726bdebaed7ac8afb7028672e068e12cf1b0b97cddd742a3a7939159699"
|
||||
"checksum serde_derive 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)" = "5121751b76f5a2e6f51b4c0d07976f4f04e33ae7a981467c2845e7cd4b67a114"
|
||||
"checksum serde_json 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)" = "f3ad6d546e765177cf3dded3c2e424a8040f870083a0e64064746b958ece9cb1"
|
||||
"checksum syn 0.13.9 (registry+https://github.com/rust-lang/crates.io-index)" = "505550dded6ff93eb63bd9d0ada380ffccd9f51c046a5e80a3078d53fcef0038"
|
||||
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
|
||||
"checksum unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "402bb19d8e03f1d1a7450e2bd613980869438e0666331be3e073089124aa1adc"
|
||||
"checksum winapi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "04e3bd221fcbe8a271359c04f21a76db7d0c6028862d1bb5512d85e1e2eb5bb3"
|
||||
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
15
raptor-indexer/Cargo.toml
Normal file
15
raptor-indexer/Cargo.toml
Normal file
@ -0,0 +1,15 @@
|
||||
[package]
|
||||
name = "raptor-indexer"
|
||||
version = "0.1.0"
|
||||
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||
|
||||
[dependencies]
|
||||
raptor = { path = ".." }
|
||||
serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
serde_json = "1.0"
|
||||
unidecode = "0.3"
|
||||
|
||||
[profile.release]
|
||||
debug = true
|
||||
lto = true
|
97
raptor-indexer/src/main.rs
Normal file
97
raptor-indexer/src/main.rs
Normal file
@ -0,0 +1,97 @@
|
||||
// TODO make the raptor binary expose multiple subcommand
|
||||
// make only one binary
|
||||
|
||||
extern crate raptor;
|
||||
extern crate serde_json;
|
||||
#[macro_use] extern crate serde_derive;
|
||||
extern crate unidecode;
|
||||
|
||||
use std::path::Path;
|
||||
use std::collections::HashSet;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{self, BufReader, BufRead};
|
||||
use std::iter;
|
||||
|
||||
use raptor::{DocIndexMapBuilder, DocIndexMap, DocIndex};
|
||||
use serde_json::from_str;
|
||||
use unidecode::unidecode;
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct Product {
|
||||
title: String,
|
||||
product_id: u64,
|
||||
ft: String,
|
||||
}
|
||||
|
||||
fn set_readonly<P>(path: P, readonly: bool) -> io::Result<()>
|
||||
where P: AsRef<Path>
|
||||
{
|
||||
let mut perms = fs::metadata(&path)?.permissions();
|
||||
perms.set_readonly(readonly);
|
||||
fs::set_permissions(&path, perms)
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let data = File::open("products.json_lines").unwrap();
|
||||
let data = BufReader::new(data);
|
||||
|
||||
let common_words = {
|
||||
match File::open("fr.stopwords.txt") {
|
||||
Ok(file) => {
|
||||
let file = BufReader::new(file);
|
||||
let mut set = HashSet::new();
|
||||
for line in file.lines().filter_map(|l| l.ok()) {
|
||||
for word in line.split_whitespace() {
|
||||
set.insert(word.to_owned());
|
||||
}
|
||||
}
|
||||
set
|
||||
},
|
||||
Err(e) => {
|
||||
eprintln!("{:?}", e);
|
||||
HashSet::new()
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
let mut builder = DocIndexMapBuilder::new();
|
||||
for line in data.lines() {
|
||||
let line = line.unwrap();
|
||||
|
||||
let product: Product = from_str(&line).unwrap();
|
||||
|
||||
let title = iter::repeat(0).zip(product.title.split_whitespace()).enumerate();
|
||||
let description = iter::repeat(1).zip(product.ft.split_whitespace()).enumerate();
|
||||
|
||||
let words = title.chain(description);
|
||||
for (i, (attr, word)) in words {
|
||||
if common_words.contains(word) { continue }
|
||||
let doc_index = DocIndex {
|
||||
document: product.product_id,
|
||||
attribute: attr,
|
||||
attribute_index: i as u32,
|
||||
};
|
||||
// insert the exact representation
|
||||
let word_lower = word.to_lowercase();
|
||||
|
||||
// and the unidecoded lowercased version
|
||||
let word_unidecoded = unidecode(word).to_lowercase();
|
||||
if word_lower != word_unidecoded {
|
||||
builder.insert(word_unidecoded, doc_index);
|
||||
}
|
||||
|
||||
builder.insert(word_lower, doc_index);
|
||||
}
|
||||
}
|
||||
|
||||
let map = File::create("map.fst").unwrap();
|
||||
let values = File::create("values.vecs").unwrap();
|
||||
|
||||
let (map, values) = builder.build(map, values).unwrap();
|
||||
|
||||
set_readonly("map.fst", true).unwrap();
|
||||
set_readonly("values.vecs", true).unwrap();
|
||||
|
||||
println!("Checking the dump consistency...");
|
||||
unsafe { DocIndexMap::from_paths("map.fst", "values.vecs").unwrap() };
|
||||
}
|
Reference in New Issue
Block a user