feat: Reintroduce stopwords for the serializer

This commit is contained in:
Clément Renault
2019-01-06 18:03:47 +01:00
parent a2f5e8aa25
commit 32f8908d71
7 changed files with 276 additions and 35 deletions

View File

@@ -1,12 +1,13 @@
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
use std::io::{self, BufRead, BufReader};
use std::path::{Path, PathBuf};
use std::error::Error;
use std::borrow::Cow;
use std::fs::File;
use hashbrown::HashMap;
use hashbrown::{HashMap, HashSet};
use serde_derive::{Serialize, Deserialize};
use structopt::StructOpt;
@@ -26,6 +27,13 @@ pub struct Opt {
/// The path to the schema.
#[structopt(long = "schema", parse(from_os_str))]
pub schema_path: PathBuf,
/// The path to the list of stop words (one by line).
#[structopt(long = "stop-words", parse(from_os_str))]
pub stop_words_path: Option<PathBuf>,
#[structopt(long = "update-group-size")]
pub update_group_size: Option<usize>,
}
#[derive(Serialize, Deserialize)]
@@ -34,37 +42,75 @@ struct Document<'a> (
HashMap<Cow<'a, str>, Cow<'a, str>>
);
fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<Database, Box<Error>> {
fn index(
schema: Schema,
database_path: &Path,
csv_data_path: &Path,
update_group_size: Option<usize>,
stop_words: &HashSet<String>,
) -> Result<Database, Box<Error>>
{
let database = Database::create(database_path, &schema)?;
println!("start indexing...");
let tokenizer_builder = DefaultBuilder::new();
let update_path = tempfile::NamedTempFile::new()?;
let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema);
let mut rdr = csv::Reader::from_path(csv_data_path)?;
let mut raw_record = csv::StringRecord::new();
let headers = rdr.headers()?.clone();
while rdr.read_record(&mut raw_record)? {
let document: Document = match raw_record.deserialize(Some(&headers)) {
Ok(document) => document,
Err(e) => {
eprintln!("{:?}", e);
continue;
}
};
let mut i = 0;
let mut end_of_file = false;
update.update_document(&document, &tokenizer_builder)?;
while !end_of_file {
let tokenizer_builder = DefaultBuilder::new();
let update_path = tempfile::NamedTempFile::new()?;
let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema.clone());
loop {
end_of_file = !rdr.read_record(&mut raw_record)?;
if end_of_file { break }
let document: Document = match raw_record.deserialize(Some(&headers)) {
Ok(document) => document,
Err(e) => {
eprintln!("{:?}", e);
continue;
}
};
update.update_document(&document, &tokenizer_builder, &stop_words)?;
print!("\rindexing document {}", i);
i += 1;
if let Some(group_size) = update_group_size {
if i % group_size == 0 { break }
}
}
println!();
println!("building update...");
let update = update.build()?;
println!("ingesting update...");
database.ingest_update_file(update)?;
}
let update = update.build()?;
database.ingest_update_file(update)?;
Ok(database)
}
fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
let f = File::open(path)?;
let reader = BufReader::new(f);
let mut words = HashSet::new();
for line in reader.lines() {
let line = line?;
let word = line.trim().to_string();
words.insert(word);
}
Ok(words)
}
fn main() -> Result<(), Box<Error>> {
let _ = env_logger::init();
let opt = Opt::from_args();
@@ -74,8 +120,13 @@ fn main() -> Result<(), Box<Error>> {
Schema::from_toml(file)?
};
let stop_words = match opt.stop_words_path {
Some(ref path) => retrieve_stop_words(path)?,
None => HashSet::new(),
};
let (elapsed, result) = elapsed::measure_time(|| {
index(schema, &opt.database_path, &opt.csv_data_path)
index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words)
});
if let Err(e) = result {