mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-26 00:01:00 +00:00
fet: Move meilidb example into the meilidb workspace
This commit is contained in:
135
meilidb/examples/create-database.rs
Normal file
135
meilidb/examples/create-database.rs
Normal file
@ -0,0 +1,135 @@
|
||||
#[global_allocator]
|
||||
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::io::{self, BufRead, BufReader};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::Instant;
|
||||
use std::error::Error;
|
||||
use std::borrow::Cow;
|
||||
use std::fs::File;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use structopt::StructOpt;
|
||||
|
||||
use meilidb::database::{Database, Schema};
|
||||
|
||||
#[derive(Debug, StructOpt)]
|
||||
pub struct Opt {
|
||||
/// The destination where the database must be created.
|
||||
#[structopt(parse(from_os_str))]
|
||||
pub database_path: PathBuf,
|
||||
|
||||
/// The csv file to index.
|
||||
#[structopt(parse(from_os_str))]
|
||||
pub csv_data_path: PathBuf,
|
||||
|
||||
/// The path to the schema.
|
||||
#[structopt(long = "schema", parse(from_os_str))]
|
||||
pub schema_path: PathBuf,
|
||||
|
||||
/// The path to the list of stop words (one by line).
|
||||
#[structopt(long = "stop-words", parse(from_os_str))]
|
||||
pub stop_words_path: Option<PathBuf>,
|
||||
|
||||
#[structopt(long = "update-group-size")]
|
||||
pub update_group_size: Option<usize>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct Document<'a> (
|
||||
#[serde(borrow)]
|
||||
HashMap<Cow<'a, str>, Cow<'a, str>>
|
||||
);
|
||||
|
||||
fn index(
|
||||
schema: Schema,
|
||||
database_path: &Path,
|
||||
csv_data_path: &Path,
|
||||
update_group_size: Option<usize>,
|
||||
stop_words: &HashSet<String>,
|
||||
) -> Result<Database, Box<Error>>
|
||||
{
|
||||
let database = Database::create(database_path)?;
|
||||
|
||||
database.create_index("default", &schema)?;
|
||||
|
||||
let mut rdr = csv::Reader::from_path(csv_data_path)?;
|
||||
let mut raw_record = csv::StringRecord::new();
|
||||
let headers = rdr.headers()?.clone();
|
||||
|
||||
let mut i = 0;
|
||||
let mut end_of_file = false;
|
||||
|
||||
while !end_of_file {
|
||||
let mut update = database.start_update("default")?;
|
||||
|
||||
loop {
|
||||
end_of_file = !rdr.read_record(&mut raw_record)?;
|
||||
if end_of_file { break }
|
||||
|
||||
let document: Document = match raw_record.deserialize(Some(&headers)) {
|
||||
Ok(document) => document,
|
||||
Err(e) => {
|
||||
eprintln!("{:?}", e);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
update.update_document(&document, &stop_words)?;
|
||||
|
||||
print!("\rindexing document {}", i);
|
||||
i += 1;
|
||||
|
||||
if let Some(group_size) = update_group_size {
|
||||
if i % group_size == 0 { break }
|
||||
}
|
||||
}
|
||||
|
||||
println!();
|
||||
|
||||
println!("committing update...");
|
||||
database.commit_update(update)?;
|
||||
}
|
||||
|
||||
Ok(database)
|
||||
}
|
||||
|
||||
fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
|
||||
let f = File::open(path)?;
|
||||
let reader = BufReader::new(f);
|
||||
let mut words = HashSet::new();
|
||||
|
||||
for line in reader.lines() {
|
||||
let line = line?;
|
||||
let word = line.trim().to_string();
|
||||
words.insert(word);
|
||||
}
|
||||
|
||||
Ok(words)
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<Error>> {
|
||||
let _ = env_logger::init();
|
||||
let opt = Opt::from_args();
|
||||
|
||||
let schema = {
|
||||
let file = File::open(&opt.schema_path)?;
|
||||
Schema::from_toml(file)?
|
||||
};
|
||||
|
||||
let stop_words = match opt.stop_words_path {
|
||||
Some(ref path) => retrieve_stop_words(path)?,
|
||||
None => HashSet::new(),
|
||||
};
|
||||
|
||||
let start = Instant::now();
|
||||
let result = index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words);
|
||||
|
||||
if let Err(e) = result {
|
||||
return Err(e.into())
|
||||
}
|
||||
|
||||
println!("database created in {:.2?} at: {:?}", start.elapsed(), opt.database_path);
|
||||
Ok(())
|
||||
}
|
210
meilidb/examples/query-database.rs
Normal file
210
meilidb/examples/query-database.rs
Normal file
@ -0,0 +1,210 @@
|
||||
#[global_allocator]
|
||||
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||
|
||||
use std::collections::btree_map::{BTreeMap, Entry};
|
||||
use std::iter::FromIterator;
|
||||
use std::io::{self, Write};
|
||||
use std::time::Instant;
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
use hashbrown::{HashMap, HashSet};
|
||||
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
|
||||
use structopt::StructOpt;
|
||||
use meilidb_core::Match;
|
||||
|
||||
use meilidb::database::schema::SchemaAttr;
|
||||
use meilidb::database::Database;
|
||||
|
||||
#[derive(Debug, StructOpt)]
|
||||
pub struct Opt {
|
||||
/// The destination where the database must be created
|
||||
#[structopt(parse(from_os_str))]
|
||||
pub database_path: PathBuf,
|
||||
|
||||
/// Fields that must be displayed.
|
||||
pub displayed_fields: Vec<String>,
|
||||
|
||||
/// The number of returned results
|
||||
#[structopt(short = "n", long = "number-results", default_value = "10")]
|
||||
pub number_results: usize,
|
||||
|
||||
/// The number of characters before and after the first match
|
||||
#[structopt(short = "C", long = "context", default_value = "35")]
|
||||
pub char_context: usize,
|
||||
}
|
||||
|
||||
type Document = HashMap<String, String>;
|
||||
|
||||
fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
|
||||
let mut stdout = StandardStream::stdout(ColorChoice::Always);
|
||||
let mut highlighted = false;
|
||||
|
||||
for range in ranges.windows(2) {
|
||||
let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() };
|
||||
if highlighted {
|
||||
stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
|
||||
}
|
||||
write!(&mut stdout, "{}", &text[start..end])?;
|
||||
stdout.reset()?;
|
||||
highlighted = !highlighted;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize) {
|
||||
let mut byte_index = 0;
|
||||
let mut byte_length = 0;
|
||||
|
||||
for (n, (i, c)) in text.char_indices().enumerate() {
|
||||
if n == index {
|
||||
byte_index = i;
|
||||
}
|
||||
|
||||
if n + 1 == index + length {
|
||||
byte_length = i - byte_index + c.len_utf8();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
(byte_index, byte_length)
|
||||
}
|
||||
|
||||
fn create_highlight_areas(text: &str, matches: &[Match]) -> Vec<usize> {
|
||||
let mut byte_indexes = BTreeMap::new();
|
||||
|
||||
for match_ in matches {
|
||||
let char_index = match_.char_index as usize;
|
||||
let char_length = match_.char_length as usize;
|
||||
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
|
||||
|
||||
match byte_indexes.entry(byte_index) {
|
||||
Entry::Vacant(entry) => { entry.insert(byte_length); },
|
||||
Entry::Occupied(mut entry) => {
|
||||
if *entry.get() < byte_length {
|
||||
entry.insert(byte_length);
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
let mut title_areas = Vec::new();
|
||||
title_areas.push(0);
|
||||
for (byte_index, length) in byte_indexes {
|
||||
title_areas.push(byte_index);
|
||||
title_areas.push(byte_index + length);
|
||||
}
|
||||
title_areas.push(text.len());
|
||||
title_areas.sort_unstable();
|
||||
title_areas
|
||||
}
|
||||
|
||||
/// note: matches must have been sorted by `char_index` and `char_length` before being passed.
|
||||
///
|
||||
/// ```no_run
|
||||
/// matches.sort_unstable_by_key(|m| (m.char_index, m.char_length));
|
||||
///
|
||||
/// let matches = matches.matches.iter().filter(|m| SchemaAttr::new(m.attribute) == attr).cloned();
|
||||
///
|
||||
/// let (text, matches) = crop_text(&text, matches, 35);
|
||||
/// ```
|
||||
fn crop_text(
|
||||
text: &str,
|
||||
matches: impl IntoIterator<Item=Match>,
|
||||
context: usize,
|
||||
) -> (String, Vec<Match>)
|
||||
{
|
||||
let mut matches = matches.into_iter().peekable();
|
||||
|
||||
let char_index = matches.peek().map(|m| m.char_index as usize).unwrap_or(0);
|
||||
let start = char_index.saturating_sub(context);
|
||||
let text = text.chars().skip(start).take(context * 2).collect();
|
||||
|
||||
let matches = matches
|
||||
.take_while(|m| {
|
||||
(m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)
|
||||
})
|
||||
.map(|match_| {
|
||||
Match { char_index: match_.char_index - start as u16, ..match_ }
|
||||
})
|
||||
.collect();
|
||||
|
||||
(text, matches)
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<Error>> {
|
||||
let _ = env_logger::init();
|
||||
let opt = Opt::from_args();
|
||||
|
||||
let start = Instant::now();
|
||||
let database = Database::open(&opt.database_path)?;
|
||||
println!("database prepared for you in {:.2?}", start.elapsed());
|
||||
|
||||
let mut buffer = String::new();
|
||||
let input = io::stdin();
|
||||
|
||||
loop {
|
||||
print!("Searching for: ");
|
||||
io::stdout().flush()?;
|
||||
|
||||
if input.read_line(&mut buffer)? == 0 { break }
|
||||
let query = buffer.trim_end_matches('\n');
|
||||
|
||||
let view = database.view("default")?;
|
||||
let schema = view.schema();
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let builder = view.query_builder();
|
||||
let documents = builder.query(query, 0..opt.number_results);
|
||||
|
||||
let number_of_documents = documents.len();
|
||||
for mut doc in documents {
|
||||
|
||||
doc.matches.sort_unstable_by_key(|m| (m.char_index, m.char_index));
|
||||
|
||||
match view.document_by_id::<Document>(doc.id) {
|
||||
Ok(document) => {
|
||||
for name in &opt.displayed_fields {
|
||||
let attr = match schema.attribute(name) {
|
||||
Some(attr) => attr,
|
||||
None => continue,
|
||||
};
|
||||
let text = match document.get(name) {
|
||||
Some(text) => text,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
print!("{}: ", name);
|
||||
let matches = doc.matches.iter()
|
||||
.filter(|m| SchemaAttr::new(m.attribute) == attr)
|
||||
.cloned();
|
||||
let (text, matches) = crop_text(&text, matches, opt.char_context);
|
||||
let areas = create_highlight_areas(&text, &matches);
|
||||
display_highlights(&text, &areas)?;
|
||||
println!();
|
||||
}
|
||||
},
|
||||
Err(e) => eprintln!("{}", e),
|
||||
}
|
||||
|
||||
let mut matching_attributes = HashSet::new();
|
||||
for _match in doc.matches {
|
||||
let attr = SchemaAttr::new(_match.attribute);
|
||||
let name = schema.attribute_name(attr);
|
||||
matching_attributes.insert(name);
|
||||
}
|
||||
|
||||
let matching_attributes = Vec::from_iter(matching_attributes);
|
||||
println!("matching in: {:?}", matching_attributes);
|
||||
|
||||
println!();
|
||||
}
|
||||
|
||||
eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start.elapsed());
|
||||
buffer.clear();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
Reference in New Issue
Block a user