fet: Move meilidb example into the meilidb workspace

2025-07-26 00:01:00 +00:00 · 2019-02-26 13:50:46 +01:00
parent a745819ddf
commit 397522f277
3 changed files with 2 additions and 23 deletions
--- a/meilidb/examples/create-database.rs
+++ b/meilidb/examples/create-database.rs
@ -0,0 +1,135 @@
+#[global_allocator]
+static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
+
+use std::collections::{HashMap, HashSet};
+use std::io::{self, BufRead, BufReader};
+use std::path::{Path, PathBuf};
+use std::time::Instant;
+use std::error::Error;
+use std::borrow::Cow;
+use std::fs::File;
+
+use serde_derive::{Serialize, Deserialize};
+use structopt::StructOpt;
+
+use meilidb::database::{Database, Schema};
+
+#[derive(Debug, StructOpt)]
+pub struct Opt {
+    /// The destination where the database must be created.
+    #[structopt(parse(from_os_str))]
+    pub database_path: PathBuf,
+
+    /// The csv file to index.
+    #[structopt(parse(from_os_str))]
+    pub csv_data_path: PathBuf,
+
+    /// The path to the schema.
+    #[structopt(long = "schema", parse(from_os_str))]
+    pub schema_path: PathBuf,
+
+    /// The path to the list of stop words (one by line).
+    #[structopt(long = "stop-words", parse(from_os_str))]
+    pub stop_words_path: Option<PathBuf>,
+
+    #[structopt(long = "update-group-size")]
+    pub update_group_size: Option<usize>,
+}
+
+#[derive(Serialize, Deserialize)]
+struct Document<'a> (
+    #[serde(borrow)]
+    HashMap<Cow<'a, str>, Cow<'a, str>>
+);
+
+fn index(
+    schema: Schema,
+    database_path: &Path,
+    csv_data_path: &Path,
+    update_group_size: Option<usize>,
+    stop_words: &HashSet<String>,
+) -> Result<Database, Box<Error>>
+{
+    let database = Database::create(database_path)?;
+
+    database.create_index("default", &schema)?;
+
+    let mut rdr = csv::Reader::from_path(csv_data_path)?;
+    let mut raw_record = csv::StringRecord::new();
+    let headers = rdr.headers()?.clone();
+
+    let mut i = 0;
+    let mut end_of_file = false;
+
+    while !end_of_file {
+        let mut update = database.start_update("default")?;
+
+        loop {
+            end_of_file = !rdr.read_record(&mut raw_record)?;
+            if end_of_file { break }
+
+            let document: Document = match raw_record.deserialize(Some(&headers)) {
+                Ok(document) => document,
+                Err(e) => {
+                    eprintln!("{:?}", e);
+                    continue;
+                }
+            };
+
+            update.update_document(&document, &stop_words)?;
+
+            print!("\rindexing document {}", i);
+            i += 1;
+
+            if let Some(group_size) = update_group_size {
+                if i % group_size == 0 { break }
+            }
+        }
+
+        println!();
+
+        println!("committing update...");
+        database.commit_update(update)?;
+    }
+
+    Ok(database)
+}
+
+fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
+    let f = File::open(path)?;
+    let reader = BufReader::new(f);
+    let mut words = HashSet::new();
+
+    for line in reader.lines() {
+        let line = line?;
+        let word = line.trim().to_string();
+        words.insert(word);
+    }
+
+    Ok(words)
+}
+
+fn main() -> Result<(), Box<Error>> {
+    let _ = env_logger::init();
+    let opt = Opt::from_args();
+
+    let schema = {
+        let file = File::open(&opt.schema_path)?;
+        Schema::from_toml(file)?
+    };
+
+    let stop_words = match opt.stop_words_path {
+        Some(ref path) => retrieve_stop_words(path)?,
+        None           => HashSet::new(),
+    };
+
+    let start = Instant::now();
+    let result = index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words);
+
+    if let Err(e) = result {
+        return Err(e.into())
+    }
+
+    println!("database created in {:.2?} at: {:?}", start.elapsed(), opt.database_path);
+    Ok(())
+}
--- a/meilidb/examples/query-database.rs
+++ b/meilidb/examples/query-database.rs
@ -0,0 +1,210 @@
+#[global_allocator]
+static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
+
+use std::collections::btree_map::{BTreeMap, Entry};
+use std::iter::FromIterator;
+use std::io::{self, Write};
+use std::time::Instant;
+use std::path::PathBuf;
+use std::error::Error;
+
+use hashbrown::{HashMap, HashSet};
+use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
+use structopt::StructOpt;
+use meilidb_core::Match;
+
+use meilidb::database::schema::SchemaAttr;
+use meilidb::database::Database;
+
+#[derive(Debug, StructOpt)]
+pub struct Opt {
+    /// The destination where the database must be created
+    #[structopt(parse(from_os_str))]
+    pub database_path: PathBuf,
+
+    /// Fields that must be displayed.
+    pub displayed_fields: Vec<String>,
+
+    /// The number of returned results
+    #[structopt(short = "n", long = "number-results", default_value = "10")]
+    pub number_results: usize,
+
+    /// The number of characters before and after the first match
+    #[structopt(short = "C", long = "context", default_value = "35")]
+    pub char_context: usize,
+}
+
+type Document = HashMap<String, String>;
+
+fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
+    let mut stdout = StandardStream::stdout(ColorChoice::Always);
+    let mut highlighted = false;
+
+    for range in ranges.windows(2) {
+        let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() };
+        if highlighted {
+            stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
+        }
+        write!(&mut stdout, "{}", &text[start..end])?;
+        stdout.reset()?;
+        highlighted = !highlighted;
+    }
+
+    Ok(())
+}
+
+fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize) {
+    let mut byte_index = 0;
+    let mut byte_length = 0;
+
+    for (n, (i, c)) in text.char_indices().enumerate() {
+        if n == index {
+            byte_index = i;
+        }
+
+        if n + 1 == index + length {
+            byte_length = i - byte_index + c.len_utf8();
+            break;
+        }
+    }
+
+    (byte_index, byte_length)
+}
+
+fn create_highlight_areas(text: &str, matches: &[Match]) -> Vec<usize> {
+    let mut byte_indexes = BTreeMap::new();
+
+    for match_ in matches {
+        let char_index = match_.char_index as usize;
+        let char_length = match_.char_length as usize;
+        let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
+
+        match byte_indexes.entry(byte_index) {
+            Entry::Vacant(entry) => { entry.insert(byte_length); },
+            Entry::Occupied(mut entry) => {
+                if *entry.get() < byte_length {
+                    entry.insert(byte_length);
+                }
+            },
+        }
+    }
+
+    let mut title_areas = Vec::new();
+    title_areas.push(0);
+    for (byte_index, length) in byte_indexes {
+        title_areas.push(byte_index);
+        title_areas.push(byte_index + length);
+    }
+    title_areas.push(text.len());
+    title_areas.sort_unstable();
+    title_areas
+}
+
+/// note: matches must have been sorted by `char_index` and `char_length` before being passed.
+///
+/// ```no_run
+/// matches.sort_unstable_by_key(|m| (m.char_index, m.char_length));
+///
+/// let matches = matches.matches.iter().filter(|m| SchemaAttr::new(m.attribute) == attr).cloned();
+///
+/// let (text, matches) = crop_text(&text, matches, 35);
+/// ```
+fn crop_text(
+    text: &str,
+    matches: impl IntoIterator<Item=Match>,
+    context: usize,
+) -> (String, Vec<Match>)
+{
+    let mut matches = matches.into_iter().peekable();
+
+    let char_index = matches.peek().map(|m| m.char_index as usize).unwrap_or(0);
+    let start = char_index.saturating_sub(context);
+    let text = text.chars().skip(start).take(context * 2).collect();
+
+    let matches = matches
+        .take_while(|m| {
+            (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)
+        })
+        .map(|match_| {
+            Match { char_index: match_.char_index - start as u16, ..match_ }
+        })
+        .collect();
+
+    (text, matches)
+}
+
+fn main() -> Result<(), Box<Error>> {
+    let _ = env_logger::init();
+    let opt = Opt::from_args();
+
+    let start = Instant::now();
+    let database = Database::open(&opt.database_path)?;
+    println!("database prepared for you in {:.2?}", start.elapsed());
+
+    let mut buffer = String::new();
+    let input = io::stdin();
+
+    loop {
+        print!("Searching for: ");
+        io::stdout().flush()?;
+
+        if input.read_line(&mut buffer)? == 0 { break }
+        let query = buffer.trim_end_matches('\n');
+
+        let view = database.view("default")?;
+        let schema = view.schema();
+
+        let start = Instant::now();
+
+        let builder = view.query_builder();
+        let documents = builder.query(query, 0..opt.number_results);
+
+        let number_of_documents = documents.len();
+        for mut doc in documents {
+
+            doc.matches.sort_unstable_by_key(|m| (m.char_index, m.char_index));
+
+            match view.document_by_id::<Document>(doc.id) {
+                Ok(document) => {
+                    for name in &opt.displayed_fields {
+                        let attr = match schema.attribute(name) {
+                            Some(attr) => attr,
+                            None => continue,
+                        };
+                        let text = match document.get(name) {
+                            Some(text) => text,
+                            None => continue,
+                        };
+
+                        print!("{}: ", name);
+                        let matches = doc.matches.iter()
+                                        .filter(|m| SchemaAttr::new(m.attribute) == attr)
+                                        .cloned();
+                        let (text, matches) = crop_text(&text, matches, opt.char_context);
+                        let areas = create_highlight_areas(&text, &matches);
+                        display_highlights(&text, &areas)?;
+                        println!();
+                    }
+                },
+                Err(e) => eprintln!("{}", e),
+            }
+
+            let mut matching_attributes = HashSet::new();
+            for _match in doc.matches {
+                let attr = SchemaAttr::new(_match.attribute);
+                let name = schema.attribute_name(attr);
+                matching_attributes.insert(name);
+            }
+
+            let matching_attributes = Vec::from_iter(matching_attributes);
+            println!("matching in: {:?}", matching_attributes);
+
+            println!();
+        }
+
+        eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start.elapsed());
+        buffer.clear();
+    }
+
+    Ok(())
+}