Merge pull request #90 from Kerollmops/version-bump

Bump version to 0.2.1
chore: Bump version to 0.2.1
2025-07-18 20:30:47 +00:00 · 2019-01-25 17:08:53 +01:00 · 2019-01-25 16:41:08 +01:00 · 2019-01-25 16:40:08 +01:00 · 2019-01-25 16:05:56 +01:00 · 2019-01-24 14:07:11 +01:00
55 changed files with 3846 additions and 2036 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,39 +1,61 @@
 [package]
 edition = "2018"
 name = "meilidb"
-version = "0.1.0"
+version = "0.2.1"
 authors = ["Kerollmops <renault.cle@gmail.com>"]

 [dependencies]
 bincode = "1.0"
 byteorder = "1.2"
+crossbeam = "0.6"
+elapsed = "0.1"
 fst = "0.3"
-hashbrown = "0.1"
+hashbrown = { version = "0.1", features = ["serde"] }
 lazy_static = "1.1"
 levenshtein_automata = { version = "0.1", features = ["fst_automaton"] }
 linked-hash-map = { version = "0.5", features = ["serde_impl"] }
+log = "0.4"
 sdset = "0.3"
 serde = "1.0"
 serde_derive = "1.0"
+serde_json = { version = "1.0", features = ["preserve_order"] }
 unidecode = "0.3"

+[dependencies.toml]
+git = "https://github.com/Kerollmops/toml-rs.git"
+features = ["preserve_order"]
+rev = "0372ba6"
+
 [dependencies.rocksdb]
 git = "https://github.com/pingcap/rust-rocksdb.git"
-rev = "c2eb140"
+rev = "306e201"

 [dependencies.group-by]
 git = "https://github.com/Kerollmops/group-by.git"
-rev = "cab857b"
+rev = "5a113fe"

 [features]
 default = ["simd"]
 i128 = ["bincode/i128", "byteorder/i128"]
-simd = ["rocksdb/sse"]
 portable = ["rocksdb/portable"]
-nightly = []
+simd = ["rocksdb/sse"]
+nightly = ["hashbrown/nightly", "group-by/nightly"]

 [dev-dependencies]
 csv = "1.0"
-elapsed = "0.1"
+env_logger = "0.6"
+jemallocator = "0.1"
+quickcheck = "0.8"
+rand = "0.6"
+rand_xorshift = "0.1"
 structopt = "0.2"
 tempfile = "3.0"
+termcolor = "1.0"
+warp = "0.1"
+
+[dev-dependencies.chashmap]
+git = "https://gitlab.redox-os.org/redox-os/tfs.git"
+rev = "b3e7cae1"
+
+[profile.release]
+debug = true
--- a/README.md
+++ b/README.md
@ -1,47 +1,60 @@
 # MeiliDB

+[![Build Status](https://travis-ci.org/Kerollmops/MeiliDB.svg?branch=master)](https://travis-ci.org/Kerollmops/MeiliDB)
+[![dependency status](https://deps.rs/repo/github/Kerollmops/MeiliDB/status.svg)](https://deps.rs/repo/github/Kerollmops/MeiliDB)
+[![License](https://img.shields.io/github/license/Kerollmops/MeiliDB.svg)](https://github.com/Kerollmops/MeiliDB)
+[![Rust 1.31+](https://img.shields.io/badge/rust-1.31+-lightgray.svg)](
+https://www.rust-lang.org)
+
 A _full-text search database_ using a key-value store internally.

-It uses [RocksDB](https://github.com/facebook/rocksdb) like a classic database, to store documents and internal data. The key-value store power allow us to handle updates and queries with small memory and CPU overheads.
+It uses [RocksDB](https://github.com/facebook/rocksdb) as the internal key-value store. The key-value store allows us to handle updates and queries with small memory and CPU overheads.

-You can [read the deep dive](deep-dive.md) if you want more informations on the engine, it describes the whole process of generating updates and handling queries.
+You can [read the deep dive](deep-dive.md) if you want more information on the engine, it describes the whole process of generating updates and handling queries.

-We will be proud if you send pull requests to help us grow this project, you can start with [issues tagged "good-first-issue"](https://github.com/Kerollmops/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) to start !
+We will be proud if you submit issues and pull requests. You can help to grow this project and start contributing by checking [issues tagged "good-first-issue"](https://github.com/Kerollmops/MeiliDB/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). It is a good start!

-At the moment this is a library only, this means that binaries are not part of this repository but since I'm still nice I have made some examples for you in the `examples/` folder that works with the data located in the `misc/` folder.
+The project is only a library yet. It means that there is no binary provided yet. To get started, you can check the examples wich are made to work with the data located in the `misc/` folder.

-In a near future MeiliDB we be a binary like any database: updated and queried using some kind of protocol. It is the final goal, [see the milestones](https://github.com/Kerollmops/MeiliDB/milestones). MeiliDB will just be a bunch of network and protocols functions wrapping the library which itself will be published to https://crates.io, following the same update cycle.
+MeiliDB will be a binary in a near future so you will be able to use it as a database out-of-the-box. We should be able to query it using a [to-be-defined](https://github.com/Kerollmops/MeiliDB/issues/38) protocol. This is our current goal, [see the milestones](https://github.com/Kerollmops/MeiliDB/milestones). In the end, the binary will be a bunch of network protocols and wrappers around the library - which will also be published on [crates.io](https://crates.io). Both the binary and the library will follow the same update cycle.



 ## Performances

-_these informations have been made with a version dated of october 2018, we must update them_
+With a database composed of _100 353_ documents with _352_ attributes each and _90_ of them indexed.
+So nearly _9 million_ fields indexed for _35 million_ stored we can handle more than _1.2k req/sec_ on an Intel i7-7700 (8) @ 4.2GHz.

-We made some tests on remote machines and found that we can handle with a dataset of near 280k products, on a server that cost 5$/month with 1vCPU and 1GB of ram and on the same index and with a simple query:
+Requests are made using [wrk](https://github.com/wg/wrk) and scripted to generate real users queries.

- near 190 users with an average response time of 90ms
- 150 users with an average response time of 70ms
- 100 users with an average response time of 45ms
-
-Network is mesured, servers are located in amsterdam and tests are made between two different datacenters.
+```
+Running 10s test @ http://localhost:2230
+  2 threads and 12 connections
+  Thread Stats   Avg      Stdev     Max   +/- Stdev
+    Latency    18.86ms   49.39ms 614.89ms   95.23%
+    Req/Sec   620.41     59.53   790.00     65.00%
+  12359 requests in 10.00s, 3.26MB read
+Requests/sec:   1235.54
+Transfer/sec:    334.22KB
+```

+### Notes

+The default Rust allocator has recently been [changed to use the system allocator](https://github.com/rust-lang/rust/pull/51241/).
+We have seen much better performances when [using jemalloc as the global allocator](https://github.com/alexcrichton/jemallocator#documentation).

 ## Usage and examples

-MeiliDB work with an index like most of the search engines.
+MeiliDB runs with an index like most search engines.
 So to test the library you can create one by indexing a simple csv file.

 ```bash
-cargo run --release --example create-database -- test.mdb misc/kaggle.csv
+cargo run --release --example create-database -- test.mdb misc/kaggle.csv --schema schema-example.toml
 ```

-Once the command finished indexing the database should have been saved under the `test.mdb` folder.
-
-Now you can easily run the `query-database` example to check what is stored in it.
+Once the command is executed, the index should be in the `test.mdb` folder. You are now able to run the `query-database` example and play with MeiliDB.

 ```bash
-cargo run --release --example query-database -- test.mdb
+cargo run --release --example query-database -- test.mdb -n 10 id title
 ```

--- a/examples/create-database.rs
+++ b/examples/create-database.rs
@ -1,91 +1,132 @@
-use std::collections::hash_map::DefaultHasher;
-use std::path::{Path, PathBuf};
-use std::hash::{Hash, Hasher};
-use std::error::Error;
+#[global_allocator]
+static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;

+use std::io::{self, BufRead, BufReader};
+use std::path::{Path, PathBuf};
+use std::error::Error;
+use std::borrow::Cow;
+use std::fs::File;
+
+use hashbrown::{HashMap, HashSet};
 use serde_derive::{Serialize, Deserialize};
 use structopt::StructOpt;

-use meilidb::database::schema::{Schema, SchemaBuilder, STORED, INDEXED};
-use meilidb::database::update::PositiveUpdateBuilder;
+use meilidb::database::{Database, Schema, UpdateBuilder};
 use meilidb::tokenizer::DefaultBuilder;
-use meilidb::database::Database;

 #[derive(Debug, StructOpt)]
 pub struct Opt {
-    /// The destination where the database must be created
+    /// The destination where the database must be created.
    #[structopt(parse(from_os_str))]
    pub database_path: PathBuf,

    /// The csv file to index.
    #[structopt(parse(from_os_str))]
    pub csv_data_path: PathBuf,
+
+    /// The path to the schema.
+    #[structopt(long = "schema", parse(from_os_str))]
+    pub schema_path: PathBuf,
+
+    /// The path to the list of stop words (one by line).
+    #[structopt(long = "stop-words", parse(from_os_str))]
+    pub stop_words_path: Option<PathBuf>,
+
+    #[structopt(long = "update-group-size")]
+    pub update_group_size: Option<usize>,
 }

-#[derive(Debug, Serialize, Deserialize)]
-struct Document<'a> {
-    id: &'a str,
-    title: &'a str,
-    description: &'a str,
-    image: &'a str,
-}
+#[derive(Serialize, Deserialize)]
+struct Document<'a> (
+    #[serde(borrow)]
+    HashMap<Cow<'a, str>, Cow<'a, str>>
+);

-fn calculate_hash<T: Hash>(t: &T) -> u64 {
-    let mut s = DefaultHasher::new();
-    t.hash(&mut s);
-    s.finish()
-}
-
-fn create_schema() -> Schema {
-    let mut schema = SchemaBuilder::new();
-    schema.new_attribute("id", STORED);
-    schema.new_attribute("title", STORED | INDEXED);
-    schema.new_attribute("description", STORED | INDEXED);
-    schema.new_attribute("image", STORED);
-    schema.build()
-}
-
-fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<Database, Box<Error>> {
-    let database = Database::create(database_path, schema.clone())?;
-
-    println!("start indexing...");
-
-    let tokenizer_builder = DefaultBuilder::new();
-    let update_path = tempfile::NamedTempFile::new()?;
-    let mut update = PositiveUpdateBuilder::new(update_path.path(), schema, tokenizer_builder);
+fn index(
+    schema: Schema,
+    database_path: &Path,
+    csv_data_path: &Path,
+    update_group_size: Option<usize>,
+    stop_words: &HashSet<String>,
+) -> Result<Database, Box<Error>>
+{
+    let database = Database::create(database_path, &schema)?;

    let mut rdr = csv::Reader::from_path(csv_data_path)?;
    let mut raw_record = csv::StringRecord::new();
    let headers = rdr.headers()?.clone();

-    while rdr.read_record(&mut raw_record)? {
-        let document: Document = match raw_record.deserialize(Some(&headers)) {
-            Ok(document) => document,
-            Err(e) => {
-                eprintln!("{:?}", e);
-                continue;
+    let mut i = 0;
+    let mut end_of_file = false;
+
+    while !end_of_file {
+        let tokenizer_builder = DefaultBuilder::new();
+        let update_path = tempfile::NamedTempFile::new()?;
+        let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema.clone());
+
+        loop {
+            end_of_file = !rdr.read_record(&mut raw_record)?;
+            if end_of_file { break }
+
+            let document: Document = match raw_record.deserialize(Some(&headers)) {
+                Ok(document) => document,
+                Err(e) => {
+                    eprintln!("{:?}", e);
+                    continue;
+                }
+            };
+
+            update.update_document(&document, &tokenizer_builder, &stop_words)?;
+
+            print!("\rindexing document {}", i);
+            i += 1;
+
+            if let Some(group_size) = update_group_size {
+                if i % group_size == 0 { break }
            }
-        };
+        }

-        let document_id = calculate_hash(&document.id);
-        update.update(document_id, &document).unwrap();
+        println!();
+
+        println!("building update...");
+        let update = update.build()?;
+        println!("ingesting update...");
+        database.ingest_update_file(update)?;
    }

-    let mut update = update.build()?;
-
-    update.set_move(true);
-    database.ingest_update_file(update)?;
-
    Ok(database)
 }

+fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
+    let f = File::open(path)?;
+    let reader = BufReader::new(f);
+    let mut words = HashSet::new();
+
+    for line in reader.lines() {
+        let line = line?;
+        let word = line.trim().to_string();
+        words.insert(word);
+    }
+
+    Ok(words)
+}
+
 fn main() -> Result<(), Box<Error>> {
+    let _ = env_logger::init();
    let opt = Opt::from_args();

-    let schema = create_schema();
+    let schema = {
+        let file = File::open(&opt.schema_path)?;
+        Schema::from_toml(file)?
+    };
+
+    let stop_words = match opt.stop_words_path {
+        Some(ref path) => retrieve_stop_words(path)?,
+        None           => HashSet::new(),
+    };

    let (elapsed, result) = elapsed::measure_time(|| {
-        index(schema, &opt.database_path, &opt.csv_data_path)
+        index(schema, &opt.database_path, &opt.csv_data_path, opt.update_group_size, &stop_words)
    });

    if let Err(e) = result {
@ -93,6 +134,5 @@ fn main() -> Result<(), Box<Error>> {
    }

    println!("database created in {} at: {:?}", elapsed, opt.database_path);
-
    Ok(())
 }
--- a/examples/http-server.rs
+++ b/examples/http-server.rs
@ -0,0 +1,435 @@
+#[global_allocator]
+static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
+
+use log::{error, info};
+use std::error::Error;
+use std::ffi::OsStr;
+use std::fmt;
+use std::fs::{self, File};
+use std::io::{self, BufRead, BufReader};
+use std::net::SocketAddr;
+use std::path::{PathBuf, Path};
+use std::sync::Arc;
+use std::time::SystemTime;
+
+use hashbrown::{HashMap, HashSet};
+use chashmap::CHashMap;
+use chashmap::ReadGuard;
+use elapsed::measure_time;
+use meilidb::database::Database;
+use meilidb::database::UpdateBuilder;
+use meilidb::database::schema::Schema;
+use meilidb::database::schema::SchemaBuilder;
+use meilidb::tokenizer::DefaultBuilder;
+use serde_derive::Deserialize;
+use serde_derive::Serialize;
+use structopt::StructOpt;
+use warp::{Rejection, Filter};
+
+#[derive(Debug, StructOpt)]
+pub struct Opt {
+    /// The destination where the database must be created.
+    #[structopt(parse(from_os_str))]
+    pub database_path: PathBuf,
+
+    /// The address and port to bind the server to.
+    #[structopt(short = "l", default_value = "127.0.0.1:8080")]
+    pub listen_addr: SocketAddr,
+
+    /// The path to the list of stop words (one by line).
+    #[structopt(long = "stop-words", parse(from_os_str))]
+    pub stop_words: PathBuf,
+}
+
+//
+// ERRORS FOR THE MULTIDATABASE
+//
+
+#[derive(Debug)]
+pub enum DatabaseError {
+    AlreadyExist,
+    NotExist,
+    NotFound(String),
+    Unknown(Box<Error>),
+}
+
+impl fmt::Display for DatabaseError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            DatabaseError::AlreadyExist => write!(f, "File already exist"),
+            DatabaseError::NotExist => write!(f, "File not exist"),
+            DatabaseError::NotFound(ref name) => write!(f, "Database {} not found", name),
+            DatabaseError::Unknown(e) => write!(f, "{}", e),
+        }
+    }
+}
+
+impl Error for DatabaseError {}
+
+impl From<Box<Error>> for DatabaseError {
+    fn from(e: Box<Error>) -> DatabaseError {
+        DatabaseError::Unknown(e)
+    }
+}
+
+//
+// MULTIDATABASE DEFINITION
+//
+
+pub struct MultiDatabase {
+    databases: CHashMap<String, Database>,
+    db_path: PathBuf,
+    stop_words: HashSet<String>,
+}
+
+impl MultiDatabase {
+
+    pub fn new(path: PathBuf, stop_words: HashSet<String>) -> MultiDatabase {
+        MultiDatabase {
+            databases: CHashMap::new(),
+            db_path: path,
+            stop_words: stop_words
+        }
+    }
+
+    pub fn create(&self, name: String, schema: Schema) -> Result<(), DatabaseError> {
+        let rdb_name = format!("{}.mdb", name);
+        let database_path = self.db_path.join(rdb_name);
+
+        if database_path.exists() {
+            return Err(DatabaseError::AlreadyExist.into());
+        }
+
+        let index = Database::create(database_path, &schema)?;
+
+        self.databases.insert_new(name, index);
+
+        Ok(())
+    }
+
+    pub fn load(&self, name: String) -> Result<(), DatabaseError> {
+        let rdb_name = format!("{}.mdb", name);
+        let index_path = self.db_path.join(rdb_name);
+
+        if !index_path.exists() {
+            return Err(DatabaseError::NotExist.into());
+        }
+
+        let index = Database::open(index_path)?;
+
+        self.databases.insert_new(name, index);
+
+        Ok(())
+    }
+
+    pub fn load_existing(&self) {
+        let paths = match fs::read_dir(self.db_path.clone()){
+            Ok(p) => p,
+            Err(e) => {
+                error!("{}", e);
+                return
+            }
+        };
+
+        for path in paths {
+            let path = match path {
+                Ok(p) => p.path(),
+                Err(_) => continue
+            };
+
+            let path_str = match path.to_str() {
+                Some(p) => p,
+                None => continue
+            };
+
+            let extension = match get_extension_from_path(path_str) {
+                Some(e) => e,
+                None => continue
+            };
+
+            if extension != "mdb" {
+                continue
+            }
+
+            let name = match get_file_name_from_path(path_str) {
+                Some(f) => f,
+                None => continue
+            };
+
+            let db = match Database::open(path.clone()) {
+                Ok(db) => db,
+                Err(_) => continue
+            };
+
+            self.databases.insert_new(name.to_string(), db);
+            info!("Load database {}", name);
+        }
+    }
+
+    pub fn create_or_load(&self, name: String, schema: Schema) -> Result<(), DatabaseError> {
+        match self.create(name.clone(), schema) {
+            Err(DatabaseError::AlreadyExist) => self.load(name),
+            x => x,
+        }
+    }
+
+    pub fn get(&self, name: String) -> Result<ReadGuard<String, Database>, Box<Error>> {
+        Ok(self.databases.get(&name).ok_or(DatabaseError::NotFound(name))?)
+    }
+}
+
+fn get_extension_from_path(path: &str) -> Option<&str> {
+    Path::new(path).extension().and_then(OsStr::to_str)
+}
+
+fn get_file_name_from_path(path: &str) -> Option<&str> {
+    Path::new(path).file_stem().and_then(OsStr::to_str)
+}
+
+fn retrieve_stop_words(path: &Path) -> io::Result<HashSet<String>> {
+    let f = File::open(path)?;
+    let reader = BufReader::new(f);
+    let mut words = HashSet::new();
+
+    for line in reader.lines() {
+        let line = line?;
+        let word = line.trim().to_string();
+        words.insert(word);
+    }
+
+    Ok(words)
+}
+
+//
+// PARAMS & BODY FOR HTTPS HANDLERS
+//
+
+#[derive(Deserialize)]
+struct CreateBody {
+    name: String,
+    schema: SchemaBuilder,
+}
+
+#[derive(Deserialize)]
+struct IngestBody {
+    insert: Option<Vec<HashMap<String, String>>>,
+    delete: Option<Vec<HashMap<String, String>>>
+}
+
+#[derive(Serialize)]
+struct IngestResponse {
+    inserted: usize,
+    deleted: usize
+}
+
+#[derive(Deserialize)]
+struct SearchQuery {
+    q: String,
+    limit: Option<usize>,
+}
+
+//
+// HTTP ROUTES
+//
+
+// Create a new index.
+// The index name should be unused and the schema valid.
+//
+// POST /create
+// Body:
+//     - name: String
+//     - schema: JSON
+//     - stopwords: Vec<String>
+fn create(body: CreateBody, db: Arc<MultiDatabase>) -> Result<String, Rejection>  {
+    let schema = body.schema.build();
+
+    match db.create(body.name.clone(), schema) {
+        Ok(_) => Ok(format!("{} created ", body.name)),
+        Err(e) => {
+            error!("{:?}", e);
+            return Err(warp::reject::not_found())
+        }
+    }
+}
+
+// Ingest new document.
+// It's possible to have positive or/and negative updates.
+//
+// PUT /:name/ingest
+// Body:
+//     - insert: Option<Vec<JSON>>
+//     - delete: Option<Vec<String>>
+fn ingest(index_name: String, body: IngestBody, db: Arc<MultiDatabase>) -> Result<String, Rejection>  {
+
+    let schema = {
+        let index = match db.get(index_name.clone()){
+            Ok(i) => i,
+            Err(_) => return Err(warp::reject::not_found()),
+        };
+        let view = index.view();
+
+        view.schema().clone()
+    };
+
+    let tokenizer_builder = DefaultBuilder::new();
+    let now = match SystemTime::now().duration_since(SystemTime::UNIX_EPOCH) {
+        Ok(n) => n.as_secs(),
+        Err(_) => panic!("SystemTime before UNIX EPOCH!"),
+    };
+
+    let sst_name = format!("update-{}-{}.sst", index_name, now);
+    let sst_path = db.db_path.join(sst_name);
+
+    let mut response = IngestResponse{inserted: 0, deleted: 0};
+    let mut update = UpdateBuilder::new(sst_path, schema);
+
+    if let Some(documents) = body.delete {
+        for doc in documents {
+            if let Err(e) = update.remove_document(doc) {
+                error!("Impossible to remove document; {:?}", e);
+            } else {
+                response.deleted += 1;
+            }
+        }
+    }
+
+    let stop_words = &db.stop_words;
+    if let Some(documents) = body.insert {
+        for doc in documents {
+            if let Err(e) = update.update_document(doc, &tokenizer_builder, &stop_words) {
+                error!("Impossible to update document; {:?}", e);
+            } else {
+                response.inserted += 1;
+            }
+        }
+    }
+
+
+    let update = match update.build() {
+        Ok(u) => u,
+        Err(e) => {
+            error!("Impossible to create an update file; {:?}", e);
+            return Err(warp::reject::not_found())
+        }
+    };
+
+    {
+        let index = match db.get(index_name.clone()){
+            Ok(i) => i,
+            Err(_) => return Err(warp::reject::not_found()),
+        };
+
+        if let Err(e) = index.ingest_update_file(update) {
+            error!("Impossible to ingest sst file; {:?}", e);
+            return Err(warp::reject::not_found())
+        };
+    }
+
+    if let Ok(response) = serde_json::to_string(&response) {
+        return Ok(response);
+    };
+
+    return Err(warp::reject::not_found())
+}
+
+// Search in a specific index
+// The default limit is 20
+//
+// GET /:name/search
+// Params:
+//     - query: String
+//     - limit: Option<usize>
+fn search(index_name: String, query: SearchQuery, db: Arc<MultiDatabase>) -> Result<String, Rejection>  {
+
+    let view = {
+        let index = match db.get(index_name.clone()){
+            Ok(i) => i,
+            Err(_) => return Err(warp::reject::not_found()),
+        };
+        index.view()
+    };
+
+    let limit = query.limit.unwrap_or(20);
+
+    let query_builder = match view.query_builder() {
+        Ok(q) => q,
+        Err(_err) => return Err(warp::reject::not_found()),
+    };
+
+    let (time, responses) = measure_time(|| {
+        let docs = query_builder.query(&query.q, 0..limit);
+        let mut results: Vec<HashMap<String, String>> = Vec::with_capacity(limit);
+        for doc in docs {
+            match view.document_by_id(doc.id) {
+                Ok(val) => results.push(val),
+                Err(e) => println!("{:?}", e),
+            }
+        }
+        results
+    });
+
+    let response = match serde_json::to_string(&responses) {
+        Ok(val) => val,
+        Err(err) => format!("{:?}", err),
+    };
+
+    info!("index: {} - search: {:?} - limit: {} - time: {}", index_name, query.q, limit, time);
+    Ok(response)
+}
+
+fn start_server(listen_addr: SocketAddr, db: Arc<MultiDatabase>) {
+    let index_path = warp::path("index").and(warp::path::param::<String>());
+    let db = warp::any().map(move || db.clone());
+
+    let create_path = warp::path("create").and(warp::path::end());
+    let ingest_path = index_path.and(warp::path("ingest")).and(warp::path::end());
+    let search_path = index_path.and(warp::path("search")).and(warp::path::end());
+
+    let create = warp::post2()
+        .and(create_path)
+        .and(warp::body::json())
+        .and(db.clone())
+        .and_then(create);
+
+    let ingest = warp::put2()
+        .and(ingest_path)
+        .and(warp::body::json())
+        .and(db.clone())
+        .and_then(ingest);
+
+    let search = warp::get2()
+        .and(search_path)
+        .and(warp::query())
+        .and(db.clone())
+        .and_then(search);
+
+    let api = create
+        .or(ingest)
+        .or(search);
+
+    let logs = warp::log("server");
+    let headers = warp::reply::with::header("Content-Type", "application/json");
+
+    let routes = api.with(logs).with(headers);
+
+    info!("Server is started on {}", listen_addr);
+    warp::serve(routes).run(listen_addr);
+}
+
+fn main() {
+    env_logger::init();
+    let opt = Opt::from_args();
+
+    let stop_words = match retrieve_stop_words(&opt.stop_words) {
+        Ok(s) => s,
+        Err(_) => HashSet::new(),
+    };
+
+    let db = Arc::new(MultiDatabase::new(opt.database_path.clone(), stop_words));
+
+    db.load_existing();
+
+    start_server(opt.listen_addr, db);
+}
+
+
--- a/examples/query-database.rs
+++ b/examples/query-database.rs
@ -1,11 +1,19 @@
+#[global_allocator]
+static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
+
+use std::collections::btree_map::{BTreeMap, Entry};
+use std::iter::FromIterator;
 use std::io::{self, Write};
 use std::path::PathBuf;
 use std::error::Error;

-use serde_derive::{Serialize, Deserialize};
+use hashbrown::{HashMap, HashSet};
+use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
 use structopt::StructOpt;

+use meilidb::database::schema::SchemaAttr;
 use meilidb::database::Database;
+use meilidb::Match;

 #[derive(Debug, StructOpt)]
 pub struct Opt {
@ -13,20 +21,87 @@ pub struct Opt {
    #[structopt(parse(from_os_str))]
    pub database_path: PathBuf,

+    /// Fields that must be displayed.
+    pub displayed_fields: Vec<String>,
+
    /// The number of returned results
    #[structopt(short = "n", long = "number-results", default_value = "10")]
    pub number_results: usize,
 }

-#[derive(Debug, Serialize, Deserialize)]
-struct Document {
-    id: String,
-    title: String,
-    description: String,
-    image: String,
+type Document = HashMap<String, String>;
+
+fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> {
+    let mut stdout = StandardStream::stdout(ColorChoice::Always);
+    let mut highlighted = false;
+
+    for range in ranges.windows(2) {
+        let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() };
+        if highlighted {
+            stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?;
+        }
+        write!(&mut stdout, "{}", &text[start..end])?;
+        stdout.reset()?;
+        highlighted = !highlighted;
+    }
+
+    Ok(())
+}
+
+fn char_to_byte_range(index: usize, length: usize, text: &str) -> (usize, usize) {
+    let mut byte_index = 0;
+    let mut byte_length = 0;
+
+    for (n, (i, c)) in text.char_indices().enumerate() {
+        if n == index {
+            byte_index = i;
+        }
+
+        if n + 1 == index + length {
+            byte_length = i - byte_index + c.len_utf8();
+            break;
+        }
+    }
+
+    (byte_index, byte_length)
+}
+
+fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr) -> Vec<usize> {
+    let mut byte_indexes = BTreeMap::new();
+
+    for match_ in matches {
+        let match_attribute = match_.attribute.attribute();
+        if SchemaAttr::new(match_attribute) == attribute {
+            let word_area = match_.word_area;
+
+            let char_index = word_area.char_index() as usize;
+            let char_length = word_area.length() as usize;
+            let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
+
+            match byte_indexes.entry(byte_index) {
+                Entry::Vacant(entry) => { entry.insert(byte_length); },
+                Entry::Occupied(mut entry) => {
+                    if *entry.get() < byte_length {
+                        entry.insert(byte_length);
+                    }
+                },
+            }
+        }
+    }
+
+    let mut title_areas = Vec::new();
+    title_areas.push(0);
+    for (byte_index, length) in byte_indexes {
+        title_areas.push(byte_index);
+        title_areas.push(byte_index + length);
+    }
+    title_areas.push(text.len());
+    title_areas.sort_unstable();
+    title_areas
 }

 fn main() -> Result<(), Box<Error>> {
+    let _ = env_logger::init();
    let opt = Opt::from_args();

    let (elapsed, result) = elapsed::measure_time(|| Database::open(&opt.database_path));
@ -41,26 +116,53 @@ fn main() -> Result<(), Box<Error>> {
        io::stdout().flush()?;

        if input.read_line(&mut buffer)? == 0 { break }
+        let query = buffer.trim_end_matches('\n');

        let view = database.view();
+        let schema = view.schema();

        let (elapsed, documents) = elapsed::measure_time(|| {
            let builder = view.query_builder().unwrap();
-            builder.query(&buffer, 0..opt.number_results)
+            builder.query(query, 0..opt.number_results)
        });

-        let mut full_documents = Vec::with_capacity(documents.len());
+        let number_of_documents = documents.len();
+        for doc in documents {
+            match view.document_by_id::<Document>(doc.id) {
+                Ok(document) => {
+                    for name in &opt.displayed_fields {
+                        let attr = match schema.attribute(name) {
+                            Some(attr) => attr,
+                            None => continue,
+                        };
+                        let text = match document.get(name) {
+                            Some(text) => text,
+                            None => continue,
+                        };

-        for document in documents {
-            match view.retrieve_document::<Document>(document.id) {
-                Ok(document) => full_documents.push(document),
+                        print!("{}: ", name);
+                        let areas = create_highlight_areas(&text, &doc.matches, attr);
+                        display_highlights(&text, &areas)?;
+                        println!();
+                    }
+                },
                Err(e) => eprintln!("{}", e),
            }
+
+            let mut matching_attributes = HashSet::new();
+            for _match in doc.matches {
+                let attr = SchemaAttr::new(_match.attribute.attribute());
+                let name = schema.attribute_name(attr);
+                matching_attributes.insert(name);
+            }
+
+            let matching_attributes = Vec::from_iter(matching_attributes);
+            println!("matching in: {:?}", matching_attributes);
+
+            println!();
        }

-        println!("{:#?}", full_documents);
-        println!("Found {} results in {}", full_documents.len(), elapsed);
-
+        eprintln!("===== Found {} results in {} =====", number_of_documents, elapsed);
        buffer.clear();
    }

--- a/examples/schema-example.toml
+++ b/examples/schema-example.toml
@ -0,0 +1,19 @@
+# This schema has been generated ...
+# The order in which the attributes are declared is important,
+# it specify the attribute xxx...
+
+identifier = "id"
+
+[attributes.id]
+stored = true
+
+[attributes.title]
+stored = true
+indexed = true
+
+[attributes.description]
+stored = true
+indexed = true
+
+[attributes.image]
+stored = true
--- a/misc/en.stopwords.txt
+++ b/misc/en.stopwords.txt
@ -95,7 +95,8 @@ or
 other
 ought
 our
-ours    ourselves
+ours
+ourselves
 out
 over
 own
--- a/misc/fr.stopwords.txt
+++ b/misc/fr.stopwords.txt
@ -0,0 +1,163 @@
+au
+aux
+avec
+ce
+ces
+dans
+de
+des
+du
+elle
+en
+et
+eux
+il
+je
+la
+le
+leur
+lui
+ma
+mais
+me
+même
+mes
+moi
+mon
+ne
+nos
+notre
+nous
+on
+ou
+par
+pas
+pour
+qu
+que
+qui
+sa
+se
+ses
+son
+sur
+ta
+te
+tes
+toi
+ton
+tu
+un
+une
+vos
+votre
+vous
+c
+d
+j
+l
+à
+m
+n
+s
+t
+y
+été
+étée
+étées
+étés
+étant
+suis
+es
+est
+sommes
+êtes
+sont
+serai
+seras
+sera
+serons
+serez
+seront
+serais
+serait
+serions
+seriez
+seraient
+étais
+était
+étions
+étiez
+étaient
+fus
+fut
+fûmes
+fûtes
+furent
+sois
+soit
+soyons
+soyez
+soient
+fusse
+fusses
+fût
+fussions
+fussiez
+fussent
+ayant
+eu
+eue
+eues
+eus
+ai
+as
+avons
+avez
+ont
+aurai
+auras
+aura
+aurons
+aurez
+auront
+aurais
+aurait
+aurions
+auriez
+auraient
+avais
+avait
+avions
+aviez
+avaient
+eut
+eûmes
+eûtes
+eurent
+aie
+aies
+ait
+ayons
+ayez
+aient
+eusse
+eusses
+eût
+eussions
+eussiez
+eussent
+ceci
+celà
+cet
+cette
+ici
+ils
+les
+leurs
+quel
+quels
+quelle
+quelles
+sans
+soi
--- a/src/attribute.rs
+++ b/src/attribute.rs
@ -0,0 +1,105 @@
+use std::fmt;
+
+/// Represent an attribute number along with the word index
+/// according to the tokenizer used.
+///
+/// It can accept up to 1024 attributes and word positions
+/// can be maximum 2^22.
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct Attribute(u32);
+
+impl Attribute {
+    /// Construct an `Attribute` from an attribute number and
+    /// the word position of a match according to the tokenizer used.
+    pub(crate) fn new(attribute: u16, index: u32) -> Result<Attribute, AttributeError> {
+        if attribute & 0b1111_1100_0000_0000 != 0 {
+            return Err(AttributeError::AttributeTooBig)
+        }
+
+        if index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 {
+            return Err(AttributeError::IndexTooBig)
+        }
+
+        let attribute = u32::from(attribute) << 22;
+        Ok(Attribute(attribute | index))
+    }
+
+    /// Construct an `Attribute` from an attribute number and
+    /// the word position of a match according to the tokenizer used.
+    ///
+    /// # Panics
+    ///
+    /// The attribute must not be greater than 1024
+    /// and the word index not greater than 2^22.
+    pub(crate) fn new_faillible(attribute: u16, index: u32) -> Attribute {
+        match Attribute::new(attribute, index) {
+            Ok(attribute) => attribute,
+            Err(AttributeError::AttributeTooBig) => {
+                panic!("attribute must not be greater than 1024")
+            },
+            Err(AttributeError::IndexTooBig) => {
+                panic!("attribute word index must not be greater than 2^22")
+            },
+        }
+    }
+
+    pub(crate) fn max_value() -> Attribute {
+        Attribute(u32::max_value())
+    }
+
+    #[inline]
+    pub fn attribute(self) -> u16 {
+        (self.0 >> 22) as u16
+    }
+
+    #[inline]
+    pub fn word_index(self) -> u32 {
+        self.0 & 0b0000_0000_0011_1111_1111_1111_1111
+    }
+}
+
+impl fmt::Debug for Attribute {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.debug_struct("Attribute")
+            .field("attribute", &self.attribute())
+            .field("word_index", &self.word_index())
+            .finish()
+    }
+}
+
+pub enum AttributeError {
+    AttributeTooBig,
+    IndexTooBig,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use quickcheck::{quickcheck, TestResult};
+
+    quickcheck! {
+        fn qc_attribute(gen_attr: u16, gen_index: u32) -> TestResult {
+            if gen_attr > 2_u16.pow(10) || gen_index > 2_u32.pow(22) {
+                return TestResult::discard()
+            }
+
+            let attribute = Attribute::new_faillible(gen_attr, gen_index);
+
+            let valid_attribute = attribute.attribute() == gen_attr;
+            let valid_index = attribute.word_index() == gen_index;
+
+            TestResult::from_bool(valid_attribute && valid_index)
+        }
+
+        fn qc_attribute_ord(gen_attr: u16, gen_index: u32) -> TestResult {
+            if gen_attr >= 2_u16.pow(10) || gen_index >= 2_u32.pow(22) {
+                return TestResult::discard()
+            }
+
+            let a = Attribute::new_faillible(gen_attr, gen_index);
+            let b = Attribute::new_faillible(gen_attr + 1, gen_index + 1);
+
+            TestResult::from_bool(a < b)
+        }
+    }
+}
--- a/src/automaton.rs
+++ b/src/automaton.rs
@ -50,6 +50,7 @@ impl AutomatonExt for DfaExt {
    }
 }

+#[derive(Copy, Clone)]
 enum PrefixSetting {
    Prefix,
    NoPrefix,
--- a/src/data/doc_ids.rs
+++ b/src/data/doc_ids.rs
@ -1,59 +1,54 @@
+use std::io::{self, Cursor, BufRead};
 use std::slice::from_raw_parts;
-use std::error::Error;
-use std::path::Path;
-use std::sync::Arc;
-use std::{io, mem};
+use std::mem::size_of;

+use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
 use sdset::Set;
-use fst::raw::MmapReadOnly;
-use serde::ser::{Serialize, Serializer};

 use crate::DocumentId;
-use crate::data::Data;
+use crate::data::SharedData;
+use super::into_u8_slice;

 #[derive(Default, Clone)]
-pub struct DocIds {
-    data: Data,
-}
+pub struct DocIds(SharedData);

 impl DocIds {
-    pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
-        let mmap = MmapReadOnly::open_path(path)?;
-        let data = Data::Mmap(mmap);
-        Ok(DocIds { data })
+    pub fn new(ids: &Set<DocumentId>) -> DocIds {
+        let bytes = unsafe { into_u8_slice(ids.as_slice()) };
+        let data = SharedData::from_bytes(bytes.to_vec());
+        DocIds(data)
    }

-    pub fn from_bytes(vec: Vec<u8>) -> Result<Self, Box<Error>> {
-        // FIXME check if modulo DocumentId
-        let len = vec.len();
-        let data = Data::Shared {
-            bytes: Arc::new(vec),
-            offset: 0,
-            len: len
-        };
-        Ok(DocIds { data })
+    pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIds> {
+        let len = cursor.read_u64::<LittleEndian>()? as usize;
+        let offset = cursor.position() as usize;
+        let doc_ids = cursor.get_ref().range(offset, len);
+        cursor.consume(len);
+
+        Ok(DocIds(doc_ids))
    }

-    pub fn from_document_ids(vec: Vec<DocumentId>) -> Self {
-        DocIds::from_bytes(unsafe { mem::transmute(vec) }).unwrap()
+    pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
+        let len = self.0.len() as u64;
+        bytes.write_u64::<LittleEndian>(len).unwrap();
+        bytes.extend_from_slice(&self.0);
    }

-    pub fn contains(&self, doc: DocumentId) -> bool {
-        // FIXME prefer using the sdset::exponential_search function
-        self.doc_ids().binary_search(&doc).is_ok()
+    pub fn is_empty(&self) -> bool {
+        self.0.is_empty()
    }

-    pub fn doc_ids(&self) -> &Set<DocumentId> {
-        let slice = &self.data;
+    pub fn as_bytes(&self) -> &[u8] {
+        &self.0
+    }
+}
+
+impl AsRef<Set<DocumentId>> for DocIds {
+    fn as_ref(&self) -> &Set<DocumentId> {
+        let slice = &self.0;
        let ptr = slice.as_ptr() as *const DocumentId;
-        let len = slice.len() / mem::size_of::<DocumentId>();
+        let len = slice.len() / size_of::<DocumentId>();
        let slice = unsafe { from_raw_parts(ptr, len) };
        Set::new_unchecked(slice)
    }
 }
-
-impl Serialize for DocIds {
-    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
-        self.data.as_ref().serialize(serializer)
-    }
-}
--- a/src/data/doc_indexes.rs
+++ b/src/data/doc_indexes.rs
@ -1,16 +1,15 @@
+use std::io::{self, Write, Cursor, BufRead};
 use std::slice::from_raw_parts;
-use std::io::{self, Write};
 use std::mem::size_of;
 use std::ops::Index;
-use std::path::Path;
 use std::sync::Arc;

 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
-use fst::raw::MmapReadOnly;
 use sdset::Set;

 use crate::DocIndex;
-use crate::data::Data;
+use crate::data::SharedData;
+use super::into_u8_slice;

 #[derive(Debug)]
 #[repr(C)]
@ -21,52 +20,45 @@ struct Range {

 #[derive(Clone, Default)]
 pub struct DocIndexes {
-    ranges: Data,
-    indexes: Data,
+    ranges: SharedData,
+    indexes: SharedData,
 }

 impl DocIndexes {
-    pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
-        let mmap = MmapReadOnly::open_path(path)?;
-        DocIndexes::from_data(Data::Mmap(mmap))
+    pub fn from_bytes(bytes: Vec<u8>) -> io::Result<DocIndexes> {
+        let bytes = Arc::new(bytes);
+        let len = bytes.len();
+        let data = SharedData::new(bytes, 0, len);
+        let mut  cursor = Cursor::new(data);
+        DocIndexes::from_cursor(&mut cursor)
    }

-    pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
-        let len = vec.len();
-        DocIndexes::from_shared_bytes(Arc::new(vec), 0, len)
-    }
+    pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIndexes> {
+        let len = cursor.read_u64::<LittleEndian>()? as usize;
+        let offset = cursor.position() as usize;
+        let ranges = cursor.get_ref().range(offset, len);
+        cursor.consume(len);

-    pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> io::Result<Self> {
-        let data = Data::Shared { bytes, offset, len };
-        DocIndexes::from_data(data)
-    }
-
-    fn from_data(data: Data) -> io::Result<Self> {
-        let ranges_len_offset = data.len() - size_of::<u64>();
-        let ranges_len = (&data[ranges_len_offset..]).read_u64::<LittleEndian>()?;
-        let ranges_len = ranges_len as usize;
-
-        let ranges_offset = ranges_len_offset - ranges_len;
-        let ranges = data.range(ranges_offset, ranges_len);
-
-        let indexes = data.range(0, ranges_offset);
+        let len = cursor.read_u64::<LittleEndian>()? as usize;
+        let offset = cursor.position() as usize;
+        let indexes = cursor.get_ref().range(offset, len);
+        cursor.consume(len);

        Ok(DocIndexes { ranges, indexes })
    }

-    pub fn to_vec(&self) -> Vec<u8> {
-        let capacity = self.indexes.len() + self.ranges.len() + size_of::<u64>();
-        let mut bytes = Vec::with_capacity(capacity);
-
-        bytes.extend_from_slice(&self.indexes);
+    pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
+        let ranges_len = self.ranges.len() as u64;
+        let _ = bytes.write_u64::<LittleEndian>(ranges_len);
        bytes.extend_from_slice(&self.ranges);
-        bytes.write_u64::<LittleEndian>(self.ranges.len() as u64).unwrap();

-        bytes
+        let indexes_len = self.indexes.len() as u64;
+        let _ = bytes.write_u64::<LittleEndian>(indexes_len);
+        bytes.extend_from_slice(&self.indexes);
    }

    pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> {
-        self.ranges().get(index as usize).map(|Range { start, end }| {
+        self.ranges().get(index).map(|Range { start, end }| {
            let start = *start as usize;
            let end = *end as usize;
            let slice = &self.indexes()[start..end];
@ -102,12 +94,17 @@ impl Index<usize> for DocIndexes {

 pub struct DocIndexesBuilder<W> {
    ranges: Vec<Range>,
+    indexes: Vec<DocIndex>,
    wtr: W,
 }

 impl DocIndexesBuilder<Vec<u8>> {
    pub fn memory() -> Self {
-        DocIndexesBuilder::new(Vec::new())
+        DocIndexesBuilder {
+            ranges: Vec::new(),
+            indexes: Vec::new(),
+            wtr: Vec::new(),
+        }
    }
 }

@ -115,19 +112,18 @@ impl<W: Write> DocIndexesBuilder<W> {
    pub fn new(wtr: W) -> Self {
        DocIndexesBuilder {
            ranges: Vec::new(),
+            indexes: Vec::new(),
            wtr: wtr,
        }
    }

-    pub fn insert(&mut self, indexes: &Set<DocIndex>) -> io::Result<()> {
+    pub fn insert(&mut self, indexes: &Set<DocIndex>) {
        let len = indexes.len() as u64;
        let start = self.ranges.last().map(|r| r.end).unwrap_or(0);
        let range = Range { start, end: start + len };
        self.ranges.push(range);

-        // write the values
-        let indexes = unsafe { into_u8_slice(indexes) };
-        self.wtr.write_all(indexes)
+        self.indexes.extend_from_slice(indexes);
    }

    pub fn finish(self) -> io::Result<()> {
@ -135,40 +131,52 @@ impl<W: Write> DocIndexesBuilder<W> {
    }

    pub fn into_inner(mut self) -> io::Result<W> {
-        // write the ranges
-        let ranges = unsafe { into_u8_slice(self.ranges.as_slice()) };
-        self.wtr.write_all(ranges)?;
-
-        // write the length of the ranges
+        let ranges = unsafe { into_u8_slice(&self.ranges) };
        let len = ranges.len() as u64;
        self.wtr.write_u64::<LittleEndian>(len)?;
+        self.wtr.write_all(ranges)?;
+
+        let indexes = unsafe { into_u8_slice(&self.indexes) };
+        let len = indexes.len() as u64;
+        self.wtr.write_u64::<LittleEndian>(len)?;
+        self.wtr.write_all(indexes)?;

        Ok(self.wtr)
    }
 }

-unsafe fn into_u8_slice<T>(slice: &[T]) -> &[u8] {
-    let ptr = slice.as_ptr() as *const u8;
-    let len = slice.len() * size_of::<T>();
-    from_raw_parts(ptr, len)
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
+
    use std::error::Error;
+    use crate::{Attribute, WordArea};
+
+    use crate::DocumentId;

    #[test]
    fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
-        let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
-        let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
-        let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
+        let a = DocIndex {
+            document_id: DocumentId(0),
+            attribute: Attribute::new_faillible(3, 11),
+            word_area: WordArea::new_faillible(30, 4)
+        };
+        let b = DocIndex {
+            document_id: DocumentId(1),
+            attribute: Attribute::new_faillible(4, 21),
+            word_area: WordArea::new_faillible(35, 6)
+        };
+        let c = DocIndex {
+            document_id: DocumentId(2),
+            attribute: Attribute::new_faillible(8, 2),
+            word_area: WordArea::new_faillible(89, 6)
+        };

        let mut builder = DocIndexesBuilder::memory();

-        builder.insert(Set::new(&[a])?)?;
-        builder.insert(Set::new(&[a, b, c])?)?;
-        builder.insert(Set::new(&[a, c])?)?;
+        builder.insert(Set::new(&[a])?);
+        builder.insert(Set::new(&[a, b, c])?);
+        builder.insert(Set::new(&[a, c])?);

        let bytes = builder.into_inner()?;
        let docs = DocIndexes::from_bytes(bytes)?;
@ -183,19 +191,33 @@ mod tests {

    #[test]
    fn serialize_deserialize() -> Result<(), Box<Error>> {
-        let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
-        let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
-        let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
+        let a = DocIndex {
+            document_id: DocumentId(0),
+            attribute: Attribute::new_faillible(3, 11),
+            word_area: WordArea::new_faillible(30, 4)
+        };
+        let b = DocIndex {
+            document_id: DocumentId(1),
+            attribute: Attribute::new_faillible(4, 21),
+            word_area: WordArea::new_faillible(35, 6)
+        };
+        let c = DocIndex {
+            document_id: DocumentId(2),
+            attribute: Attribute::new_faillible(8, 2),
+            word_area: WordArea::new_faillible(89, 6)
+        };

        let mut builder = DocIndexesBuilder::memory();

-        builder.insert(Set::new(&[a])?)?;
-        builder.insert(Set::new(&[a, b, c])?)?;
-        builder.insert(Set::new(&[a, c])?)?;
+        builder.insert(Set::new(&[a])?);
+        builder.insert(Set::new(&[a, b, c])?);
+        builder.insert(Set::new(&[a, c])?);

        let builder_bytes = builder.into_inner()?;
        let docs = DocIndexes::from_bytes(builder_bytes.clone())?;
-        let bytes = docs.to_vec();
+
+        let mut bytes = Vec::new();
+        docs.write_to_bytes(&mut bytes);

        assert_eq!(builder_bytes, bytes);

--- a/src/data/mod.rs
+++ b/src/data/mod.rs
@ -1,51 +1,43 @@
 mod doc_ids;
 mod doc_indexes;

+use std::slice::from_raw_parts;
+use std::mem::size_of;
 use std::ops::Deref;
 use std::sync::Arc;

-use fst::raw::MmapReadOnly;
-
 pub use self::doc_ids::DocIds;
 pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};

-#[derive(Clone)]
-enum Data {
-    Shared {
-        bytes: Arc<Vec<u8>>,
-        offset: usize,
-        len: usize,
-    },
-    Mmap(MmapReadOnly),
+#[derive(Default, Clone)]
+pub struct SharedData {
+    pub bytes: Arc<Vec<u8>>,
+    pub offset: usize,
+    pub len: usize,
 }

-impl Data {
-    pub fn range(&self, off: usize, l: usize) -> Data {
-        match self {
-            Data::Shared { bytes, offset, len } => {
-                assert!(off + l <= *len);
-                Data::Shared {
-                    bytes: bytes.clone(),
-                    offset: offset + off,
-                    len: l,
-                }
-            },
-            Data::Mmap(mmap) => Data::Mmap(mmap.range(off, l)),
+impl SharedData {
+    pub fn from_bytes(vec: Vec<u8>) -> SharedData {
+        let len = vec.len();
+        let bytes = Arc::new(vec);
+        SharedData::new(bytes, 0, len)
+    }
+
+    pub fn new(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedData {
+        SharedData { bytes, offset, len }
+    }
+
+    pub fn range(&self, offset: usize, len: usize) -> SharedData {
+        assert!(offset + len <= self.len);
+        SharedData {
+            bytes: self.bytes.clone(),
+            offset: self.offset + offset,
+            len: len,
        }
    }
 }

-impl Default for Data {
-    fn default() -> Data {
-        Data::Shared {
-            bytes: Arc::default(),
-            offset: 0,
-            len: 0,
-        }
-    }
-}
-
-impl Deref for Data {
+impl Deref for SharedData {
    type Target = [u8];

    fn deref(&self) -> &Self::Target {
@ -53,13 +45,14 @@ impl Deref for Data {
    }
 }

-impl AsRef<[u8]> for Data {
+impl AsRef<[u8]> for SharedData {
    fn as_ref(&self) -> &[u8] {
-        match self {
-            Data::Shared { bytes, offset, len } => {
-                &bytes[*offset..offset + len]
-            },
-            Data::Mmap(m) => m.as_slice(),
-        }
+        &self.bytes[self.offset..self.offset + self.len]
    }
 }
+
+unsafe fn into_u8_slice<T: Sized>(slice: &[T]) -> &[u8] {
+    let ptr = slice.as_ptr() as *const u8;
+    let len = slice.len() * size_of::<T>();
+    from_raw_parts(ptr, len)
+}
--- a/src/database/blob/mod.rs
+++ b/src/database/blob/mod.rs
@ -1,110 +0,0 @@
-mod ops;
-pub mod positive;
-pub mod negative;
-
-pub use self::positive::{PositiveBlob, PositiveBlobBuilder};
-pub use self::negative::NegativeBlob;
-pub use self::ops::OpBuilder;
-
-use std::fmt;
-
-use serde_derive::{Serialize, Deserialize};
-use serde::ser::{Serialize, Serializer, SerializeTuple};
-use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor};
-
-#[derive(Debug)]
-pub enum Blob {
-    Positive(PositiveBlob),
-    Negative(NegativeBlob),
-}
-
-impl Blob {
-    pub fn is_negative(&self) -> bool {
-        self.sign() == Sign::Negative
-    }
-
-    pub fn is_positive(&self) -> bool {
-        self.sign() == Sign::Positive
-    }
-
-    pub fn sign(&self) -> Sign {
-        match self {
-            Blob::Positive(_) => Sign::Positive,
-            Blob::Negative(_) => Sign::Negative,
-        }
-    }
-}
-
-impl Serialize for Blob {
-    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
-        match self {
-            Blob::Positive(blob) => {
-                let mut tuple = serializer.serialize_tuple(2)?;
-                tuple.serialize_element(&Sign::Positive)?;
-                tuple.serialize_element(&blob)?;
-                tuple.end()
-            },
-            Blob::Negative(blob) => {
-                let mut tuple = serializer.serialize_tuple(2)?;
-                tuple.serialize_element(&Sign::Negative)?;
-                tuple.serialize_element(&blob)?;
-                tuple.end()
-            },
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for Blob {
-    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Blob, D::Error> {
-        struct TupleVisitor;
-
-        impl<'de> Visitor<'de> for TupleVisitor {
-            type Value = Blob;
-
-            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
-                formatter.write_str("a Blob struct")
-            }
-
-            #[inline]
-            fn visit_seq<A: SeqAccess<'de>>(self, mut seq: A) -> Result<Self::Value, A::Error> {
-                let sign = match seq.next_element()? {
-                    Some(value) => value,
-                    None => return Err(de::Error::invalid_length(0, &self)),
-                };
-                match sign {
-                    Sign::Positive => {
-                        let blob = match seq.next_element()? {
-                            Some(value) => value,
-                            None => return Err(de::Error::invalid_length(1, &self)),
-                        };
-                        Ok(Blob::Positive(blob))
-                    },
-                    Sign::Negative => {
-                        let blob = match seq.next_element()? {
-                            Some(value) => value,
-                            None => return Err(de::Error::invalid_length(1, &self)),
-                        };
-                        Ok(Blob::Negative(blob))
-                    },
-                }
-            }
-        }
-
-        deserializer.deserialize_tuple(2, TupleVisitor)
-    }
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
-pub enum Sign {
-    Positive,
-    Negative,
-}
-
-impl Sign {
-    pub fn invert(self) -> Sign {
-        match self {
-            Sign::Positive => Sign::Negative,
-            Sign::Negative => Sign::Positive,
-        }
-    }
-}
--- a/src/database/blob/negative/blob.rs
+++ b/src/database/blob/negative/blob.rs
@ -1,67 +0,0 @@
-use std::error::Error;
-use std::path::Path;
-use std::fmt;
-
-use sdset::Set;
-use serde::de::{self, Deserialize, Deserializer};
-use serde::ser::{Serialize, Serializer};
-use crate::data::DocIds;
-use crate::DocumentId;
-
-#[derive(Default)]
-pub struct NegativeBlob {
-    doc_ids: DocIds,
-}
-
-impl NegativeBlob {
-    pub unsafe fn from_path<P>(doc_ids: P) -> Result<Self, Box<Error>>
-    where P: AsRef<Path>,
-    {
-        let doc_ids = DocIds::from_path(doc_ids)?;
-        Ok(NegativeBlob { doc_ids })
-    }
-
-    pub fn from_bytes(doc_ids: Vec<u8>) -> Result<Self, Box<Error>> {
-        let doc_ids = DocIds::from_bytes(doc_ids)?;
-        Ok(NegativeBlob { doc_ids })
-    }
-
-    pub fn from_raw(doc_ids: DocIds) -> Self {
-        NegativeBlob { doc_ids }
-    }
-
-    pub fn as_ids(&self) -> &DocIds {
-        &self.doc_ids
-    }
-
-    pub fn into_doc_ids(self) -> DocIds {
-        self.doc_ids
-    }
-}
-
-impl AsRef<Set<DocumentId>> for NegativeBlob {
-    fn as_ref(&self) -> &Set<DocumentId> {
-        self.as_ids().doc_ids()
-    }
-}
-
-impl fmt::Debug for NegativeBlob {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "NegativeBlob(")?;
-        f.debug_list().entries(self.as_ref().as_slice()).finish()?;
-        write!(f, ")")
-    }
-}
-
-impl Serialize for NegativeBlob {
-    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
-        self.doc_ids.serialize(serializer)
-    }
-}
-
-impl<'de> Deserialize<'de> for NegativeBlob {
-    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<NegativeBlob, D::Error> {
-        let bytes = Vec::deserialize(deserializer)?;
-        NegativeBlob::from_bytes(bytes).map_err(de::Error::custom)
-    }
-}
--- a/src/database/blob/negative/mod.rs
+++ b/src/database/blob/negative/mod.rs
@ -1,5 +0,0 @@
-mod blob;
-mod ops;
-
-pub use self::blob::NegativeBlob;
-pub use self::ops::OpBuilder;
--- a/src/database/blob/negative/ops.rs
+++ b/src/database/blob/negative/ops.rs
@ -1,73 +0,0 @@
-use sdset::multi::OpBuilder as SdOpBuilder;
-use sdset::Set;
-
-use crate::database::blob::NegativeBlob;
-use crate::data::DocIds;
-use crate::DocumentId;
-
-pub struct OpBuilder<'a> {
-    inner: SdOpBuilder<'a, DocumentId>,
-}
-
-/// Do a set operation on multiple negative blobs.
-impl<'a> OpBuilder<'a> {
-    pub fn new() -> Self {
-        Self { inner: SdOpBuilder::new() }
-    }
-
-    pub fn with_capacity(cap: usize) -> Self {
-        Self { inner: SdOpBuilder::with_capacity(cap) }
-    }
-
-    pub fn add(mut self, blob: &'a NegativeBlob) -> Self {
-        self.push(blob);
-        self
-    }
-
-    pub fn push(&mut self, blob: &'a NegativeBlob) {
-        let set = Set::new_unchecked(blob.as_ref());
-        self.inner.push(set);
-    }
-
-    pub fn union(self) -> Union<'a> {
-        Union::new(self.inner.union())
-    }
-
-    pub fn intersection(self) -> Intersection<'a> {
-        Intersection::new(self.inner.intersection())
-    }
-
-    pub fn difference(self) -> Difference<'a> {
-        Difference::new(self.inner.difference())
-    }
-
-    pub fn symmetric_difference(self) -> SymmetricDifference<'a> {
-        SymmetricDifference::new(self.inner.symmetric_difference())
-    }
-}
-
-macro_rules! logical_operation {
-    (struct $name:ident, $operation:ident) => {
-
-pub struct $name<'a> {
-    op: sdset::multi::$name<'a, DocumentId>,
-}
-
-impl<'a> $name<'a> {
-    fn new(op: sdset::multi::$name<'a, DocumentId>) -> Self {
-        $name { op }
-    }
-
-    pub fn into_negative_blob(self) -> NegativeBlob {
-        let document_ids = sdset::SetOperation::into_set_buf(self.op);
-        let doc_ids = DocIds::from_document_ids(document_ids.into_vec());
-        NegativeBlob::from_raw(doc_ids)
-    }
-}
-
-}}
-
-logical_operation!(struct Union, union);
-logical_operation!(struct Intersection, intersection);
-logical_operation!(struct Difference, difference);
-logical_operation!(struct SymmetricDifference, symmetric_difference);
--- a/src/database/blob/ops.rs
+++ b/src/database/blob/ops.rs
@ -1,109 +0,0 @@
-use std::error::Error;
-
-use fst::{IntoStreamer, Streamer};
-use sdset::duo::DifferenceByKey;
-use sdset::{Set, SetOperation};
-use group_by::GroupBy;
-
-use crate::database::blob::{Blob, Sign, PositiveBlob, PositiveBlobBuilder, NegativeBlob};
-use crate::database::blob::{positive, negative};
-
-fn blob_same_sign(a: &Blob, b: &Blob) -> bool {
-    a.sign() == b.sign()
-}
-
-fn unwrap_positive(blob: &Blob) -> &PositiveBlob {
-    match blob {
-        Blob::Positive(blob) => blob,
-        Blob::Negative(_) => panic!("called `unwrap_positive()` on a `Negative` value"),
-    }
-}
-
-fn unwrap_negative(blob: &Blob) -> &NegativeBlob {
-    match blob {
-        Blob::Negative(blob) => blob,
-        Blob::Positive(_) => panic!("called `unwrap_negative()` on a `Positive` value"),
-    }
-}
-
-pub struct OpBuilder {
-    blobs: Vec<Blob>,
-}
-
-impl OpBuilder {
-    pub fn new() -> OpBuilder {
-        OpBuilder { blobs: Vec::new() }
-    }
-
-    pub fn with_capacity(cap: usize) -> OpBuilder {
-        OpBuilder { blobs: Vec::with_capacity(cap) }
-    }
-
-    pub fn push(&mut self, blob: Blob) {
-        if self.blobs.is_empty() && blob.is_negative() { return }
-        self.blobs.push(blob);
-    }
-
-    pub fn merge(self) -> Result<PositiveBlob, Box<Error>> {
-        let groups = GroupBy::new(&self.blobs, blob_same_sign);
-        let mut aggregated = Vec::new();
-
-        for blobs in groups {
-            match blobs[0].sign() {
-                Sign::Positive => {
-                    let mut op_builder = positive::OpBuilder::with_capacity(blobs.len());
-                    for blob in blobs {
-                        op_builder.push(unwrap_positive(blob));
-                    }
-
-                    let mut stream = op_builder.union().into_stream();
-                    let mut builder = PositiveBlobBuilder::memory();
-                    while let Some((input, doc_indexes)) = stream.next() {
-                        // FIXME empty doc_indexes must be handled by OpBuilder
-                        if !doc_indexes.is_empty() {
-                            builder.insert(input, doc_indexes).unwrap();
-                        }
-                    }
-                    let (map, doc_indexes) = builder.into_inner().unwrap();
-                    let blob = PositiveBlob::from_bytes(map, doc_indexes).unwrap();
-                    aggregated.push(Blob::Positive(blob));
-                },
-                Sign::Negative => {
-                    let mut op_builder = negative::OpBuilder::with_capacity(blobs.len());
-                    for blob in blobs {
-                        op_builder.push(unwrap_negative(blob));
-                    }
-                    let blob = op_builder.union().into_negative_blob();
-                    aggregated.push(Blob::Negative(blob));
-                },
-            }
-        }
-
-        let mut buffer = Vec::new();
-        aggregated.chunks(2).try_fold(PositiveBlob::default(), |base, slice| {
-            let negative = NegativeBlob::default();
-            let (positive, negative) = match slice {
-                [a, b] => (unwrap_positive(a), unwrap_negative(b)),
-                [a] => (unwrap_positive(a), &negative),
-                _ => unreachable!(),
-            };
-
-            let mut builder = PositiveBlobBuilder::memory();
-
-            let op_builder = positive::OpBuilder::new().add(&base).add(&positive);
-            let mut stream = op_builder.union().into_stream();
-            while let Some((input, doc_indexes)) = stream.next() {
-                let op = DifferenceByKey::new(doc_indexes, negative.as_ref(), |x| x.document_id, |x| *x);
-
-                buffer.clear();
-                op.extend_vec(&mut buffer);
-                if !buffer.is_empty() {
-                    builder.insert(input, Set::new_unchecked(&buffer))?;
-                }
-            }
-
-            let (map, doc_indexes) = builder.into_inner()?;
-            PositiveBlob::from_bytes(map, doc_indexes)
-        })
-    }
-}
--- a/src/database/blob/positive/blob.rs
+++ b/src/database/blob/positive/blob.rs
@ -1,254 +0,0 @@
-use std::fmt;
-use std::io::Write;
-use std::path::Path;
-use std::error::Error;
-
-use fst::{map, Map, Streamer, IntoStreamer};
-use sdset::Set;
-
-use crate::DocIndex;
-use crate::data::{DocIndexes, DocIndexesBuilder};
-use serde::ser::{Serialize, Serializer, SerializeTuple};
-use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor};
-
-#[derive(Default)]
-pub struct PositiveBlob {
-    map: Map,
-    indexes: DocIndexes,
-}
-
-impl PositiveBlob {
-    pub unsafe fn from_paths<P, Q>(map: P, indexes: Q) -> Result<Self, Box<Error>>
-    where P: AsRef<Path>,
-          Q: AsRef<Path>,
-    {
-        let map = Map::from_path(map)?;
-        let indexes = DocIndexes::from_path(indexes)?;
-        Ok(PositiveBlob { map, indexes })
-    }
-
-    pub fn from_bytes(map: Vec<u8>, indexes: Vec<u8>) -> Result<Self, Box<Error>> {
-        let map = Map::from_bytes(map)?;
-        let indexes = DocIndexes::from_bytes(indexes)?;
-        Ok(PositiveBlob { map, indexes })
-    }
-
-    pub fn from_raw(map: Map, indexes: DocIndexes) -> Self {
-        PositiveBlob { map, indexes }
-    }
-
-    pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[DocIndex]> {
-        self.map.get(key).map(|index| &self.indexes[index as usize])
-    }
-
-    pub fn as_map(&self) -> &Map {
-        &self.map
-    }
-
-    pub fn as_indexes(&self) -> &DocIndexes {
-        &self.indexes
-    }
-
-    pub fn explode(self) -> (Map, DocIndexes) {
-        (self.map, self.indexes)
-    }
-}
-
-impl fmt::Debug for PositiveBlob {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "PositiveBlob([")?;
-        let mut stream = self.into_stream();
-        let mut first = true;
-        while let Some((k, v)) = stream.next() {
-            if !first {
-                write!(f, ", ")?;
-            }
-            first = false;
-            write!(f, "({}, {:?})", String::from_utf8_lossy(k), v)?;
-        }
-        write!(f, "])")
-    }
-}
-
-impl<'m, 'a> IntoStreamer<'a> for &'m PositiveBlob {
-    type Item = (&'a [u8], &'a [DocIndex]);
-    /// The type of the stream to be constructed.
-    type Into = PositiveBlobStream<'m>;
-
-    /// Construct a stream from `Self`.
-    fn into_stream(self) -> Self::Into {
-        PositiveBlobStream {
-            map_stream: self.map.into_stream(),
-            doc_indexes: &self.indexes,
-        }
-    }
-}
-
-pub struct PositiveBlobStream<'m> {
-    map_stream: map::Stream<'m>,
-    doc_indexes: &'m DocIndexes,
-}
-
-impl<'m, 'a> Streamer<'a> for PositiveBlobStream<'m> {
-    type Item = (&'a [u8], &'a [DocIndex]);
-
-    fn next(&'a mut self) -> Option<Self::Item> {
-        match self.map_stream.next() {
-            Some((input, index)) => {
-                let doc_indexes = &self.doc_indexes[index as usize];
-                Some((input, doc_indexes))
-            },
-            None => None,
-        }
-    }
-}
-
-impl Serialize for PositiveBlob {
-    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
-        let mut tuple = serializer.serialize_tuple(2)?;
-        tuple.serialize_element(&self.map.as_fst().to_vec())?;
-        tuple.serialize_element(&self.indexes.to_vec())?;
-        tuple.end()
-    }
-}
-
-impl<'de> Deserialize<'de> for PositiveBlob {
-    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<PositiveBlob, D::Error> {
-        struct TupleVisitor;
-
-        impl<'de> Visitor<'de> for TupleVisitor {
-            type Value = PositiveBlob;
-
-            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
-                formatter.write_str("a PositiveBlob struct")
-            }
-
-            #[inline]
-            fn visit_seq<A: SeqAccess<'de>>(self, mut seq: A) -> Result<Self::Value, A::Error> {
-                let map = match seq.next_element()? {
-                    Some(bytes) => match Map::from_bytes(bytes) {
-                        Ok(value) => value,
-                        Err(err) => return Err(de::Error::custom(err)),
-                    },
-                    None => return Err(de::Error::invalid_length(0, &self)),
-                };
-
-                let indexes = match seq.next_element()? {
-                    Some(bytes) => match DocIndexes::from_bytes(bytes) {
-                        Ok(value) => value,
-                        Err(err) => return Err(de::Error::custom(err)),
-                    },
-                    None => return Err(de::Error::invalid_length(1, &self)),
-                };
-
-                Ok(PositiveBlob { map, indexes })
-            }
-        }
-
-        deserializer.deserialize_tuple(2, TupleVisitor)
-    }
-}
-
-pub struct PositiveBlobBuilder<W, X> {
-    map: fst::MapBuilder<W>,
-    indexes: DocIndexesBuilder<X>,
-    value: u64,
-}
-
-impl PositiveBlobBuilder<Vec<u8>, Vec<u8>> {
-    pub fn memory() -> Self {
-        PositiveBlobBuilder {
-            map: fst::MapBuilder::memory(),
-            indexes: DocIndexesBuilder::memory(),
-            value: 0,
-        }
-    }
-}
-
-impl<W: Write, X: Write> PositiveBlobBuilder<W, X> {
-    pub fn new(map: W, indexes: X) -> Result<Self, Box<Error>> {
-        Ok(PositiveBlobBuilder {
-            map: fst::MapBuilder::new(map)?,
-            indexes: DocIndexesBuilder::new(indexes),
-            value: 0,
-        })
-    }
-
-    /// If a key is inserted that is less than or equal to any previous key added,
-    /// then an error is returned. Similarly, if there was a problem writing
-    /// to the underlying writer, an error is returned.
-    // FIXME what if one write doesn't work but the other do ?
-    pub fn insert<K>(&mut self, key: K, doc_indexes: &Set<DocIndex>) -> Result<(), Box<Error>>
-    where K: AsRef<[u8]>,
-    {
-        self.map.insert(key, self.value)?;
-        self.indexes.insert(doc_indexes)?;
-        self.value += 1;
-        Ok(())
-    }
-
-    pub fn finish(self) -> Result<(), Box<Error>> {
-        self.into_inner().map(drop)
-    }
-
-    pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
-        let map = self.map.into_inner()?;
-        let indexes = self.indexes.into_inner()?;
-        Ok((map, indexes))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use std::error::Error;
-
-    #[test]
-    fn serialize_deserialize() -> Result<(), Box<Error>> {
-        let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
-        let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
-        let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
-
-        let mut builder = PositiveBlobBuilder::memory();
-
-        builder.insert("aaa", Set::new(&[a])?)?;
-        builder.insert("aab", Set::new(&[a, b, c])?)?;
-        builder.insert("aac", Set::new(&[a, c])?)?;
-
-        let (map_bytes, indexes_bytes) = builder.into_inner()?;
-        let positive_blob = PositiveBlob::from_bytes(map_bytes, indexes_bytes)?;
-
-        assert_eq!(positive_blob.get("aaa"), Some(&[a][..]));
-        assert_eq!(positive_blob.get("aab"), Some(&[a, b, c][..]));
-        assert_eq!(positive_blob.get("aac"), Some(&[a, c][..]));
-        assert_eq!(positive_blob.get("aad"), None);
-
-        Ok(())
-    }
-
-    #[test]
-    fn serde_serialize_deserialize() -> Result<(), Box<Error>> {
-        let a = DocIndex { document_id: 0, attribute: 3, attribute_index: 11 };
-        let b = DocIndex { document_id: 1, attribute: 4, attribute_index: 21 };
-        let c = DocIndex { document_id: 2, attribute: 8, attribute_index: 2 };
-
-        let mut builder = PositiveBlobBuilder::memory();
-
-        builder.insert("aaa", Set::new(&[a])?)?;
-        builder.insert("aab", Set::new(&[a, b, c])?)?;
-        builder.insert("aac", Set::new(&[a, c])?)?;
-
-        let (map_bytes, indexes_bytes) = builder.into_inner()?;
-        let positive_blob = PositiveBlob::from_bytes(map_bytes, indexes_bytes)?;
-
-        let bytes = bincode::serialize(&positive_blob)?;
-        let positive_blob: PositiveBlob = bincode::deserialize(&bytes)?;
-
-        assert_eq!(positive_blob.get("aaa"), Some(&[a][..]));
-        assert_eq!(positive_blob.get("aab"), Some(&[a, b, c][..]));
-        assert_eq!(positive_blob.get("aac"), Some(&[a, c][..]));
-        assert_eq!(positive_blob.get("aad"), None);
-
-        Ok(())
-    }
-}
--- a/src/database/blob/positive/mod.rs
+++ b/src/database/blob/positive/mod.rs
@ -1,5 +0,0 @@
-mod blob;
-mod ops;
-
-pub use self::blob::{PositiveBlob, PositiveBlobBuilder};
-pub use self::ops::OpBuilder;
--- a/src/database/blob/positive/ops.rs
+++ b/src/database/blob/positive/ops.rs
@ -1,128 +0,0 @@
-use sdset::multi::OpBuilder as SdOpBuilder;
-use sdset::{SetOperation, Set};
-
-use crate::database::blob::PositiveBlob;
-use crate::data::DocIndexes;
-use crate::DocIndex;
-
-pub struct OpBuilder<'m> {
-    // the operation on the maps is always an union.
-    map_op: fst::map::OpBuilder<'m>,
-    indexes: Vec<&'m DocIndexes>,
-}
-
-/// Do a set operation on multiple positive blobs.
-impl<'m> OpBuilder<'m> {
-    pub fn new() -> Self {
-        Self {
-            map_op: fst::map::OpBuilder::new(),
-            indexes: Vec::new(),
-        }
-    }
-
-    pub fn with_capacity(cap: usize) -> Self {
-        Self {
-            map_op: fst::map::OpBuilder::new(), // TODO patch fst to add with_capacity
-            indexes: Vec::with_capacity(cap),
-        }
-    }
-
-    pub fn add(mut self, blob: &'m PositiveBlob) -> Self {
-        self.push(blob);
-        self
-    }
-
-    pub fn push(&mut self, blob: &'m PositiveBlob) {
-        self.map_op.push(blob.as_map());
-        self.indexes.push(blob.as_indexes());
-    }
-
-    pub fn union(self) -> Union<'m> {
-        Union::new(self.map_op.union(), self.indexes)
-    }
-
-    pub fn intersection(self) -> Intersection<'m> {
-        Intersection::new(self.map_op.union(), self.indexes)
-    }
-
-    pub fn difference(self) -> Difference<'m> {
-        Difference::new(self.map_op.union(), self.indexes)
-    }
-
-    pub fn symmetric_difference(self) -> SymmetricDifference<'m> {
-        SymmetricDifference::new(self.map_op.union(), self.indexes)
-    }
-}
-
-macro_rules! logical_operation {
-    (struct $name:ident, $operation:ident) => {
-
-pub struct $name<'m> {
-    stream: fst::map::Union<'m>,
-    indexes: Vec<&'m DocIndexes>,
-    outs: Vec<DocIndex>,
-}
-
-impl<'m> $name<'m> {
-    fn new(stream: fst::map::Union<'m>, indexes: Vec<&'m DocIndexes>) -> Self {
-        $name {
-            stream: stream,
-            indexes: indexes,
-            outs: Vec::new(),
-        }
-    }
-}
-
-impl<'m, 'a> fst::Streamer<'a> for $name<'m> {
-    type Item = (&'a [u8], &'a Set<DocIndex>);
-
-    fn next(&'a mut self) -> Option<Self::Item> {
-        // loop {
-        //     let (input, ivalues) = match self.stream.next() {
-        //         Some(value) => value,
-        //         None => return None,
-        //     };
-
-        //     self.outs.clear();
-
-        //     let mut builder = SdOpBuilder::with_capacity(ivalues.len());
-        //     for ivalue in ivalues {
-        //         let indexes = self.indexes[ivalue.index];
-        //         let indexes = indexes.get(ivalue.value).expect("BUG: could not find document indexes");
-        //         let set = Set::new_unchecked(indexes);
-        //         builder.push(set);
-        //     }
-
-        //     builder.$operation().extend_vec(&mut self.outs);
-
-        //     if self.outs.is_empty() { continue }
-        //     return Some((input, &self.outs))
-        // }
-
-        // FIXME make the above code compile
-        match self.stream.next() {
-            Some((input, ivalues)) => {
-                self.outs.clear();
-
-                let mut builder = SdOpBuilder::with_capacity(ivalues.len());
-                for ivalue in ivalues {
-                    let doc_indexes = &self.indexes[ivalue.index][ivalue.value as usize];
-                    let set = Set::new_unchecked(doc_indexes);
-                    builder.push(set);
-                }
-
-                builder.$operation().extend_vec(&mut self.outs);
-
-                if self.outs.is_empty() { return None }
-                return Some((input, Set::new_unchecked(&self.outs)))
-            },
-            None => None
-        }
-    }
-}
-}}
-
-logical_operation!(struct Union, union);
-logical_operation!(struct Intersection, intersection);
-logical_operation!(struct Difference, difference);
-logical_operation!(struct SymmetricDifference, symmetric_difference);
--- a/src/database/document_key.rs
+++ b/src/database/document_key.rs
@ -2,13 +2,13 @@ use std::io::{Cursor, Read, Write};
 use std::mem::size_of;
 use std::fmt;

-use byteorder::{NativeEndian, WriteBytesExt, ReadBytesExt};
+use byteorder::{BigEndian, WriteBytesExt, ReadBytesExt};

 use crate::database::schema::SchemaAttr;
 use crate::DocumentId;

 const DOC_KEY_LEN:      usize = 4 + size_of::<u64>();
-const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::<u32>();
+const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::<u16>();

 #[derive(Copy, Clone)]
 pub struct DocumentKey([u8; DOC_KEY_LEN]);
@ -19,7 +19,7 @@ impl DocumentKey {

        let mut wtr = Cursor::new(&mut buffer[..]);
        wtr.write_all(b"doc-").unwrap();
-        wtr.write_u64::<NativeEndian>(id).unwrap();
+        wtr.write_u64::<BigEndian>(id.0).unwrap();

        DocumentKey(buffer)
    }
@ -43,7 +43,8 @@ impl DocumentKey {
    }

    pub fn document_id(&self) -> DocumentId {
-        (&self.0[4..]).read_u64::<NativeEndian>().unwrap()
+        let id = (&self.0[4..]).read_u64::<BigEndian>().unwrap();
+        DocumentId(id)
    }
 }

@ -72,11 +73,19 @@ impl DocumentKeyAttr {
        let mut wtr = Cursor::new(&mut buffer[..]);
        wtr.write_all(&raw_key).unwrap();
        wtr.write_all(b"-").unwrap();
-        wtr.write_u32::<NativeEndian>(attr.as_u32()).unwrap();
+        wtr.write_u16::<BigEndian>(attr.0).unwrap();

        DocumentKeyAttr(buffer)
    }

+    pub fn with_attribute_min(id: DocumentId) -> DocumentKeyAttr {
+        DocumentKeyAttr::new(id, SchemaAttr::min())
+    }
+
+    pub fn with_attribute_max(id: DocumentId) -> DocumentKeyAttr {
+        DocumentKeyAttr::new(id, SchemaAttr::max())
+    }
+
    pub fn from_bytes(mut bytes: &[u8]) -> DocumentKeyAttr {
        assert!(bytes.len() >= DOC_KEY_ATTR_LEN);
        assert_eq!(&bytes[..4], b"doc-");
@ -88,12 +97,13 @@ impl DocumentKeyAttr {
    }

    pub fn document_id(&self) -> DocumentId {
-        (&self.0[4..]).read_u64::<NativeEndian>().unwrap()
+        let id = (&self.0[4..]).read_u64::<BigEndian>().unwrap();
+        DocumentId(id)
    }

    pub fn attribute(&self) -> SchemaAttr {
        let offset = 4 + size_of::<u64>() + 1;
-        let value = (&self.0[offset..]).read_u32::<NativeEndian>().unwrap();
+        let value = (&self.0[offset..]).read_u16::<BigEndian>().unwrap();
        SchemaAttr::new(value)
    }

@ -112,7 +122,24 @@ impl fmt::Debug for DocumentKeyAttr {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        f.debug_struct("DocumentKeyAttr")
            .field("document_id", &self.document_id())
-            .field("attribute", &self.attribute().as_u32())
+            .field("attribute", &self.attribute().0)
            .finish()
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn keep_as_ref_order() {
+        for (a, b) in (0..).zip(1..).take(u16::max_value() as usize - 1) {
+            let id = DocumentId(0);
+            let a = DocumentKeyAttr::new(id, SchemaAttr(a));
+            let b = DocumentKeyAttr::new(id, SchemaAttr(b));
+
+            assert!(a < b);
+            assert!(a.as_ref() < b.as_ref());
+        }
+    }
+}
--- a/src/database/index/mod.rs
+++ b/src/database/index/mod.rs
@ -0,0 +1,82 @@
+mod negative;
+mod positive;
+
+pub(crate) use self::negative::Negative;
+pub(crate) use self::positive::{Positive, PositiveBuilder};
+
+use std::error::Error;
+use std::io::Cursor;
+use std::sync::Arc;
+
+use fst::{IntoStreamer, Streamer};
+use sdset::duo::DifferenceByKey;
+use sdset::{Set, SetOperation};
+use fst::Map;
+
+use crate::data::{SharedData, DocIndexes};
+
+#[derive(Default)]
+pub struct Index {
+    pub(crate) negative: Negative,
+    pub(crate) positive: Positive,
+}
+
+impl Index {
+    pub fn from_bytes(bytes: Vec<u8>) -> Result<Index, Box<Error>> {
+        let len = bytes.len();
+        Index::from_shared_bytes(Arc::new(bytes), 0, len)
+    }
+
+    pub fn from_shared_bytes(
+        bytes: Arc<Vec<u8>>,
+        offset: usize,
+        len: usize,
+    ) -> Result<Index, Box<Error>>
+    {
+        let data = SharedData::new(bytes, offset, len);
+        let mut cursor = Cursor::new(data);
+
+        let negative = Negative::from_cursor(&mut cursor)?;
+        let positive = Positive::from_cursor(&mut cursor)?;
+        Ok(Index { negative, positive })
+    }
+
+    pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
+        self.negative.write_to_bytes(bytes);
+        self.positive.write_to_bytes(bytes);
+    }
+
+    pub fn merge(&self, other: &Index) -> Result<Index, Box<Error>> {
+        if other.negative.is_empty() {
+            let negative = Negative::default();
+            let positive = self.positive.union(&other.positive)?;
+            return Ok(Index { negative, positive })
+        }
+
+        let mut buffer = Vec::new();
+        let mut builder = PositiveBuilder::memory();
+        let mut stream = self.positive.into_stream();
+        while let Some((key, indexes)) = stream.next() {
+            let op = DifferenceByKey::new(indexes, &other.negative, |x| x.document_id, |x| *x);
+
+            buffer.clear();
+            op.extend_vec(&mut buffer);
+
+            if !buffer.is_empty() {
+                let indexes = Set::new_unchecked(&buffer);
+                builder.insert(key, indexes)?;
+            }
+        }
+
+        let positive = {
+            let (map, indexes) = builder.into_inner()?;
+            let map = Map::from_bytes(map)?;
+            let indexes = DocIndexes::from_bytes(indexes)?;
+            Positive::new(map, indexes)
+        };
+
+        let negative = Negative::default();
+        let positive = positive.union(&other.positive)?;
+        Ok(Index { negative, positive })
+    }
+}
--- a/src/database/index/negative.rs
+++ b/src/database/index/negative.rs
@ -0,0 +1,43 @@
+use std::error::Error;
+use std::io::Cursor;
+use std::ops::Deref;
+
+use sdset::Set;
+use byteorder::{LittleEndian, WriteBytesExt};
+
+use crate::data::SharedData;
+use crate::data::DocIds;
+use crate::DocumentId;
+
+#[derive(Default)]
+pub struct Negative(DocIds);
+
+impl Negative {
+    pub fn new(doc_ids: DocIds) -> Negative {
+        Negative(doc_ids)
+    }
+
+    pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Negative, Box<Error>> {
+        let doc_ids = DocIds::from_cursor(cursor)?;
+        Ok(Negative(doc_ids))
+    }
+
+    pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
+        let slice = self.0.as_bytes();
+        let len = slice.len() as u64;
+        let _ = bytes.write_u64::<LittleEndian>(len);
+        bytes.extend_from_slice(slice);
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+}
+
+impl Deref for Negative {
+    type Target = Set<DocumentId>;
+
+    fn deref(&self) -> &Self::Target {
+        self.0.as_ref()
+    }
+}
--- a/src/database/index/positive.rs
+++ b/src/database/index/positive.rs
@ -0,0 +1,166 @@
+use std::io::{Write, BufRead, Cursor};
+use std::error::Error;
+
+use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
+use fst::{map, Map, Streamer, IntoStreamer};
+use sdset::{Set, SetOperation};
+use sdset::duo::Union;
+use fst::raw::Fst;
+
+use crate::data::{DocIndexes, DocIndexesBuilder};
+use crate::data::SharedData;
+use crate::DocIndex;
+
+#[derive(Default)]
+pub struct Positive {
+    map: Map,
+    indexes: DocIndexes,
+}
+
+impl Positive {
+    pub fn new(map: Map, indexes: DocIndexes) -> Positive {
+        Positive { map, indexes }
+    }
+
+    pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Positive, Box<Error>> {
+        let len = cursor.read_u64::<LittleEndian>()? as usize;
+        let offset = cursor.position() as usize;
+        let data = cursor.get_ref().range(offset, len);
+
+        let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?;
+        let map = Map::from(fst);
+        cursor.consume(len);
+
+        let indexes = DocIndexes::from_cursor(cursor)?;
+
+        Ok(Positive { map, indexes})
+    }
+
+    pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
+        let slice = self.map.as_fst().as_bytes();
+        let len = slice.len() as u64;
+        let _ = bytes.write_u64::<LittleEndian>(len);
+        bytes.extend_from_slice(slice);
+
+        self.indexes.write_to_bytes(bytes);
+    }
+
+    pub fn map(&self) -> &Map {
+        &self.map
+    }
+
+    pub fn indexes(&self) -> &DocIndexes {
+        &self.indexes
+    }
+
+    pub fn union(&self, other: &Positive) -> Result<Positive, Box<Error>> {
+        let mut builder = PositiveBuilder::memory();
+        let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union();
+
+        let mut buffer = Vec::new();
+        while let Some((key, ivalues)) = stream.next() {
+            buffer.clear();
+            match ivalues {
+                [a, b] => {
+                    let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
+                    let indexes = indexes.get(a.value as usize).ok_or(format!("index not found"))?;
+                    let a = Set::new_unchecked(indexes);
+
+                    let indexes = if b.index == 0 { &self.indexes } else { &other.indexes };
+                    let indexes = indexes.get(b.value as usize).ok_or(format!("index not found"))?;
+                    let b = Set::new_unchecked(indexes);
+
+                    let op = Union::new(a, b);
+                    op.extend_vec(&mut buffer);
+                },
+                [a] => {
+                    let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
+                    let indexes = indexes.get(a.value as usize).ok_or(format!("index not found"))?;
+                    buffer.extend_from_slice(indexes)
+                },
+                _ => continue,
+            }
+
+            if !buffer.is_empty() {
+                let indexes = Set::new_unchecked(&buffer);
+                builder.insert(key, indexes)?;
+            }
+        }
+
+        let (map, indexes) = builder.into_inner()?;
+        let map = Map::from_bytes(map)?;
+        let indexes = DocIndexes::from_bytes(indexes)?;
+        Ok(Positive { map, indexes })
+    }
+}
+
+impl<'m, 'a> IntoStreamer<'a> for &'m Positive {
+    type Item = (&'a [u8], &'a Set<DocIndex>);
+    /// The type of the stream to be constructed.
+    type Into = Stream<'m>;
+
+    /// Construct a stream from `Self`.
+    fn into_stream(self) -> Self::Into {
+        Stream {
+            map_stream: self.map.into_stream(),
+            indexes: &self.indexes,
+        }
+    }
+}
+
+pub struct Stream<'m> {
+    map_stream: map::Stream<'m>,
+    indexes: &'m DocIndexes,
+}
+
+impl<'m, 'a> Streamer<'a> for Stream<'m> {
+    type Item = (&'a [u8], &'a Set<DocIndex>);
+
+    fn next(&'a mut self) -> Option<Self::Item> {
+        match self.map_stream.next() {
+            Some((input, index)) => {
+                let indexes = &self.indexes[index as usize];
+                let indexes = Set::new_unchecked(indexes);
+                Some((input, indexes))
+            },
+            None => None,
+        }
+    }
+}
+
+pub struct PositiveBuilder<W, X> {
+    map: fst::MapBuilder<W>,
+    indexes: DocIndexesBuilder<X>,
+    value: u64,
+}
+
+impl PositiveBuilder<Vec<u8>, Vec<u8>> {
+    pub fn memory() -> Self {
+        PositiveBuilder {
+            map: fst::MapBuilder::memory(),
+            indexes: DocIndexesBuilder::memory(),
+            value: 0,
+        }
+    }
+}
+
+impl<W: Write, X: Write> PositiveBuilder<W, X> {
+    /// If a key is inserted that is less than or equal to any previous key added,
+    /// then an error is returned. Similarly, if there was a problem writing
+    /// to the underlying writer, an error is returned.
+    // FIXME what if one write doesn't work but the other do ?
+    pub fn insert<K>(&mut self, key: K, indexes: &Set<DocIndex>) -> Result<(), Box<Error>>
+    where K: AsRef<[u8]>,
+    {
+        self.map.insert(key, self.value)?;
+        self.indexes.insert(indexes);
+        self.value += 1;
+        Ok(())
+    }
+
+    pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
+        let map = self.map.into_inner()?;
+        let indexes = self.indexes.into_inner()?;
+        Ok((map, indexes))
+    }
+}
--- a/src/database/mod.rs
+++ b/src/database/mod.rs
@ -1,45 +1,81 @@
-use std::sync::{Arc, Mutex, RwLock, RwLockReadGuard};
+use std::sync::{Arc, Mutex};
 use std::error::Error;
-use std::path::Path;
 use std::ops::Deref;
+use std::path::Path;

 use rocksdb::rocksdb_options::{DBOptions, IngestExternalFileOptions, ColumnFamilyOptions};
 use rocksdb::rocksdb::{Writable, Snapshot};
 use rocksdb::{DB, DBVector, MergeOperands};
+use crossbeam::atomic::ArcCell;
+use log::info;

 pub use self::document_key::{DocumentKey, DocumentKeyAttr};
-pub use self::database_view::{DatabaseView, DocumentIter};
-use self::blob::positive::PositiveBlob;
-use self::update::Update;
-use self::schema::Schema;
-use self::blob::Blob;
-
-pub mod blob;
-pub mod schema;
-pub mod update;
-mod document_key;
-mod database_view;
-mod deserializer;
+pub use self::view::{DatabaseView, DocumentIter};
+pub use self::update::{Update, UpdateBuilder};
+pub use self::serde::SerializerError;
+pub use self::schema::Schema;
+pub use self::index::Index;

 const DATA_INDEX:  &[u8] = b"data-index";
 const DATA_SCHEMA: &[u8] = b"data-schema";

-pub fn retrieve_data_schema<D>(snapshot: &Snapshot<D>) -> Result<Schema, Box<Error>>
+pub mod schema;
+pub(crate) mod index;
+mod deserializer;
+mod document_key;
+mod serde;
+mod update;
+mod view;
+
+fn retrieve_data_schema<D>(snapshot: &Snapshot<D>) -> Result<Schema, Box<Error>>
 where D: Deref<Target=DB>
 {
    match snapshot.get(DATA_SCHEMA)? {
-        Some(vector) => Ok(Schema::read_from(&*vector)?),
+        Some(vector) => Ok(Schema::read_from_bin(&*vector)?),
        None => Err(String::from("BUG: no schema found in the database").into()),
    }
 }

-pub fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<PositiveBlob, Box<Error>>
+fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<Index, Box<Error>>
 where D: Deref<Target=DB>
 {
-    match snapshot.get(DATA_INDEX)? {
-        Some(vector) => Ok(bincode::deserialize(&*vector)?),
-        None => Ok(PositiveBlob::default()),
+    let (elapsed, vector) = elapsed::measure_time(|| snapshot.get(DATA_INDEX));
+    info!("loading index from kv-store took {}", elapsed);
+
+    let index = match vector? {
+        Some(vector) => {
+            let bytes = vector.as_ref().to_vec();
+            info!("index size if {} MiB", bytes.len() / 1024 / 1024);
+
+            let (elapsed, index) = elapsed::measure_time(|| Index::from_bytes(bytes));
+            info!("loading index from bytes took {}", elapsed);
+            index?
+
+        },
+        None => Index::default(),
+    };
+
+    Ok(index)
+}
+
+fn merge_indexes(key: &[u8], existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
+    assert_eq!(key, DATA_INDEX, "The merge operator only supports \"data-index\" merging");
+
+    let mut index: Option<Index> = None;
+    for bytes in existing.into_iter().chain(operands) {
+        let operand = Index::from_bytes(bytes.to_vec()).unwrap();
+        let merged = match index {
+            Some(ref index) => index.merge(&operand).unwrap(),
+            None            => operand,
+        };
+
+        index.replace(merged);
    }
+
+    let index = index.unwrap_or_default();
+    let mut bytes = Vec::new();
+    index.write_to_bytes(&mut bytes);
+    bytes
 }

 pub struct Database {
@ -49,11 +85,11 @@ pub struct Database {
    db: Mutex<Arc<DB>>,

    // This view is updated each time the DB ingests an update
-    view: RwLock<DatabaseView<Arc<DB>>>,
+    view: ArcCell<DatabaseView<Arc<DB>>>,
 }

 impl Database {
-    pub fn create<P: AsRef<Path>>(path: P, schema: Schema) -> Result<Database, Box<Error>> {
+    pub fn create<P: AsRef<Path>>(path: P, schema: &Schema) -> Result<Database, Box<Error>> {
        let path = path.as_ref();
        if path.exists() {
            return Err(format!("File already exists at path: {}, cannot create database.",
@ -71,12 +107,12 @@ impl Database {
        let db = DB::open_cf(opts, &path, vec![("default", cf_opts)])?;

        let mut schema_bytes = Vec::new();
-        schema.write_to(&mut schema_bytes)?;
+        schema.write_to_bin(&mut schema_bytes)?;
        db.put(DATA_SCHEMA, &schema_bytes)?;

        let db = Arc::new(db);
        let snapshot = Snapshot::new(db.clone());
-        let view = RwLock::new(DatabaseView::new(snapshot)?);
+        let view = ArcCell::new(Arc::new(DatabaseView::new(snapshot)?));

        Ok(Database { db: Mutex::new(db), view })
    }
@ -94,18 +130,18 @@ impl Database {

        // FIXME create a generic function to do that !
        let _schema = match db.get(DATA_SCHEMA)? {
-            Some(value) => Schema::read_from(&*value)?,
+            Some(value) => Schema::read_from_bin(&*value)?,
            None => return Err(String::from("Database does not contain a schema").into()),
        };

        let db = Arc::new(db);
        let snapshot = Snapshot::new(db.clone());
-        let view = RwLock::new(DatabaseView::new(snapshot)?);
+        let view = ArcCell::new(Arc::new(DatabaseView::new(snapshot)?));

        Ok(Database { db: Mutex::new(db), view })
    }

-    pub fn ingest_update_file(&self, update: Update) -> Result<(), Box<Error>> {
+    pub fn ingest_update_file(&self, update: Update) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
        let snapshot = {
            // We must have a mutex here to ensure that update ingestions and compactions
            // are done atomatically and in the right order.
@ -116,32 +152,24 @@ impl Database {
                Err(e) => return Err(e.to_string().into()),
            };

-            let move_update = update.can_be_moved();
-            let path = update.into_path_buf();
-            let path = path.to_string_lossy();
+            let path = update.path().to_string_lossy();
+            let options = IngestExternalFileOptions::new();
+            // options.move_files(move_update);

-            let mut options = IngestExternalFileOptions::new();
-            options.move_files(move_update);
-
-            let cf_handle = db.cf_handle("default").expect("\"default\" column family not found");
-            db.ingest_external_file_optimized(&cf_handle, &options, &[&path])?;
-
-            // Compacting to trigger the merge operator only one time
-            // while ingesting the update and not each time searching
-            db.compact_range(Some(DATA_INDEX), Some(DATA_INDEX));
+            let (elapsed, result) = elapsed::measure_time(|| {
+                let cf_handle = db.cf_handle("default").expect("\"default\" column family not found");
+                db.ingest_external_file_optimized(&cf_handle, &options, &[&path])
+            });
+            let _ = result?;
+            info!("ingesting update file took {}", elapsed);

            Snapshot::new(db.clone())
        };

-        // Here we will block the view creation for the minimum amount of time:
-        // updating the DatabaseView itself with the new database snapshot
-        let view = DatabaseView::new(snapshot)?;
-        match self.view.write() {
-            Ok(mut lock) => *lock = view,
-            Err(e) => return Err(e.to_string().into()),
-        }
+        let view = Arc::new(DatabaseView::new(snapshot)?);
+        self.view.set(view.clone());

-        Ok(())
+        Ok(view)
    }

    pub fn get(&self, key: &[u8]) -> Result<Option<DBVector>, Box<Error>> {
@ -155,105 +183,508 @@ impl Database {
        }
    }

-    pub fn view(&self) -> RwLockReadGuard<DatabaseView<Arc<DB>>> {
-        self.view.read().unwrap()
+    pub fn view(&self) -> Arc<DatabaseView<Arc<DB>>> {
+        self.view.get()
    }
 }

-fn merge_indexes(key: &[u8], existing_value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
-    if key != DATA_INDEX {
-        panic!("The merge operator only supports \"data-index\" merging")
-    }
-
-    let capacity = {
-        let remaining = operands.size_hint().0;
-        let already_exist = usize::from(existing_value.is_some());
-        remaining + already_exist
-    };
-
-    let mut op = blob::OpBuilder::with_capacity(capacity);
-    if let Some(existing_value) = existing_value {
-        let blob = bincode::deserialize(existing_value).expect("BUG: could not deserialize data-index");
-        op.push(Blob::Positive(blob));
-    }
-
-    for bytes in operands {
-        let blob = bincode::deserialize(bytes).expect("BUG: could not deserialize blob");
-        op.push(blob);
-    }
-
-    let blob = op.merge().expect("BUG: could not merge blobs");
-    bincode::serialize(&blob).expect("BUG: could not serialize merged blob")
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
    use std::error::Error;

    use serde_derive::{Serialize, Deserialize};
+    use hashbrown::HashSet;
    use tempfile::tempdir;

-    use crate::tokenizer::DefaultBuilder;
-    use crate::database::update::PositiveUpdateBuilder;
    use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
+    use crate::database::update::UpdateBuilder;
+    use crate::tokenizer::DefaultBuilder;

    #[test]
-    fn ingest_update_file() -> Result<(), Box<Error>> {
+    fn ingest_one_update_file() -> Result<(), Box<Error>> {
        let dir = tempdir()?;
+        let stop_words = HashSet::new();

        let rocksdb_path = dir.path().join("rocksdb.rdb");

        #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
        struct SimpleDoc {
+            id: u64,
            title: String,
            description: String,
            timestamp: u64,
        }

        let schema = {
-            let mut builder = SchemaBuilder::new();
+            let mut builder = SchemaBuilder::with_identifier("id");
+            builder.new_attribute("id", STORED);
            builder.new_attribute("title", STORED | INDEXED);
            builder.new_attribute("description", STORED | INDEXED);
            builder.new_attribute("timestamp", STORED);
            builder.build()
        };

-        let database = Database::create(&rocksdb_path, schema.clone())?;
-        let tokenizer_builder = DefaultBuilder::new();
+        let database = Database::create(&rocksdb_path, &schema)?;

        let update_path = dir.path().join("update.sst");

        let doc0 = SimpleDoc {
+            id: 0,
            title: String::from("I am a title"),
            description: String::from("I am a description"),
            timestamp: 1234567,
        };
        let doc1 = SimpleDoc {
+            id: 1,
            title: String::from("I am the second title"),
            description: String::from("I am the second description"),
            timestamp: 7654321,
        };

-        let mut update = {
-            let mut builder = PositiveUpdateBuilder::new(update_path, schema, tokenizer_builder);
+        let docid0;
+        let docid1;
+        let update = {
+            let tokenizer_builder = DefaultBuilder::new();
+            let mut builder = UpdateBuilder::new(update_path, schema);

-            builder.update(0, &doc0).unwrap();
-            builder.update(1, &doc1).unwrap();
+            docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
+            docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;

            builder.build()?
        };

-        update.set_move(true);
        database.ingest_update_file(update)?;
        let view = database.view();

-        let de_doc0: SimpleDoc = view.retrieve_document(0)?;
-        let de_doc1: SimpleDoc = view.retrieve_document(1)?;
+        let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
+        let de_doc1: SimpleDoc = view.document_by_id(docid1)?;

        assert_eq!(doc0, de_doc0);
        assert_eq!(doc1, de_doc1);

        Ok(dir.close()?)
    }
+
+    #[test]
+    fn ingest_two_update_files() -> Result<(), Box<Error>> {
+        let dir = tempdir()?;
+        let stop_words = HashSet::new();
+
+        let rocksdb_path = dir.path().join("rocksdb.rdb");
+
+        #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+        struct SimpleDoc {
+            id: u64,
+            title: String,
+            description: String,
+            timestamp: u64,
+        }
+
+        let schema = {
+            let mut builder = SchemaBuilder::with_identifier("id");
+            builder.new_attribute("id", STORED);
+            builder.new_attribute("title", STORED | INDEXED);
+            builder.new_attribute("description", STORED | INDEXED);
+            builder.new_attribute("timestamp", STORED);
+            builder.build()
+        };
+
+        let database = Database::create(&rocksdb_path, &schema)?;
+
+        let doc0 = SimpleDoc {
+            id: 0,
+            title: String::from("I am a title"),
+            description: String::from("I am a description"),
+            timestamp: 1234567,
+        };
+        let doc1 = SimpleDoc {
+            id: 1,
+            title: String::from("I am the second title"),
+            description: String::from("I am the second description"),
+            timestamp: 7654321,
+        };
+        let doc2 = SimpleDoc {
+            id: 2,
+            title: String::from("I am the third title"),
+            description: String::from("I am the third description"),
+            timestamp: 7654321,
+        };
+        let doc3 = SimpleDoc {
+            id: 3,
+            title: String::from("I am the fourth title"),
+            description: String::from("I am the fourth description"),
+            timestamp: 7654321,
+        };
+
+        let docid0;
+        let docid1;
+        let update1 = {
+            let tokenizer_builder = DefaultBuilder::new();
+            let update_path = dir.path().join("update-000.sst");
+            let mut builder = UpdateBuilder::new(update_path, schema.clone());
+
+            docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
+            docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
+
+            builder.build()?
+        };
+
+        let docid2;
+        let docid3;
+        let update2 = {
+            let tokenizer_builder = DefaultBuilder::new();
+            let update_path = dir.path().join("update-001.sst");
+            let mut builder = UpdateBuilder::new(update_path, schema);
+
+            docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?;
+            docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?;
+
+            builder.build()?
+        };
+
+        database.ingest_update_file(update1)?;
+        database.ingest_update_file(update2)?;
+
+        let view = database.view();
+
+        let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
+        let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
+
+        assert_eq!(doc0, de_doc0);
+        assert_eq!(doc1, de_doc1);
+
+        let de_doc2: SimpleDoc = view.document_by_id(docid2)?;
+        let de_doc3: SimpleDoc = view.document_by_id(docid3)?;
+
+        assert_eq!(doc2, de_doc2);
+        assert_eq!(doc3, de_doc3);
+
+        Ok(dir.close()?)
+    }
+}
+
+#[cfg(all(feature = "nightly", test))]
+mod bench {
+    extern crate test;
+
+    use super::*;
+    use std::error::Error;
+    use std::iter::repeat_with;
+    use self::test::Bencher;
+
+    use rand::distributions::Alphanumeric;
+    use rand_xorshift::XorShiftRng;
+    use rand::{Rng, SeedableRng};
+    use serde_derive::Serialize;
+    use rand::seq::SliceRandom;
+    use hashbrown::HashSet;
+
+    use crate::tokenizer::DefaultBuilder;
+    use crate::database::update::UpdateBuilder;
+    use crate::database::schema::*;
+
+    fn random_sentences<R: Rng>(number: usize, rng: &mut R) -> String {
+        let mut words = String::new();
+
+        for i in 0..number {
+            let word_len = rng.gen_range(1, 12);
+            let iter = repeat_with(|| rng.sample(Alphanumeric)).take(word_len);
+            words.extend(iter);
+
+            if i == number - 1 { // last word
+                let final_ = [".", "?", "!", "..."].choose(rng).cloned();
+                words.extend(final_);
+            } else {
+                let middle = [",", ", "].choose(rng).cloned();
+                words.extend(middle);
+            }
+        }
+
+        words
+    }
+
+    #[bench]
+    fn open_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
+        let dir = tempfile::tempdir()?;
+        let stop_words = HashSet::new();
+
+        let mut builder = SchemaBuilder::with_identifier("id");
+        builder.new_attribute("title", STORED | INDEXED);
+        builder.new_attribute("description", STORED | INDEXED);
+        let schema = builder.build();
+
+        let db_path = dir.path().join("bench.mdb");
+        let database = Database::create(db_path.clone(), &schema)?;
+
+        #[derive(Serialize)]
+        struct Document {
+            id: u64,
+            title: String,
+            description: String,
+        }
+
+        let path = dir.path().join("update-000.sst");
+        let tokenizer_builder = DefaultBuilder;
+        let mut builder = UpdateBuilder::new(path, schema);
+        let mut rng = XorShiftRng::seed_from_u64(42);
+
+        for i in 0..300 {
+            let document = Document {
+                id: i,
+                title: random_sentences(rng.gen_range(1, 8), &mut rng),
+                description: random_sentences(rng.gen_range(20, 200), &mut rng),
+            };
+            builder.update_document(&document, &tokenizer_builder, &stop_words)?;
+        }
+
+        let update = builder.build()?;
+        database.ingest_update_file(update)?;
+
+        drop(database);
+
+        bench.iter(|| {
+            let database = Database::open(db_path.clone()).unwrap();
+            test::black_box(|| database);
+        });
+
+        Ok(())
+    }
+
+    #[bench]
+    fn open_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
+        let dir = tempfile::tempdir()?;
+        let stop_words = HashSet::new();
+
+        let mut builder = SchemaBuilder::with_identifier("id");
+        builder.new_attribute("title", STORED | INDEXED);
+        builder.new_attribute("description", STORED | INDEXED);
+        let schema = builder.build();
+
+        let db_path = dir.path().join("bench.mdb");
+        let database = Database::create(db_path.clone(), &schema)?;
+
+        #[derive(Serialize)]
+        struct Document {
+            id: u64,
+            title: String,
+            description: String,
+        }
+
+        let path = dir.path().join("update-000.sst");
+        let tokenizer_builder = DefaultBuilder;
+        let mut builder = UpdateBuilder::new(path, schema);
+        let mut rng = XorShiftRng::seed_from_u64(42);
+
+        for i in 0..3000 {
+            let document = Document {
+                id: i,
+                title: random_sentences(rng.gen_range(1, 8), &mut rng),
+                description: random_sentences(rng.gen_range(20, 200), &mut rng),
+            };
+            builder.update_document(&document, &tokenizer_builder, &stop_words)?;
+        }
+
+        let update = builder.build()?;
+        database.ingest_update_file(update)?;
+
+        drop(database);
+
+        bench.iter(|| {
+            let database = Database::open(db_path.clone()).unwrap();
+            test::black_box(|| database);
+        });
+
+        Ok(())
+    }
+
+    #[bench]
+    #[ignore]
+    fn open_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
+        let dir = tempfile::tempdir()?;
+        let stop_words = HashSet::new();
+
+        let mut builder = SchemaBuilder::with_identifier("id");
+        builder.new_attribute("title", STORED | INDEXED);
+        builder.new_attribute("description", STORED | INDEXED);
+        let schema = builder.build();
+
+        let db_path = dir.path().join("bench.mdb");
+        let database = Database::create(db_path.clone(), &schema)?;
+
+        #[derive(Serialize)]
+        struct Document {
+            id: u64,
+            title: String,
+            description: String,
+        }
+
+        let path = dir.path().join("update-000.sst");
+        let tokenizer_builder = DefaultBuilder;
+        let mut builder = UpdateBuilder::new(path, schema);
+        let mut rng = XorShiftRng::seed_from_u64(42);
+
+        for i in 0..30_000 {
+            let document = Document {
+                id: i,
+                title: random_sentences(rng.gen_range(1, 8), &mut rng),
+                description: random_sentences(rng.gen_range(20, 200), &mut rng),
+            };
+            builder.update_document(&document, &tokenizer_builder, &stop_words)?;
+        }
+
+        let update = builder.build()?;
+        database.ingest_update_file(update)?;
+
+        drop(database);
+
+        bench.iter(|| {
+            let database = Database::open(db_path.clone()).unwrap();
+            test::black_box(|| database);
+        });
+
+        Ok(())
+    }
+
+    #[bench]
+    fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
+        let dir = tempfile::tempdir()?;
+        let stop_words = HashSet::new();
+
+        let mut builder = SchemaBuilder::with_identifier("id");
+        builder.new_attribute("title", STORED | INDEXED);
+        builder.new_attribute("description", STORED | INDEXED);
+        let schema = builder.build();
+
+        let db_path = dir.path().join("bench.mdb");
+        let database = Database::create(db_path.clone(), &schema)?;
+
+        #[derive(Serialize)]
+        struct Document {
+            id: u64,
+            title: String,
+            description: String,
+        }
+
+        let path = dir.path().join("update-000.sst");
+        let tokenizer_builder = DefaultBuilder;
+        let mut builder = UpdateBuilder::new(path, schema);
+        let mut rng = XorShiftRng::seed_from_u64(42);
+
+        for i in 0..300 {
+            let document = Document {
+                id: i,
+                title: random_sentences(rng.gen_range(1, 8), &mut rng),
+                description: random_sentences(rng.gen_range(20, 200), &mut rng),
+            };
+            builder.update_document(&document, &tokenizer_builder, &stop_words)?;
+        }
+
+        let update = builder.build()?;
+        let view = database.ingest_update_file(update)?;
+
+        bench.iter(|| {
+            for q in &["a", "b", "c", "d", "e"] {
+                let documents = view.query_builder().unwrap().query(q, 0..20);
+                test::black_box(|| documents);
+            }
+        });
+
+        Ok(())
+    }
+
+    #[bench]
+    fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
+        let dir = tempfile::tempdir()?;
+        let stop_words = HashSet::new();
+
+        let mut builder = SchemaBuilder::with_identifier("id");
+        builder.new_attribute("title", STORED | INDEXED);
+        builder.new_attribute("description", STORED | INDEXED);
+        let schema = builder.build();
+
+        let db_path = dir.path().join("bench.mdb");
+        let database = Database::create(db_path.clone(), &schema)?;
+
+        #[derive(Serialize)]
+        struct Document {
+            id: u64,
+            title: String,
+            description: String,
+        }
+
+        let path = dir.path().join("update-000.sst");
+        let tokenizer_builder = DefaultBuilder;
+        let mut builder = UpdateBuilder::new(path, schema);
+        let mut rng = XorShiftRng::seed_from_u64(42);
+
+        for i in 0..3000 {
+            let document = Document {
+                id: i,
+                title: random_sentences(rng.gen_range(1, 8), &mut rng),
+                description: random_sentences(rng.gen_range(20, 200), &mut rng),
+            };
+            builder.update_document(&document, &tokenizer_builder, &stop_words)?;
+        }
+
+        let update = builder.build()?;
+        let view = database.ingest_update_file(update)?;
+
+        bench.iter(|| {
+            for q in &["a", "b", "c", "d", "e"] {
+                let documents = view.query_builder().unwrap().query(q, 0..20);
+                test::black_box(|| documents);
+            }
+        });
+
+        Ok(())
+    }
+
+    #[bench]
+    #[ignore]
+    fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
+        let dir = tempfile::tempdir()?;
+        let stop_words = HashSet::new();
+
+        let mut builder = SchemaBuilder::with_identifier("id");
+        builder.new_attribute("title", STORED | INDEXED);
+        builder.new_attribute("description", STORED | INDEXED);
+        let schema = builder.build();
+
+        let db_path = dir.path().join("bench.mdb");
+        let database = Database::create(db_path.clone(), &schema)?;
+
+        #[derive(Serialize)]
+        struct Document {
+            id: u64,
+            title: String,
+            description: String,
+        }
+
+        let path = dir.path().join("update-000.sst");
+        let tokenizer_builder = DefaultBuilder;
+        let mut builder = UpdateBuilder::new(path, schema);
+        let mut rng = XorShiftRng::seed_from_u64(42);
+
+        for i in 0..30_000 {
+            let document = Document {
+                id: i,
+                title: random_sentences(rng.gen_range(1, 8), &mut rng),
+                description: random_sentences(rng.gen_range(20, 200), &mut rng),
+            };
+            builder.update_document(&document, &tokenizer_builder, &stop_words)?;
+        }
+
+        let update = builder.build()?;
+        let view = database.ingest_update_file(update)?;
+
+        bench.iter(|| {
+            for q in &["a", "b", "c", "d", "e"] {
+                let documents = view.query_builder().unwrap().query(q, 0..20);
+                test::black_box(|| documents);
+            }
+        });
+
+        Ok(())
+    }
 }
--- a/src/database/schema.rs
+++ b/src/database/schema.rs
@ -1,29 +1,36 @@
 use std::collections::{HashMap, BTreeMap};
 use std::io::{Read, Write};
-use std::{fmt, u32};
-use std::path::Path;
+use std::error::Error;
+use std::{fmt, u16};
 use std::ops::BitOr;
 use std::sync::Arc;
-use std::fs::File;

 use serde_derive::{Serialize, Deserialize};
 use linked_hash_map::LinkedHashMap;
+use serde::Serialize;
+
+use crate::database::serde::find_id::FindDocumentIdSerializer;
+use crate::database::serde::SerializerError;
+use crate::DocumentId;

 pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false };
 pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true };

 #[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub struct SchemaProps {
+    #[serde(default)]
    stored: bool,
+
+    #[serde(default)]
    indexed: bool,
 }

 impl SchemaProps {
-    pub fn is_stored(&self) -> bool {
+    pub fn is_stored(self) -> bool {
        self.stored
    }

-    pub fn is_indexed(&self) -> bool {
+    pub fn is_indexed(self) -> bool {
        self.indexed
    }
 }
@ -39,33 +46,39 @@ impl BitOr for SchemaProps {
    }
 }

+#[derive(Serialize, Deserialize)]
 pub struct SchemaBuilder {
-    attrs: LinkedHashMap<String, SchemaProps>,
+    identifier: String,
+    attributes: LinkedHashMap<String, SchemaProps>,
 }

 impl SchemaBuilder {
-    pub fn new() -> SchemaBuilder {
-        SchemaBuilder { attrs: LinkedHashMap::new() }
+    pub fn with_identifier<S: Into<String>>(name: S) -> SchemaBuilder {
+        SchemaBuilder {
+            identifier: name.into(),
+            attributes: LinkedHashMap::new(),
+        }
    }

    pub fn new_attribute<S: Into<String>>(&mut self, name: S, props: SchemaProps) -> SchemaAttr {
-        let len = self.attrs.len();
-        if self.attrs.insert(name.into(), props).is_some() {
+        let len = self.attributes.len();
+        if self.attributes.insert(name.into(), props).is_some() {
            panic!("Field already inserted.")
        }
-        SchemaAttr(len as u32)
+        SchemaAttr(len as u16)
    }

    pub fn build(self) -> Schema {
        let mut attrs = HashMap::new();
        let mut props = Vec::new();

-        for (i, (name, prop)) in self.attrs.into_iter().enumerate() {
-            attrs.insert(name.clone(), SchemaAttr(i as u32));
+        for (i, (name, prop)) in self.attributes.into_iter().enumerate() {
+            attrs.insert(name.clone(), SchemaAttr(i as u16));
            props.push((name, prop));
        }

-        Schema { inner: Arc::new(InnerSchema { attrs, props }) }
+        let identifier = self.identifier;
+        Schema { inner: Arc::new(InnerSchema { identifier, attrs, props }) }
    }
 }

@ -76,69 +89,124 @@ pub struct Schema {

 #[derive(Debug, Clone, PartialEq, Eq)]
 struct InnerSchema {
+    identifier: String,
    attrs: HashMap<String, SchemaAttr>,
    props: Vec<(String, SchemaProps)>,
 }

 impl Schema {
-    pub fn open<P: AsRef<Path>>(path: P) -> bincode::Result<Schema> {
-        let file = File::open(path)?;
-        Schema::read_from(file)
-    }
-
-    pub fn read_from<R: Read>(reader: R) -> bincode::Result<Schema> {
-        let attrs = bincode::deserialize_from(reader)?;
-        let builder = SchemaBuilder { attrs };
+    pub fn from_toml<R: Read>(mut reader: R) -> Result<Schema, Box<Error>> {
+        let mut buffer = Vec::new();
+        reader.read_to_end(&mut buffer)?;
+        let builder: SchemaBuilder = toml::from_slice(&buffer)?;
        Ok(builder.build())
    }

-    pub fn write_to<W: Write>(&self, writer: W) -> bincode::Result<()> {
+    pub fn to_toml<W: Write>(&self, mut writer: W) -> Result<(), Box<Error>> {
+        let identifier = self.inner.identifier.clone();
+        let attributes = self.attributes_ordered();
+        let builder = SchemaBuilder { identifier, attributes };
+
+        let string = toml::to_string_pretty(&builder)?;
+        writer.write_all(string.as_bytes())?;
+
+        Ok(())
+    }
+
+    pub fn from_json<R: Read>(mut reader: R) -> Result<Schema, Box<Error>> {
+        let mut buffer = Vec::new();
+        reader.read_to_end(&mut buffer)?;
+        let builder: SchemaBuilder = serde_json::from_slice(&buffer)?;
+        Ok(builder.build())
+    }
+
+    pub fn to_json<W: Write>(&self, mut writer: W) -> Result<(), Box<Error>> {
+        let identifier = self.inner.identifier.clone();
+        let attributes = self.attributes_ordered();
+        let builder = SchemaBuilder { identifier, attributes };
+        let string = serde_json::to_string_pretty(&builder)?;
+        writer.write_all(string.as_bytes())?;
+
+        Ok(())
+    }
+
+    pub(crate) fn read_from_bin<R: Read>(reader: R) -> bincode::Result<Schema> {
+        let builder: SchemaBuilder = bincode::deserialize_from(reader)?;
+        Ok(builder.build())
+    }
+
+    pub(crate) fn write_to_bin<W: Write>(&self, writer: W) -> bincode::Result<()> {
+        let identifier = self.inner.identifier.clone();
+        let attributes = self.attributes_ordered();
+        let builder = SchemaBuilder { identifier, attributes };
+
+        bincode::serialize_into(writer, &builder)
+    }
+
+    fn attributes_ordered(&self) -> LinkedHashMap<String, SchemaProps> {
        let mut ordered = BTreeMap::new();
-        for (name, field) in &self.inner.attrs {
-            let index = field.as_u32();
-            let (_, props) = self.inner.props[index as usize];
-            ordered.insert(index, (name, props));
+        for (name, attr) in &self.inner.attrs {
+            let (_, props) = self.inner.props[attr.0 as usize];
+            ordered.insert(attr.0, (name, props));
        }

-        let mut attrs = LinkedHashMap::with_capacity(ordered.len());
+        let mut attributes = LinkedHashMap::with_capacity(ordered.len());
        for (_, (name, props)) in ordered {
-            attrs.insert(name, props);
+            attributes.insert(name.clone(), props);
        }

-        bincode::serialize_into(writer, &attrs)
+        attributes
+    }
+
+    pub fn document_id<T>(&self, document: T) -> Result<DocumentId, SerializerError>
+    where T: Serialize,
+    {
+        let id_attribute_name = &self.inner.identifier;
+        let serializer = FindDocumentIdSerializer { id_attribute_name };
+        document.serialize(serializer)
    }

    pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
-        let index = attr.as_u32();
-        let (_, props) = self.inner.props[index as usize];
+        let (_, props) = self.inner.props[attr.0 as usize];
        props
    }

+    pub fn identifier_name(&self) -> &str {
+        &self.inner.identifier
+    }
+
    pub fn attribute<S: AsRef<str>>(&self, name: S) -> Option<SchemaAttr> {
        self.inner.attrs.get(name.as_ref()).cloned()
    }

    pub fn attribute_name(&self, attr: SchemaAttr) -> &str {
-        let index = attr.as_u32();
-        let (name, _) = &self.inner.props[index as usize];
+        let (name, _) = &self.inner.props[attr.0 as usize];
        name
    }
 }

 #[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)]
-pub struct SchemaAttr(u32);
+pub struct SchemaAttr(pub(crate) u16);

 impl SchemaAttr {
-    pub fn new(value: u32) -> SchemaAttr {
+    pub fn new(value: u16) -> SchemaAttr {
        SchemaAttr(value)
    }

-    pub fn max() -> SchemaAttr {
-        SchemaAttr(u32::MAX)
+    pub fn min() -> SchemaAttr {
+        SchemaAttr(0)
    }

-    pub fn as_u32(&self) -> u32 {
-        self.0
+    pub fn next(self) -> Option<SchemaAttr> {
+        self.0.checked_add(1).map(SchemaAttr)
+    }
+
+    pub fn prev(self) -> Option<SchemaAttr> {
+        self.0.checked_sub(1).map(SchemaAttr)
+    }
+
+    pub fn max() -> SchemaAttr {
+        SchemaAttr(u16::MAX)
    }
 }

@ -151,22 +219,92 @@ impl fmt::Display for SchemaAttr {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use std::error::Error;

    #[test]
    fn serialize_deserialize() -> bincode::Result<()> {
-        let mut builder = SchemaBuilder::new();
-        builder.new_attribute("alphabet", STORED);
+        let mut builder = SchemaBuilder::with_identifier("id");
+        builder.new_attribute("alpha", STORED);
        builder.new_attribute("beta", STORED | INDEXED);
        builder.new_attribute("gamma", INDEXED);
        let schema = builder.build();

        let mut buffer = Vec::new();

-        schema.write_to(&mut buffer)?;
-        let schema2 = Schema::read_from(buffer.as_slice())?;
+        schema.write_to_bin(&mut buffer)?;
+        let schema2 = Schema::read_from_bin(buffer.as_slice())?;

        assert_eq!(schema, schema2);

        Ok(())
    }
+
+    #[test]
+    fn serialize_deserialize_toml() -> Result<(), Box<Error>> {
+        let mut builder = SchemaBuilder::with_identifier("id");
+        builder.new_attribute("alpha", STORED);
+        builder.new_attribute("beta", STORED | INDEXED);
+        builder.new_attribute("gamma", INDEXED);
+        let schema = builder.build();
+
+        let mut buffer = Vec::new();
+        schema.to_toml(&mut buffer)?;
+
+        let schema2 = Schema::from_toml(buffer.as_slice())?;
+        assert_eq!(schema, schema2);
+
+        let data = r#"
+            identifier = "id"
+
+            [attributes."alpha"]
+            stored = true
+
+            [attributes."beta"]
+            stored = true
+            indexed = true
+
+            [attributes."gamma"]
+            indexed = true
+        "#;
+        let schema2 = Schema::from_toml(data.as_bytes())?;
+        assert_eq!(schema, schema2);
+
+        Ok(())
+    }
+
+    #[test]
+    fn serialize_deserialize_json() -> Result<(), Box<Error>> {
+        let mut builder = SchemaBuilder::with_identifier("id");
+        builder.new_attribute("alpha", STORED);
+        builder.new_attribute("beta", STORED | INDEXED);
+        builder.new_attribute("gamma", INDEXED);
+        let schema = builder.build();
+
+        let mut buffer = Vec::new();
+        schema.to_json(&mut buffer)?;
+
+        let schema2 = Schema::from_json(buffer.as_slice())?;
+        assert_eq!(schema, schema2);
+
+        let data = r#"
+            {
+                "identifier": "id",
+                "attributes": {
+                    "alpha": {
+                        "stored": true
+                    },
+                    "beta": {
+                        "stored": true,
+                        "indexed": true
+                    },
+                    "gamma": {
+                        "indexed": true
+                    }
+                }
+            }"#;
+        let schema2 = Schema::from_json(data.as_bytes())?;
+        assert_eq!(schema, schema2);
+
+        Ok(())
+    }
 }
--- a/src/database/serde/find_id.rs
+++ b/src/database/serde/find_id.rs
@ -0,0 +1,243 @@
+use serde::Serialize;
+use serde::ser;
+
+use crate::database::serde::key_to_string::KeyToStringSerializer;
+use crate::database::serde::{SerializerError, calculate_hash};
+use crate::DocumentId;
+
+pub struct FindDocumentIdSerializer<'a> {
+    pub id_attribute_name: &'a str,
+}
+
+impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
+    type Ok = DocumentId;
+    type Error = SerializerError;
+    type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
+    type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
+    type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
+    type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
+    type SerializeMap = FindDocumentIdMapSerializer<'a>;
+    type SerializeStruct = FindDocumentIdStructSerializer<'a>;
+    type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
+
+    forward_to_unserializable_type! {
+        bool => serialize_bool,
+        char => serialize_char,
+
+        i8  => serialize_i8,
+        i16 => serialize_i16,
+        i32 => serialize_i32,
+        i64 => serialize_i64,
+
+        u8  => serialize_u8,
+        u16 => serialize_u16,
+        u32 => serialize_u32,
+        u64 => serialize_u64,
+
+        f32 => serialize_f32,
+        f64 => serialize_f64,
+    }
+
+    fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "str" })
+    }
+
+    fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "&[u8]" })
+    }
+
+    fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "Option" })
+    }
+
+    fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
+    where T: Serialize,
+    {
+        Err(SerializerError::UnserializableType { name: "Option" })
+    }
+
+    fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "()" })
+    }
+
+    fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "unit struct" })
+    }
+
+    fn serialize_unit_variant(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        _variant: &'static str
+    ) -> Result<Self::Ok, Self::Error>
+    {
+        Err(SerializerError::UnserializableType { name: "unit variant" })
+    }
+
+    fn serialize_newtype_struct<T: ?Sized>(
+        self,
+        _name: &'static str,
+        value: &T
+    ) -> Result<Self::Ok, Self::Error>
+    where T: Serialize,
+    {
+        value.serialize(self)
+    }
+
+    fn serialize_newtype_variant<T: ?Sized>(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        _variant: &'static str,
+        _value: &T
+    ) -> Result<Self::Ok, Self::Error>
+    where T: Serialize,
+    {
+        Err(SerializerError::UnserializableType { name: "newtype variant" })
+    }
+
+    fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "sequence" })
+    }
+
+    fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "tuple" })
+    }
+
+    fn serialize_tuple_struct(
+        self,
+        _name: &'static str,
+        _len: usize
+    ) -> Result<Self::SerializeTupleStruct, Self::Error>
+    {
+        Err(SerializerError::UnserializableType { name: "tuple struct" })
+    }
+
+    fn serialize_tuple_variant(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        _variant: &'static str,
+        _len: usize
+    ) -> Result<Self::SerializeTupleVariant, Self::Error>
+    {
+        Err(SerializerError::UnserializableType { name: "tuple variant" })
+    }
+
+    fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
+        Ok(FindDocumentIdMapSerializer {
+            id_attribute_name: self.id_attribute_name,
+            document_id: None,
+            current_key_name: None,
+        })
+    }
+
+    fn serialize_struct(
+        self,
+        _name: &'static str,
+        _len: usize
+    ) -> Result<Self::SerializeStruct, Self::Error>
+    {
+        Ok(FindDocumentIdStructSerializer {
+            id_attribute_name: self.id_attribute_name,
+            document_id: None,
+        })
+    }
+
+    fn serialize_struct_variant(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        _variant: &'static str,
+        _len: usize
+    ) -> Result<Self::SerializeStructVariant, Self::Error>
+    {
+        Err(SerializerError::UnserializableType { name: "struct variant" })
+    }
+}
+
+pub struct FindDocumentIdMapSerializer<'a> {
+    id_attribute_name: &'a str,
+    document_id: Option<DocumentId>,
+    current_key_name: Option<String>,
+}
+
+impl<'a> ser::SerializeMap for FindDocumentIdMapSerializer<'a> {
+    type Ok = DocumentId;
+    type Error = SerializerError;
+
+    fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
+    where T: Serialize,
+    {
+        let key = key.serialize(KeyToStringSerializer)?;
+        self.current_key_name = Some(key);
+        Ok(())
+    }
+
+    fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
+    where T: Serialize,
+    {
+        let key = self.current_key_name.take().unwrap();
+        self.serialize_entry(&key, value)
+    }
+
+    fn serialize_entry<K: ?Sized, V: ?Sized>(
+        &mut self,
+        key: &K,
+        value: &V
+    ) -> Result<(), Self::Error>
+    where K: Serialize, V: Serialize,
+    {
+        let key = key.serialize(KeyToStringSerializer)?;
+
+        if self.id_attribute_name == key {
+            // TODO is it possible to have multiple ids?
+            let id = bincode::serialize(value).unwrap();
+            let hash = calculate_hash(&id);
+            self.document_id = Some(DocumentId(hash));
+        }
+
+        Ok(())
+    }
+
+    fn end(self) -> Result<Self::Ok, Self::Error> {
+        match self.document_id {
+            Some(document_id) => Ok(document_id),
+            None => Err(SerializerError::DocumentIdNotFound)
+        }
+    }
+}
+
+pub struct FindDocumentIdStructSerializer<'a> {
+    id_attribute_name: &'a str,
+    document_id: Option<DocumentId>,
+}
+
+impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> {
+    type Ok = DocumentId;
+    type Error = SerializerError;
+
+    fn serialize_field<T: ?Sized>(
+        &mut self,
+        key: &'static str,
+        value: &T
+    ) -> Result<(), Self::Error>
+    where T: Serialize,
+    {
+        if self.id_attribute_name == key {
+            // TODO can it be possible to have multiple ids?
+            let id = bincode::serialize(value).unwrap();
+            let hash = calculate_hash(&id);
+            self.document_id = Some(DocumentId(hash));
+        }
+
+        Ok(())
+    }
+
+    fn end(self) -> Result<Self::Ok, Self::Error> {
+        match self.document_id {
+            Some(document_id) => Ok(document_id),
+            None => Err(SerializerError::DocumentIdNotFound)
+        }
+    }
+}
--- a/src/database/serde/indexer_serializer.rs
+++ b/src/database/serde/indexer_serializer.rs
@ -0,0 +1,196 @@
+use crate::database::update::DocumentUpdate;
+use crate::database::serde::SerializerError;
+use crate::database::schema::SchemaAttr;
+use crate::tokenizer::TokenizerBuilder;
+use crate::tokenizer::Token;
+use crate::{DocumentId, DocIndex, Attribute, WordArea};
+
+use hashbrown::HashSet;
+use serde::Serialize;
+use serde::ser;
+
+pub struct IndexerSerializer<'a, B> {
+    pub tokenizer_builder: &'a B,
+    pub update: &'a mut DocumentUpdate,
+    pub document_id: DocumentId,
+    pub attribute: SchemaAttr,
+    pub stop_words: &'a HashSet<String>,
+}
+
+impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
+where B: TokenizerBuilder
+{
+    type Ok = ();
+    type Error = SerializerError;
+    type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
+    type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
+    type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
+    type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
+    type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
+    type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
+    type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
+
+    forward_to_unserializable_type! {
+        bool => serialize_bool,
+        char => serialize_char,
+
+        i8  => serialize_i8,
+        i16 => serialize_i16,
+        i32 => serialize_i32,
+        i64 => serialize_i64,
+
+        u8  => serialize_u8,
+        u16 => serialize_u16,
+        u32 => serialize_u32,
+        u64 => serialize_u64,
+
+        f32 => serialize_f32,
+        f64 => serialize_f64,
+    }
+
+    fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
+        for token in self.tokenizer_builder.build(v) {
+            let Token { word, word_index, char_index } = token;
+            let document_id = self.document_id;
+
+            // FIXME must u32::try_from instead
+            let attribute = match Attribute::new(self.attribute.0, word_index as u32) {
+                Ok(attribute) => attribute,
+                Err(_) => return Ok(()),
+            };
+
+            // insert the exact representation
+            let word_lower = word.to_lowercase();
+            let length = word.chars().count() as u16;
+
+            if self.stop_words.contains(&word_lower) { continue }
+
+            // and the unidecoded lowercased version
+            let word_unidecoded = unidecode::unidecode(word).to_lowercase();
+            if word_lower != word_unidecoded {
+                let word_area = match WordArea::new(char_index as u32, length) {
+                    Ok(word_area) => word_area,
+                    Err(_) => return Ok(()),
+                };
+
+                let doc_index = DocIndex { document_id, attribute, word_area };
+                self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index);
+            }
+
+            let word_area = match WordArea::new(char_index as u32, length) {
+                Ok(word_area) => word_area,
+                Err(_) => return Ok(()),
+            };
+
+            let doc_index = DocIndex { document_id, attribute, word_area };
+            self.update.insert_doc_index(word_lower.into_bytes(), doc_index);
+        }
+        Ok(())
+    }
+
+    fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "&[u8]" })
+    }
+
+    fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "Option" })
+    }
+
+    fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
+    where T: Serialize,
+    {
+        Err(SerializerError::UnserializableType { name: "Option" })
+    }
+
+    fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "()" })
+    }
+
+    fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "unit struct" })
+    }
+
+    fn serialize_unit_variant(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        _variant: &'static str
+    ) -> Result<Self::Ok, Self::Error>
+    {
+        Err(SerializerError::UnserializableType { name: "unit variant" })
+    }
+
+    fn serialize_newtype_struct<T: ?Sized>(
+        self,
+        _name: &'static str,
+        value: &T
+    ) -> Result<Self::Ok, Self::Error>
+    where T: Serialize,
+    {
+        value.serialize(self)
+    }
+
+    fn serialize_newtype_variant<T: ?Sized>(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        _variant: &'static str,
+        _value: &T
+    ) -> Result<Self::Ok, Self::Error>
+    where T: Serialize,
+    {
+        Err(SerializerError::UnserializableType { name: "newtype variant" })
+    }
+
+    fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "seq" })
+    }
+
+    fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "tuple" })
+    }
+
+    fn serialize_tuple_struct(
+        self,
+        _name: &'static str,
+        _len: usize
+    ) -> Result<Self::SerializeTupleStruct, Self::Error>
+    {
+        Err(SerializerError::UnserializableType { name: "tuple struct" })
+    }
+
+    fn serialize_tuple_variant(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        _variant: &'static str,
+        _len: usize
+    ) -> Result<Self::SerializeTupleVariant, Self::Error>
+    {
+        Err(SerializerError::UnserializableType { name: "tuple variant" })
+    }
+
+    fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "map" })
+    }
+
+    fn serialize_struct(
+        self,
+        _name: &'static str,
+        _len: usize
+    ) -> Result<Self::SerializeStruct, Self::Error>
+    {
+        Err(SerializerError::UnserializableType { name: "struct" })
+    }
+
+    fn serialize_struct_variant(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        _variant: &'static str,
+        _len: usize
+    ) -> Result<Self::SerializeStructVariant, Self::Error>
+    {
+        Err(SerializerError::UnserializableType { name: "struct variant" })
+    }
+}
--- a/src/database/serde/key_to_string.rs
+++ b/src/database/serde/key_to_string.rs
@ -0,0 +1,146 @@
+use serde::Serialize;
+use serde::ser;
+
+use crate::database::serde::SerializerError;
+
+pub struct KeyToStringSerializer;
+
+impl ser::Serializer for KeyToStringSerializer {
+    type Ok = String;
+    type Error = SerializerError;
+    type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
+    type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
+    type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
+    type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
+    type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
+    type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
+    type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
+
+    forward_to_unserializable_type! {
+        bool => serialize_bool,
+        char => serialize_char,
+
+        i8  => serialize_i8,
+        i16 => serialize_i16,
+        i32 => serialize_i32,
+        i64 => serialize_i64,
+
+        u8  => serialize_u8,
+        u16 => serialize_u16,
+        u32 => serialize_u32,
+        u64 => serialize_u64,
+
+        f32 => serialize_f32,
+        f64 => serialize_f64,
+    }
+
+    fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
+        Ok(value.to_string())
+    }
+
+    fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "&[u8]" })
+    }
+
+    fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "Option" })
+    }
+
+    fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
+    where T: Serialize,
+    {
+        Err(SerializerError::UnserializableType { name: "Option" })
+    }
+
+    fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "()" })
+    }
+
+    fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "unit struct" })
+    }
+
+    fn serialize_unit_variant(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        _variant: &'static str
+    ) -> Result<Self::Ok, Self::Error>
+    {
+        Err(SerializerError::UnserializableType { name: "unit variant" })
+    }
+
+    fn serialize_newtype_struct<T: ?Sized>(
+        self,
+        _name: &'static str,
+        value: &T
+    ) -> Result<Self::Ok, Self::Error>
+    where T: Serialize,
+    {
+        value.serialize(self)
+    }
+
+    fn serialize_newtype_variant<T: ?Sized>(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        _variant: &'static str,
+        _value: &T
+    ) -> Result<Self::Ok, Self::Error>
+    where T: Serialize,
+    {
+        Err(SerializerError::UnserializableType { name: "newtype variant" })
+    }
+
+    fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "sequence" })
+    }
+
+    fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "tuple" })
+    }
+
+    fn serialize_tuple_struct(
+        self,
+        _name: &'static str,
+        _len: usize
+    ) -> Result<Self::SerializeTupleStruct, Self::Error>
+    {
+        Err(SerializerError::UnserializableType { name: "tuple struct" })
+    }
+
+    fn serialize_tuple_variant(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        _variant: &'static str,
+        _len: usize
+    ) -> Result<Self::SerializeTupleVariant, Self::Error>
+    {
+        Err(SerializerError::UnserializableType { name: "tuple variant" })
+    }
+
+    fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "map" })
+    }
+
+    fn serialize_struct(
+        self,
+        _name: &'static str,
+        _len: usize
+    ) -> Result<Self::SerializeStruct, Self::Error>
+    {
+        Err(SerializerError::UnserializableType { name: "struct" })
+    }
+
+    fn serialize_struct_variant(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        _variant: &'static str,
+        _len: usize
+    ) -> Result<Self::SerializeStructVariant, Self::Error>
+    {
+        Err(SerializerError::UnserializableType { name: "struct variant" })
+    }
+}
--- a/src/database/serde/mod.rs
+++ b/src/database/serde/mod.rs
@ -0,0 +1,57 @@
+use std::collections::hash_map::DefaultHasher;
+use std::hash::{Hash, Hasher};
+use std::error::Error;
+use std::fmt;
+
+use serde::ser;
+
+macro_rules! forward_to_unserializable_type {
+    ($($ty:ident => $se_method:ident,)*) => {
+        $(
+            fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
+                Err(SerializerError::UnserializableType { name: "$ty" })
+            }
+        )*
+    }
+}
+
+pub mod find_id;
+pub mod key_to_string;
+pub mod serializer;
+pub mod indexer_serializer;
+
+pub fn calculate_hash<T: Hash>(t: &T) -> u64 {
+    let mut s = DefaultHasher::new();
+    t.hash(&mut s);
+    s.finish()
+}
+
+#[derive(Debug)]
+pub enum SerializerError {
+    DocumentIdNotFound,
+    UnserializableType { name: &'static str },
+    Custom(String),
+}
+
+impl ser::Error for SerializerError {
+    fn custom<T: fmt::Display>(msg: T) -> Self {
+        SerializerError::Custom(msg.to_string())
+    }
+}
+
+impl fmt::Display for SerializerError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            SerializerError::DocumentIdNotFound => {
+                write!(f, "serialized document does not have an id according to the schema")
+            }
+            SerializerError::UnserializableType { name } => {
+                write!(f, "Only struct and map types are considered valid documents and
+                           can be serialized, not {} types directly.", name)
+            },
+            SerializerError::Custom(s) => f.write_str(&s),
+        }
+    }
+}
+
+impl Error for SerializerError {}
--- a/src/database/serde/serializer.rs
+++ b/src/database/serde/serializer.rs
@ -0,0 +1,286 @@
+use hashbrown::HashSet;
+use serde::Serialize;
+use serde::ser;
+
+use crate::database::serde::indexer_serializer::IndexerSerializer;
+use crate::database::serde::key_to_string::KeyToStringSerializer;
+use crate::database::update::DocumentUpdate;
+use crate::database::serde::SerializerError;
+use crate::tokenizer::TokenizerBuilder;
+use crate::database::schema::Schema;
+use crate::DocumentId;
+
+pub struct Serializer<'a, B> {
+    pub schema: &'a Schema,
+    pub update: &'a mut DocumentUpdate,
+    pub document_id: DocumentId,
+    pub tokenizer_builder: &'a B,
+    pub stop_words: &'a HashSet<String>,
+}
+
+impl<'a, B> ser::Serializer for Serializer<'a, B>
+where B: TokenizerBuilder
+{
+    type Ok = ();
+    type Error = SerializerError;
+    type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
+    type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
+    type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
+    type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
+    type SerializeMap = MapSerializer<'a, B>;
+    type SerializeStruct = StructSerializer<'a, B>;
+    type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
+
+    forward_to_unserializable_type! {
+        bool => serialize_bool,
+        char => serialize_char,
+
+        i8  => serialize_i8,
+        i16 => serialize_i16,
+        i32 => serialize_i32,
+        i64 => serialize_i64,
+
+        u8  => serialize_u8,
+        u16 => serialize_u16,
+        u32 => serialize_u32,
+        u64 => serialize_u64,
+
+        f32 => serialize_f32,
+        f64 => serialize_f64,
+    }
+
+    fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "str" })
+    }
+
+    fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "&[u8]" })
+    }
+
+    fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "Option" })
+    }
+
+    fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
+    where T: Serialize,
+    {
+        Err(SerializerError::UnserializableType { name: "Option" })
+    }
+
+    fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "()" })
+    }
+
+    fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "unit struct" })
+    }
+
+    fn serialize_unit_variant(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        _variant: &'static str
+    ) -> Result<Self::Ok, Self::Error>
+    {
+        Err(SerializerError::UnserializableType { name: "unit variant" })
+    }
+
+    fn serialize_newtype_struct<T: ?Sized>(
+        self,
+        _name: &'static str,
+        value: &T
+    ) -> Result<Self::Ok, Self::Error>
+    where T: Serialize,
+    {
+        value.serialize(self)
+    }
+
+    fn serialize_newtype_variant<T: ?Sized>(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        _variant: &'static str,
+        _value: &T
+    ) -> Result<Self::Ok, Self::Error>
+    where T: Serialize,
+    {
+        Err(SerializerError::UnserializableType { name: "newtype variant" })
+    }
+
+    fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "sequence" })
+    }
+
+    fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
+        Err(SerializerError::UnserializableType { name: "tuple" })
+    }
+
+    fn serialize_tuple_struct(
+        self,
+        _name: &'static str,
+        _len: usize
+    ) -> Result<Self::SerializeTupleStruct, Self::Error>
+    {
+        Err(SerializerError::UnserializableType { name: "tuple struct" })
+    }
+
+    fn serialize_tuple_variant(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        _variant: &'static str,
+        _len: usize
+    ) -> Result<Self::SerializeTupleVariant, Self::Error>
+    {
+        Err(SerializerError::UnserializableType { name: "tuple variant" })
+    }
+
+    fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
+        Ok(MapSerializer {
+            schema: self.schema,
+            document_id: self.document_id,
+            update: self.update,
+            tokenizer_builder: self.tokenizer_builder,
+            stop_words: self.stop_words,
+            current_key_name: None,
+        })
+    }
+
+    fn serialize_struct(
+        self,
+        _name: &'static str,
+        _len: usize
+    ) -> Result<Self::SerializeStruct, Self::Error>
+    {
+        Ok(StructSerializer {
+            schema: self.schema,
+            update: self.update,
+            document_id: self.document_id,
+            tokenizer_builder: self.tokenizer_builder,
+            stop_words: self.stop_words,
+        })
+    }
+
+    fn serialize_struct_variant(
+        self,
+        _name: &'static str,
+        _variant_index: u32,
+        _variant: &'static str,
+        _len: usize
+    ) -> Result<Self::SerializeStructVariant, Self::Error>
+    {
+        Err(SerializerError::UnserializableType { name: "struct variant" })
+    }
+}
+
+pub struct MapSerializer<'a, B> {
+    pub schema: &'a Schema,
+    pub document_id: DocumentId,
+    pub update: &'a mut DocumentUpdate,
+    pub tokenizer_builder: &'a B,
+    pub stop_words: &'a HashSet<String>,
+    pub current_key_name: Option<String>,
+}
+
+impl<'a, B> ser::SerializeMap for MapSerializer<'a, B>
+where B: TokenizerBuilder
+{
+    type Ok = ();
+    type Error = SerializerError;
+
+    fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
+    where T: Serialize,
+    {
+        let key = key.serialize(KeyToStringSerializer)?;
+        self.current_key_name = Some(key);
+        Ok(())
+    }
+
+    fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
+    where T: Serialize,
+    {
+        let key = self.current_key_name.take().unwrap();
+        self.serialize_entry(&key, value)
+    }
+
+    fn serialize_entry<K: ?Sized, V: ?Sized>(
+        &mut self,
+        key: &K,
+        value: &V
+    ) -> Result<(), Self::Error>
+    where K: Serialize, V: Serialize,
+    {
+        let key = key.serialize(KeyToStringSerializer)?;
+
+        if let Some(attr) = self.schema.attribute(key) {
+            let props = self.schema.props(attr);
+            if props.is_stored() {
+                let value = bincode::serialize(value).unwrap();
+                self.update.insert_attribute_value(attr, value);
+            }
+            if props.is_indexed() {
+                let serializer = IndexerSerializer {
+                    update: self.update,
+                    tokenizer_builder: self.tokenizer_builder,
+                    document_id: self.document_id,
+                    attribute: attr,
+                    stop_words: self.stop_words,
+                };
+                value.serialize(serializer)?;
+            }
+        }
+
+        Ok(())
+    }
+
+    fn end(self) -> Result<Self::Ok, Self::Error> {
+        Ok(())
+    }
+}
+
+pub struct StructSerializer<'a, B> {
+    pub schema: &'a Schema,
+    pub document_id: DocumentId,
+    pub update: &'a mut DocumentUpdate,
+    pub tokenizer_builder: &'a B,
+    pub stop_words: &'a HashSet<String>,
+}
+
+impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
+where B: TokenizerBuilder
+{
+    type Ok = ();
+    type Error = SerializerError;
+
+    fn serialize_field<T: ?Sized>(
+        &mut self,
+        key: &'static str,
+        value: &T
+    ) -> Result<(), Self::Error>
+    where T: Serialize,
+    {
+        if let Some(attr) = self.schema.attribute(key) {
+            let props = self.schema.props(attr);
+            if props.is_stored() {
+                let value = bincode::serialize(value).unwrap();
+                self.update.insert_attribute_value(attr, value);
+            }
+            if props.is_indexed() {
+                let serializer = IndexerSerializer {
+                    update: self.update,
+                    tokenizer_builder: self.tokenizer_builder,
+                    document_id: self.document_id,
+                    attribute: attr,
+                    stop_words: self.stop_words,
+                };
+                value.serialize(serializer)?;
+            }
+        }
+
+        Ok(())
+    }
+
+    fn end(self) -> Result<Self::Ok, Self::Error> {
+        Ok(())
+    }
+}
--- a/src/database/update/builder.rs
+++ b/src/database/update/builder.rs
@ -0,0 +1,64 @@
+use std::path::PathBuf;
+use std::error::Error;
+
+use hashbrown::HashSet;
+use serde::Serialize;
+
+use crate::database::serde::serializer::Serializer;
+use crate::database::serde::SerializerError;
+use crate::tokenizer::TokenizerBuilder;
+use crate::database::Schema;
+
+use crate::DocumentId;
+use super::{Update, RawUpdateBuilder};
+
+pub struct UpdateBuilder {
+    schema: Schema,
+    raw_builder: RawUpdateBuilder,
+}
+
+impl UpdateBuilder {
+    pub fn new(path: PathBuf, schema: Schema) -> UpdateBuilder {
+        UpdateBuilder {
+            schema: schema,
+            raw_builder: RawUpdateBuilder::new(path),
+        }
+    }
+
+    pub fn update_document<T, B>(
+        &mut self,
+        document: T,
+        tokenizer_builder: &B,
+        stop_words: &HashSet<String>,
+    ) -> Result<DocumentId, SerializerError>
+    where T: Serialize,
+          B: TokenizerBuilder,
+    {
+        let document_id = self.schema.document_id(&document)?;
+        let update = self.raw_builder.document_update(document_id);
+
+        let serializer = Serializer {
+            schema: &self.schema,
+            document_id: document_id,
+            tokenizer_builder: tokenizer_builder,
+            update: update,
+            stop_words: stop_words,
+        };
+
+        document.serialize(serializer)?;
+
+        Ok(document_id)
+    }
+
+    pub fn remove_document<T>(&mut self, document: T) -> Result<DocumentId, SerializerError>
+    where T: Serialize,
+    {
+        let document_id = self.schema.document_id(&document)?;
+        self.raw_builder.document_update(document_id).remove();
+        Ok(document_id)
+    }
+
+    pub fn build(self) -> Result<Update, Box<Error>> {
+        self.raw_builder.build()
+    }
+}
--- a/src/database/update/mod.rs
+++ b/src/database/update/mod.rs
@ -1,35 +1,17 @@
-use std::path::PathBuf;
-use std::error::Error;
+use std::path::{Path, PathBuf};

-mod negative;
-mod positive;
+mod builder;
+mod raw_builder;

-pub use self::positive::{PositiveUpdateBuilder, NewState};
-pub use self::negative::NegativeUpdateBuilder;
+pub use self::builder::UpdateBuilder;
+pub use self::raw_builder::{RawUpdateBuilder, DocumentUpdate};

 pub struct Update {
-    path: PathBuf,
-    can_be_moved: bool,
+    sst_file: PathBuf,
 }

 impl Update {
-    pub fn open<P: Into<PathBuf>>(path: P) -> Result<Update, Box<Error>> {
-        Ok(Update { path: path.into(), can_be_moved: false })
-    }
-
-    pub fn open_and_move<P: Into<PathBuf>>(path: P) -> Result<Update, Box<Error>> {
-        Ok(Update { path: path.into(), can_be_moved: true })
-    }
-
-    pub fn set_move(&mut self, can_be_moved: bool) {
-        self.can_be_moved = can_be_moved
-    }
-
-    pub fn can_be_moved(&self) -> bool {
-        self.can_be_moved
-    }
-
-    pub fn into_path_buf(self) -> PathBuf {
-        self.path
+    pub fn path(&self) -> &Path {
+        &self.sst_file
    }
 }
--- a/src/database/update/negative/mod.rs
+++ b/src/database/update/negative/mod.rs
@ -1,4 +0,0 @@
-mod update;
-mod unordered_builder;
-
-pub use self::update::NegativeUpdateBuilder;
--- a/src/database/update/negative/unordered_builder.rs
+++ b/src/database/update/negative/unordered_builder.rs
@ -1,37 +0,0 @@
-use std::collections::BTreeSet;
-use std::io;
-
-use byteorder::{NativeEndian, WriteBytesExt};
-
-use crate::DocumentId;
-
-pub struct UnorderedNegativeBlobBuilder<W> {
-    doc_ids: BTreeSet<DocumentId>, // TODO: prefer a linked-list
-    wrt: W,
-}
-
-impl UnorderedNegativeBlobBuilder<Vec<u8>> {
-    pub fn memory() -> Self {
-        UnorderedNegativeBlobBuilder::new(Vec::new())
-    }
-}
-
-impl<W: io::Write> UnorderedNegativeBlobBuilder<W> {
-    pub fn new(wrt: W) -> Self {
-        Self {
-            doc_ids: BTreeSet::new(),
-            wrt: wrt,
-        }
-    }
-
-    pub fn insert(&mut self, doc: DocumentId) -> bool {
-        self.doc_ids.insert(doc)
-    }
-
-    pub fn into_inner(mut self) -> io::Result<W> {
-        for id in self.doc_ids {
-            self.wrt.write_u64::<NativeEndian>(id)?;
-        }
-        Ok(self.wrt)
-    }
-}
--- a/src/database/update/negative/update.rs
+++ b/src/database/update/negative/update.rs
@ -1,60 +0,0 @@
-use std::path::PathBuf;
-use std::error::Error;
-
-use ::rocksdb::rocksdb_options;
-
-use crate::database::update::negative::unordered_builder::UnorderedNegativeBlobBuilder;
-use crate::database::blob::{Blob, NegativeBlob};
-use crate::database::update::Update;
-use crate::database::DocumentKey;
-use crate::database::DATA_INDEX;
-use crate::DocumentId;
-
-pub struct NegativeUpdateBuilder {
-    path: PathBuf,
-    doc_ids: UnorderedNegativeBlobBuilder<Vec<u8>>,
-}
-
-impl NegativeUpdateBuilder {
-    pub fn new<P: Into<PathBuf>>(path: P) -> NegativeUpdateBuilder {
-        NegativeUpdateBuilder {
-            path: path.into(),
-            doc_ids: UnorderedNegativeBlobBuilder::memory(),
-        }
-    }
-
-    pub fn remove(&mut self, id: DocumentId) -> bool {
-        self.doc_ids.insert(id)
-    }
-
-    pub fn build(self) -> Result<Update, Box<Error>> {
-        let env_options = rocksdb_options::EnvOptions::new();
-        let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
-        let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
-        file_writer.open(&self.path.to_string_lossy())?;
-
-        let bytes = self.doc_ids.into_inner()?;
-        let negative_blob = NegativeBlob::from_bytes(bytes)?;
-        let blob = Blob::Negative(negative_blob);
-
-        // write the data-index aka negative blob
-        let bytes = bincode::serialize(&blob)?;
-        file_writer.merge(DATA_INDEX, &bytes)?;
-
-        // FIXME remove this ugly thing !
-        // let Blob::Negative(negative_blob) = blob;
-        let negative_blob = match blob {
-            Blob::Negative(blob) => blob,
-            Blob::Positive(_) => unreachable!(),
-        };
-
-        for &document_id in negative_blob.as_ref().as_slice() {
-            let start = DocumentKey::new(document_id);
-            let end = start.with_attribute_max();
-            file_writer.delete_range(start.as_ref(), end.as_ref())?;
-        }
-
-        file_writer.finish()?;
-        Update::open(self.path)
-    }
-}
--- a/src/database/update/positive/mod.rs
+++ b/src/database/update/positive/mod.rs
@ -1,4 +0,0 @@
-mod update;
-mod unordered_builder;
-
-pub use self::update::{PositiveUpdateBuilder, NewState};
--- a/src/database/update/positive/unordered_builder.rs
+++ b/src/database/update/positive/unordered_builder.rs
@ -1,49 +0,0 @@
-#![allow(unused)]
-
-use std::collections::BTreeMap;
-use std::error::Error;
-use std::io::Write;
-
-use sdset::Set;
-
-use crate::database::blob::positive::PositiveBlobBuilder;
-use crate::DocIndex;
-
-pub struct UnorderedPositiveBlobBuilder<W, X> {
-    builder: PositiveBlobBuilder<W, X>,
-    map: BTreeMap<Vec<u8>, Vec<DocIndex>>,
-}
-
-impl UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>> {
-    pub fn memory() -> Self {
-        Self {
-            builder: PositiveBlobBuilder::memory(),
-            map: BTreeMap::new(),
-        }
-    }
-}
-
-impl<W: Write, X: Write> UnorderedPositiveBlobBuilder<W, X> {
-    pub fn new(map_wtr: W, doc_wtr: X) -> Result<Self, Box<Error>> {
-        Ok(UnorderedPositiveBlobBuilder {
-            builder: PositiveBlobBuilder::new(map_wtr, doc_wtr)?,
-            map: BTreeMap::new(),
-        })
-    }
-
-    pub fn insert<K: Into<Vec<u8>>>(&mut self, input: K, doc_index: DocIndex) {
-        self.map.entry(input.into()).or_insert_with(Vec::new).push(doc_index);
-    }
-
-    pub fn finish(self) -> Result<(), Box<Error>> {
-        self.into_inner().map(drop)
-    }
-
-    pub fn into_inner(mut self) -> Result<(W, X), Box<Error>> {
-        for (key, mut doc_indexes) in self.map {
-            doc_indexes.sort_unstable();
-            self.builder.insert(&key, Set::new_unchecked(&doc_indexes))?;
-        }
-        self.builder.into_inner()
-    }
-}
--- a/src/database/update/positive/update.rs
+++ b/src/database/update/positive/update.rs
@ -1,514 +0,0 @@
-use std::collections::BTreeMap;
-use std::path::PathBuf;
-use std::error::Error;
-use std::fmt;
-
-use ::rocksdb::rocksdb_options;
-use serde::ser::{self, Serialize};
-
-use crate::database::update::positive::unordered_builder::UnorderedPositiveBlobBuilder;
-use crate::database::blob::positive::PositiveBlob;
-use crate::database::schema::{Schema, SchemaAttr};
-use crate::tokenizer::TokenizerBuilder;
-use crate::database::DocumentKeyAttr;
-use crate::database::update::Update;
-use crate::{DocumentId, DocIndex};
-use crate::database::DATA_INDEX;
-use crate::database::blob::Blob;
-
-pub enum NewState {
-    Updated { value: Vec<u8> },
-    Removed,
-}
-
-pub struct PositiveUpdateBuilder<B> {
-    path: PathBuf,
-    schema: Schema,
-    tokenizer_builder: B,
-    builder: UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
-    new_states: BTreeMap<DocumentKeyAttr, NewState>,
-}
-
-impl<B> PositiveUpdateBuilder<B> {
-    pub fn new<P: Into<PathBuf>>(path: P, schema: Schema, tokenizer_builder: B) -> PositiveUpdateBuilder<B> {
-        PositiveUpdateBuilder {
-            path: path.into(),
-            schema: schema,
-            tokenizer_builder: tokenizer_builder,
-            builder: UnorderedPositiveBlobBuilder::memory(),
-            new_states: BTreeMap::new(),
-        }
-    }
-
-    pub fn update<T: Serialize>(&mut self, id: DocumentId, document: &T) -> Result<(), Box<Error>>
-    where B: TokenizerBuilder
-    {
-        let serializer = Serializer {
-            schema: &self.schema,
-            document_id: id,
-            tokenizer_builder: &self.tokenizer_builder,
-            builder: &mut self.builder,
-            new_states: &mut self.new_states
-        };
-
-        Ok(ser::Serialize::serialize(document, serializer)?)
-    }
-
-    // TODO value must be a field that can be indexed
-    pub fn update_field(&mut self, id: DocumentId, attr: SchemaAttr, value: String) {
-        let value = bincode::serialize(&value).unwrap();
-        self.new_states.insert(DocumentKeyAttr::new(id, attr), NewState::Updated { value });
-    }
-
-    pub fn remove_field(&mut self, id: DocumentId, attr: SchemaAttr) {
-        self.new_states.insert(DocumentKeyAttr::new(id, attr), NewState::Removed);
-    }
-}
-
-#[derive(Debug)]
-pub enum SerializerError {
-    SchemaDontMatch { attribute: String },
-    UnserializableType { name: &'static str },
-    Custom(String),
-}
-
-impl ser::Error for SerializerError {
-    fn custom<T: fmt::Display>(msg: T) -> Self {
-        SerializerError::Custom(msg.to_string())
-    }
-}
-
-impl fmt::Display for SerializerError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        match self {
-            SerializerError::SchemaDontMatch { attribute } => {
-                write!(f, "serialized document try to specify the \
-                           {:?} attribute that is not known by the schema", attribute)
-            },
-            SerializerError::UnserializableType { name } => {
-                write!(f, "Only struct and map types are considered valid documents and
-                           can be serialized, not {} types directly.", name)
-            },
-            SerializerError::Custom(s) => f.write_str(&s),
-        }
-    }
-}
-
-impl Error for SerializerError {}
-
-struct Serializer<'a, B> {
-    schema: &'a Schema,
-    tokenizer_builder: &'a B,
-    document_id: DocumentId,
-    builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
-    new_states: &'a mut BTreeMap<DocumentKeyAttr, NewState>,
-}
-
-macro_rules! forward_to_unserializable_type {
-    ($($ty:ident => $se_method:ident,)*) => {
-        $(
-            fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
-                Err(SerializerError::UnserializableType { name: "$ty" })
-            }
-        )*
-    }
-}
-
-impl<'a, B> ser::Serializer for Serializer<'a, B>
-where B: TokenizerBuilder
-{
-    type Ok = ();
-    type Error = SerializerError;
-    type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
-    type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
-    type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
-    type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
-    type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
-    type SerializeStruct = StructSerializer<'a, B>;
-    type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
-
-    forward_to_unserializable_type! {
-        bool => serialize_bool,
-        char => serialize_char,
-
-        i8  => serialize_i8,
-        i16 => serialize_i16,
-        i32 => serialize_i32,
-        i64 => serialize_i64,
-
-        u8  => serialize_u8,
-        u16 => serialize_u16,
-        u32 => serialize_u32,
-        u64 => serialize_u64,
-
-        f32 => serialize_f32,
-        f64 => serialize_f64,
-    }
-
-    fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
-        Err(SerializerError::UnserializableType { name: "str" })
-    }
-
-    fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
-        Err(SerializerError::UnserializableType { name: "&[u8]" })
-    }
-
-    fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
-        Err(SerializerError::UnserializableType { name: "Option" })
-    }
-
-    fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
-    where T: Serialize,
-    {
-        Err(SerializerError::UnserializableType { name: "Option" })
-    }
-
-    fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
-        Err(SerializerError::UnserializableType { name: "()" })
-    }
-
-    fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
-        Err(SerializerError::UnserializableType { name: "unit struct" })
-    }
-
-    fn serialize_unit_variant(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str
-    ) -> Result<Self::Ok, Self::Error>
-    {
-        Err(SerializerError::UnserializableType { name: "unit variant" })
-    }
-
-    fn serialize_newtype_struct<T: ?Sized>(
-        self,
-        _name: &'static str,
-        value: &T
-    ) -> Result<Self::Ok, Self::Error>
-    where T: Serialize,
-    {
-        value.serialize(self)
-    }
-
-    fn serialize_newtype_variant<T: ?Sized>(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-        _value: &T
-    ) -> Result<Self::Ok, Self::Error>
-    where T: Serialize,
-    {
-        Err(SerializerError::UnserializableType { name: "newtype variant" })
-    }
-
-    fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
-        Err(SerializerError::UnserializableType { name: "sequence" })
-    }
-
-    fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
-        Err(SerializerError::UnserializableType { name: "tuple" })
-    }
-
-    fn serialize_tuple_struct(
-        self,
-        _name: &'static str,
-        _len: usize
-    ) -> Result<Self::SerializeTupleStruct, Self::Error>
-    {
-        Err(SerializerError::UnserializableType { name: "tuple struct" })
-    }
-
-    fn serialize_tuple_variant(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-        _len: usize
-    ) -> Result<Self::SerializeTupleVariant, Self::Error>
-    {
-        Err(SerializerError::UnserializableType { name: "tuple variant" })
-    }
-
-    fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
-        // Ok(MapSerializer {
-        //     schema: self.schema,
-        //     document_id: self.document_id,
-        //     new_states: self.new_states,
-        // })
-        Err(SerializerError::UnserializableType { name: "map" })
-    }
-
-    fn serialize_struct(
-        self,
-        _name: &'static str,
-        _len: usize
-    ) -> Result<Self::SerializeStruct, Self::Error>
-    {
-        Ok(StructSerializer {
-            schema: self.schema,
-            tokenizer_builder: self.tokenizer_builder,
-            document_id: self.document_id,
-            builder: self.builder,
-            new_states: self.new_states,
-        })
-    }
-
-    fn serialize_struct_variant(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-        _len: usize
-    ) -> Result<Self::SerializeStructVariant, Self::Error>
-    {
-        Err(SerializerError::UnserializableType { name: "struct variant" })
-    }
-}
-
-struct StructSerializer<'a, B> {
-    schema: &'a Schema,
-    tokenizer_builder: &'a B,
-    document_id: DocumentId,
-    builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
-    new_states: &'a mut BTreeMap<DocumentKeyAttr, NewState>,
-}
-
-impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
-where B: TokenizerBuilder
-{
-    type Ok = ();
-    type Error = SerializerError;
-
-    fn serialize_field<T: ?Sized>(
-        &mut self,
-        key: &'static str,
-        value: &T
-    ) -> Result<(), Self::Error>
-    where T: Serialize,
-    {
-        match self.schema.attribute(key) {
-            Some(attr) => {
-                let props = self.schema.props(attr);
-                if props.is_stored() {
-                    let value = bincode::serialize(value).unwrap();
-                    let key = DocumentKeyAttr::new(self.document_id, attr);
-                    self.new_states.insert(key, NewState::Updated { value });
-                }
-                if props.is_indexed() {
-                    let serializer = IndexerSerializer {
-                        builder: self.builder,
-                        tokenizer_builder: self.tokenizer_builder,
-                        document_id: self.document_id,
-                        attribute: attr,
-                    };
-                    value.serialize(serializer)?;
-                }
-                Ok(())
-            },
-            None => Err(SerializerError::SchemaDontMatch { attribute: key.to_owned() }),
-        }
-    }
-
-    fn end(self) -> Result<Self::Ok, Self::Error> {
-        Ok(())
-    }
-}
-
-struct IndexerSerializer<'a, B> {
-    tokenizer_builder: &'a B,
-    builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
-    document_id: DocumentId,
-    attribute: SchemaAttr,
-}
-
-impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
-where B: TokenizerBuilder
-{
-    type Ok = ();
-    type Error = SerializerError;
-    type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
-    type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
-    type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
-    type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
-    type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
-    type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
-    type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
-
-    forward_to_unserializable_type! {
-        bool => serialize_bool,
-        char => serialize_char,
-
-        i8  => serialize_i8,
-        i16 => serialize_i16,
-        i32 => serialize_i32,
-        i64 => serialize_i64,
-
-        u8  => serialize_u8,
-        u16 => serialize_u16,
-        u32 => serialize_u32,
-        u64 => serialize_u64,
-
-        f32 => serialize_f32,
-        f64 => serialize_f64,
-    }
-
-    fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
-        for (index, word) in self.tokenizer_builder.build(v) {
-            let doc_index = DocIndex {
-                document_id: self.document_id,
-                attribute: self.attribute.as_u32() as u8,
-                attribute_index: index as u32,
-            };
-
-            // insert the exact representation
-            let word_lower = word.to_lowercase();
-
-            // and the unidecoded lowercased version
-            let word_unidecoded = unidecode::unidecode(word).to_lowercase();
-            if word_lower != word_unidecoded {
-                self.builder.insert(word_unidecoded, doc_index);
-            }
-
-            self.builder.insert(word_lower, doc_index);
-        }
-        Ok(())
-    }
-
-    fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
-        Err(SerializerError::UnserializableType { name: "&[u8]" })
-    }
-
-    fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
-        Err(SerializerError::UnserializableType { name: "Option" })
-    }
-
-    fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
-    where T: Serialize,
-    {
-        Err(SerializerError::UnserializableType { name: "Option" })
-    }
-
-    fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
-        Err(SerializerError::UnserializableType { name: "()" })
-    }
-
-    fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
-        Err(SerializerError::UnserializableType { name: "unit struct" })
-    }
-
-    fn serialize_unit_variant(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str
-    ) -> Result<Self::Ok, Self::Error>
-    {
-        Err(SerializerError::UnserializableType { name: "unit variant" })
-    }
-
-    fn serialize_newtype_struct<T: ?Sized>(
-        self,
-        _name: &'static str,
-        value: &T
-    ) -> Result<Self::Ok, Self::Error>
-    where T: Serialize,
-    {
-        value.serialize(self)
-    }
-
-    fn serialize_newtype_variant<T: ?Sized>(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-        _value: &T
-    ) -> Result<Self::Ok, Self::Error>
-    where T: Serialize,
-    {
-        Err(SerializerError::UnserializableType { name: "newtype variant" })
-    }
-
-    fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
-        Err(SerializerError::UnserializableType { name: "seq" })
-    }
-
-    fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
-        Err(SerializerError::UnserializableType { name: "tuple" })
-    }
-
-    fn serialize_tuple_struct(
-        self,
-        _name: &'static str,
-        _len: usize
-    ) -> Result<Self::SerializeTupleStruct, Self::Error>
-    {
-        Err(SerializerError::UnserializableType { name: "tuple struct" })
-    }
-
-    fn serialize_tuple_variant(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-        _len: usize
-    ) -> Result<Self::SerializeTupleVariant, Self::Error>
-    {
-        Err(SerializerError::UnserializableType { name: "tuple variant" })
-    }
-
-    fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
-        Err(SerializerError::UnserializableType { name: "map" })
-    }
-
-    fn serialize_struct(
-        self,
-        _name: &'static str,
-        _len: usize
-    ) -> Result<Self::SerializeStruct, Self::Error>
-    {
-        Err(SerializerError::UnserializableType { name: "struct" })
-    }
-
-    fn serialize_struct_variant(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-        _len: usize
-    ) -> Result<Self::SerializeStructVariant, Self::Error>
-    {
-        Err(SerializerError::UnserializableType { name: "struct variant" })
-    }
-}
-
-impl<B> PositiveUpdateBuilder<B> {
-    pub fn build(self) -> Result<Update, Box<Error>> {
-        let env_options = rocksdb_options::EnvOptions::new();
-        let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
-        let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
-        file_writer.open(&self.path.to_string_lossy())?;
-
-        let (blob_fst_map, blob_doc_idx) = self.builder.into_inner()?;
-        let positive_blob = PositiveBlob::from_bytes(blob_fst_map, blob_doc_idx)?;
-        let blob = Blob::Positive(positive_blob);
-
-        // write the data-index aka positive blob
-        let bytes = bincode::serialize(&blob)?;
-        file_writer.merge(DATA_INDEX, &bytes)?;
-
-        // write all the documents fields updates
-        for (key, state) in self.new_states {
-            match state {
-                NewState::Updated { value } => {
-                    file_writer.put(key.as_ref(), &value)?
-                },
-                NewState::Removed => file_writer.delete(key.as_ref())?,
-            }
-        }
-
-        file_writer.finish()?;
-        Update::open(self.path)
-    }
-}
--- a/src/database/update/raw_builder.rs
+++ b/src/database/update/raw_builder.rs
@ -0,0 +1,168 @@
+use std::collections::btree_map::{BTreeMap, Entry};
+use std::path::PathBuf;
+use std::error::Error;
+
+use rocksdb::rocksdb_options;
+use hashbrown::HashMap;
+use fst::map::Map;
+use sdset::Set;
+
+use crate::database::index::{Index, Positive, PositiveBuilder, Negative};
+use crate::database::{DATA_INDEX, DocumentKeyAttr};
+use crate::database::schema::SchemaAttr;
+use crate::data::{DocIds, DocIndexes};
+use crate::{DocumentId, DocIndex};
+use super::Update;
+
+type Token = Vec<u8>; // TODO could be replaced by a SmallVec
+type Value = Vec<u8>;
+
+pub struct RawUpdateBuilder {
+    sst_file: PathBuf,
+    document_updates: BTreeMap<DocumentId, DocumentUpdate>,
+}
+
+pub struct DocumentUpdate {
+    cleared: bool,
+    words_indexes: HashMap<Token, Vec<DocIndex>>,
+    attributes: BTreeMap<SchemaAttr, Value>,
+}
+
+impl DocumentUpdate {
+    pub fn new() -> DocumentUpdate {
+        DocumentUpdate {
+            cleared: false,
+            words_indexes: HashMap::new(),
+            attributes: BTreeMap::new(),
+        }
+    }
+
+    pub fn remove(&mut self) {
+        self.cleared = true;
+        self.clear();
+    }
+
+    pub fn clear(&mut self) {
+        self.words_indexes.clear();
+        self.attributes.clear();
+    }
+
+    pub fn insert_attribute_value(&mut self, attr: SchemaAttr, value: Vec<u8>) {
+        self.attributes.insert(attr, value);
+    }
+
+    pub fn insert_doc_index(&mut self, token: Vec<u8>, doc_index: DocIndex) {
+        self.words_indexes.entry(token).or_insert_with(Vec::new).push(doc_index)
+    }
+}
+
+impl RawUpdateBuilder {
+    pub fn new(path: PathBuf) -> RawUpdateBuilder {
+        RawUpdateBuilder {
+            sst_file: path,
+            document_updates: BTreeMap::new(),
+        }
+    }
+
+    pub fn document_update(&mut self, document_id: DocumentId) -> &mut DocumentUpdate {
+        match self.document_updates.entry(document_id) {
+            Entry::Occupied(mut occupied) => {
+                occupied.get_mut().clear();
+                occupied.into_mut()
+            },
+            Entry::Vacant(vacant) => vacant.insert(DocumentUpdate::new()),
+        }
+    }
+
+    pub fn build(mut self) -> Result<Update, Box<Error>> {
+        let mut removed_document_ids = Vec::new();
+        let mut words_indexes = BTreeMap::new();
+
+        for (&id, update) in self.document_updates.iter_mut() {
+            if update.cleared { removed_document_ids.push(id) }
+
+            for (token, indexes) in &update.words_indexes {
+                words_indexes.entry(token).or_insert_with(Vec::new).extend_from_slice(indexes)
+            }
+        }
+
+        let negative = {
+            let removed_document_ids = Set::new_unchecked(&removed_document_ids);
+            let doc_ids = DocIds::new(removed_document_ids);
+            Negative::new(doc_ids)
+        };
+
+        let positive = {
+            let mut positive_builder = PositiveBuilder::memory();
+
+            for (key, mut indexes) in words_indexes {
+                indexes.sort_unstable();
+                let indexes = Set::new_unchecked(&indexes);
+                positive_builder.insert(key, indexes)?;
+            }
+
+            let (map, indexes) = positive_builder.into_inner()?;
+            let map = Map::from_bytes(map)?;
+            let indexes = DocIndexes::from_bytes(indexes)?;
+            Positive::new(map, indexes)
+        };
+
+        let index = Index { negative, positive };
+
+        let env_options = rocksdb_options::EnvOptions::new();
+        let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
+        let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
+        file_writer.open(&self.sst_file.to_string_lossy())?;
+
+        // write the data-index
+        let mut bytes = Vec::new();
+        index.write_to_bytes(&mut bytes);
+        file_writer.merge(DATA_INDEX, &bytes)?;
+
+        // write all the documents attributes updates
+        for (id, update) in self.document_updates {
+
+            let mut last_attr: Option<SchemaAttr> = None;
+            for (attr, value) in update.attributes {
+
+                if update.cleared {
+                    // if there is no last attribute, remove from the first attribute
+                    let start_attr = match last_attr {
+                        Some(attr) => attr.next(),
+                        None       => Some(SchemaAttr::min())
+                    };
+                    let start = start_attr.map(|a| DocumentKeyAttr::new(id, a));
+                    let end = attr.prev().map(|a| DocumentKeyAttr::new(id, a));
+
+                    // delete_range between (last_attr + 1) and (attr - 1)
+                    if let (Some(start), Some(end)) = (start, end) {
+                        file_writer.delete_range(start.as_ref(), end.as_ref())?;
+                    }
+                }
+
+                let key = DocumentKeyAttr::new(id, attr);
+                file_writer.put(key.as_ref(), &value)?;
+                last_attr = Some(attr);
+            }
+
+            if update.cleared {
+                // if there is no last attribute, remove from the first attribute
+                let start_attr = match last_attr {
+                    Some(attr) => attr.next(),
+                    None       => Some(SchemaAttr::min())
+                };
+                let start = start_attr.map(|a| DocumentKeyAttr::new(id, a));
+                let end = DocumentKeyAttr::with_attribute_max(id);
+
+                // delete_range between (last_attr + 1) and attr_max
+                if let Some(start) = start {
+                    file_writer.delete_range(start.as_ref(), end.as_ref())?;
+                }
+            }
+        }
+
+        file_writer.finish()?;
+
+        Ok(Update { sst_file: self.sst_file })
+    }
+}
--- a/src/database/database_view.rs
+++ b/src/database/database_view.rs
@ -9,17 +9,17 @@ use serde::de::DeserializeOwned;

 use crate::database::{DocumentKey, DocumentKeyAttr};
 use crate::database::{retrieve_data_schema, retrieve_data_index};
-use crate::database::blob::positive::PositiveBlob;
 use crate::database::deserializer::Deserializer;
 use crate::database::schema::Schema;
-use crate::rank::QueryBuilder;
+use crate::database::index::Index;
+use crate::rank::{QueryBuilder, FilterFunc};
 use crate::DocumentId;

 pub struct DatabaseView<D>
 where D: Deref<Target=DB>
 {
    snapshot: Snapshot<D>,
-    blob: PositiveBlob,
+    index: Index,
    schema: Schema,
 }

@ -28,16 +28,16 @@ where D: Deref<Target=DB>
 {
    pub fn new(snapshot: Snapshot<D>) -> Result<DatabaseView<D>, Box<Error>> {
        let schema = retrieve_data_schema(&snapshot)?;
-        let blob = retrieve_data_index(&snapshot)?;
-        Ok(DatabaseView { snapshot, blob, schema })
+        let index = retrieve_data_index(&snapshot)?;
+        Ok(DatabaseView { snapshot, index, schema })
    }

    pub fn schema(&self) -> &Schema {
        &self.schema
    }

-    pub fn blob(&self) -> &PositiveBlob {
-        &self.blob
+    pub fn index(&self) -> &Index {
+        &self.index
    }

    pub fn into_snapshot(self) -> Snapshot<D> {
@ -71,19 +71,18 @@ where D: Deref<Target=DB>
        Ok(())
    }

-    pub fn query_builder(&self) -> Result<QueryBuilder<D>, Box<Error>> {
+    pub fn query_builder(&self) -> Result<QueryBuilder<D, FilterFunc<D>>, Box<Error>> {
        QueryBuilder::new(self)
    }

-    // TODO create an enum error type
-    pub fn retrieve_document<T>(&self, id: DocumentId) -> Result<T, Box<Error>>
+    pub fn document_by_id<T>(&self, id: DocumentId) -> Result<T, Box<Error>>
    where T: DeserializeOwned
    {
        let mut deserializer = Deserializer::new(&self.snapshot, &self.schema, id);
        Ok(T::deserialize(&mut deserializer)?)
    }

-    pub fn retrieve_documents<T, I>(&self, ids: I) -> DocumentIter<D, T, I::IntoIter>
+    pub fn documents_by_id<T, I>(&self, ids: I) -> DocumentIter<D, T, I::IntoIter>
    where T: DeserializeOwned,
          I: IntoIterator<Item=DocumentId>,
    {
@ -100,7 +99,7 @@ where D: Deref<Target=DB>
 {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        let mut options = ReadOptions::new();
-        let lower = DocumentKey::new(0);
+        let lower = DocumentKey::new(DocumentId(0));
        options.set_iterate_lower_bound(lower.as_ref());

        let mut iter = self.snapshot.iter_opt(options);
@ -149,7 +148,7 @@ where D: Deref<Target=DB>,

    fn next(&mut self) -> Option<Self::Item> {
        match self.document_ids.next() {
-            Some(id) => Some(self.database_view.retrieve_document(id)),
+            Some(id) => Some(self.database_view.document_by_id(id)),
            None => None
        }
    }
@ -168,7 +167,7 @@ where D: Deref<Target=DB>,
 {
    fn next_back(&mut self) -> Option<Self::Item> {
        match self.document_ids.next_back() {
-            Some(id) => Some(self.database_view.retrieve_document(id)),
+            Some(id) => Some(self.database_view.document_by_id(id)),
            None => None
        }
    }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,40 +1,49 @@
+#![cfg_attr(feature = "nightly", feature(test))]
+
 pub mod automaton;
 pub mod database;
 pub mod data;
 pub mod rank;
 pub mod tokenizer;
-pub mod vec_read_only;
+mod attribute;
+mod word_area;
 mod common_words;

 pub use rocksdb;

 pub use self::tokenizer::Tokenizer;
 pub use self::common_words::CommonWords;
+pub use self::attribute::{Attribute, AttributeError};
+pub use self::word_area::{WordArea, WordAreaError};

-pub type DocumentId = u64;
+/// Represent an internally generated document unique identifier.
+///
+/// It is used to inform the database the document you want to deserialize.
+/// Helpful for custom ranking.
+#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
+pub struct DocumentId(u64);

 /// This structure represent the position of a word
 /// in a document and its attributes.
 ///
 /// This is stored in the map, generated at index time,
 /// extracted and interpreted at search time.
-#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 #[repr(C)]
 pub struct DocIndex {
    /// The document identifier where the word was found.
    pub document_id: DocumentId,

-    /// The attribute identifier in the document
-    /// where the word was found.
-    ///
-    /// This is an `u8` therefore a document
-    /// can not have more than `2^8` attributes.
-    pub attribute: u8,
+    /// The attribute in the document where the word was found
+    /// along with the index in it.
+    pub attribute: Attribute,

-    /// The index where the word was found in the attribute.
+    /// The position in bytes where the word was found
+    /// along with the length of it.
    ///
-    /// Only the first 1000 words are indexed.
-    pub attribute_index: u32,
+    /// It informs on the original word area in the text indexed
+    /// without needing to run the tokenizer again.
+    pub word_area: WordArea,
 }

 /// This structure represent a matching word with informations
@ -45,7 +54,7 @@ pub struct DocIndex {
 ///
 /// The word in itself is not important.
 // TODO do data oriented programming ? very arrays ?
-#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct Match {
    /// The word index in the query sentence.
    /// Same as the `attribute_index` but for the query words.
@ -57,23 +66,19 @@ pub struct Match {
    /// (i.e. the Levenshtein distance).
    pub distance: u8,

-    /// The attribute in which the word is located
-    /// (i.e. Title is 0, Description is 1).
-    ///
-    /// This is an `u8` therefore a document
-    /// can not have more than `2^8` attributes.
-    pub attribute: u8,
-
-    /// Where does this word is located in the attribute string
-    /// (i.e. at the start or the end of the attribute).
-    ///
-    /// The index in the attribute is limited to a maximum of `2^32`
-    /// this is because we index only the first 1000 words
-    /// in an attribute.
-    pub attribute_index: u32,
+    /// The attribute in the document where the word was found
+    /// along with the index in it.
+    pub attribute: Attribute,

    /// Whether the word that match is an exact match or a prefix.
    pub is_exact: bool,
+
+    /// The position in bytes where the word was found
+    /// along with the length of it.
+    ///
+    /// It informs on the original word area in the text indexed
+    /// without needing to run the tokenizer again.
+    pub word_area: WordArea,
 }

 impl Match {
@ -81,9 +86,9 @@ impl Match {
        Match {
            query_index: 0,
            distance: 0,
-            attribute: 0,
-            attribute_index: 0,
+            attribute: Attribute::new_faillible(0, 0),
            is_exact: false,
+            word_area: WordArea::new_faillible(0, 0),
        }
    }

@ -91,9 +96,20 @@ impl Match {
        Match {
            query_index: u32::max_value(),
            distance: u8::max_value(),
-            attribute: u8::max_value(),
-            attribute_index: u32::max_value(),
+            attribute: Attribute::max_value(),
            is_exact: true,
+            word_area: WordArea::max_value(),
        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::mem;
+
+    #[test]
+    fn docindex_mem_size() {
+        assert_eq!(mem::size_of::<DocIndex>(), 16);
+    }
+}
--- a/src/rank/criterion/exact.rs
+++ b/src/rank/criterion/exact.rs
@ -10,13 +10,13 @@ use crate::database::DatabaseView;
 use crate::Match;

 #[inline]
-fn contains_exact(matches: &[Match]) -> bool {
+fn contains_exact(matches: &&[Match]) -> bool {
    matches.iter().any(|m| m.is_exact)
 }

 #[inline]
 fn number_exact_matches(matches: &[Match]) -> usize {
-    GroupBy::new(matches, match_query_index).map(contains_exact).count()
+    GroupBy::new(matches, match_query_index).filter(contains_exact).count()
 }

 #[derive(Debug, Clone, Copy)]
--- a/src/rank/criterion/mod.rs
+++ b/src/rank/criterion/mod.rs
@ -29,7 +29,6 @@ pub use self::{
 pub trait Criterion<D>
 where D: Deref<Target=DB>
 {
-    #[inline]
    fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering;

    #[inline]
@ -62,6 +61,7 @@ where D: Deref<Target=DB>
    }
 }

+#[derive(Default)]
 pub struct CriteriaBuilder<D>
 where D: Deref<Target=DB>
 {
--- a/src/rank/criterion/sort_by.rs
+++ b/src/rank/criterion/sort_by.rs
@ -46,13 +46,18 @@ use crate::rank::Document;
 /// let criterion = builder.build();
 ///
 /// ```
-#[derive(Default)]
 pub struct SortBy<T> {
    _phantom: marker::PhantomData<T>,
 }

 impl<T> SortBy<T> {
    pub fn new() -> Self {
+        SortBy::default()
+    }
+}
+
+impl<T> Default for SortBy<T> {
+    fn default() -> SortBy<T> {
        SortBy { _phantom: marker::PhantomData }
    }
 }
@ -62,12 +67,12 @@ where D: Deref<Target=DB>,
      T: DeserializeOwned + Ord,
 {
    fn evaluate(&self, lhs: &Document, rhs: &Document, view: &DatabaseView<D>) -> Ordering {
-        let lhs = match view.retrieve_document::<T>(lhs.id) {
+        let lhs = match view.document_by_id::<T>(lhs.id) {
            Ok(doc) => Some(doc),
            Err(e) => { eprintln!("{}", e); None },
        };

-        let rhs = match view.retrieve_document::<T>(rhs.id) {
+        let rhs = match view.document_by_id::<T>(rhs.id) {
            Ok(doc) => Some(doc),
            Err(e) => { eprintln!("{}", e); None },
        };
--- a/src/rank/criterion/sum_of_typos.rs
+++ b/src/rank/criterion/sum_of_typos.rs
@ -11,14 +11,14 @@ use crate::database::DatabaseView;
 use crate::Match;

 #[inline]
-fn sum_matches_typos(matches: &[Match]) -> i8 {
+fn sum_matches_typos(matches: &[Match]) -> isize {
    let mut sum_typos = 0;
    let mut number_words = 0;

    // note that GroupBy will never return an empty group
    // so we can do this assumption safely
    for group in GroupBy::new(matches, match_query_index) {
-        sum_typos += unsafe { group.get_unchecked(0).distance } as i8;
+        sum_typos += unsafe { group.get_unchecked(0).distance as isize };
        number_words += 1;
    }

@ -44,6 +44,8 @@ where D: Deref<Target=DB>
 mod tests {
    use super::*;

+    use crate::{DocumentId, Attribute, WordArea};
+
    // typing: "Geox CEO"
    //
    // doc0: "Geox SpA: CEO and Executive"
@ -52,22 +54,46 @@ mod tests {
    fn one_typo_reference() {
        let doc0 = {
            let matches = vec![
-                Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
-                Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 2, is_exact: false },
+                Match {
+                    query_index: 0,
+                    distance: 0,
+                    attribute: Attribute::new_faillible(0, 0),
+                    is_exact: false,
+                    word_area: WordArea::new_faillible(0, 6)
+                },
+                Match {
+                    query_index: 1,
+                    distance: 0,
+                    attribute: Attribute::new_faillible(0, 2),
+                    is_exact: false,
+                    word_area: WordArea::new_faillible(0, 6)
+                },
            ];
            Document {
-                id: 0,
+                id: DocumentId(0),
                matches: matches,
            }
        };

        let doc1 = {
            let matches = vec![
-                Match { query_index: 0, distance: 1, attribute: 0, attribute_index: 0, is_exact: false },
-                Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 2, is_exact: false },
+                Match {
+                    query_index: 0,
+                    distance: 1,
+                    attribute: Attribute::new_faillible(0, 0),
+                    is_exact: false,
+                    word_area: WordArea::new_faillible(0, 6)
+                },
+                Match {
+                    query_index: 1,
+                    distance: 0,
+                    attribute: Attribute::new_faillible(0, 2),
+                    is_exact: false,
+                    word_area: WordArea::new_faillible(0, 6)
+                },
            ];
            Document {
-                id: 1,
+                id: DocumentId(1),
                matches: matches,
            }
        };
@ -85,21 +111,39 @@ mod tests {
    fn no_typo() {
        let doc0 = {
            let matches = vec![
-                Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
-                Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 1, is_exact: false },
+                Match {
+                    query_index: 0,
+                    distance: 0,
+                    attribute: Attribute::new_faillible(0, 0),
+                    is_exact: false,
+                    word_area: WordArea::new_faillible(0, 6)
+                },
+                Match {
+                    query_index: 1,
+                    distance: 0,
+                    attribute: Attribute::new_faillible(0, 1),
+                    is_exact: false,
+                    word_area: WordArea::new_faillible(0, 6)
+                },
            ];
            Document {
-                id: 0,
+                id: DocumentId(0),
                matches: matches,
            }
        };

        let doc1 = {
            let matches = vec![
-                Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
+                Match {
+                    query_index: 0,
+                    distance: 0,
+                    attribute: Attribute::new_faillible(0, 0),
+                    is_exact: false,
+                    word_area: WordArea::new_faillible(0, 6)
+                },
            ];
            Document {
-                id: 1,
+                id: DocumentId(1),
                matches: matches,
            }
        };
@ -117,21 +161,39 @@ mod tests {
    fn one_typo() {
        let doc0 = {
            let matches = vec![
-                Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
-                Match { query_index: 1, distance: 1, attribute: 0, attribute_index: 1, is_exact: false },
+                Match {
+                    query_index: 0,
+                    distance: 0,
+                    attribute: Attribute::new_faillible(0, 0),
+                    is_exact: false,
+                    word_area: WordArea::new_faillible(0, 6)
+                },
+                Match {
+                    query_index: 1,
+                    distance: 1,
+                    attribute: Attribute::new_faillible(0, 1),
+                    is_exact: false,
+                    word_area: WordArea::new_faillible(0, 6)
+                },
            ];
            Document {
-                id: 0,
+                id: DocumentId(0),
                matches: matches,
            }
        };

        let doc1 = {
            let matches = vec![
-                Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false },
+                Match {
+                    query_index: 0,
+                    distance: 0,
+                    attribute: Attribute::new_faillible(0, 0),
+                    is_exact: false,
+                    word_area: WordArea::new_faillible(0, 6)
+                },
            ];
            Document {
-                id: 1,
+                id: DocumentId(1),
                matches: matches,
            }
        };
--- a/src/rank/criterion/sum_of_words_attribute.rs
+++ b/src/rank/criterion/sum_of_words_attribute.rs
@ -10,11 +10,11 @@ use crate::rank::criterion::Criterion;
 use crate::Match;

 #[inline]
-fn sum_matches_attributes(matches: &[Match]) -> u8 {
+fn sum_matches_attributes(matches: &[Match]) -> usize {
    // note that GroupBy will never return an empty group
    // so we can do this assumption safely
-    GroupBy::new(matches, match_query_index).map(|group| unsafe {
-        group.get_unchecked(0).attribute
+    GroupBy::new(matches, match_query_index).map(|group| {
+        unsafe { group.get_unchecked(0).attribute.attribute() as usize }
    }).sum()
 }

--- a/src/rank/criterion/sum_of_words_position.rs
+++ b/src/rank/criterion/sum_of_words_position.rs
@ -10,11 +10,11 @@ use crate::rank::criterion::Criterion;
 use crate::Match;

 #[inline]
-fn sum_matches_attribute_index(matches: &[Match]) -> u32 {
+fn sum_matches_attribute_index(matches: &[Match]) -> usize {
    // note that GroupBy will never return an empty group
    // so we can do this assumption safely
-    GroupBy::new(matches, match_query_index).map(|group| unsafe {
-        group.get_unchecked(0).attribute_index
+    GroupBy::new(matches, match_query_index).map(|group| {
+        unsafe { group.get_unchecked(0).attribute.word_index() as usize }
    }).sum()
 }

--- a/src/rank/criterion/words_proximity.rs
+++ b/src/rank/criterion/words_proximity.rs
@ -20,8 +20,8 @@ fn index_proximity(lhs: u32, rhs: u32) -> u32 {
 }

 fn attribute_proximity(lhs: &Match, rhs: &Match) -> u32 {
-    if lhs.attribute != rhs.attribute { return MAX_DISTANCE }
-    index_proximity(lhs.attribute_index, rhs.attribute_index)
+    if lhs.attribute.attribute() != rhs.attribute.attribute() { return MAX_DISTANCE }
+    index_proximity(lhs.attribute.word_index(), rhs.attribute.word_index())
 }

 fn min_proximity(lhs: &[Match], rhs: &[Match]) -> u32 {
@ -67,6 +67,8 @@ where D: Deref<Target=DB>
 mod tests {
    use super::*;

+    use crate::Attribute;
+
    #[test]
    fn three_different_attributes() {

@ -79,11 +81,11 @@ mod tests {
        // { id: 3, attr: 3, attr_index: 1 }

        let matches = &[
-            Match { query_index: 0, attribute: 0, attribute_index: 0, ..Match::zero() },
-            Match { query_index: 1, attribute: 1, attribute_index: 0, ..Match::zero() },
-            Match { query_index: 2, attribute: 1, attribute_index: 1, ..Match::zero() },
-            Match { query_index: 2, attribute: 2, attribute_index: 0, ..Match::zero() },
-            Match { query_index: 3, attribute: 3, attribute_index: 1, ..Match::zero() },
+            Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() },
+            Match { query_index: 1, attribute: Attribute::new_faillible(1, 0), ..Match::zero() },
+            Match { query_index: 2, attribute: Attribute::new_faillible(1, 1), ..Match::zero() },
+            Match { query_index: 2, attribute: Attribute::new_faillible(2, 0), ..Match::zero() },
+            Match { query_index: 3, attribute: Attribute::new_faillible(3, 1), ..Match::zero() },
        ];

        //   soup -> of = 8
@ -105,12 +107,12 @@ mod tests {
        // { id: 3, attr: 1, attr_index: 3 }

        let matches = &[
-            Match { query_index: 0, attribute: 0, attribute_index: 0, ..Match::zero() },
-            Match { query_index: 0, attribute: 1, attribute_index: 0, ..Match::zero() },
-            Match { query_index: 1, attribute: 1, attribute_index: 1, ..Match::zero() },
-            Match { query_index: 2, attribute: 1, attribute_index: 2, ..Match::zero() },
-            Match { query_index: 3, attribute: 0, attribute_index: 1, ..Match::zero() },
-            Match { query_index: 3, attribute: 1, attribute_index: 3, ..Match::zero() },
+            Match { query_index: 0, attribute: Attribute::new_faillible(0, 0), ..Match::zero() },
+            Match { query_index: 0, attribute: Attribute::new_faillible(1, 0), ..Match::zero() },
+            Match { query_index: 1, attribute: Attribute::new_faillible(1, 1), ..Match::zero() },
+            Match { query_index: 2, attribute: Attribute::new_faillible(1, 2), ..Match::zero() },
+            Match { query_index: 3, attribute: Attribute::new_faillible(0, 1), ..Match::zero() },
+            Match { query_index: 3, attribute: Attribute::new_faillible(1, 3), ..Match::zero() },
        ];

        //   soup -> of = 1
@ -119,3 +121,42 @@ mod tests {
        assert_eq!(matches_proximity(matches), 3);
    }
 }
+
+#[cfg(all(feature = "nightly", test))]
+mod bench {
+    extern crate test;
+
+    use super::*;
+    use std::error::Error;
+    use self::test::Bencher;
+
+    use rand_xorshift::XorShiftRng;
+    use rand::{Rng, SeedableRng};
+
+    use crate::Attribute;
+
+    #[bench]
+    fn evaluate_proximity(bench: &mut Bencher) -> Result<(), Box<Error>> {
+        let number_matches = 30_000;
+        let mut matches = Vec::with_capacity(number_matches);
+        let mut rng = XorShiftRng::seed_from_u64(42);
+
+        for _ in 0..number_matches {
+            let query_index = rng.gen_range(0, 4);
+
+            let attribute = rng.gen_range(0, 5);
+            let word_index = rng.gen_range(0, 15);
+            let attribute = Attribute::new_faillible(attribute, word_index);
+
+            let match_ = Match { query_index, attribute, ..Match::zero() };
+            matches.push(match_);
+        }
+
+        bench.iter(|| {
+            let proximity = matches_proximity(&matches);
+            test::black_box(move || proximity)
+        });
+
+        Ok(())
+    }
+}
--- a/src/rank/mod.rs
+++ b/src/rank/mod.rs
@ -4,7 +4,7 @@ mod distinct_map;

 use crate::{Match, DocumentId};

-pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder};
+pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder};

 #[inline]
 fn match_query_index(a: &Match, b: &Match) -> bool {
--- a/src/rank/query_builder.rs
+++ b/src/rank/query_builder.rs
@ -4,10 +4,11 @@ use std::error::Error;
 use std::hash::Hash;
 use std::rc::Rc;

-use group_by::GroupByMut;
+use group_by::BinaryGroupByMut;
 use hashbrown::HashMap;
 use fst::Streamer;
 use rocksdb::DB;
+use log::info;

 use crate::automaton::{self, DfaExt, AutomatonExt};
 use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap};
@ -34,14 +35,17 @@ fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
    automatons
 }

-pub struct QueryBuilder<'a, D>
+pub type FilterFunc<D> = fn(DocumentId, &DatabaseView<D>) -> bool;
+
+pub struct QueryBuilder<'a, D, FI>
 where D: Deref<Target=DB>
 {
    view: &'a DatabaseView<D>,
    criteria: Criteria<D>,
+    filter: Option<FI>,
 }

-impl<'a, D> QueryBuilder<'a, D>
+impl<'a, D> QueryBuilder<'a, D, FilterFunc<D>>
 where D: Deref<Target=DB>
 {
    pub fn new(view: &'a DatabaseView<D>) -> Result<Self, Box<Error>> {
@ -49,19 +53,27 @@ where D: Deref<Target=DB>
    }
 }

-impl<'a, D> QueryBuilder<'a, D>
-where D: Deref<Target=DB>
+impl<'a, D, FI> QueryBuilder<'a, D, FI>
+where D: Deref<Target=DB>,
 {
    pub fn with_criteria(view: &'a DatabaseView<D>, criteria: Criteria<D>) -> Result<Self, Box<Error>> {
-        Ok(QueryBuilder { view, criteria })
+        Ok(QueryBuilder { view, criteria, filter: None })
    }

-    pub fn criteria(&mut self, criteria: Criteria<D>) -> &mut Self {
-        self.criteria = criteria;
-        self
+    pub fn with_filter<F>(self, function: F) -> QueryBuilder<'a, D, F>
+    where F: Fn(DocumentId, &DatabaseView<D>) -> bool,
+    {
+        QueryBuilder {
+            view: self.view,
+            criteria: self.criteria,
+            filter: Some(function)
+        }
    }

-    pub fn with_distinct<F>(self, function: F, size: usize) -> DistinctQueryBuilder<'a, D, F> {
+    pub fn with_distinct<F, K>(self, function: F, size: usize) -> DistinctQueryBuilder<'a, D, FI, F>
+    where F: Fn(DocumentId, &DatabaseView<D>) -> Option<K>,
+          K: Hash + Eq,
+    {
        DistinctQueryBuilder {
            inner: self,
            function: function,
@ -75,12 +87,13 @@ where D: Deref<Target=DB>
        let mut stream = {
            let mut op_builder = fst::map::OpBuilder::new();
            for automaton in &automatons {
-                let stream = self.view.blob().as_map().search(automaton);
+                let stream = self.view.index().positive.map().search(automaton);
                op_builder.push(stream);
            }
            op_builder.union()
        };

+        let mut number_matches = 0;
        let mut matches = HashMap::new();

        while let Some((input, indexed_values)) = stream.next() {
@ -89,39 +102,55 @@ where D: Deref<Target=DB>
                let distance = automaton.eval(input).to_u8();
                let is_exact = distance == 0 && input.len() == automaton.query_len();

-                let doc_indexes = self.view.blob().as_indexes();
+                let doc_indexes = &self.view.index().positive.indexes();
                let doc_indexes = &doc_indexes[iv.value as usize];

+                number_matches += doc_indexes.len();
                for doc_index in doc_indexes {
                    let match_ = Match {
                        query_index: iv.index as u32,
                        distance: distance,
                        attribute: doc_index.attribute,
-                        attribute_index: doc_index.attribute_index,
                        is_exact: is_exact,
+                        word_area: doc_index.word_area,
                    };
                    matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_);
                }
            }
        }

-        matches.into_iter().map(|(id, matches)| Document::from_matches(id, matches)).collect()
+        info!("{} total documents to classify", matches.len());
+        info!("{} total matches to classify", number_matches);
+
+        matches.into_iter().map(|(i, m)| Document::from_matches(i, m)).collect()
    }
 }

-impl<'a, D> QueryBuilder<'a, D>
+impl<'a, D, FI> QueryBuilder<'a, D, FI>
 where D: Deref<Target=DB>,
+      FI: Fn(DocumentId, &DatabaseView<D>) -> bool,
 {
-    pub fn query(&self, query: &str, range: Range<usize>) -> Vec<Document> {
-        let mut documents = self.query_all(query);
+    pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
+        // We give the filtering work to the query distinct builder,
+        // specifying a distinct rule that has no effect.
+        if self.filter.is_some() {
+            let builder = self.with_distinct(|_, _| None as Option<()>, 1);
+            return builder.query(query, range);
+        }
+
+        let (elapsed, mut documents) = elapsed::measure_time(|| self.query_all(query));
+        info!("query_all took {}", elapsed);
+
        let mut groups = vec![documents.as_mut_slice()];
        let view = &self.view;

-        'criteria: for criterion in self.criteria.as_ref() {
+        'criteria: for (ci, criterion) in self.criteria.as_ref().iter().enumerate() {
            let tmp_groups = mem::replace(&mut groups, Vec::new());
            let mut documents_seen = 0;

            for group in tmp_groups {
+                info!("criterion {}, documents group of size {}", ci, group.len());
+
                // if this group does not overlap with the requested range,
                // push it without sorting and splitting it
                if documents_seen + group.len() < range.start {
@ -130,9 +159,12 @@ where D: Deref<Target=DB>,
                    continue;
                }

-                group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view));
+                let (elapsed, ()) = elapsed::measure_time(|| {
+                    group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view));
+                });
+                info!("criterion {} sort took {}", ci, elapsed);

-                for group in GroupByMut::new(group, |a, b| criterion.eq(a, b, view)) {
+                for group in BinaryGroupByMut::new(group, |a, b| criterion.eq(a, b, view)) {
                    documents_seen += group.len();
                    groups.push(group);

@ -152,25 +184,41 @@ where D: Deref<Target=DB>,
    }
 }

-pub struct DistinctQueryBuilder<'a, D, F>
+pub struct DistinctQueryBuilder<'a, D, FI, FD>
 where D: Deref<Target=DB>
 {
-    inner: QueryBuilder<'a, D>,
-    function: F,
+    inner: QueryBuilder<'a, D, FI>,
+    function: FD,
    size: usize,
 }

-impl<'a, D, F, K> DistinctQueryBuilder<'a, D, F>
+impl<'a, D, FI, FD> DistinctQueryBuilder<'a, D, FI, FD>
 where D: Deref<Target=DB>,
-      F: Fn(DocumentId, &DatabaseView<D>) -> Option<K>,
+{
+    pub fn with_filter<F>(self, function: F) -> DistinctQueryBuilder<'a, D, F, FD>
+    where F: Fn(DocumentId, &DatabaseView<D>) -> bool,
+    {
+        DistinctQueryBuilder {
+            inner: self.inner.with_filter(function),
+            function: self.function,
+            size: self.size
+        }
+    }
+}
+
+impl<'a, D, FI, FD, K> DistinctQueryBuilder<'a, D, FI, FD>
+where D: Deref<Target=DB>,
+      FI: Fn(DocumentId, &DatabaseView<D>) -> bool,
+      FD: Fn(DocumentId, &DatabaseView<D>) -> Option<K>,
      K: Hash + Eq,
 {
-    pub fn query(&self, query: &str, range: Range<usize>) -> Vec<Document> {
+    pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
        let mut documents = self.inner.query_all(query);
        let mut groups = vec![documents.as_mut_slice()];
        let mut key_cache = HashMap::new();
        let view = &self.inner.view;

+        let mut filter_map = HashMap::new();
        // these two variables informs on the current distinct map and
        // on the raw offset of the start of the group where the
        // range.start bound is located according to the distinct function
@ -193,17 +241,27 @@ where D: Deref<Target=DB>,

                group.sort_unstable_by(|a, b| criterion.evaluate(a, b, view));

-                for group in GroupByMut::new(group, |a, b| criterion.eq(a, b, view)) {
+                for group in BinaryGroupByMut::new(group, |a, b| criterion.eq(a, b, view)) {
                    // we must compute the real distinguished len of this sub-group
                    for document in group.iter() {
-                        let entry = key_cache.entry(document.id);
-                        let key = entry.or_insert_with(|| (self.function)(document.id, view).map(Rc::new));
-
-                        match key.clone() {
-                            Some(key) => buf_distinct.register(key),
-                            None      => buf_distinct.register_without_key(),
+                        let filter_accepted = match &self.inner.filter {
+                            Some(filter) => {
+                                let entry = filter_map.entry(document.id);
+                                *entry.or_insert_with(|| (filter)(document.id, view))
+                            },
+                            None => true,
                        };

+                        if filter_accepted {
+                            let entry = key_cache.entry(document.id);
+                            let key = entry.or_insert_with(|| (self.function)(document.id, view).map(Rc::new));
+
+                            match key.clone() {
+                                Some(key) => buf_distinct.register(key),
+                                None => buf_distinct.register_without_key(),
+                            };
+                        }
+
                        // the requested range end is reached: stop computing distinct
                        if buf_distinct.len() >= range.end { break }
                    }
@ -229,16 +287,22 @@ where D: Deref<Target=DB>,
        let mut seen = BufferedDistinctMap::new(&mut distinct_map);

        for document in documents.into_iter().skip(distinct_raw_offset) {
-            let key = key_cache.remove(&document.id).expect("BUG: cached key not found");
-
-            let accepted = match key {
-                Some(key) => seen.register(key),
-                None      => seen.register_without_key(),
+            let filter_accepted = match &self.inner.filter {
+                Some(_) => filter_map.remove(&document.id).expect("BUG: filtered not found"),
+                None => true,
            };

-            if accepted && seen.len() > range.start {
-                out_documents.push(document);
-                if out_documents.len() == range.len() { break }
+            if filter_accepted {
+                let key = key_cache.remove(&document.id).expect("BUG: cached key not found");
+                let distinct_accepted = match key {
+                    Some(key) => seen.register(key),
+                    None => seen.register_without_key(),
+                };
+
+                if distinct_accepted && seen.len() > range.start {
+                    out_documents.push(document);
+                    if out_documents.len() == range.len() { break }
+                }
            }
        }

--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@ -2,7 +2,7 @@ use std::mem;
 use self::Separator::*;

 pub trait TokenizerBuilder {
-    fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a>;
+    fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a>;
 }

 pub struct DefaultBuilder;
@ -13,22 +13,39 @@ impl DefaultBuilder {
    }
 }

+#[derive(Debug, PartialEq, Eq)]
+pub struct Token<'a> {
+    pub word: &'a str,
+    pub word_index: usize,
+    pub char_index: usize,
+}
+
 impl TokenizerBuilder for DefaultBuilder {
-    fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a> {
+    fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a> {
        Box::new(Tokenizer::new(text))
    }
 }

 pub struct Tokenizer<'a> {
-    index: usize,
+    word_index: usize,
+    char_index: usize,
    inner: &'a str,
 }

 impl<'a> Tokenizer<'a> {
    pub fn new(string: &str) -> Tokenizer {
+        let mut char_advance = 0;
+        let mut index_advance = 0;
+        for (n, (i, c)) in string.char_indices().enumerate() {
+            char_advance = n;
+            index_advance = i;
+            if detect_separator(c).is_none() { break }
+        }
+
        Tokenizer {
-            index: 0,
-            inner: string.trim_matches(&[' ', '.', ';', ',', '!', '?', '-', '\'', '"'][..]),
+            word_index: 0,
+            char_index: char_advance,
+            inner: &string[index_advance..],
        }
    }
 }
@ -56,43 +73,58 @@ impl Separator {
    }
 }

+fn detect_separator(c: char) -> Option<Separator> {
+    match c {
+        '.' | ';' | ',' | '!' | '?' | '-' => Some(Long),
+        ' ' | '\'' | '"'                  => Some(Short),
+        _                                 => None,
+    }
+}
+
 impl<'a> Iterator for Tokenizer<'a> {
-    type Item = (usize, &'a str);
+    type Item = Token<'a>;

    fn next(&mut self) -> Option<Self::Item> {
        let mut start_word = None;
        let mut distance = None;

        for (i, c) in self.inner.char_indices() {
-            let separator = match c {
-                '.' | ';' | ',' | '!' | '?' | '-' => Some(Long),
-                ' ' | '\'' | '"' => Some(Short),
-                _   => None,
-            };
-
-            match separator {
-                Some(dist) => {
+            match detect_separator(c) {
+                Some(sep) => {
                    if let Some(start_word) = start_word {
-                        let (word, tail) = self.inner.split_at(i);
+                        let (prefix, tail) = self.inner.split_at(i);
+                        let (spaces, word) = prefix.split_at(start_word);

                        self.inner = tail;
-                        self.index += distance.map(Separator::to_usize).unwrap_or(0);
+                        self.char_index += spaces.chars().count();
+                        self.word_index += distance.map(Separator::to_usize).unwrap_or(0);

-                        let word = &word[start_word..];
-                        return Some((self.index, word))
+                        let token = Token {
+                            word: word,
+                            word_index: self.word_index,
+                            char_index: self.char_index,
+                        };
+
+                        self.char_index += word.chars().count();
+                        return Some(token)
                    }
-                    distance = Some(distance.map(|s| s.add(dist)).unwrap_or(dist));
+
+                    distance.replace(distance.map_or(sep, |s| s.add(sep)));
                },
                None => { start_word.get_or_insert(i); },
            }
        }

        if let Some(start_word) = start_word {
-            let word = mem::replace(&mut self.inner, "");
-            self.index += distance.map(Separator::to_usize).unwrap_or(0);
+            let prefix = mem::replace(&mut self.inner, "");
+            let (spaces, word) = prefix.split_at(start_word);

-            let word = &word[start_word..];
-            return Some((self.index, word))
+            let token = Token {
+                word: word,
+                word_index: self.word_index + distance.map(Separator::to_usize).unwrap_or(0),
+                char_index: self.char_index + spaces.chars().count(),
+            };
+            return Some(token)
        }

        None
@ -107,12 +139,12 @@ mod tests {
    fn easy() {
        let mut tokenizer = Tokenizer::new("salut");

-        assert_eq!(tokenizer.next(), Some((0, "salut")));
+        assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
        assert_eq!(tokenizer.next(), None);

        let mut tokenizer = Tokenizer::new("yo    ");

-        assert_eq!(tokenizer.next(), Some((0, "yo")));
+        assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
        assert_eq!(tokenizer.next(), None);
    }

@ -120,18 +152,37 @@ mod tests {
    fn hard() {
        let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe");

-        assert_eq!(tokenizer.next(), Some((0, "yo")));
-        assert_eq!(tokenizer.next(), Some((1, "lolo")));
-        assert_eq!(tokenizer.next(), Some((9, "aïe")));
+        assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
        assert_eq!(tokenizer.next(), None);

        let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");

-        assert_eq!(tokenizer.next(), Some((0, "yo")));
-        assert_eq!(tokenizer.next(), Some((8, "lolo")));
-        assert_eq!(tokenizer.next(), Some((16, "wtf")));
-        assert_eq!(tokenizer.next(), Some((24, "lol")));
-        assert_eq!(tokenizer.next(), Some((32, "aïe")));
+        assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 18 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 32, char_index: 24 }));
+        assert_eq!(tokenizer.next(), None);
+    }
+
+    #[test]
+    fn hard_long_chars() {
+        let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
+
+        assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
+        assert_eq!(tokenizer.next(), None);
+
+        let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
+
+        assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 }));
        assert_eq!(tokenizer.next(), None);
    }
 }
--- a/src/vec_read_only.rs
+++ b/src/vec_read_only.rs
@ -1,51 +0,0 @@
-use std::ops::Deref;
-use std::sync::Arc;
-use std::fmt;
-
-#[derive(Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
-pub struct VecReadOnly<T> {
-    inner: Arc<Vec<T>>,
-    offset: usize,
-    len: usize,
-}
-
-impl<T> VecReadOnly<T> {
-    pub fn new(vec: Vec<T>) -> Self {
-        let len = vec.len();
-        Self {
-            inner: Arc::new(vec),
-            offset: 0,
-            len: len,
-        }
-    }
-
-    pub fn len(&self) -> usize {
-        self.len
-    }
-
-    pub fn range(&self, offset: usize, len: usize) -> Self {
-        Self {
-            inner: self.inner.clone(),
-            offset: self.offset + offset,
-            len: len,
-        }
-    }
-
-    pub fn as_slice(&self) -> &[T] {
-        &self.inner[self.offset..self.offset + self.len]
-    }
-}
-
-impl<T> Deref for VecReadOnly<T> {
-    type Target = [T];
-
-    fn deref(&self) -> &Self::Target {
-        self.as_slice()
-    }
-}
-
-impl<T: fmt::Debug> fmt::Debug for VecReadOnly<T> {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        self.inner.fmt(f)
-    }
-}
--- a/src/word_area.rs
+++ b/src/word_area.rs
@ -0,0 +1,102 @@
+use std::fmt;
+
+/// Represent a word position in bytes along with the length of it.
+///
+/// It can represent words byte index to maximum 2^22 and
+/// up to words of length 1024.
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct WordArea(u32);
+
+impl WordArea {
+    /// Construct a `WordArea` from a word position in expresed as
+    /// a number of characters and the length of it.
+    ///
+    /// # Panics
+    ///
+    /// The char index must not be greater than 2^22
+    /// and the length not greater than 1024.
+    pub(crate) fn new(char_index: u32, length: u16) -> Result<WordArea, WordAreaError> {
+        if char_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 {
+            return Err(WordAreaError::ByteIndexTooBig)
+        }
+
+        if length & 0b1111_1100_0000_0000 != 0 {
+            return Err(WordAreaError::LengthTooBig)
+        }
+
+        let char_index = char_index << 10;
+        Ok(WordArea(char_index | u32::from(length)))
+    }
+
+    pub(crate) fn new_faillible(char_index: u32, length: u16) -> WordArea {
+        match WordArea::new(char_index, length) {
+            Ok(word_area) => word_area,
+            Err(WordAreaError::ByteIndexTooBig) => {
+                panic!("word area byte index must not be greater than 2^22")
+            },
+            Err(WordAreaError::LengthTooBig) => {
+                panic!("word area length must not be greater than 1024")
+            },
+        }
+    }
+
+    pub(crate) fn max_value() -> WordArea {
+        WordArea(u32::max_value())
+    }
+
+    #[inline]
+    pub fn char_index(self) -> u32 {
+        self.0 >> 10
+    }
+
+    #[inline]
+    pub fn length(self) -> u16 {
+        (self.0 & 0b0000_0000_0000_0000_0011_1111_1111) as u16
+    }
+}
+
+impl fmt::Debug for WordArea {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.debug_struct("WordArea")
+            .field("char_index", &self.char_index())
+            .field("length", &self.length())
+            .finish()
+    }
+}
+
+pub enum WordAreaError {
+    ByteIndexTooBig,
+    LengthTooBig,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use quickcheck::{quickcheck, TestResult};
+
+    quickcheck! {
+        fn qc_word_area(gen_char_index: u32, gen_length: u16) -> TestResult {
+            if gen_char_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) {
+                return TestResult::discard()
+            }
+
+            let word_area = WordArea::new_faillible(gen_char_index, gen_length);
+
+            let valid_char_index = word_area.char_index() == gen_char_index;
+            let valid_length = word_area.length() == gen_length;
+
+            TestResult::from_bool(valid_char_index && valid_length)
+        }
+
+        fn qc_word_area_ord(gen_char_index: u32, gen_length: u16) -> TestResult {
+            if gen_char_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) {
+                return TestResult::discard()
+            }
+
+            let a = WordArea::new_faillible(gen_char_index, gen_length);
+            let b = WordArea::new_faillible(gen_char_index + 1, gen_length + 1);
+
+            TestResult::from_bool(a < b)
+        }
+    }
+}
Author	SHA1	Message	Date
Clément Renault	810dfdf656	Merge pull request #90 from Kerollmops/version-bump Bump version to 0.2.1	2019-01-25 17:08:53 +01:00
Clément Renault	f016652fca	chore: Bump version to 0.2.1	2019-01-25 16:41:08 +01:00
Clément Renault	6c99ebe3fa	Merge pull request #89 from Kerollmops/no-more-compaction Remove the manual compaction triggering	2019-01-25 16:40:08 +01:00
Clément Renault	94d357985f	feat: Remove the manual compaction triggering	2019-01-25 16:05:56 +01:00
Clément Renault	fbc698567a	Merge pull request #87 from Kerollmops/measure-index-loading Display index loading times	2019-01-24 14:07:11 +01:00
Clément Renault	aa9db14c09	chore: Display index loading times	2019-01-23 11:19:44 +01:00
Clément Renault	61e83a1c21	Merge pull request #86 from Kerollmops/measure-indexation Display timings of indexation operations	2019-01-16 13:32:44 +01:00
Clément Renault	1316be5b09	chore: Display timings of indexation operations	2019-01-16 11:45:33 +01:00
Clément Renault	4e8b0383dd	Merge pull request #85 from Kerollmops/debug-more-stats Display more stats infos	2019-01-15 14:20:28 +01:00
Clément Renault	4fa10753c1	chore: Display more stats infos	2019-01-14 21:18:46 +01:00
Clément Renault	2473e289e8	Merge pull request #84 from qdequele/create-server-example Example HTTP server example can use stopwords	2019-01-14 18:55:58 +01:00
Quentin de Quelen	e0e5e87ed3	feat: HTTP server example can use stopwords	2019-01-14 18:21:58 +01:00
Quentin de Quelen	b13e61f40a	Merge pull request #83 from qdequele/create-server-example Create an example of HTTP server managing multiple databases	2019-01-14 14:35:33 +01:00
Quentin de Quelen	c023cb3065	feat: Create an example for HTTP server managing multiple databases	2019-01-14 13:39:54 +01:00
Clément Renault	0a3d069fbc	Merge pull request #79 from qdequele/master Schema can be de/serialized from a json format	2019-01-12 21:50:02 +01:00
Quentin de Quelen	fa062ce2cf	feat: Schema can be de/serialized from a json format	2019-01-12 21:05:48 +01:00
Clément Renault	cdc6e47bf5	Merge pull request #81 from Kerollmops/update-readme Simplify the examples command lines	2019-01-12 13:43:42 +01:00
Clément Renault	d5f44838be	doc: Simplify the examples command lines	2019-01-12 12:56:11 +01:00
Clément Renault	5939f6e68a	Merge pull request #80 from Kerollmops/version-bump Bump version to 0.2.0	2019-01-12 12:52:08 +01:00
Clément Renault	97edc987f8	chore: Bump version to 0.2.0	2019-01-12 12:18:29 +01:00
Clément Renault	e4e50cecce	Merge pull request #77 from Kerollmops/update-dependencies Update the quickcheck dev-dependency	2019-01-10 22:09:44 +01:00
Clément Renault	77e0c19749	chore: Update the quickcheck dev-dependency	2019-01-10 21:25:32 +01:00
Clément Renault	251bccbbc3	Merge pull request #76 from Kerollmops/update-readme Update readme	2019-01-10 21:20:39 +01:00
Clément Renault	f7561f8552	doc: Update examples usages	2019-01-10 21:14:01 +01:00
Clément Renault	05fd7e87ec	doc: Add some wrk stats to the Readme	2019-01-10 21:13:54 +01:00
Clément Renault	446d6a5455	Merge pull request #75 from Kerollmops/binary-group-by-mut-query-builder Introduce binary group by in the query builder	2019-01-10 21:10:31 +01:00
Clément Renault	78786a0007	feat: Introduce binary group by in the query builder	2019-01-10 20:13:40 +01:00
Clément Renault	3d820a27ee	Merge pull request #74 from Kerollmops/same-document-update-shadowed Make multiple document updates shadow themselves	2019-01-10 15:57:49 +01:00
Clément Renault	ac347d788c	feat: Make multiple document updates shadow themselves	2019-01-10 15:25:24 +01:00
Clément Renault	5627f15d41	Merge pull request #73 from Kerollmops/module-for-attribute-wordarea Module for attribute wordarea	2019-01-10 15:23:03 +01:00
Clément Renault	e31afc2da2	chore: Move the WordArea type to its own module	2019-01-10 13:37:22 +01:00
Clément Renault	77c252e12a	chore: Move the Attribute type to its own module	2019-01-10 11:59:42 +01:00
Clément Renault	30c9c053c2	Merge pull request #72 from Kerollmops/wordarea-char-index Make WordArea be based on char index and length	2019-01-09 20:53:59 +01:00
Clément Renault	b53ef08d05	feat: Make WordArea be based on char index and length	2019-01-09 20:14:08 +01:00
Clément Renault	86bfb173ef	Merge pull request #70 from Kerollmops/fix-assert-new-attribute Remove assert on Attribute::new()	2019-01-09 11:09:18 +01:00
Quentin de Quelen	8e5f834625	chore: remove assert on Attribute::new()	2019-01-08 18:46:55 +01:00
Clément Renault	563b021679	Merge pull request #69 from tpayet/patch-1 Update README.md	2019-01-08 18:45:10 +01:00
Thomas Payet	681f721b1d	Correct README typos	2019-01-08 17:09:48 +01:00
Baptiste Jamin	8a7c061539	Update README.md	2019-01-08 17:09:48 +01:00
Clément Renault	8c781a4d05	Merge pull request #67 from Kerollmops/reintroduce-stop-words Reintroduce stop words	2019-01-07 13:29:23 +01:00
Clément Renault	de59ea495d	feat: Log some update steps	2019-01-06 22:49:12 +01:00
Clément Renault	966eda8ae5	feat: Do the sum of typos using usizes	2019-01-06 22:49:12 +01:00
Clément Renault	32f8908d71	feat: Reintroduce stopwords for the serializer	2019-01-06 22:49:11 +01:00
Clément Renault	a2f5e8aa25	Merge pull request #66 from Kerollmops/revert-precompute-query-index-groups Revert precompute query index groups	2019-01-06 22:38:44 +01:00
Clément Renault	f00b978801	Revert "feat: Pre-compute matches query index groups" This reverts commit `039a9a4cc7`.	2019-01-06 21:54:49 +01:00
Clément Renault	a78b5d225f	Revert "feat: Allow Matches to be constructed" This reverts commit `d21406a939`.	2019-01-06 21:44:53 +01:00
Clément Renault	f32a59720d	Revert "feat: Introducing the Matches as_matches method" This reverts commit `ef7ba96d4a`.	2019-01-06 21:44:53 +01:00
Clément Renault	2cc5fbde1a	Revert "feat: Introduce multiple Iterator impl for Matches" This reverts commit `c594597a01`.	2019-01-06 21:44:53 +01:00
Clément Renault	34d2850d28	Revert "feat: Prefer using ranges and not using unreachable!" This reverts commit `d899b86603`.	2019-01-06 21:44:51 +01:00
Clément Renault	023f62b0ce	Merge pull request #65 from Kerollmops/logging Add a little bit of logging	2019-01-06 15:55:48 +01:00
Clément Renault	7f35b971f0	feat: Log the total number of documents to rank	2019-01-06 15:02:53 +01:00
Clément Renault	3418adb06a	feat: Add log libraries dependencies	2019-01-06 15:02:53 +01:00
Clément Renault	510426c05c	Merge pull request #64 from Kerollmops/precompute-query-index-groups Precompute query index groups	2019-01-06 14:59:04 +01:00
Clément Renault	c74caa0f82	feat: Sum usizes instead of little u16/u32	2019-01-06 13:54:14 +01:00
Clément Renault	d899b86603	feat: Prefer using ranges and not using unreachable!	2019-01-06 13:54:14 +01:00
Clément Renault	0d07af3caf	fix: Filter and count the exact matching words	2019-01-06 13:54:13 +01:00
Clément Renault	c594597a01	feat: Introduce multiple Iterator impl for Matches	2019-01-06 13:54:13 +01:00
Clément Renault	ef7ba96d4a	feat: Introducing the Matches as_matches method	2019-01-06 13:54:13 +01:00
Clément Renault	d21406a939	feat: Allow Matches to be constructed	2019-01-06 13:54:13 +01:00
Clément Renault	039a9a4cc7	feat: Pre-compute matches query index groups	2019-01-06 11:11:55 +01:00
Clément Renault	40ab9e7a55	Merge pull request #63 from Kerollmops/update-rocksdb Update RocksDB to Titan	2019-01-06 10:37:54 +01:00
Clément Renault	d21abb50fa	chore: Update RocksDB to Titan	2019-01-05 12:47:03 +01:00
Clément Renault	3dd5e2445a	Merge pull request #62 from Kerollmops/test-document-key-attr Add tests to DocumentKeyAttr	2019-01-02 22:20:37 +01:00
Clément Renault	7f5e6c5b6e	test: Add test to the DocumentKeyAttr slice repr	2019-01-02 21:48:58 +01:00
Clément Renault	e6d3840f12	Merge pull request #61 from Kerollmops/update-remove-kv-attributes UpdateBuilder handles document attributes deletion	2019-01-02 18:20:14 +01:00
Clément Renault	c05fab783a	fix: Write and Read DocumentKeyAttr in big endian	2019-01-02 17:53:53 +01:00
Clément Renault	95dc6fe904	feat: Rework the UpdateBuilder struct	2019-01-02 17:53:52 +01:00
Clément Renault	b2e9ae4136	Merge pull request #60 from Kerollmops/improve-perfs Improve performances	2019-01-01 17:03:41 +01:00
Clément Renault	b070778d44	feat: Use the jemalloc global allocator in examples	2019-01-01 16:37:15 +01:00
Clément Renault	6731025003	chore: Update group-by	2019-01-01 16:27:39 +01:00
Clément Renault	04544c1531	feat: Expose nightly features of some dependencies	2019-01-01 16:27:08 +01:00
Clément Renault	9dd68b4eaa	Merge pull request #58 from Kerollmops/clean-up Clean up some database functions	2019-01-01 11:43:27 +01:00
Clément Renault	1d67012aa5	chore: Clean up some database functions	2019-01-01 01:40:20 +01:00
Clément Renault	e723e01ec8	Merge pull request #57 from Kerollmops/clippy-pass Clippy pass	2018-12-31 23:46:18 +01:00
Clément Renault	7845292ea8	chore: Clippy pass	2018-12-31 23:20:30 +01:00
Clément Renault	521df85c0d	Merge pull request #55 from Kerollmops/add-benchmarks Add benchmarks	2018-12-31 21:48:38 +01:00
Clément Renault	dfa19582a2	test: Add benchmarks to mesure the words proximity criterion	2018-12-31 21:18:42 +01:00
Clément Renault	87ec95f7a0	test: Add benchmarks to mesure the database	2018-12-31 21:18:37 +01:00
Clément Renault	76ef2cceeb	Merge pull request #49 from Kerollmops/serialize-any-map Serialize any map	2018-12-31 21:11:17 +01:00
Clément Renault	20b5a6a06e	doc: Add examples for runtime defined data and Schema	2018-12-31 20:44:33 +01:00
Clément Renault	a842e647f7	Merge pull request #56 from Kerollmops/new-index-struct New Index structure	2018-12-31 19:55:18 +01:00
Clément Renault	21bb38c3b0	test: Add more tests for updates ingestion	2018-12-31 19:27:21 +01:00
Clément Renault	64d53ee1bd	chore: Rework the data module structures being able to be constructed from SharedData	2018-12-31 19:27:21 +01:00
Clément Renault	c022fa3fca	chore: Move serde related structs to their module	2018-12-31 19:26:28 +01:00
Clément Renault	0080bf486f	feat: Introduce the new Index structure replacing the old ugly Blob system	2018-12-31 19:26:27 +01:00
Clément Renault	6bd779f9ae	feat: Improve the deserialization time of a Blob	2018-12-31 13:15:37 +01:00
Clément Renault	a18401f47e	Merge pull request #53 from Kerollmops/query-builder-filter Distinct/QueryBuilder filtering	2018-12-29 23:11:43 +01:00
Clément Renault	7132c3be89	feat: Allow filtering on QueryBuilder	2018-12-29 22:30:41 +01:00
Clément Renault	aa3d059363	feat: Allow filtering on DistinctQueryBuilder	2018-12-29 22:30:41 +01:00
Clément Renault	e2a9dbc404	feat: Introduce filtering methods for Distinct/QueryBuilder	2018-12-29 22:30:40 +01:00
Clément Renault	a0a11faee5	Merge pull request #54 from Kerollmops/arccell-instead-of-rwlock Prefer using ArcCell instead of RWLock for database updates	2018-12-29 22:29:35 +01:00
Clément Renault	36ef9581aa	feat: Return the database view for each update	2018-12-29 21:07:01 +01:00
Clément Renault	f4b04dfb72	feat: Prefer doing DatabaseView updates atomically	2018-12-29 20:52:00 +01:00
Clément Renault	cf5d56e63a	Merge pull request #52 from Kerollmops/schema-toml Schema can be de/serialized from a toml format	2018-12-28 19:59:40 +01:00
Clément Renault	8412c14b5b	feat: Schema can be toml de/serialized	2018-12-28 19:24:50 +01:00
Clément Renault	70772eca5c	Merge pull request #51 from Kerollmops/wordarea-attribute-fallible Make the Attribute and WordArea errors recoverable	2018-12-28 18:26:19 +01:00
Clément Renault	b27f632e14	feat: Make the Attribute and WordArea errors recoverable	2018-12-28 16:15:22 +01:00
Clément Renault	e3bfb866e5	Merge pull request #46 from Kerollmops/schema-considers-id Schema considers document ids	2018-12-27 12:26:57 +01:00
Clément Renault	fa238f21ef	feat: Move Database to its own module	2018-12-27 11:21:47 +01:00
Clément Renault	444a4c1af7	feat: Make the schema consider document ids	2018-12-27 11:21:47 +01:00
Clément Renault	2e5c5fad33	Merge pull request #45 from Kerollmops/index-length-in-docindex Introduce the WordArea struct	2018-12-24 17:08:20 +01:00
Clément Renault	b32c96cdc9	feat: Introduce a WordArea struct Useful to highlight matching areas in the original text.	2018-12-24 15:58:46 +01:00
Clément Renault	62521262e8	Merge pull request #44 from Kerollmops/real-document-id-type Create a real DocumentId type	2018-12-24 15:41:47 +01:00
Clément Renault	4ebae7784c	feat: Create a strong DocumentId type Forcing it to be something internal will permit to avoid possible miss comparisons to be done with other types.	2018-12-24 12:42:24 +01:00
Clément Renault	a756ca5e3f	Merge pull request #39 from Kerollmops/readme-badges Add badges to the README	2018-12-19 14:42:54 +01:00
Clément Renault	aa104fa253	doc: Add some funny badges to the README	2018-12-19 12:00:29 +01:00