Put the documents MTBL back into LMDB

We makes sure to write the documents into a file before
memory mapping it and putting it into LMDB, this way we avoid
moving it to RAM
This commit is contained in:
Clément Renault
2020-08-28 15:38:05 +02:00
parent d784d87880
commit 0a44ff86ab
10 changed files with 100 additions and 110 deletions

View File

@ -1,5 +1,5 @@
use std::convert::{TryFrom, TryInto};
use std::fs::{File, OpenOptions};
use std::fs::File;
use std::io::{self, Read, Write};
use std::iter::FromIterator;
use std::path::PathBuf;
@ -14,7 +14,7 @@ use flate2::read::GzDecoder;
use fst::IntoStreamer;
use heed::EnvOpenOptions;
use heed::types::*;
use log::debug;
use log::{debug, info};
use memmap::Mmap;
use oxidized_mtbl::{Reader, Writer, Merger, Sorter, CompressionType};
use rayon::prelude::*;
@ -486,9 +486,9 @@ fn main() -> anyhow::Result<()> {
.max_dbs(10)
.open(&opt.database)?;
let mut index = Index::new(&env, &opt.database)?;
let before_indexing = Instant::now();
let index = Index::new(&env)?;
let documents_path = opt.database.join("documents.mtbl");
let num_threads = rayon::current_num_threads();
let arc_cache_size = opt.indexer.arc_cache_size;
let max_nb_chunks = opt.indexer.max_nb_chunks;
@ -566,32 +566,28 @@ fn main() -> anyhow::Result<()> {
docs_stores.push(d);
});
debug!("We are writing into LMDB and MTBL...");
debug!("We are writing the documents into MTBL on disk...");
// We also merge the documents into its own MTBL store.
let file = tempfile::tempfile()?;
let mut writer = Writer::builder()
.compression_type(documents_compression_type)
.compression_level(documents_compression_level)
.build(file);
let mut builder = Merger::builder(docs_merge);
builder.extend(docs_stores);
builder.build().write_into(&mut writer)?;
let file = writer.into_inner()?;
let documents_mmap = unsafe { memmap::Mmap::map(&file)? };
// We run both merging steps in parallel.
let (lmdb, mtbl) = rayon::join(|| {
// We merge the postings lists into LMDB.
let mut wtxn = env.write_txn()?;
merge_into_lmdb(stores, |k, v| lmdb_writer(&mut wtxn, &index, k, v))?;
Ok(wtxn.commit()?) as anyhow::Result<_>
}, || {
// We also merge the documents into its own MTBL store.
let file = OpenOptions::new().create(true).truncate(true).write(true).read(true).open(documents_path)?;
let mut writer = Writer::builder()
.compression_type(documents_compression_type)
.compression_level(documents_compression_level)
.build(file);
let mut builder = Merger::builder(docs_merge);
builder.extend(docs_stores);
builder.build().write_into(&mut writer)?;
Ok(writer.finish()?) as anyhow::Result<_>
});
debug!("We are writing the postings lists and documents into LMDB on disk...");
// We merge the postings lists into LMDB.
let mut wtxn = env.write_txn()?;
merge_into_lmdb(stores, |k, v| lmdb_writer(&mut wtxn, &index, k, v))?;
index.put_documents(&mut wtxn, &documents_mmap)?;
let count = index.number_of_documents(&wtxn)?;
wtxn.commit()?;
lmdb.and(mtbl)?;
index.refresh_documents()?;
let count = index.number_of_documents();
debug!("Wrote {} documents into LMDB", count);
info!("Wrote {} documents in {:.02?}", count, before_indexing.elapsed());
Ok(())
}

View File

@ -96,7 +96,7 @@ fn main() -> anyhow::Result<()> {
.open(&opt.database)?;
// Open the LMDB database.
let index = Index::new(&env, opt.database)?;
let index = Index::new(&env)?;
let rtxn = env.read_txn()?;
match opt.command {
@ -200,6 +200,11 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
if heap.len() > limit { heap.pop(); }
}
if let Some(documents) = index.main.get::<_, ByteSlice, ByteSlice>(rtxn, b"documents")? {
heap.push(Reverse((documents.len(), format!("documents"), main_name)));
if heap.len() > limit { heap.pop(); }
}
for result in index.word_positions.as_polymorph().iter::<_, Str, ByteSlice>(rtxn)? {
let (word, value) = result?;
heap.push(Reverse((value.len(), word.to_string(), word_positions_name)));

View File

@ -49,7 +49,7 @@ fn main() -> anyhow::Result<()> {
.open(&opt.database)?;
// Open the LMDB database.
let index = Index::new(&env, opt.database)?;
let index = Index::new(&env)?;
let rtxn = env.read_txn()?;
let stdin = io::stdin();
@ -68,7 +68,7 @@ fn main() -> anyhow::Result<()> {
Some(headers) => headers,
None => return Ok(()),
};
let documents = index.documents(result.documents_ids.iter().cloned())?;
let documents = index.documents(&rtxn, result.documents_ids.iter().cloned())?;
let mut stdout = io::stdout();
stdout.write_all(&headers)?;

View File

@ -62,7 +62,6 @@ fn highlight_string(string: &str, words: &HashSet<String>) -> String {
struct IndexTemplate {
db_name: String,
db_size: usize,
docs_size: usize,
docs_count: usize,
}
@ -83,28 +82,23 @@ async fn main() -> anyhow::Result<()> {
.open(&opt.database)?;
// Open the LMDB database.
let index = Index::new(&env, &opt.database)?;
let index = Index::new(&env)?;
// Retrieve the database the file stem (w/o the extension),
// the disk file size and the number of documents in the database.
let db_name = opt.database.file_stem().and_then(|s| s.to_str()).unwrap_or("").to_string();
let db_size = File::open(opt.database.join("data.mdb"))?.metadata()?.len() as usize;
let docs_size = File::open(opt.database.join("documents.mtbl"))?.metadata()?.len() as usize;
let docs_count = index.number_of_documents();
let rtxn = env.read_txn()?;
let docs_count = index.number_of_documents(&rtxn)? as usize;
drop(rtxn);
// We run and wait on the HTTP server
// Expose an HTML page to debug the search in a browser
let dash_html_route = warp::filters::method::get()
.and(warp::filters::path::end())
.map(move || {
IndexTemplate {
db_name: db_name.clone(),
db_size,
docs_size,
docs_count: docs_count as usize,
}
});
.map(move || IndexTemplate { db_name: db_name.clone(), db_size, docs_count });
let dash_bulma_route = warp::filters::method::get()
.and(warp::path!("bulma.min.css"))
@ -192,7 +186,7 @@ async fn main() -> anyhow::Result<()> {
if let Some(headers) = index.headers(&rtxn).unwrap() {
// We write the headers
body.extend_from_slice(headers);
let documents = index.documents(documents_ids).unwrap();
let documents = index.documents(&rtxn, documents_ids).unwrap();
for (_id, content) in documents {
let content = std::str::from_utf8(content.as_ref()).unwrap();