mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-28 01:01:00 +00:00
Put the documents MTBL back into LMDB
We makes sure to write the documents into a file before memory mapping it and putting it into LMDB, this way we avoid moving it to RAM
This commit is contained in:
@ -1,5 +1,5 @@
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::fs::File;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::iter::FromIterator;
|
||||
use std::path::PathBuf;
|
||||
@ -14,7 +14,7 @@ use flate2::read::GzDecoder;
|
||||
use fst::IntoStreamer;
|
||||
use heed::EnvOpenOptions;
|
||||
use heed::types::*;
|
||||
use log::debug;
|
||||
use log::{debug, info};
|
||||
use memmap::Mmap;
|
||||
use oxidized_mtbl::{Reader, Writer, Merger, Sorter, CompressionType};
|
||||
use rayon::prelude::*;
|
||||
@ -486,9 +486,9 @@ fn main() -> anyhow::Result<()> {
|
||||
.max_dbs(10)
|
||||
.open(&opt.database)?;
|
||||
|
||||
let mut index = Index::new(&env, &opt.database)?;
|
||||
let before_indexing = Instant::now();
|
||||
let index = Index::new(&env)?;
|
||||
|
||||
let documents_path = opt.database.join("documents.mtbl");
|
||||
let num_threads = rayon::current_num_threads();
|
||||
let arc_cache_size = opt.indexer.arc_cache_size;
|
||||
let max_nb_chunks = opt.indexer.max_nb_chunks;
|
||||
@ -566,32 +566,28 @@ fn main() -> anyhow::Result<()> {
|
||||
docs_stores.push(d);
|
||||
});
|
||||
|
||||
debug!("We are writing into LMDB and MTBL...");
|
||||
debug!("We are writing the documents into MTBL on disk...");
|
||||
// We also merge the documents into its own MTBL store.
|
||||
let file = tempfile::tempfile()?;
|
||||
let mut writer = Writer::builder()
|
||||
.compression_type(documents_compression_type)
|
||||
.compression_level(documents_compression_level)
|
||||
.build(file);
|
||||
let mut builder = Merger::builder(docs_merge);
|
||||
builder.extend(docs_stores);
|
||||
builder.build().write_into(&mut writer)?;
|
||||
let file = writer.into_inner()?;
|
||||
let documents_mmap = unsafe { memmap::Mmap::map(&file)? };
|
||||
|
||||
// We run both merging steps in parallel.
|
||||
let (lmdb, mtbl) = rayon::join(|| {
|
||||
// We merge the postings lists into LMDB.
|
||||
let mut wtxn = env.write_txn()?;
|
||||
merge_into_lmdb(stores, |k, v| lmdb_writer(&mut wtxn, &index, k, v))?;
|
||||
Ok(wtxn.commit()?) as anyhow::Result<_>
|
||||
}, || {
|
||||
// We also merge the documents into its own MTBL store.
|
||||
let file = OpenOptions::new().create(true).truncate(true).write(true).read(true).open(documents_path)?;
|
||||
let mut writer = Writer::builder()
|
||||
.compression_type(documents_compression_type)
|
||||
.compression_level(documents_compression_level)
|
||||
.build(file);
|
||||
let mut builder = Merger::builder(docs_merge);
|
||||
builder.extend(docs_stores);
|
||||
builder.build().write_into(&mut writer)?;
|
||||
Ok(writer.finish()?) as anyhow::Result<_>
|
||||
});
|
||||
debug!("We are writing the postings lists and documents into LMDB on disk...");
|
||||
// We merge the postings lists into LMDB.
|
||||
let mut wtxn = env.write_txn()?;
|
||||
merge_into_lmdb(stores, |k, v| lmdb_writer(&mut wtxn, &index, k, v))?;
|
||||
index.put_documents(&mut wtxn, &documents_mmap)?;
|
||||
let count = index.number_of_documents(&wtxn)?;
|
||||
wtxn.commit()?;
|
||||
|
||||
lmdb.and(mtbl)?;
|
||||
index.refresh_documents()?;
|
||||
let count = index.number_of_documents();
|
||||
|
||||
debug!("Wrote {} documents into LMDB", count);
|
||||
info!("Wrote {} documents in {:.02?}", count, before_indexing.elapsed());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -96,7 +96,7 @@ fn main() -> anyhow::Result<()> {
|
||||
.open(&opt.database)?;
|
||||
|
||||
// Open the LMDB database.
|
||||
let index = Index::new(&env, opt.database)?;
|
||||
let index = Index::new(&env)?;
|
||||
let rtxn = env.read_txn()?;
|
||||
|
||||
match opt.command {
|
||||
@ -200,6 +200,11 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
||||
if heap.len() > limit { heap.pop(); }
|
||||
}
|
||||
|
||||
if let Some(documents) = index.main.get::<_, ByteSlice, ByteSlice>(rtxn, b"documents")? {
|
||||
heap.push(Reverse((documents.len(), format!("documents"), main_name)));
|
||||
if heap.len() > limit { heap.pop(); }
|
||||
}
|
||||
|
||||
for result in index.word_positions.as_polymorph().iter::<_, Str, ByteSlice>(rtxn)? {
|
||||
let (word, value) = result?;
|
||||
heap.push(Reverse((value.len(), word.to_string(), word_positions_name)));
|
||||
|
@ -49,7 +49,7 @@ fn main() -> anyhow::Result<()> {
|
||||
.open(&opt.database)?;
|
||||
|
||||
// Open the LMDB database.
|
||||
let index = Index::new(&env, opt.database)?;
|
||||
let index = Index::new(&env)?;
|
||||
let rtxn = env.read_txn()?;
|
||||
|
||||
let stdin = io::stdin();
|
||||
@ -68,7 +68,7 @@ fn main() -> anyhow::Result<()> {
|
||||
Some(headers) => headers,
|
||||
None => return Ok(()),
|
||||
};
|
||||
let documents = index.documents(result.documents_ids.iter().cloned())?;
|
||||
let documents = index.documents(&rtxn, result.documents_ids.iter().cloned())?;
|
||||
|
||||
let mut stdout = io::stdout();
|
||||
stdout.write_all(&headers)?;
|
||||
|
@ -62,7 +62,6 @@ fn highlight_string(string: &str, words: &HashSet<String>) -> String {
|
||||
struct IndexTemplate {
|
||||
db_name: String,
|
||||
db_size: usize,
|
||||
docs_size: usize,
|
||||
docs_count: usize,
|
||||
}
|
||||
|
||||
@ -83,28 +82,23 @@ async fn main() -> anyhow::Result<()> {
|
||||
.open(&opt.database)?;
|
||||
|
||||
// Open the LMDB database.
|
||||
let index = Index::new(&env, &opt.database)?;
|
||||
let index = Index::new(&env)?;
|
||||
|
||||
// Retrieve the database the file stem (w/o the extension),
|
||||
// the disk file size and the number of documents in the database.
|
||||
let db_name = opt.database.file_stem().and_then(|s| s.to_str()).unwrap_or("").to_string();
|
||||
let db_size = File::open(opt.database.join("data.mdb"))?.metadata()?.len() as usize;
|
||||
let docs_size = File::open(opt.database.join("documents.mtbl"))?.metadata()?.len() as usize;
|
||||
let docs_count = index.number_of_documents();
|
||||
|
||||
let rtxn = env.read_txn()?;
|
||||
let docs_count = index.number_of_documents(&rtxn)? as usize;
|
||||
drop(rtxn);
|
||||
|
||||
// We run and wait on the HTTP server
|
||||
|
||||
// Expose an HTML page to debug the search in a browser
|
||||
let dash_html_route = warp::filters::method::get()
|
||||
.and(warp::filters::path::end())
|
||||
.map(move || {
|
||||
IndexTemplate {
|
||||
db_name: db_name.clone(),
|
||||
db_size,
|
||||
docs_size,
|
||||
docs_count: docs_count as usize,
|
||||
}
|
||||
});
|
||||
.map(move || IndexTemplate { db_name: db_name.clone(), db_size, docs_count });
|
||||
|
||||
let dash_bulma_route = warp::filters::method::get()
|
||||
.and(warp::path!("bulma.min.css"))
|
||||
@ -192,7 +186,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
if let Some(headers) = index.headers(&rtxn).unwrap() {
|
||||
// We write the headers
|
||||
body.extend_from_slice(headers);
|
||||
let documents = index.documents(documents_ids).unwrap();
|
||||
let documents = index.documents(&rtxn, documents_ids).unwrap();
|
||||
|
||||
for (_id, content) in documents {
|
||||
let content = std::str::from_utf8(content.as_ref()).unwrap();
|
||||
|
Reference in New Issue
Block a user