mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-26 00:01:00 +00:00
feat: Introduce the new Index system
This commit is contained in:
@ -8,6 +8,7 @@ edition = "2018"
|
||||
arc-swap = "0.3.11"
|
||||
bincode = "1.1.2"
|
||||
byteorder = "1.3.1"
|
||||
deunicode = "1.0.0"
|
||||
hashbrown = { version = "0.2.2", features = ["serde"] }
|
||||
linked-hash-map = { version = "0.5.2", features = ["serde_impl"] }
|
||||
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
|
||||
@ -18,8 +19,12 @@ serde = { version = "1.0.90", features = ["derive"] }
|
||||
serde_json = { version = "1.0.39", features = ["preserve_order"] }
|
||||
sled = "0.23.0"
|
||||
toml = { version = "0.5.0", features = ["preserve_order"] }
|
||||
deunicode = "1.0.0"
|
||||
zerocopy = "0.2.2"
|
||||
|
||||
[dependencies.rmp-serde]
|
||||
git = "https://github.com/3Hren/msgpack-rust.git"
|
||||
rev = "40b3d48"
|
||||
|
||||
[dependencies.fst]
|
||||
git = "https://github.com/Kerollmops/fst.git"
|
||||
branch = "arc-byte-slice"
|
||||
|
@ -12,7 +12,7 @@ use meilidb_core::criterion::Criteria;
|
||||
use meilidb_core::QueryBuilder;
|
||||
use meilidb_core::shared_data_cursor::{FromSharedDataCursor, SharedDataCursor};
|
||||
use meilidb_core::write_to_bytes::WriteToBytes;
|
||||
use meilidb_core::{DocumentId, Index as WordIndex};
|
||||
use meilidb_core::DocumentId;
|
||||
use rmp_serde::decode::{Error as RmpError};
|
||||
use sdset::SetBuf;
|
||||
use serde::de;
|
||||
@ -20,7 +20,9 @@ use sled::IVec;
|
||||
|
||||
use crate::{Schema, SchemaAttr, RankedMap};
|
||||
use crate::serde::{extract_document_id, Serializer, Deserializer, SerializerError};
|
||||
use crate::indexer::Indexer;
|
||||
use crate::indexer::{Indexer, WordIndexTree};
|
||||
|
||||
pub type WordIndex = meilidb_core::Index<WordIndexTree>;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Error {
|
||||
@ -72,6 +74,10 @@ fn index_name(name: &str) -> Vec<u8> {
|
||||
format!("index-{}", name).into_bytes()
|
||||
}
|
||||
|
||||
fn word_index_name(name: &str) -> Vec<u8> {
|
||||
format!("word-index-{}", name).into_bytes()
|
||||
}
|
||||
|
||||
fn document_key(id: DocumentId, attr: SchemaAttr) -> Vec<u8> {
|
||||
let DocumentId(document_id) = id;
|
||||
let SchemaAttr(schema_attr) = attr;
|
||||
@ -136,7 +142,8 @@ impl Database {
|
||||
let raw_name = index_name(name);
|
||||
if self.inner.tree_names().into_iter().any(|tn| tn == raw_name) {
|
||||
let tree = self.inner.open_tree(raw_name)?;
|
||||
let raw_index = RawIndex::from_raw(tree)?;
|
||||
let word_index_tree = self.inner.open_tree(word_index_name(name))?;
|
||||
let raw_index = RawIndex::from_raw(tree, word_index_tree)?;
|
||||
|
||||
self.opened.rcu(|opened| {
|
||||
let mut opened = HashMap::clone(opened);
|
||||
@ -162,7 +169,8 @@ impl Database {
|
||||
None => {
|
||||
let raw_name = index_name(&name);
|
||||
let tree = self.inner.open_tree(raw_name)?;
|
||||
let raw_index = RawIndex::new_from_raw(tree, schema)?;
|
||||
let word_index_tree = self.inner.open_tree(word_index_name(&name))?;
|
||||
let raw_index = RawIndex::new_from_raw(tree, word_index_tree, schema)?;
|
||||
|
||||
self.opened.rcu(|opened| {
|
||||
let mut opened = HashMap::clone(opened);
|
||||
@ -185,25 +193,16 @@ pub struct RawIndex {
|
||||
}
|
||||
|
||||
impl RawIndex {
|
||||
fn from_raw(inner: Arc<sled::Tree>) -> Result<RawIndex, Error> {
|
||||
fn from_raw(inner: Arc<sled::Tree>, word_index: Arc<sled::Tree>) -> Result<RawIndex, Error> {
|
||||
let schema = {
|
||||
let bytes = inner.get("schema")?;
|
||||
let bytes = bytes.ok_or(Error::SchemaMissing)?;
|
||||
Schema::read_from_bin(bytes.as_ref())?
|
||||
};
|
||||
|
||||
let bytes = inner.get("word-index")?;
|
||||
let bytes = bytes.ok_or(Error::WordIndexMissing)?;
|
||||
let word_index = {
|
||||
let len = bytes.len();
|
||||
let bytes: Arc<[u8]> = Into::into(bytes);
|
||||
let mut cursor = SharedDataCursor::from_shared_bytes(bytes, 0, len);
|
||||
|
||||
// TODO must handle this error
|
||||
let word_index = WordIndex::from_shared_data_cursor(&mut cursor).unwrap();
|
||||
|
||||
Arc::new(ArcSwap::new(Arc::new(word_index)))
|
||||
};
|
||||
let store = WordIndexTree(word_index);
|
||||
let word_index = WordIndex::from_store(store)?;
|
||||
let word_index = Arc::new(ArcSwap::new(Arc::new(word_index)));
|
||||
|
||||
let ranked_map = {
|
||||
let map = match inner.get("ranked-map")? {
|
||||
@ -217,13 +216,18 @@ impl RawIndex {
|
||||
Ok(RawIndex { schema, word_index, ranked_map, inner })
|
||||
}
|
||||
|
||||
fn new_from_raw(inner: Arc<sled::Tree>, schema: Schema) -> Result<RawIndex, Error> {
|
||||
fn new_from_raw(
|
||||
inner: Arc<sled::Tree>,
|
||||
word_index: Arc<sled::Tree>,
|
||||
schema: Schema,
|
||||
) -> Result<RawIndex, Error>
|
||||
{
|
||||
let mut schema_bytes = Vec::new();
|
||||
schema.write_to_bin(&mut schema_bytes)?;
|
||||
inner.set("schema", schema_bytes)?;
|
||||
|
||||
let word_index = WordIndex::default();
|
||||
inner.set("word-index", word_index.into_bytes())?;
|
||||
let store = WordIndexTree(word_index);
|
||||
let word_index = WordIndex::from_store(store)?;
|
||||
let word_index = Arc::new(ArcSwap::new(Arc::new(word_index)));
|
||||
|
||||
let ranked_map = Arc::new(ArcSwap::new(Arc::new(RankedMap::default())));
|
||||
@ -243,12 +247,8 @@ impl RawIndex {
|
||||
self.ranked_map.lease()
|
||||
}
|
||||
|
||||
pub fn update_word_index(&self, word_index: Arc<WordIndex>) -> sled::Result<()> {
|
||||
let data = word_index.into_bytes();
|
||||
self.inner.set("word-index", data).map(drop)?;
|
||||
self.word_index.store(word_index);
|
||||
|
||||
Ok(())
|
||||
pub fn update_word_index(&self, word_index: Arc<WordIndex>) {
|
||||
self.word_index.store(word_index)
|
||||
}
|
||||
|
||||
pub fn update_ranked_map(&self, ranked_map: Arc<RankedMap>) -> sled::Result<()> {
|
||||
@ -417,14 +417,15 @@ impl DocumentsAddition {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finalize(self) -> sled::Result<()> {
|
||||
let delta_index = self.indexer.build();
|
||||
|
||||
let index = self.inner.word_index();
|
||||
let new_index = index.r#union(&delta_index);
|
||||
let new_index = index.insert_indexes(delta_index)?;
|
||||
|
||||
let new_index = Arc::from(new_index);
|
||||
self.inner.update_word_index(new_index)?;
|
||||
self.inner.update_word_index(new_index);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@ -454,10 +455,10 @@ impl DocumentsDeletion {
|
||||
let idset = SetBuf::new_unchecked(self.documents);
|
||||
let index = self.inner.word_index();
|
||||
|
||||
let new_index = index.remove_documents(&idset);
|
||||
let new_index = index.remove_documents(&idset)?;
|
||||
let new_index = Arc::from(new_index);
|
||||
|
||||
self.inner.update_word_index(new_index)?;
|
||||
self.inner.update_word_index(new_index);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -7,12 +7,12 @@ use meilidb_core::data::DocIds;
|
||||
use meilidb_core::write_to_bytes::WriteToBytes;
|
||||
use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||
|
||||
enum NewIndexEvent<'a> {
|
||||
enum NewIndexEvent<'a, S> {
|
||||
RemovedDocuments(&'a DocIds),
|
||||
UpdatedDocuments(&'a WordIndex),
|
||||
UpdatedDocuments(&'a WordIndex<S>),
|
||||
}
|
||||
|
||||
impl<'a> WriteToBytes for NewIndexEvent<'a> {
|
||||
impl<'a, S> WriteToBytes for NewIndexEvent<'a, S> {
|
||||
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
match self {
|
||||
NewIndexEvent::RemovedDocuments(doc_ids) => {
|
||||
@ -21,24 +21,24 @@ impl<'a> WriteToBytes for NewIndexEvent<'a> {
|
||||
},
|
||||
NewIndexEvent::UpdatedDocuments(index) => {
|
||||
let _ = bytes.write_u8(1);
|
||||
index.write_to_bytes(bytes);
|
||||
// index.write_to_bytes(bytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum IndexEvent {
|
||||
enum IndexEvent<S> {
|
||||
RemovedDocuments(DocIds),
|
||||
UpdatedDocuments(WordIndex),
|
||||
UpdatedDocuments(WordIndex<S>),
|
||||
}
|
||||
|
||||
impl FromSharedDataCursor for IndexEvent {
|
||||
impl<S> FromSharedDataCursor for IndexEvent<S> {
|
||||
type Error = Box<Error>;
|
||||
|
||||
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Self, Self::Error> {
|
||||
match cursor.read_u8()? {
|
||||
0 => DocIds::from_shared_data_cursor(cursor).map(IndexEvent::RemovedDocuments),
|
||||
1 => WordIndex::from_shared_data_cursor(cursor).map(IndexEvent::UpdatedDocuments),
|
||||
// 1 => WordIndex::from_shared_data_cursor(cursor).map(IndexEvent::UpdatedDocuments),
|
||||
_ => Err("invalid index event type".into()),
|
||||
}
|
||||
}
|
||||
|
@ -1,14 +1,78 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::convert::TryFrom;
|
||||
use std::sync::Arc;
|
||||
|
||||
use deunicode::deunicode_with_tofu;
|
||||
use meilidb_core::{DocumentId, DocIndex};
|
||||
use meilidb_core::{Index as WordIndex, IndexBuilder as WordIndexBuilder};
|
||||
use meilidb_core::{DocumentId, DocIndex, Store};
|
||||
use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token};
|
||||
use sdset::Set;
|
||||
use sdset::{Set, SetBuf};
|
||||
use sled::Tree;
|
||||
use zerocopy::{AsBytes, LayoutVerified};
|
||||
|
||||
use crate::SchemaAttr;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct WordIndexTree(pub Arc<Tree>);
|
||||
|
||||
impl Store for WordIndexTree {
|
||||
type Error = sled::Error;
|
||||
|
||||
fn get_fst(&self) -> Result<fst::Set, Self::Error> {
|
||||
match self.0.get("fst")? {
|
||||
Some(bytes) => {
|
||||
let bytes: Arc<[u8]> = bytes.into();
|
||||
let len = bytes.len();
|
||||
let raw = fst::raw::Fst::from_shared_bytes(bytes, 0, len).unwrap();
|
||||
Ok(fst::Set::from(raw))
|
||||
},
|
||||
None => Ok(fst::Set::default()),
|
||||
}
|
||||
}
|
||||
|
||||
fn set_fst(&self, set: &fst::Set) -> Result<(), Self::Error> {
|
||||
let bytes = set.as_fst().to_vec();
|
||||
self.0.set("fst", bytes)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_indexes(&self, word: &[u8]) -> Result<Option<SetBuf<DocIndex>>, Self::Error> {
|
||||
let mut word_bytes = Vec::from("word-");
|
||||
word_bytes.extend_from_slice(word);
|
||||
|
||||
match self.0.get(word_bytes)? {
|
||||
Some(bytes) => {
|
||||
let layout = LayoutVerified::new_slice(bytes.as_ref()).unwrap();
|
||||
let slice = layout.into_slice();
|
||||
let setbuf = SetBuf::new_unchecked(slice.to_vec());
|
||||
Ok(Some(setbuf))
|
||||
},
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
fn set_indexes(&self, word: &[u8], indexes: &Set<DocIndex>) -> Result<(), Self::Error> {
|
||||
let mut word_bytes = Vec::from("word-");
|
||||
word_bytes.extend_from_slice(word);
|
||||
|
||||
let slice = indexes.as_slice();
|
||||
let bytes = slice.as_bytes();
|
||||
|
||||
self.0.set(word_bytes, bytes)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn del_indexes(&self, word: &[u8]) -> Result<(), Self::Error> {
|
||||
let mut word_bytes = Vec::from("word-");
|
||||
word_bytes.extend_from_slice(word);
|
||||
|
||||
self.0.del(word_bytes)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
type Word = Vec<u8>; // TODO make it be a SmallVec
|
||||
|
||||
pub struct Indexer {
|
||||
@ -48,18 +112,11 @@ impl Indexer {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build(self) -> WordIndex {
|
||||
let mut builder = WordIndexBuilder::new();
|
||||
|
||||
for (key, mut indexes) in self.indexed {
|
||||
pub fn build(self) -> BTreeMap<Word, SetBuf<DocIndex>> {
|
||||
self.indexed.into_iter().map(|(word, mut indexes)| {
|
||||
indexes.sort_unstable();
|
||||
indexes.dedup();
|
||||
|
||||
let indexes = Set::new_unchecked(&indexes);
|
||||
builder.insert(key, indexes).unwrap();
|
||||
}
|
||||
|
||||
builder.build()
|
||||
(word, SetBuf::new_unchecked(indexes))
|
||||
}).collect()
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user