feat: Introduce the "data-index" entry with merge compaction

This commit is contained in:
Clément Renault
2018-11-26 17:30:19 +01:00
parent 0e856db4e6
commit b636e5fe57
12 changed files with 251 additions and 231 deletions

View File

@@ -8,10 +8,9 @@ authors = ["Kerollmops <renault.cle@gmail.com>"]
bincode = "1.0" bincode = "1.0"
byteorder = "1.2" byteorder = "1.2"
fnv = "1.0" fnv = "1.0"
fs2 = "0.4"
lazy_static = "1.1" lazy_static = "1.1"
linked-hash-map = { version = "0.5", features = ["serde_impl"] } linked-hash-map = { version = "0.5", features = ["serde_impl"] }
sdset = "0.2" sdset = "0.3"
serde = "1.0" serde = "1.0"
serde_derive = "1.0" serde_derive = "1.0"
unidecode = "0.3" unidecode = "0.3"

View File

@@ -1,17 +1,17 @@
use crate::vec_read_only::VecReadOnly;
use std::collections::BinaryHeap; use std::collections::BinaryHeap;
use std::{mem, cmp};
use std::rc::Rc; use std::rc::Rc;
use std::cmp;
use fst::{Automaton, Streamer}; use fst::{Automaton, Streamer};
use fst::automaton::AlwaysMatch; use fst::automaton::AlwaysMatch;
use sdset::{Set, SetBuf, SetOperation}; use sdset::{Set, SetOperation};
use sdset::duo::OpBuilder as SdOpBuilder; use sdset::duo::OpBuilder as SdOpBuilder;
use group_by::GroupBy; use group_by::GroupBy;
use crate::blob::{Blob, Sign};
use crate::blob::ops::{OpBuilder, Union, IndexedDocIndexes};
use crate::DocIndex; use crate::DocIndex;
use crate::blob::{Blob, Sign};
use crate::vec_read_only::VecReadOnly;
use crate::blob::ops::{OpBuilder, Union, IndexedDocIndexes};
fn group_is_negative(blobs: &&[Blob]) -> bool { fn group_is_negative(blobs: &&[Blob]) -> bool {
blobs[0].sign() == Sign::Negative blobs[0].sign() == Sign::Negative

View File

@@ -12,7 +12,6 @@ use std::error::Error;
use std::io::{Write, Read}; use std::io::{Write, Read};
use std::{io, fmt, mem}; use std::{io, fmt, mem};
use fst::Map;
use uuid::Uuid; use uuid::Uuid;
use rocksdb::rocksdb::{DB, Snapshot}; use rocksdb::rocksdb::{DB, Snapshot};
use serde::ser::{Serialize, Serializer, SerializeTuple}; use serde::ser::{Serialize, Serializer, SerializeTuple};
@@ -108,100 +107,3 @@ impl Sign {
} }
} }
} }
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
pub struct BlobName(Uuid);
impl BlobName {
pub fn new() -> BlobName {
BlobName(Uuid::new_v4())
}
pub fn as_bytes(&self) -> &[u8; 16] {
self.0.as_bytes()
}
}
impl fmt::Display for BlobName {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_tuple("BlobName")
.field(&self.0.to_hyphenated().to_string())
.finish()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct BlobInfo {
pub sign: Sign,
pub name: BlobName,
}
impl BlobInfo {
pub fn new_positive() -> BlobInfo {
BlobInfo {
sign: Sign::Positive,
name: BlobName::new(),
}
}
pub fn new_negative() -> BlobInfo {
BlobInfo {
sign: Sign::Negative,
name: BlobName::new(),
}
}
pub fn read_from<R: Read>(reader: R) -> bincode::Result<BlobInfo> {
bincode::deserialize_from(reader)
}
pub fn read_from_slice(slice: &[u8]) -> bincode::Result<Vec<BlobInfo>> {
let len = slice.len() / mem::size_of::<BlobInfo>();
let mut blob_infos = Vec::with_capacity(len);
let mut cursor = io::Cursor::new(slice);
while blob_infos.len() != len {
let blob_info = BlobInfo::read_from(&mut cursor)?;
blob_infos.push(blob_info);
}
Ok(blob_infos)
}
pub fn write_into<W: Write>(&self, writer: W) -> bincode::Result<()> {
bincode::serialize_into(writer, self)
}
}
pub fn blobs_from_blob_infos(infos: &[BlobInfo], snapshot: &Snapshot<&DB>) -> Result<Vec<Blob>, Box<Error>> {
let mut blobs = Vec::with_capacity(infos.len());
for info in infos {
let blob = match info.sign {
Sign::Positive => {
let blob_key = Identifier::blob(info.name).fst_map().build();
let map = match snapshot.get(&blob_key)? {
Some(value) => value.to_vec(),
None => return Err(format!("No fst entry found for blob {}", info.name).into()),
};
let blob_key = Identifier::blob(info.name).document_indexes().build();
let doc_idx = match snapshot.get(&blob_key)? {
Some(value) => value.to_vec(),
None => return Err(format!("No doc-idx entry found for blob {}", info.name).into()),
};
PositiveBlob::from_bytes(map, doc_idx).map(Blob::Positive)?
},
Sign::Negative => {
let blob_key = Identifier::blob(info.name).document_ids().build();
let doc_ids = match snapshot.get(&blob_key)? {
Some(value) => value.to_vec(),
None => return Err(format!("No doc-ids entry found for blob {}", info.name).into()),
};
NegativeBlob::from_bytes(doc_ids).map(Blob::Negative)?
},
};
blobs.push(blob);
}
Ok(blobs)
}

View File

@@ -1,6 +1,6 @@
use std::io::{Read, Write}; use std::io::Write;
use std::error::Error;
use std::path::Path; use std::path::Path;
use std::error::Error;
use crate::DocumentId; use crate::DocumentId;
use crate::data::{DocIds, DocIdsBuilder}; use crate::data::{DocIds, DocIdsBuilder};
@@ -24,6 +24,10 @@ impl NegativeBlob {
Ok(NegativeBlob { doc_ids }) Ok(NegativeBlob { doc_ids })
} }
pub fn from_raw(doc_ids: DocIds) -> Self {
NegativeBlob { doc_ids }
}
pub fn as_ids(&self) -> &DocIds { pub fn as_ids(&self) -> &DocIds {
&self.doc_ids &self.doc_ids
} }

View File

@@ -1,7 +1,7 @@
use std::io::{Read, Write};
use std::error::Error;
use std::path::Path;
use std::fmt; use std::fmt;
use std::io::Write;
use std::path::Path;
use std::error::Error;
use fst::{Map, MapBuilder}; use fst::{Map, MapBuilder};
@@ -10,6 +10,7 @@ use crate::data::{DocIndexes, DocIndexesBuilder};
use serde::ser::{Serialize, Serializer, SerializeTuple}; use serde::ser::{Serialize, Serializer, SerializeTuple};
use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor}; use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor};
#[derive(Default)]
pub struct PositiveBlob { pub struct PositiveBlob {
map: Map, map: Map,
indexes: DocIndexes, indexes: DocIndexes,
@@ -31,6 +32,10 @@ impl PositiveBlob {
Ok(PositiveBlob { map, indexes }) Ok(PositiveBlob { map, indexes })
} }
pub fn from_raw(map: Map, indexes: DocIndexes) -> Self {
PositiveBlob { map, indexes }
}
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[DocIndex]> { pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[DocIndex]> {
self.map.get(key).and_then(|index| self.indexes.get(index)) self.map.get(key).and_then(|index| self.indexes.get(index))
} }
@@ -109,7 +114,7 @@ impl<W: Write, X: Write> PositiveBlobBuilder<W, X> {
} }
pub fn finish(self) -> Result<(), Box<Error>> { pub fn finish(self) -> Result<(), Box<Error>> {
self.into_inner().map(|_| ()) self.into_inner().map(drop)
} }
pub fn into_inner(self) -> Result<(W, X), Box<Error>> { pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
@@ -130,6 +135,10 @@ impl<W: Write, X: Write> PositiveBlobBuilder<W, X> {
} }
impl PositiveBlobBuilder<Vec<u8>, Vec<u8>> { impl PositiveBlobBuilder<Vec<u8>, Vec<u8>> {
pub fn memory() -> Self {
PositiveBlobBuilder::new(Vec::new(), Vec::new())
}
pub fn build(self) -> Result<PositiveBlob, Box<Error>> { pub fn build(self) -> Result<PositiveBlob, Box<Error>> {
self.into_inner().and_then(|(m, i)| PositiveBlob::from_bytes(m, i)) self.into_inner().and_then(|(m, i)| PositiveBlob::from_bytes(m, i))
} }

View File

@@ -35,6 +35,10 @@ impl DocIds {
Ok(DocIds { data }) Ok(DocIds { data })
} }
pub fn from_document_ids(vec: Vec<DocumentId>) -> Self {
DocIds::from_bytes(unsafe { mem::transmute(vec) }).unwrap()
}
pub fn contains(&self, doc: DocumentId) -> bool { pub fn contains(&self, doc: DocumentId) -> bool {
// FIXME prefer using the sdset::exponential_search function // FIXME prefer using the sdset::exponential_search function
self.doc_ids().binary_search(&doc).is_ok() self.doc_ids().binary_search(&doc).is_ok()

View File

@@ -19,7 +19,7 @@ struct Range {
end: u64, end: u64,
} }
#[derive(Clone)] #[derive(Clone, Default)]
pub struct DocIndexes { pub struct DocIndexes {
ranges: Data, ranges: Data,
indexes: Data, indexes: Data,
@@ -29,15 +29,14 @@ impl DocIndexes {
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> { pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
let mmap = MmapReadOnly::open_path(path)?; let mmap = MmapReadOnly::open_path(path)?;
let range_len = mmap.as_slice().read_u64::<LittleEndian>()?; let ranges_len_offset = mmap.as_slice().len() - mem::size_of::<u64>();
let range_len = range_len as usize * mem::size_of::<Range>(); let ranges_len = (&mmap.as_slice()[ranges_len_offset..]).read_u64::<LittleEndian>()?;
let ranges_len = ranges_len as usize * mem::size_of::<Range>();
let offset = mem::size_of::<u64>() as usize; let ranges_offset = ranges_len_offset - ranges_len;
let ranges = Data::Mmap(mmap.range(offset, range_len)); let ranges = Data::Mmap(mmap.range(ranges_offset, ranges_len));
let len = mmap.len() - range_len - offset; let indexes = Data::Mmap(mmap.range(0, ranges_offset));
let offset = offset + range_len;
let indexes = Data::Mmap(mmap.range(offset, len));
Ok(DocIndexes { ranges, indexes }) Ok(DocIndexes { ranges, indexes })
} }
@@ -45,19 +44,22 @@ impl DocIndexes {
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> { pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
let vec = Arc::new(vec); let vec = Arc::new(vec);
let range_len = vec.as_slice().read_u64::<LittleEndian>()?; let ranges_len_offset = vec.len() - mem::size_of::<u64>();
let range_len = range_len as usize * mem::size_of::<Range>(); let ranges_len = (&vec[ranges_len_offset..]).read_u64::<LittleEndian>()?;
let ranges_len = ranges_len as usize * mem::size_of::<Range>();
let offset = mem::size_of::<u64>() as usize; let ranges_offset = ranges_len_offset - ranges_len;
let ranges = Data::Shared { let ranges = Data::Shared {
vec: vec.clone(), vec: vec.clone(),
offset, offset: ranges_offset,
len: range_len len: ranges_len,
}; };
let len = vec.len() - range_len - offset; let indexes = Data::Shared {
let offset = offset + range_len; vec: vec,
let indexes = Data::Shared { vec, offset, len }; offset: 0,
len: ranges_offset,
};
Ok(DocIndexes { ranges, indexes }) Ok(DocIndexes { ranges, indexes })
} }
@@ -94,6 +96,53 @@ impl Serialize for DocIndexes {
} }
} }
pub struct RawDocIndexesBuilder<W> {
ranges: Vec<Range>,
wtr: W,
}
impl RawDocIndexesBuilder<Vec<u8>> {
pub fn memory() -> Self {
RawDocIndexesBuilder::new(Vec::new())
}
}
impl<W: Write> RawDocIndexesBuilder<W> {
pub fn new(wtr: W) -> Self {
RawDocIndexesBuilder {
ranges: Vec::new(),
wtr: wtr,
}
}
pub fn insert(&mut self, indexes: &[DocIndex]) -> io::Result<()> {
let len = indexes.len() as u64;
let start = self.ranges.last().map(|r| r.start).unwrap_or(0);
let range = Range { start, end: start + len };
self.ranges.push(range);
// write the values
let indexes = unsafe { into_u8_slice(indexes) };
self.wtr.write_all(indexes)
}
pub fn finish(self) -> io::Result<()> {
self.into_inner().map(drop)
}
pub fn into_inner(mut self) -> io::Result<W> {
// write the ranges
let ranges = unsafe { into_u8_slice(self.ranges.as_slice()) };
self.wtr.write_all(ranges)?;
// write the length of the ranges
let len = ranges.len() as u64;
self.wtr.write_u64::<LittleEndian>(len)?;
Ok(self.wtr)
}
}
pub struct DocIndexesBuilder<W> { pub struct DocIndexesBuilder<W> {
keys: BTreeMap<String, u64>, keys: BTreeMap<String, u64>,
indexes: Vec<Vec<DocIndex>>, indexes: Vec<Vec<DocIndex>>,
@@ -136,29 +185,27 @@ impl<W: Write> DocIndexesBuilder<W> {
} }
pub fn finish(self) -> io::Result<()> { pub fn finish(self) -> io::Result<()> {
self.into_inner().map(|_| ()) self.into_inner().map(drop)
} }
pub fn into_inner(mut self) -> io::Result<W> { pub fn into_inner(mut self) -> io::Result<W> {
for vec in &mut self.indexes { for vec in &mut self.indexes {
vec.sort_unstable(); vec.sort_unstable();
} }
let (ranges, values) = into_sliced_ranges(self.indexes, self.number_docs); let (ranges, values) = into_sliced_ranges(self.indexes, self.number_docs);
// write values first
let slice = unsafe { into_u8_slice(values.as_slice()) };
self.wtr.write_all(slice)?;
// write ranges after
let slice = unsafe { into_u8_slice(ranges.as_slice()) };
self.wtr.write_all(slice)?;
// write the length of the ranges
let len = ranges.len() as u64; let len = ranges.len() as u64;
// TODO check if this is correct
self.wtr.write_u64::<LittleEndian>(len)?; self.wtr.write_u64::<LittleEndian>(len)?;
unsafe {
// write Ranges first
let slice = into_u8_slice(ranges.as_slice());
self.wtr.write_all(slice)?;
// write Values after
let slice = into_u8_slice(values.as_slice());
self.wtr.write_all(slice)?;
}
self.wtr.flush()?; self.wtr.flush()?;
Ok(self.wtr) Ok(self.wtr)

View File

@@ -7,7 +7,7 @@ use std::sync::Arc;
use fst::raw::MmapReadOnly; use fst::raw::MmapReadOnly;
pub use self::doc_ids::{DocIds, DocIdsBuilder}; pub use self::doc_ids::{DocIds, DocIdsBuilder};
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder}; pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder, RawDocIndexesBuilder};
#[derive(Clone)] #[derive(Clone)]
enum Data { enum Data {
@@ -19,6 +19,16 @@ enum Data {
Mmap(MmapReadOnly), Mmap(MmapReadOnly),
} }
impl Default for Data {
fn default() -> Data {
Data::Shared {
vec: Arc::default(),
offset: 0,
len: 0,
}
}
}
impl Deref for Data { impl Deref for Data {
type Target = [u8]; type Target = [u8];

View File

@@ -3,7 +3,6 @@ use std::io::Write;
use byteorder::{NetworkEndian, WriteBytesExt}; use byteorder::{NetworkEndian, WriteBytesExt};
use crate::index::schema::SchemaAttr; use crate::index::schema::SchemaAttr;
use crate::blob::BlobName;
use crate::DocumentId; use crate::DocumentId;
pub struct Identifier { pub struct Identifier {
@@ -17,13 +16,6 @@ impl Identifier {
Data { inner } Data { inner }
} }
pub fn blob(name: BlobName) -> Blob {
let mut inner = Vec::new();
let _ = inner.write(b"blob");
let _ = inner.write(name.as_bytes());
Blob { inner }
}
pub fn document(id: DocumentId) -> Document { pub fn document(id: DocumentId) -> Document {
let mut inner = Vec::new(); let mut inner = Vec::new();
let _ = inner.write(b"docu"); let _ = inner.write(b"docu");
@@ -38,9 +30,9 @@ pub struct Data {
} }
impl Data { impl Data {
pub fn blobs_order(mut self) -> Self { pub fn index(mut self) -> Self {
let _ = self.inner.write(b"-"); let _ = self.inner.write(b"-");
let _ = self.inner.write(b"blobs-order"); let _ = self.inner.write(b"index");
self self
} }
@@ -55,34 +47,6 @@ impl Data {
} }
} }
pub struct Blob {
inner: Vec<u8>,
}
impl Blob {
pub fn document_indexes(mut self) -> Self {
let _ = self.inner.write(b"-");
let _ = self.inner.write(b"doc-idx");
self
}
pub fn document_ids(mut self) -> Self {
let _ = self.inner.write(b"-");
let _ = self.inner.write(b"doc-ids");
self
}
pub fn fst_map(mut self) -> Self {
let _ = self.inner.write(b"-");
let _ = self.inner.write(b"fst");
self
}
pub fn build(self) -> Vec<u8> {
self.inner
}
}
pub struct Document { pub struct Document {
inner: Vec<u8>, inner: Vec<u8>,
} }

View File

@@ -2,76 +2,163 @@ pub mod identifier;
pub mod schema; pub mod schema;
pub mod update; pub mod update;
use std::io;
use std::rc::Rc;
use std::error::Error; use std::error::Error;
use std::fs::{self, File}; use std::path::Path;
use std::fmt::{self, Write};
use std::ops::{Deref, BitOr};
use std::path::{Path, PathBuf};
use std::collections::{BTreeSet, BTreeMap};
use fs2::FileExt; use fst::map::{Map, MapBuilder, OpBuilder};
use fst::{IntoStreamer, Streamer};
use sdset::duo::Union as SdUnion;
use sdset::duo::DifferenceByKey;
use sdset::{Set, SetOperation};
use ::rocksdb::rocksdb::Writable; use ::rocksdb::rocksdb::Writable;
use ::rocksdb::{rocksdb, rocksdb_options}; use ::rocksdb::{rocksdb, rocksdb_options};
use ::rocksdb::merge_operator::MergeOperands; use ::rocksdb::merge_operator::MergeOperands;
use crate::DocIndex;
use crate::automaton;
use crate::rank::Document; use crate::rank::Document;
use crate::data::DocIdsBuilder;
use crate::{DocIndex, DocumentId};
use crate::index::schema::Schema; use crate::index::schema::Schema;
use crate::index::update::Update; use crate::index::update::Update;
use crate::tokenizer::TokenizerBuilder;
use crate::index::identifier::Identifier; use crate::index::identifier::Identifier;
use crate::blob::{PositiveBlobBuilder, PositiveBlob, BlobInfo, Sign, Blob, blobs_from_blob_infos};
use crate::tokenizer::{TokenizerBuilder, DefaultBuilder, Tokenizer};
use crate::rank::{criterion, Config, RankedStream}; use crate::rank::{criterion, Config, RankedStream};
use crate::automaton; use crate::data::{DocIds, DocIndexes, RawDocIndexesBuilder};
use crate::blob::{PositiveBlob, NegativeBlob, Blob};
fn simple_vec_append(key: &[u8], value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> { fn union_positives(a: &PositiveBlob, b: &PositiveBlob) -> Result<PositiveBlob, Box<Error>> {
let mut output = Vec::new(); let (a_map, a_indexes) = (a.as_map(), a.as_indexes());
for bytes in operands.chain(value) { let (b_map, b_indexes) = (b.as_map(), b.as_indexes());
output.extend_from_slice(bytes);
let mut map_builder = MapBuilder::memory();
let mut indexes_builder = RawDocIndexesBuilder::memory();
let op_builder = OpBuilder::new().add(a_map).add(b_map);
let mut stream = op_builder.union();
let mut i = 0;
while let Some((key, indexed)) = stream.next() {
let doc_idx: Vec<DocIndex> = match indexed {
[a, b] => {
let a_doc_idx = a_indexes.get(a.value).expect("BUG: could not find document indexes");
let b_doc_idx = b_indexes.get(b.value).expect("BUG: could not find document indexes");
let a_doc_idx = Set::new_unchecked(a_doc_idx);
let b_doc_idx = Set::new_unchecked(b_doc_idx);
let sd_union = SdUnion::new(a_doc_idx, b_doc_idx);
sd_union.into_set_buf().into_vec()
},
[a] => {
let indexes = if a.index == 0 { a_indexes } else { b_indexes };
let doc_idx = indexes.get(a.value).expect("BUG: could not find document indexes");
doc_idx.to_vec()
},
_ => unreachable!(),
};
if !doc_idx.is_empty() {
map_builder.insert(key, i)?;
indexes_builder.insert(&doc_idx)?;
i += 1;
} }
output
} }
pub struct MergeBuilder { let inner = map_builder.into_inner()?;
blobs: Vec<Blob>, let map = Map::from_bytes(inner)?;
let inner = indexes_builder.into_inner()?;
let indexes = DocIndexes::from_bytes(inner)?;
Ok(PositiveBlob::from_raw(map, indexes))
} }
impl MergeBuilder { fn union_negatives(a: &NegativeBlob, b: &NegativeBlob) -> NegativeBlob {
pub fn new() -> MergeBuilder { let a_doc_ids = a.as_ids().doc_ids();
MergeBuilder { blobs: Vec::new() } let b_doc_ids = b.as_ids().doc_ids();
let a_doc_ids = Set::new_unchecked(a_doc_ids);
let b_doc_ids = Set::new_unchecked(b_doc_ids);
let sd_union = SdUnion::new(a_doc_ids, b_doc_ids);
let doc_ids = sd_union.into_set_buf().into_vec();
let doc_ids = DocIds::from_document_ids(doc_ids);
NegativeBlob::from_raw(doc_ids)
} }
pub fn push(&mut self, blob: Blob) { fn merge_positive_negative(pos: &PositiveBlob, neg: &NegativeBlob) -> Result<PositiveBlob, Box<Error>> {
if blob.sign() == Sign::Negative && self.blobs.is_empty() { return } let (map, indexes) = (pos.as_map(), pos.as_indexes());
self.blobs.push(blob); let doc_ids = neg.as_ids().doc_ids();
let doc_ids = Set::new_unchecked(doc_ids);
let mut map_builder = MapBuilder::memory();
let mut indexes_builder = RawDocIndexesBuilder::memory();
let mut stream = map.into_stream();
let mut i = 0;
while let Some((key, index)) = stream.next() {
let doc_idx = indexes.get(index).expect("BUG: could not find document indexes");
let doc_idx = Set::new_unchecked(doc_idx);
let diff = DifferenceByKey::new(doc_idx, doc_ids, |&d| d.document_id, |id| *id);
let doc_idx: Vec<DocIndex> = diff.into_set_buf().into_vec();
map_builder.insert(key, i)?;
indexes_builder.insert(&doc_idx)?;
i += 1;
} }
pub fn merge(self) -> PositiveBlob { let inner = map_builder.into_inner()?;
unimplemented!() let map = Map::from_bytes(inner)?;
let inner = indexes_builder.into_inner()?;
let indexes = DocIndexes::from_bytes(inner)?;
Ok(PositiveBlob::from_raw(map, indexes))
}
#[derive(Default)]
struct Merge {
blob: PositiveBlob,
}
impl Merge {
fn new(blob: PositiveBlob) -> Merge {
Merge { blob }
}
fn merge(&mut self, blob: Blob) {
self.blob = match blob {
Blob::Positive(blob) => union_positives(&self.blob, &blob).unwrap(),
Blob::Negative(blob) => merge_positive_negative(&self.blob, &blob).unwrap(),
};
}
fn build(self) -> PositiveBlob {
self.blob
} }
} }
fn merge_indexes(key: &[u8], existing_value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> { fn merge_indexes(key: &[u8], existing_value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
if key != b"data-index" { panic!("The merge operator only allow \"data-index\" merging") } if key != b"data-index" { panic!("The merge operator only supports \"data-index\" merging") }
let mut merge_builder = MergeBuilder::new(); let mut merge = match existing_value {
Some(existing_value) => {
if let Some(existing_value) = existing_value { let blob = bincode::deserialize(existing_value).expect("BUG: could not deserialize data-index");
let base: PositiveBlob = bincode::deserialize(existing_value).unwrap(); // FIXME what do we do here ? Merge::new(blob)
merge_builder.push(Blob::Positive(base)); },
} None => Merge::default(),
};
for bytes in operands { for bytes in operands {
let blob: Blob = bincode::deserialize(bytes).unwrap(); let blob = bincode::deserialize(bytes).expect("BUG: could not deserialize blobs");
merge_builder.push(blob); merge.merge(blob);
} }
let blob = merge_builder.merge(); let blob = merge.build();
// blob.to_vec() bincode::serialize(&blob).expect("BUG: could not serialize merged blob")
unimplemented!()
} }
pub struct Index { pub struct Index {
@@ -95,7 +182,7 @@ impl Index {
opts.create_if_missing(true); opts.create_if_missing(true);
let mut cf_opts = rocksdb_options::ColumnFamilyOptions::new(); let mut cf_opts = rocksdb_options::ColumnFamilyOptions::new();
cf_opts.add_merge_operator("blobs order operator", simple_vec_append); cf_opts.add_merge_operator("data-index merge operator", merge_indexes);
let database = rocksdb::DB::open_cf(opts, &path, vec![("default", cf_opts)])?; let database = rocksdb::DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
@@ -114,7 +201,7 @@ impl Index {
opts.create_if_missing(false); opts.create_if_missing(false);
let mut cf_opts = rocksdb_options::ColumnFamilyOptions::new(); let mut cf_opts = rocksdb_options::ColumnFamilyOptions::new();
cf_opts.add_merge_operator("blobs order operator", simple_vec_append); cf_opts.add_merge_operator("data-index merge operator", merge_indexes);
let database = rocksdb::DB::open_cf(opts, &path, vec![("default", cf_opts)])?; let database = rocksdb::DB::open_cf(opts, &path, vec![("default", cf_opts)])?;
@@ -150,12 +237,9 @@ impl Index {
// this snapshot will allow consistent reads for the whole search operation // this snapshot will allow consistent reads for the whole search operation
let snapshot = self.database.snapshot(); let snapshot = self.database.snapshot();
let data_key = Identifier::data().blobs_order().build(); let index_key = Identifier::data().index().build();
let blobs = match snapshot.get(&data_key)? { let map = match snapshot.get(&index_key)? {
Some(value) => { Some(value) => bincode::deserialize(&value)?,
let blob_infos = BlobInfo::read_from_slice(&value)?;
blobs_from_blob_infos(&blob_infos, &snapshot)?
},
None => Vec::new(), None => Vec::new(),
}; };
@@ -166,7 +250,7 @@ impl Index {
} }
let config = Config { let config = Config {
blobs: &blobs, map: map,
automatons: automatons, automatons: automatons,
criteria: criterion::default(), criteria: criterion::default(),
distinct: ((), 1), distinct: ((), 1),

View File

@@ -1,8 +1,6 @@
use std::path::PathBuf; use std::path::PathBuf;
use std::error::Error; use std::error::Error;
use ::rocksdb::rocksdb_options;
use crate::blob::{BlobName, Sign}; use crate::blob::{BlobName, Sign};
mod negative_update; mod negative_update;

View File

@@ -1,7 +1,6 @@
use std::collections::BTreeMap; use std::collections::BTreeMap;
use std::path::PathBuf; use std::path::PathBuf;
use std::error::Error; use std::error::Error;
use std::fmt::Write;
use ::rocksdb::rocksdb_options; use ::rocksdb::rocksdb_options;