mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-08-02 03:40:00 +00:00
chore: Move index related things to the meilidb-core workspace member
This commit is contained in:
61
meilidb-core/src/data/doc_ids.rs
Normal file
61
meilidb-core/src/data/doc_ids.rs
Normal file
@ -0,0 +1,61 @@
|
||||
use std::slice::from_raw_parts;
|
||||
use std::mem::size_of;
|
||||
use std::error::Error;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use sdset::Set;
|
||||
|
||||
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||
use crate::write_to_bytes::WriteToBytes;
|
||||
use crate::data::SharedData;
|
||||
use crate::DocumentId;
|
||||
|
||||
use super::into_u8_slice;
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
pub struct DocIds(SharedData);
|
||||
|
||||
impl DocIds {
|
||||
pub fn new(ids: &Set<DocumentId>) -> DocIds {
|
||||
let bytes = unsafe { into_u8_slice(ids.as_slice()) };
|
||||
let data = SharedData::from_bytes(bytes.to_vec());
|
||||
DocIds(data)
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.0.is_empty()
|
||||
}
|
||||
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<Set<DocumentId>> for DocIds {
|
||||
fn as_ref(&self) -> &Set<DocumentId> {
|
||||
let slice = &self.0;
|
||||
let ptr = slice.as_ptr() as *const DocumentId;
|
||||
let len = slice.len() / size_of::<DocumentId>();
|
||||
let slice = unsafe { from_raw_parts(ptr, len) };
|
||||
Set::new_unchecked(slice)
|
||||
}
|
||||
}
|
||||
|
||||
impl FromSharedDataCursor for DocIds {
|
||||
type Error = Box<Error>;
|
||||
|
||||
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<DocIds, Self::Error> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let data = cursor.extract(len);
|
||||
|
||||
Ok(DocIds(data))
|
||||
}
|
||||
}
|
||||
|
||||
impl WriteToBytes for DocIds {
|
||||
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let len = self.0.len() as u64;
|
||||
bytes.write_u64::<LittleEndian>(len).unwrap();
|
||||
bytes.extend_from_slice(&self.0);
|
||||
}
|
||||
}
|
231
meilidb-core/src/data/doc_indexes.rs
Normal file
231
meilidb-core/src/data/doc_indexes.rs
Normal file
@ -0,0 +1,231 @@
|
||||
use std::io::{self, Write};
|
||||
use std::slice::from_raw_parts;
|
||||
use std::mem::size_of;
|
||||
use std::ops::Index;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use sdset::Set;
|
||||
|
||||
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
|
||||
use crate::write_to_bytes::WriteToBytes;
|
||||
use crate::data::SharedData;
|
||||
use crate::DocIndex;
|
||||
|
||||
use super::into_u8_slice;
|
||||
|
||||
#[derive(Debug)]
|
||||
#[repr(C)]
|
||||
struct Range {
|
||||
start: u64,
|
||||
end: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct DocIndexes {
|
||||
ranges: SharedData,
|
||||
indexes: SharedData,
|
||||
}
|
||||
|
||||
impl DocIndexes {
|
||||
pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> {
|
||||
self.ranges().get(index).map(|Range { start, end }| {
|
||||
let start = *start as usize;
|
||||
let end = *end as usize;
|
||||
let slice = &self.indexes()[start..end];
|
||||
Set::new_unchecked(slice)
|
||||
})
|
||||
}
|
||||
|
||||
fn ranges(&self) -> &[Range] {
|
||||
let slice = &self.ranges;
|
||||
let ptr = slice.as_ptr() as *const Range;
|
||||
let len = slice.len() / size_of::<Range>();
|
||||
unsafe { from_raw_parts(ptr, len) }
|
||||
}
|
||||
|
||||
fn indexes(&self) -> &[DocIndex] {
|
||||
let slice = &self.indexes;
|
||||
let ptr = slice.as_ptr() as *const DocIndex;
|
||||
let len = slice.len() / size_of::<DocIndex>();
|
||||
unsafe { from_raw_parts(ptr, len) }
|
||||
}
|
||||
}
|
||||
|
||||
impl Index<usize> for DocIndexes {
|
||||
type Output = [DocIndex];
|
||||
|
||||
fn index(&self, index: usize) -> &Self::Output {
|
||||
match self.get(index) {
|
||||
Some(indexes) => indexes,
|
||||
None => panic!("index {} out of range for a maximum of {} ranges", index, self.ranges().len()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FromSharedDataCursor for DocIndexes {
|
||||
type Error = io::Error;
|
||||
|
||||
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<DocIndexes, Self::Error> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let ranges = cursor.extract(len);
|
||||
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let indexes = cursor.extract(len);
|
||||
|
||||
Ok(DocIndexes { ranges, indexes })
|
||||
}
|
||||
}
|
||||
|
||||
impl WriteToBytes for DocIndexes {
|
||||
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let ranges_len = self.ranges.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(ranges_len);
|
||||
bytes.extend_from_slice(&self.ranges);
|
||||
|
||||
let indexes_len = self.indexes.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(indexes_len);
|
||||
bytes.extend_from_slice(&self.indexes);
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocIndexesBuilder<W> {
|
||||
ranges: Vec<Range>,
|
||||
indexes: Vec<DocIndex>,
|
||||
wtr: W,
|
||||
}
|
||||
|
||||
impl DocIndexesBuilder<Vec<u8>> {
|
||||
pub fn memory() -> Self {
|
||||
DocIndexesBuilder {
|
||||
ranges: Vec::new(),
|
||||
indexes: Vec::new(),
|
||||
wtr: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: Write> DocIndexesBuilder<W> {
|
||||
pub fn new(wtr: W) -> Self {
|
||||
DocIndexesBuilder {
|
||||
ranges: Vec::new(),
|
||||
indexes: Vec::new(),
|
||||
wtr: wtr,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, indexes: &Set<DocIndex>) {
|
||||
let len = indexes.len() as u64;
|
||||
let start = self.ranges.last().map(|r| r.end).unwrap_or(0);
|
||||
let range = Range { start, end: start + len };
|
||||
self.ranges.push(range);
|
||||
|
||||
self.indexes.extend_from_slice(indexes);
|
||||
}
|
||||
|
||||
pub fn finish(self) -> io::Result<()> {
|
||||
self.into_inner().map(drop)
|
||||
}
|
||||
|
||||
pub fn into_inner(mut self) -> io::Result<W> {
|
||||
let ranges = unsafe { into_u8_slice(&self.ranges) };
|
||||
let len = ranges.len() as u64;
|
||||
self.wtr.write_u64::<LittleEndian>(len)?;
|
||||
self.wtr.write_all(ranges)?;
|
||||
|
||||
let indexes = unsafe { into_u8_slice(&self.indexes) };
|
||||
let len = indexes.len() as u64;
|
||||
self.wtr.write_u64::<LittleEndian>(len)?;
|
||||
self.wtr.write_all(indexes)?;
|
||||
|
||||
Ok(self.wtr)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::error::Error;
|
||||
use crate::DocumentId;
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
|
||||
let a = DocIndex {
|
||||
document_id: DocumentId(0),
|
||||
attribute: 3,
|
||||
word_index: 11,
|
||||
char_index: 30,
|
||||
char_length: 4,
|
||||
};
|
||||
let b = DocIndex {
|
||||
document_id: DocumentId(1),
|
||||
attribute: 4,
|
||||
word_index: 21,
|
||||
char_index: 35,
|
||||
char_length: 6,
|
||||
};
|
||||
let c = DocIndex {
|
||||
document_id: DocumentId(2),
|
||||
attribute: 8,
|
||||
word_index: 2,
|
||||
char_index: 89,
|
||||
char_length: 6,
|
||||
};
|
||||
|
||||
let mut builder = DocIndexesBuilder::memory();
|
||||
|
||||
builder.insert(Set::new(&[a])?);
|
||||
builder.insert(Set::new(&[a, b, c])?);
|
||||
builder.insert(Set::new(&[a, c])?);
|
||||
|
||||
let bytes = builder.into_inner()?;
|
||||
let docs = DocIndexes::from_bytes(bytes)?;
|
||||
|
||||
assert_eq!(docs.get(0), Some(Set::new(&[a])?));
|
||||
assert_eq!(docs.get(1), Some(Set::new(&[a, b, c])?));
|
||||
assert_eq!(docs.get(2), Some(Set::new(&[a, c])?));
|
||||
assert_eq!(docs.get(3), None);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn serialize_deserialize() -> Result<(), Box<Error>> {
|
||||
let a = DocIndex {
|
||||
document_id: DocumentId(0),
|
||||
attribute: 3,
|
||||
word_index: 11,
|
||||
char_index: 30,
|
||||
char_length: 4,
|
||||
};
|
||||
let b = DocIndex {
|
||||
document_id: DocumentId(1),
|
||||
attribute: 4,
|
||||
word_index: 21,
|
||||
char_index: 35,
|
||||
char_length: 6,
|
||||
};
|
||||
let c = DocIndex {
|
||||
document_id: DocumentId(2),
|
||||
attribute: 8,
|
||||
word_index: 2,
|
||||
char_index: 89,
|
||||
char_length: 6,
|
||||
};
|
||||
|
||||
let mut builder = DocIndexesBuilder::memory();
|
||||
|
||||
builder.insert(Set::new(&[a])?);
|
||||
builder.insert(Set::new(&[a, b, c])?);
|
||||
builder.insert(Set::new(&[a, c])?);
|
||||
|
||||
let builder_bytes = builder.into_inner()?;
|
||||
let docs = DocIndexes::from_bytes(builder_bytes.clone())?;
|
||||
|
||||
let mut bytes = Vec::new();
|
||||
docs.write_to_bytes(&mut bytes);
|
||||
|
||||
assert_eq!(builder_bytes, bytes);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
16
meilidb-core/src/data/mod.rs
Normal file
16
meilidb-core/src/data/mod.rs
Normal file
@ -0,0 +1,16 @@
|
||||
mod doc_ids;
|
||||
mod doc_indexes;
|
||||
mod shared_data;
|
||||
|
||||
use std::slice::from_raw_parts;
|
||||
use std::mem::size_of;
|
||||
|
||||
pub use self::doc_ids::DocIds;
|
||||
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
|
||||
pub use self::shared_data::SharedData;
|
||||
|
||||
unsafe fn into_u8_slice<T: Sized>(slice: &[T]) -> &[u8] {
|
||||
let ptr = slice.as_ptr() as *const u8;
|
||||
let len = slice.len() * size_of::<T>();
|
||||
from_raw_parts(ptr, len)
|
||||
}
|
48
meilidb-core/src/data/shared_data.rs
Normal file
48
meilidb-core/src/data/shared_data.rs
Normal file
@ -0,0 +1,48 @@
|
||||
use std::sync::Arc;
|
||||
use std::ops::Deref;
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
pub struct SharedData {
|
||||
pub bytes: Arc<Vec<u8>>,
|
||||
pub offset: usize,
|
||||
pub len: usize,
|
||||
}
|
||||
|
||||
impl SharedData {
|
||||
pub fn from_bytes(vec: Vec<u8>) -> SharedData {
|
||||
let len = vec.len();
|
||||
let bytes = Arc::from(vec);
|
||||
SharedData::new(bytes, 0, len)
|
||||
}
|
||||
|
||||
pub fn new(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedData {
|
||||
SharedData { bytes, offset, len }
|
||||
}
|
||||
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
&self.bytes[self.offset..self.offset + self.len]
|
||||
}
|
||||
|
||||
pub fn range(&self, offset: usize, len: usize) -> SharedData {
|
||||
assert!(offset + len <= self.len);
|
||||
SharedData {
|
||||
bytes: self.bytes.clone(),
|
||||
offset: self.offset + offset,
|
||||
len: len,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for SharedData {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for SharedData {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user