mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-24 20:46:27 +00:00 
			
		
		
		
	chore: Move index related things to the meilidb-core workspace member
This commit is contained in:
		
							
								
								
									
										60
									
								
								Cargo.toml
									
									
									
									
									
								
							
							
						
						
									
										60
									
								
								Cargo.toml
									
									
									
									
									
								
							| @@ -1,55 +1,5 @@ | |||||||
| [package] | [workspace] | ||||||
| edition = "2018" | members = [ | ||||||
| name = "meilidb" |     "meilidb", | ||||||
| version = "0.3.2" |     "meilidb-core", | ||||||
| authors = ["Kerollmops <renault.cle@gmail.com>"] | ] | ||||||
|  |  | ||||||
| [dependencies] |  | ||||||
| arc-swap = "0.3.7" |  | ||||||
| bincode = "1.1.2" |  | ||||||
| byteorder = "1.3.1" |  | ||||||
| fst = "0.3.3" |  | ||||||
| hashbrown = { version = "0.1.8", features = ["serde"] } |  | ||||||
| lazy_static = "1.2.0" |  | ||||||
| levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] } |  | ||||||
| linked-hash-map = { version = "0.5.1", features = ["serde_impl"] } |  | ||||||
| lockfree = "0.5.1" |  | ||||||
| log = "0.4.6" |  | ||||||
| rayon = "1.0.3" |  | ||||||
| sdset = "0.3.1" |  | ||||||
| serde = "1.0.88" |  | ||||||
| serde_derive = "1.0.88" |  | ||||||
| serde_json = { version = "1.0.38", features = ["preserve_order"] } |  | ||||||
| size_format = "1.0.2" |  | ||||||
| slice-group-by = "0.2.4" |  | ||||||
| unidecode = "0.3.0" |  | ||||||
|  |  | ||||||
| [dependencies.toml] |  | ||||||
| git = "https://github.com/Kerollmops/toml-rs.git" |  | ||||||
| features = ["preserve_order"] |  | ||||||
| rev = "0372ba6" |  | ||||||
|  |  | ||||||
| [dependencies.rocksdb] |  | ||||||
| git = "https://github.com/pingcap/rust-rocksdb.git" |  | ||||||
| rev = "306e201" |  | ||||||
|  |  | ||||||
| [features] |  | ||||||
| default = ["simd"] |  | ||||||
| i128 = ["bincode/i128", "byteorder/i128"] |  | ||||||
| portable = ["rocksdb/portable"] |  | ||||||
| simd = ["rocksdb/sse"] |  | ||||||
| nightly = ["hashbrown/nightly", "slice-group-by/nightly"] |  | ||||||
|  |  | ||||||
| [dev-dependencies] |  | ||||||
| csv = "1.0.5" |  | ||||||
| env_logger = "0.6.0" |  | ||||||
| jemallocator = "0.1.9" |  | ||||||
| quickcheck = "0.8.2" |  | ||||||
| rand = "0.6.5" |  | ||||||
| rand_xorshift = "0.1.1" |  | ||||||
| structopt = "0.2.14" |  | ||||||
| tempfile = "3.0.7" |  | ||||||
| termcolor = "1.0.4" |  | ||||||
|  |  | ||||||
| [profile.release] |  | ||||||
| debug = true |  | ||||||
|   | |||||||
							
								
								
									
										21
									
								
								meilidb-core/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								meilidb-core/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,21 @@ | |||||||
|  | [package] | ||||||
|  | name = "meilidb-core" | ||||||
|  | version = "0.1.0" | ||||||
|  | authors = ["Kerollmops <renault.cle@gmail.com>"] | ||||||
|  | edition = "2018" | ||||||
|  |  | ||||||
|  | [dependencies] | ||||||
|  | byteorder = "1.3.1" | ||||||
|  | fst = "0.3.3" | ||||||
|  | hashbrown = "0.1.8" | ||||||
|  | lazy_static = "1.2.0" | ||||||
|  | levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] } | ||||||
|  | log = "0.4.6" | ||||||
|  | rayon = "1.0.3" | ||||||
|  | sdset = "0.3.1" | ||||||
|  | serde = "1.0.88" | ||||||
|  | serde_derive = "1.0.88" | ||||||
|  | slice-group-by = "0.2.4" | ||||||
|  |  | ||||||
|  | [features] | ||||||
|  | i128 = ["byteorder/i128"] | ||||||
| @@ -1,7 +1,6 @@ | |||||||
| use std::cmp::Ordering; | use std::cmp::Ordering; | ||||||
| 
 | use crate::criterion::Criterion; | ||||||
| use crate::rank::criterion::Criterion; | use crate::RawDocument; | ||||||
| use crate::rank::RawDocument; |  | ||||||
| 
 | 
 | ||||||
| #[derive(Debug, Clone, Copy)] | #[derive(Debug, Clone, Copy)] | ||||||
| pub struct DocumentId; | pub struct DocumentId; | ||||||
| @@ -1,9 +1,7 @@ | |||||||
| use std::cmp::Ordering; | use std::cmp::Ordering; | ||||||
| 
 |  | ||||||
| use slice_group_by::GroupBy; | use slice_group_by::GroupBy; | ||||||
| 
 | use crate::criterion::Criterion; | ||||||
| use crate::rank::criterion::Criterion; | use crate::RawDocument; | ||||||
| use crate::rank::RawDocument; |  | ||||||
| 
 | 
 | ||||||
| #[inline] | #[inline] | ||||||
| fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize { | fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize { | ||||||
| @@ -4,11 +4,11 @@ mod words_proximity; | |||||||
| mod sum_of_words_attribute; | mod sum_of_words_attribute; | ||||||
| mod sum_of_words_position; | mod sum_of_words_position; | ||||||
| mod exact; | mod exact; | ||||||
| mod sort_by_attr; | // mod sort_by_attr;
 | ||||||
| mod document_id; | mod document_id; | ||||||
| 
 | 
 | ||||||
| use std::cmp::Ordering; | use std::cmp::Ordering; | ||||||
| use crate::rank::RawDocument; | use crate::RawDocument; | ||||||
| 
 | 
 | ||||||
| pub use self::{ | pub use self::{ | ||||||
|     sum_of_typos::SumOfTypos, |     sum_of_typos::SumOfTypos, | ||||||
| @@ -17,7 +17,7 @@ pub use self::{ | |||||||
|     sum_of_words_attribute::SumOfWordsAttribute, |     sum_of_words_attribute::SumOfWordsAttribute, | ||||||
|     sum_of_words_position::SumOfWordsPosition, |     sum_of_words_position::SumOfWordsPosition, | ||||||
|     exact::Exact, |     exact::Exact, | ||||||
|     sort_by_attr::SortByAttr, |     // sort_by_attr::SortByAttr,
 | ||||||
|     document_id::DocumentId, |     document_id::DocumentId, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| @@ -1,9 +1,7 @@ | |||||||
| use std::cmp::Ordering; | use std::cmp::Ordering; | ||||||
| 
 |  | ||||||
| use slice_group_by::GroupBy; | use slice_group_by::GroupBy; | ||||||
| 
 | use crate::criterion::Criterion; | ||||||
| use crate::rank::criterion::Criterion; | use crate::RawDocument; | ||||||
| use crate::rank::RawDocument; |  | ||||||
| 
 | 
 | ||||||
| #[inline] | #[inline] | ||||||
| fn number_of_query_words(query_index: &[u32]) -> usize { | fn number_of_query_words(query_index: &[u32]) -> usize { | ||||||
| @@ -3,9 +3,9 @@ use std::error::Error; | |||||||
| use std::fmt; | use std::fmt; | ||||||
| 
 | 
 | ||||||
| use crate::database::schema::{Schema, SchemaAttr}; | use crate::database::schema::{Schema, SchemaAttr}; | ||||||
| use crate::rank::criterion::Criterion; | use crate::criterion::Criterion; | ||||||
| use crate::database::RankedMap; | use crate::database::RankedMap; | ||||||
| use crate::rank::RawDocument; | use crate::RawDocument; | ||||||
| 
 | 
 | ||||||
| /// An helper struct that permit to sort documents by
 | /// An helper struct that permit to sort documents by
 | ||||||
| /// some of their stored attributes.
 | /// some of their stored attributes.
 | ||||||
| @@ -2,8 +2,8 @@ use std::cmp::Ordering; | |||||||
| 
 | 
 | ||||||
| use slice_group_by::GroupBy; | use slice_group_by::GroupBy; | ||||||
| 
 | 
 | ||||||
| use crate::rank::criterion::Criterion; | use crate::criterion::Criterion; | ||||||
| use crate::rank::RawDocument; | use crate::RawDocument; | ||||||
| 
 | 
 | ||||||
| // This function is a wrong logarithmic 10 function.
 | // This function is a wrong logarithmic 10 function.
 | ||||||
| // It is safe to panic on input number higher than 3,
 | // It is safe to panic on input number higher than 3,
 | ||||||
| @@ -1,9 +1,7 @@ | |||||||
| use std::cmp::Ordering; | use std::cmp::Ordering; | ||||||
| 
 |  | ||||||
| use slice_group_by::GroupBy; | use slice_group_by::GroupBy; | ||||||
| 
 | use crate::criterion::Criterion; | ||||||
| use crate::rank::criterion::Criterion; | use crate::RawDocument; | ||||||
| use crate::rank::RawDocument; |  | ||||||
| 
 | 
 | ||||||
| #[inline] | #[inline] | ||||||
| fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize { | fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize { | ||||||
| @@ -1,9 +1,7 @@ | |||||||
| use std::cmp::Ordering; | use std::cmp::Ordering; | ||||||
| 
 |  | ||||||
| use slice_group_by::GroupBy; | use slice_group_by::GroupBy; | ||||||
| 
 | use crate::criterion::Criterion; | ||||||
| use crate::rank::criterion::Criterion; | use crate::RawDocument; | ||||||
| use crate::rank::RawDocument; |  | ||||||
| 
 | 
 | ||||||
| #[inline] | #[inline] | ||||||
| fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize { | fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize { | ||||||
| @@ -1,9 +1,7 @@ | |||||||
| use std::cmp::{self, Ordering}; | use std::cmp::{self, Ordering}; | ||||||
| 
 |  | ||||||
| use slice_group_by::GroupBy; | use slice_group_by::GroupBy; | ||||||
| 
 | use crate::criterion::Criterion; | ||||||
| use crate::rank::criterion::Criterion; | use crate::RawDocument; | ||||||
| use crate::rank::RawDocument; |  | ||||||
| 
 | 
 | ||||||
| const MAX_DISTANCE: u16 = 8; | const MAX_DISTANCE: u16 = 8; | ||||||
| 
 | 
 | ||||||
| @@ -1,16 +1,118 @@ | |||||||
| pub mod criterion; | pub mod criterion; | ||||||
|  | pub mod data; | ||||||
|  | mod index; | ||||||
|  | mod automaton; | ||||||
| mod query_builder; | mod query_builder; | ||||||
| mod distinct_map; | mod distinct_map; | ||||||
| 
 | 
 | ||||||
|  | pub mod shared_data_cursor; | ||||||
|  | pub mod write_to_bytes; | ||||||
|  | 
 | ||||||
| use std::sync::Arc; | use std::sync::Arc; | ||||||
|  | use serde_derive::{Serialize, Deserialize}; | ||||||
| 
 | 
 | ||||||
| use slice_group_by::GroupBy; | use slice_group_by::GroupBy; | ||||||
| use rayon::slice::ParallelSliceMut; | use rayon::slice::ParallelSliceMut; | ||||||
| 
 | 
 | ||||||
| use crate::{Match, DocumentId}; | pub use self::index::{Index, IndexBuilder}; | ||||||
| 
 |  | ||||||
| pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder}; | pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder}; | ||||||
| 
 | 
 | ||||||
|  | /// Represent an internally generated document unique identifier.
 | ||||||
|  | ///
 | ||||||
|  | /// It is used to inform the database the document you want to deserialize.
 | ||||||
|  | /// Helpful for custom ranking.
 | ||||||
|  | #[derive(Serialize, Deserialize)] | ||||||
|  | #[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] | ||||||
|  | pub struct DocumentId(pub u64); | ||||||
|  | 
 | ||||||
|  | /// This structure represent the position of a word
 | ||||||
|  | /// in a document and its attributes.
 | ||||||
|  | ///
 | ||||||
|  | /// This is stored in the map, generated at index time,
 | ||||||
|  | /// extracted and interpreted at search time.
 | ||||||
|  | #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||||||
|  | #[repr(C)] | ||||||
|  | pub struct DocIndex { | ||||||
|  |     /// The document identifier where the word was found.
 | ||||||
|  |     pub document_id: DocumentId, | ||||||
|  | 
 | ||||||
|  |     /// The attribute in the document where the word was found
 | ||||||
|  |     /// along with the index in it.
 | ||||||
|  |     pub attribute: u16, | ||||||
|  |     pub word_index: u16, | ||||||
|  | 
 | ||||||
|  |     /// The position in bytes where the word was found
 | ||||||
|  |     /// along with the length of it.
 | ||||||
|  |     ///
 | ||||||
|  |     /// It informs on the original word area in the text indexed
 | ||||||
|  |     /// without needing to run the tokenizer again.
 | ||||||
|  |     pub char_index: u16, | ||||||
|  |     pub char_length: u16, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /// This structure represent a matching word with informations
 | ||||||
|  | /// on the location of the word in the document.
 | ||||||
|  | ///
 | ||||||
|  | /// The order of the field is important because it defines
 | ||||||
|  | /// the way these structures are ordered between themselves.
 | ||||||
|  | ///
 | ||||||
|  | /// The word in itself is not important.
 | ||||||
|  | // TODO do data oriented programming ? very arrays ?
 | ||||||
|  | #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||||||
|  | pub struct Match { | ||||||
|  |     /// The word index in the query sentence.
 | ||||||
|  |     /// Same as the `attribute_index` but for the query words.
 | ||||||
|  |     ///
 | ||||||
|  |     /// Used to retrieve the automaton that match this word.
 | ||||||
|  |     pub query_index: u32, | ||||||
|  | 
 | ||||||
|  |     /// The distance the word has with the query word
 | ||||||
|  |     /// (i.e. the Levenshtein distance).
 | ||||||
|  |     pub distance: u8, | ||||||
|  | 
 | ||||||
|  |     /// The attribute in the document where the word was found
 | ||||||
|  |     /// along with the index in it.
 | ||||||
|  |     pub attribute: u16, | ||||||
|  |     pub word_index: u16, | ||||||
|  | 
 | ||||||
|  |     /// Whether the word that match is an exact match or a prefix.
 | ||||||
|  |     pub is_exact: bool, | ||||||
|  | 
 | ||||||
|  |     /// The position in bytes where the word was found
 | ||||||
|  |     /// along with the length of it.
 | ||||||
|  |     ///
 | ||||||
|  |     /// It informs on the original word area in the text indexed
 | ||||||
|  |     /// without needing to run the tokenizer again.
 | ||||||
|  |     pub char_index: u16, | ||||||
|  |     pub char_length: u16, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | impl Match { | ||||||
|  |     pub fn zero() -> Self { | ||||||
|  |         Match { | ||||||
|  |             query_index: 0, | ||||||
|  |             distance: 0, | ||||||
|  |             attribute: 0, | ||||||
|  |             word_index: 0, | ||||||
|  |             is_exact: false, | ||||||
|  |             char_index: 0, | ||||||
|  |             char_length: 0, | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     pub fn max() -> Self { | ||||||
|  |         Match { | ||||||
|  |             query_index: u32::max_value(), | ||||||
|  |             distance: u8::max_value(), | ||||||
|  |             attribute: u16::max_value(), | ||||||
|  |             word_index: u16::max_value(), | ||||||
|  |             is_exact: true, | ||||||
|  |             char_index: u16::max_value(), | ||||||
|  |             char_length: u16::max_value(), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
| #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] | #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||||||
| pub struct Document { | pub struct Document { | ||||||
|     pub id: DocumentId, |     pub id: DocumentId, | ||||||
| @@ -181,3 +283,15 @@ impl Matches { | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | #[cfg(test)] | ||||||
|  | mod tests { | ||||||
|  |     use super::*; | ||||||
|  |     use std::mem; | ||||||
|  | 
 | ||||||
|  |     #[test] | ||||||
|  |     fn docindex_mem_size() { | ||||||
|  |         assert_eq!(mem::size_of::<DocIndex>(), 24); | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -11,11 +11,23 @@ use fst::Streamer; | |||||||
| use log::info; | use log::info; | ||||||
| 
 | 
 | ||||||
| use crate::automaton::{self, DfaExt, AutomatonExt}; | use crate::automaton::{self, DfaExt, AutomatonExt}; | ||||||
| use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap}; | use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; | ||||||
| use crate::rank::criterion::Criteria; | use crate::criterion::Criteria; | ||||||
| use crate::database::Index; | use crate::{raw_documents_from_matches, RawDocument, Document}; | ||||||
| use crate::rank::{raw_documents_from_matches, RawDocument, Document}; | use crate::{Index, Match, DocumentId}; | ||||||
| use crate::{is_cjk, Match, DocumentId}; | 
 | ||||||
|  | // query splitting must move out of this crate
 | ||||||
|  | pub fn is_cjk(c: char) -> bool { | ||||||
|  |     (c >= '\u{2e80}' && c <= '\u{2eff}') || | ||||||
|  |     (c >= '\u{2f00}' && c <= '\u{2fdf}') || | ||||||
|  |     (c >= '\u{3040}' && c <= '\u{309f}') || | ||||||
|  |     (c >= '\u{30a0}' && c <= '\u{30ff}') || | ||||||
|  |     (c >= '\u{3100}' && c <= '\u{312f}') || | ||||||
|  |     (c >= '\u{3200}' && c <= '\u{32ff}') || | ||||||
|  |     (c >= '\u{3400}' && c <= '\u{4dbf}') || | ||||||
|  |     (c >= '\u{4e00}' && c <= '\u{9fff}') || | ||||||
|  |     (c >= '\u{f900}' && c <= '\u{faff}') | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| #[derive(Debug, PartialEq, Eq)] | #[derive(Debug, PartialEq, Eq)] | ||||||
| enum CharCategory { | enum CharCategory { | ||||||
							
								
								
									
										1072
									
								
								meilidb/Cargo.lock
									
									
									
										generated
									
									
									
										Normal file
									
								
							
							
						
						
									
										1072
									
								
								meilidb/Cargo.lock
									
									
									
										generated
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										50
									
								
								meilidb/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								meilidb/Cargo.toml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,50 @@ | |||||||
|  | [package] | ||||||
|  | edition = "2018" | ||||||
|  | name = "meilidb" | ||||||
|  | version = "0.3.1" | ||||||
|  | authors = ["Kerollmops <renault.cle@gmail.com>"] | ||||||
|  |  | ||||||
|  | [dependencies] | ||||||
|  | arc-swap = "0.3.7" | ||||||
|  | bincode = "1.1.2" | ||||||
|  | byteorder = "1.3.1" | ||||||
|  | fst = "0.3.3" | ||||||
|  | hashbrown = { version = "0.1.8", features = ["serde"] } | ||||||
|  | linked-hash-map = { version = "0.5.1", features = ["serde_impl"] } | ||||||
|  | lockfree = "0.5.1" | ||||||
|  | log = "0.4.6" | ||||||
|  | sdset = "0.3.1" | ||||||
|  | serde = "1.0.88" | ||||||
|  | serde_derive = "1.0.88" | ||||||
|  | serde_json = { version = "1.0.38", features = ["preserve_order"] } | ||||||
|  | size_format = "1.0.2" | ||||||
|  | slice-group-by = "0.2.4" | ||||||
|  | unidecode = "0.3.0" | ||||||
|  | meilidb-core = { path = "../meilidb-core", version = "0.1.0" } | ||||||
|  |  | ||||||
|  | [dependencies.toml] | ||||||
|  | git = "https://github.com/Kerollmops/toml-rs.git" | ||||||
|  | features = ["preserve_order"] | ||||||
|  | rev = "0372ba6" | ||||||
|  |  | ||||||
|  | [dependencies.rocksdb] | ||||||
|  | git = "https://github.com/pingcap/rust-rocksdb.git" | ||||||
|  | rev = "306e201" | ||||||
|  |  | ||||||
|  | [features] | ||||||
|  | default = ["simd"] | ||||||
|  | i128 = ["bincode/i128"] | ||||||
|  | portable = ["rocksdb/portable"] | ||||||
|  | simd = ["rocksdb/sse"] | ||||||
|  | nightly = ["hashbrown/nightly", "slice-group-by/nightly"] | ||||||
|  |  | ||||||
|  | [dev-dependencies] | ||||||
|  | csv = "1.0.5" | ||||||
|  | env_logger = "0.6.0" | ||||||
|  | jemallocator = "0.1.9" | ||||||
|  | quickcheck = "0.8.2" | ||||||
|  | rand = "0.6.5" | ||||||
|  | rand_xorshift = "0.1.1" | ||||||
|  | structopt = "0.2.14" | ||||||
|  | tempfile = "3.0.7" | ||||||
|  | termcolor = "1.0.4" | ||||||
| @@ -5,7 +5,7 @@ use std::fmt; | |||||||
| use byteorder::{BigEndian, WriteBytesExt, ReadBytesExt}; | use byteorder::{BigEndian, WriteBytesExt, ReadBytesExt}; | ||||||
| 
 | 
 | ||||||
| use crate::database::schema::SchemaAttr; | use crate::database::schema::SchemaAttr; | ||||||
| use crate::DocumentId; | use meilidb_core::DocumentId; | ||||||
| 
 | 
 | ||||||
| const DOC_KEY_LEN:      usize = 4 + size_of::<u64>(); | const DOC_KEY_LEN:      usize = 4 + size_of::<u64>(); | ||||||
| const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::<u16>(); | const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::<u16>(); | ||||||
| @@ -17,9 +17,9 @@ use hashbrown::HashMap; | |||||||
| use log::{info, error, warn}; | use log::{info, error, warn}; | ||||||
| 
 | 
 | ||||||
| use crate::database::schema::SchemaAttr; | use crate::database::schema::SchemaAttr; | ||||||
| use crate::shared_data_cursor::FromSharedDataCursor; | use meilidb_core::shared_data_cursor::FromSharedDataCursor; | ||||||
| use crate::write_to_bytes::WriteToBytes; | use meilidb_core::write_to_bytes::WriteToBytes; | ||||||
| use crate::DocumentId; | use meilidb_core::{Index, DocumentId}; | ||||||
| 
 | 
 | ||||||
| use self::update::{ReadIndexEvent, ReadRankedMapEvent}; | use self::update::{ReadIndexEvent, ReadRankedMapEvent}; | ||||||
| 
 | 
 | ||||||
| @@ -29,7 +29,6 @@ pub use self::view::{DatabaseView, DocumentIter}; | |||||||
| pub use self::update::Update; | pub use self::update::Update; | ||||||
| pub use self::serde::SerializerError; | pub use self::serde::SerializerError; | ||||||
| pub use self::schema::Schema; | pub use self::schema::Schema; | ||||||
| pub use self::index::Index; |  | ||||||
| pub use self::number::{Number, ParseNumberError}; | pub use self::number::{Number, ParseNumberError}; | ||||||
| 
 | 
 | ||||||
| pub type RankedMap = HashMap<(DocumentId, SchemaAttr), Number>; | pub type RankedMap = HashMap<(DocumentId, SchemaAttr), Number>; | ||||||
| @@ -41,7 +40,6 @@ const CONFIG:          &[u8] = b"config"; | |||||||
| 
 | 
 | ||||||
| pub mod config; | pub mod config; | ||||||
| pub mod schema; | pub mod schema; | ||||||
| pub(crate) mod index; |  | ||||||
| mod number; | mod number; | ||||||
| mod document_key; | mod document_key; | ||||||
| mod serde; | mod serde; | ||||||
| @@ -10,7 +10,7 @@ use linked_hash_map::LinkedHashMap; | |||||||
| 
 | 
 | ||||||
| use crate::database::serde::find_id::FindDocumentIdSerializer; | use crate::database::serde::find_id::FindDocumentIdSerializer; | ||||||
| use crate::database::serde::SerializerError; | use crate::database::serde::SerializerError; | ||||||
| use crate::DocumentId; | use meilidb_core::DocumentId; | ||||||
| 
 | 
 | ||||||
| pub const STORED: SchemaProps  = SchemaProps { stored: true,  indexed: false, ranked: false }; | pub const STORED: SchemaProps  = SchemaProps { stored: true,  indexed: false, ranked: false }; | ||||||
| pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true,  ranked: false }; | pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true,  ranked: false }; | ||||||
| @@ -10,7 +10,7 @@ use serde::de::{self, Visitor, IntoDeserializer}; | |||||||
| 
 | 
 | ||||||
| use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; | use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; | ||||||
| use crate::database::schema::Schema; | use crate::database::schema::Schema; | ||||||
| use crate::DocumentId; | use meilidb_core::DocumentId; | ||||||
| 
 | 
 | ||||||
| pub struct Deserializer<'a, D> | pub struct Deserializer<'a, D> | ||||||
| where D: Deref<Target=DB> | where D: Deref<Target=DB> | ||||||
| @@ -3,7 +3,7 @@ use serde::ser; | |||||||
| 
 | 
 | ||||||
| use crate::database::serde::key_to_string::KeyToStringSerializer; | use crate::database::serde::key_to_string::KeyToStringSerializer; | ||||||
| use crate::database::serde::{SerializerError, calculate_hash}; | use crate::database::serde::{SerializerError, calculate_hash}; | ||||||
| use crate::DocumentId; | use meilidb_core::DocumentId; | ||||||
| 
 | 
 | ||||||
| pub struct FindDocumentIdSerializer<'a> { | pub struct FindDocumentIdSerializer<'a> { | ||||||
|     pub id_attribute_name: &'a str, |     pub id_attribute_name: &'a str, | ||||||
| @@ -2,13 +2,14 @@ use std::collections::HashSet; | |||||||
| 
 | 
 | ||||||
| use serde::Serialize; | use serde::Serialize; | ||||||
| use serde::ser; | use serde::ser; | ||||||
|  | use meilidb_core::{DocumentId, DocIndex}; | ||||||
| 
 | 
 | ||||||
| use crate::database::update::DocumentUpdate; | use crate::database::update::DocumentUpdate; | ||||||
| use crate::database::serde::SerializerError; | use crate::database::serde::SerializerError; | ||||||
| use crate::database::schema::SchemaAttr; | use crate::database::schema::SchemaAttr; | ||||||
| use crate::tokenizer::TokenizerBuilder; | use crate::tokenizer::TokenizerBuilder; | ||||||
| use crate::tokenizer::Token; | use crate::tokenizer::Token; | ||||||
| use crate::{is_cjk, DocumentId, DocIndex}; | use crate::is_cjk; | ||||||
| 
 | 
 | ||||||
| pub struct IndexerSerializer<'a, 'b, B> { | pub struct IndexerSerializer<'a, 'b, B> { | ||||||
|     pub tokenizer_builder: &'a B, |     pub tokenizer_builder: &'a B, | ||||||
| @@ -10,7 +10,7 @@ use crate::database::update::DocumentUpdate; | |||||||
| use crate::database::serde::SerializerError; | use crate::database::serde::SerializerError; | ||||||
| use crate::tokenizer::TokenizerBuilder; | use crate::tokenizer::TokenizerBuilder; | ||||||
| use crate::database::schema::Schema; | use crate::database::schema::Schema; | ||||||
| use crate::DocumentId; | use meilidb_core::DocumentId; | ||||||
| 
 | 
 | ||||||
| pub struct Serializer<'a, 'b, B> { | pub struct Serializer<'a, 'b, B> { | ||||||
|     pub schema: &'a Schema, |     pub schema: &'a Schema, | ||||||
| @@ -1,11 +1,11 @@ | |||||||
| use std::error::Error; | use std::error::Error; | ||||||
| 
 | 
 | ||||||
| use byteorder::{ReadBytesExt, WriteBytesExt}; | use byteorder::{ReadBytesExt, WriteBytesExt}; | ||||||
|  | use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; | ||||||
|  | use meilidb_core::write_to_bytes::WriteToBytes; | ||||||
|  | use meilidb_core::data::DocIds; | ||||||
| 
 | 
 | ||||||
| use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; |  | ||||||
| use crate::write_to_bytes::WriteToBytes; |  | ||||||
| use crate::database::Index; | use crate::database::Index; | ||||||
| use crate::data::DocIds; |  | ||||||
| 
 | 
 | ||||||
| pub enum WriteIndexEvent<'a> { | pub enum WriteIndexEvent<'a> { | ||||||
|     RemovedDocuments(&'a DocIds), |     RemovedDocuments(&'a DocIds), | ||||||
| @@ -5,19 +5,18 @@ use rocksdb::rocksdb::{Writable, WriteBatch}; | |||||||
| use hashbrown::hash_map::HashMap; | use hashbrown::hash_map::HashMap; | ||||||
| use sdset::{Set, SetBuf}; | use sdset::{Set, SetBuf}; | ||||||
| use serde::Serialize; | use serde::Serialize; | ||||||
|  | use meilidb_core::write_to_bytes::WriteToBytes; | ||||||
|  | use meilidb_core::data::DocIds; | ||||||
|  | use meilidb_core::{IndexBuilder, DocumentId, DocIndex}; | ||||||
| 
 | 
 | ||||||
| use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; | use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; | ||||||
| use crate::database::serde::serializer::Serializer; | use crate::database::serde::serializer::Serializer; | ||||||
| use crate::database::serde::SerializerError; | use crate::database::serde::SerializerError; | ||||||
| use crate::database::schema::SchemaAttr; | use crate::database::schema::SchemaAttr; | ||||||
| use crate::database::schema::Schema; | use crate::database::schema::Schema; | ||||||
| use crate::database::index::IndexBuilder; |  | ||||||
| use crate::database::{DATA_INDEX, DATA_RANKED_MAP}; | use crate::database::{DATA_INDEX, DATA_RANKED_MAP}; | ||||||
| use crate::database::{RankedMap, Number}; | use crate::database::{RankedMap, Number}; | ||||||
| use crate::tokenizer::TokenizerBuilder; | use crate::tokenizer::TokenizerBuilder; | ||||||
| use crate::write_to_bytes::WriteToBytes; |  | ||||||
| use crate::data::DocIds; |  | ||||||
| use crate::{DocumentId, DocIndex}; |  | ||||||
| 
 | 
 | ||||||
| pub use self::index_event::{ReadIndexEvent, WriteIndexEvent}; | pub use self::index_event::{ReadIndexEvent, WriteIndexEvent}; | ||||||
| pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent}; | pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent}; | ||||||
| @@ -1,11 +1,11 @@ | |||||||
| use std::error::Error; | use std::error::Error; | ||||||
| 
 | 
 | ||||||
| use byteorder::{ReadBytesExt, WriteBytesExt}; | use byteorder::{ReadBytesExt, WriteBytesExt}; | ||||||
|  | use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; | ||||||
|  | use meilidb_core::write_to_bytes::WriteToBytes; | ||||||
|  | use meilidb_core::data::DocIds; | ||||||
| 
 | 
 | ||||||
| use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; |  | ||||||
| use crate::write_to_bytes::WriteToBytes; |  | ||||||
| use crate::database::RankedMap; | use crate::database::RankedMap; | ||||||
| use crate::data::DocIds; |  | ||||||
| 
 | 
 | ||||||
| pub enum WriteRankedMapEvent<'a> { | pub enum WriteRankedMapEvent<'a> { | ||||||
|     RemovedDocuments(&'a DocIds), |     RemovedDocuments(&'a DocIds), | ||||||
| @@ -6,16 +6,15 @@ use std::{fmt, marker}; | |||||||
| use rocksdb::rocksdb_options::{ReadOptions, EnvOptions, ColumnFamilyOptions}; | use rocksdb::rocksdb_options::{ReadOptions, EnvOptions, ColumnFamilyOptions}; | ||||||
| use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey, SstFileWriter}; | use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey, SstFileWriter}; | ||||||
| use serde::de::DeserializeOwned; | use serde::de::DeserializeOwned; | ||||||
|  | use meilidb_core::{Index, QueryBuilder, FilterFunc}; | ||||||
|  | use meilidb_core::DocumentId; | ||||||
| 
 | 
 | ||||||
| use crate::database::{retrieve_data_schema, retrieve_data_index, retrieve_data_ranked_map, retrieve_config}; | use crate::database::{retrieve_data_schema, retrieve_data_index, retrieve_data_ranked_map, retrieve_config}; | ||||||
| use crate::database::serde::deserializer::Deserializer; | use crate::database::serde::deserializer::Deserializer; | ||||||
| use crate::database::{DocumentKey, DocumentKeyAttr}; | use crate::database::{DocumentKey, DocumentKeyAttr}; | ||||||
| use crate::rank::{QueryBuilder, FilterFunc}; |  | ||||||
| use crate::database::schema::Schema; | use crate::database::schema::Schema; | ||||||
| use crate::database::index::Index; |  | ||||||
| use crate::database::RankedMap; | use crate::database::RankedMap; | ||||||
| use crate::database::Config; | use crate::database::Config; | ||||||
| use crate::DocumentId; |  | ||||||
| 
 | 
 | ||||||
| pub struct DatabaseView<D> | pub struct DatabaseView<D> | ||||||
| where D: Deref<Target=DB> | where D: Deref<Target=DB> | ||||||
							
								
								
									
										22
									
								
								meilidb/src/lib.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								meilidb/src/lib.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,22 @@ | |||||||
|  | #![cfg_attr(feature = "nightly", feature(test))] | ||||||
|  |  | ||||||
|  | pub mod database; | ||||||
|  | pub mod tokenizer; | ||||||
|  | mod common_words; | ||||||
|  |  | ||||||
|  | pub use rocksdb; | ||||||
|  |  | ||||||
|  | pub use self::tokenizer::Tokenizer; | ||||||
|  | pub use self::common_words::CommonWords; | ||||||
|  |  | ||||||
|  | pub fn is_cjk(c: char) -> bool { | ||||||
|  |     (c >= '\u{2e80}' && c <= '\u{2eff}') || | ||||||
|  |     (c >= '\u{2f00}' && c <= '\u{2fdf}') || | ||||||
|  |     (c >= '\u{3040}' && c <= '\u{309f}') || | ||||||
|  |     (c >= '\u{30a0}' && c <= '\u{30ff}') || | ||||||
|  |     (c >= '\u{3100}' && c <= '\u{312f}') || | ||||||
|  |     (c >= '\u{3200}' && c <= '\u{32ff}') || | ||||||
|  |     (c >= '\u{3400}' && c <= '\u{4dbf}') || | ||||||
|  |     (c >= '\u{4e00}' && c <= '\u{9fff}') || | ||||||
|  |     (c >= '\u{f900}' && c <= '\u{faff}') | ||||||
|  | } | ||||||
							
								
								
									
										136
									
								
								src/lib.rs
									
									
									
									
									
								
							
							
						
						
									
										136
									
								
								src/lib.rs
									
									
									
									
									
								
							| @@ -1,136 +0,0 @@ | |||||||
| #![cfg_attr(feature = "nightly", feature(test))] |  | ||||||
|  |  | ||||||
| pub mod automaton; |  | ||||||
| pub mod database; |  | ||||||
| pub mod data; |  | ||||||
| pub mod rank; |  | ||||||
| pub mod tokenizer; |  | ||||||
| mod common_words; |  | ||||||
| mod shared_data_cursor; |  | ||||||
| mod write_to_bytes; |  | ||||||
|  |  | ||||||
| use serde_derive::{Serialize, Deserialize}; |  | ||||||
|  |  | ||||||
| pub use rocksdb; |  | ||||||
|  |  | ||||||
| pub use self::tokenizer::Tokenizer; |  | ||||||
| pub use self::common_words::CommonWords; |  | ||||||
|  |  | ||||||
| pub fn is_cjk(c: char) -> bool { |  | ||||||
|     (c >= '\u{2e80}' && c <= '\u{2eff}') || |  | ||||||
|     (c >= '\u{2f00}' && c <= '\u{2fdf}') || |  | ||||||
|     (c >= '\u{3040}' && c <= '\u{309f}') || |  | ||||||
|     (c >= '\u{30a0}' && c <= '\u{30ff}') || |  | ||||||
|     (c >= '\u{3100}' && c <= '\u{312f}') || |  | ||||||
|     (c >= '\u{3200}' && c <= '\u{32ff}') || |  | ||||||
|     (c >= '\u{3400}' && c <= '\u{4dbf}') || |  | ||||||
|     (c >= '\u{4e00}' && c <= '\u{9fff}') || |  | ||||||
|     (c >= '\u{f900}' && c <= '\u{faff}') |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /// Represent an internally generated document unique identifier. |  | ||||||
| /// |  | ||||||
| /// It is used to inform the database the document you want to deserialize. |  | ||||||
| /// Helpful for custom ranking. |  | ||||||
| #[derive(Serialize, Deserialize)] |  | ||||||
| #[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] |  | ||||||
| pub struct DocumentId(u64); |  | ||||||
|  |  | ||||||
| /// This structure represent the position of a word |  | ||||||
| /// in a document and its attributes. |  | ||||||
| /// |  | ||||||
| /// This is stored in the map, generated at index time, |  | ||||||
| /// extracted and interpreted at search time. |  | ||||||
| #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] |  | ||||||
| #[repr(C)] |  | ||||||
| pub struct DocIndex { |  | ||||||
|     /// The document identifier where the word was found. |  | ||||||
|     pub document_id: DocumentId, |  | ||||||
|  |  | ||||||
|     /// The attribute in the document where the word was found |  | ||||||
|     /// along with the index in it. |  | ||||||
|     pub attribute: u16, |  | ||||||
|     pub word_index: u16, |  | ||||||
|  |  | ||||||
|     /// The position in bytes where the word was found |  | ||||||
|     /// along with the length of it. |  | ||||||
|     /// |  | ||||||
|     /// It informs on the original word area in the text indexed |  | ||||||
|     /// without needing to run the tokenizer again. |  | ||||||
|     pub char_index: u16, |  | ||||||
|     pub char_length: u16, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /// This structure represent a matching word with informations |  | ||||||
| /// on the location of the word in the document. |  | ||||||
| /// |  | ||||||
| /// The order of the field is important because it defines |  | ||||||
| /// the way these structures are ordered between themselves. |  | ||||||
| /// |  | ||||||
| /// The word in itself is not important. |  | ||||||
| // TODO do data oriented programming ? very arrays ? |  | ||||||
| #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] |  | ||||||
| pub struct Match { |  | ||||||
|     /// The word index in the query sentence. |  | ||||||
|     /// Same as the `attribute_index` but for the query words. |  | ||||||
|     /// |  | ||||||
|     /// Used to retrieve the automaton that match this word. |  | ||||||
|     pub query_index: u32, |  | ||||||
|  |  | ||||||
|     /// The distance the word has with the query word |  | ||||||
|     /// (i.e. the Levenshtein distance). |  | ||||||
|     pub distance: u8, |  | ||||||
|  |  | ||||||
|     /// The attribute in the document where the word was found |  | ||||||
|     /// along with the index in it. |  | ||||||
|     pub attribute: u16, |  | ||||||
|     pub word_index: u16, |  | ||||||
|  |  | ||||||
|     /// Whether the word that match is an exact match or a prefix. |  | ||||||
|     pub is_exact: bool, |  | ||||||
|  |  | ||||||
|     /// The position in bytes where the word was found |  | ||||||
|     /// along with the length of it. |  | ||||||
|     /// |  | ||||||
|     /// It informs on the original word area in the text indexed |  | ||||||
|     /// without needing to run the tokenizer again. |  | ||||||
|     pub char_index: u16, |  | ||||||
|     pub char_length: u16, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl Match { |  | ||||||
|     pub fn zero() -> Self { |  | ||||||
|         Match { |  | ||||||
|             query_index: 0, |  | ||||||
|             distance: 0, |  | ||||||
|             attribute: 0, |  | ||||||
|             word_index: 0, |  | ||||||
|             is_exact: false, |  | ||||||
|             char_index: 0, |  | ||||||
|             char_length: 0, |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     pub fn max() -> Self { |  | ||||||
|         Match { |  | ||||||
|             query_index: u32::max_value(), |  | ||||||
|             distance: u8::max_value(), |  | ||||||
|             attribute: u16::max_value(), |  | ||||||
|             word_index: u16::max_value(), |  | ||||||
|             is_exact: true, |  | ||||||
|             char_index: u16::max_value(), |  | ||||||
|             char_length: u16::max_value(), |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[cfg(test)] |  | ||||||
| mod tests { |  | ||||||
|     use super::*; |  | ||||||
|     use std::mem; |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn docindex_mem_size() { |  | ||||||
|         assert_eq!(mem::size_of::<DocIndex>(), 16); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
		Reference in New Issue
	
	Block a user