mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 04:56:28 +00:00 
			
		
		
		
	feat: Introduce a WordArea struct
Useful to highlight matching areas in the original text.
This commit is contained in:
		| @@ -35,5 +35,7 @@ nightly = [] | |||||||
| [dev-dependencies] | [dev-dependencies] | ||||||
| csv = "1.0" | csv = "1.0" | ||||||
| elapsed = "0.1" | elapsed = "0.1" | ||||||
|  | quickcheck = "0.7" | ||||||
| structopt = "0.2" | structopt = "0.2" | ||||||
| tempfile = "3.0" | tempfile = "3.0" | ||||||
|  | termcolor = "1.0" | ||||||
|   | |||||||
| @@ -2,10 +2,12 @@ use std::io::{self, Write}; | |||||||
| use std::path::PathBuf; | use std::path::PathBuf; | ||||||
| use std::error::Error; | use std::error::Error; | ||||||
|  |  | ||||||
|  | use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; | ||||||
| use serde_derive::{Serialize, Deserialize}; | use serde_derive::{Serialize, Deserialize}; | ||||||
| use structopt::StructOpt; | use structopt::StructOpt; | ||||||
|  |  | ||||||
| use meilidb::database::Database; | use meilidb::database::Database; | ||||||
|  | use meilidb::Match; | ||||||
|  |  | ||||||
| #[derive(Debug, StructOpt)] | #[derive(Debug, StructOpt)] | ||||||
| pub struct Opt { | pub struct Opt { | ||||||
| @@ -26,6 +28,40 @@ struct Document { | |||||||
|     image: String, |     image: String, | ||||||
| } | } | ||||||
|  |  | ||||||
|  | fn display_highlights(text: &str, ranges: &[usize]) -> io::Result<()> { | ||||||
|  |     let mut stdout = StandardStream::stdout(ColorChoice::Always); | ||||||
|  |     let mut highlighted = false; | ||||||
|  |  | ||||||
|  |     for range in ranges.windows(2) { | ||||||
|  |         let [start, end] = match range { [start, end] => [*start, *end], _ => unreachable!() }; | ||||||
|  |         if highlighted { | ||||||
|  |             stdout.set_color(ColorSpec::new().set_fg(Some(Color::Yellow)))?; | ||||||
|  |         } | ||||||
|  |         write!(&mut stdout, "{}", &text[start..end])?; | ||||||
|  |         stdout.reset()?; | ||||||
|  |         highlighted = !highlighted; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     Ok(()) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn create_highlight_areas(text: &str, matches: &[Match], attribute: u16) -> Vec<usize> { | ||||||
|  |     let mut title_areas = Vec::new(); | ||||||
|  |  | ||||||
|  |     title_areas.push(0); | ||||||
|  |     for match_ in matches { | ||||||
|  |         if match_.attribute.attribute() == attribute { | ||||||
|  |             let word_area = match_.word_area; | ||||||
|  |             let byte_index = word_area.byte_index() as usize; | ||||||
|  |             let length = word_area.length() as usize; | ||||||
|  |             title_areas.push(byte_index); | ||||||
|  |             title_areas.push(byte_index + length); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     title_areas.push(text.len()); | ||||||
|  |     title_areas | ||||||
|  | } | ||||||
|  |  | ||||||
| fn main() -> Result<(), Box<Error>> { | fn main() -> Result<(), Box<Error>> { | ||||||
|     let opt = Opt::from_args(); |     let opt = Opt::from_args(); | ||||||
|  |  | ||||||
| @@ -41,26 +77,35 @@ fn main() -> Result<(), Box<Error>> { | |||||||
|         io::stdout().flush()?; |         io::stdout().flush()?; | ||||||
|  |  | ||||||
|         if input.read_line(&mut buffer)? == 0 { break } |         if input.read_line(&mut buffer)? == 0 { break } | ||||||
|  |         let query = buffer.trim_end_matches('\n'); | ||||||
|  |  | ||||||
|         let view = database.view(); |         let view = database.view(); | ||||||
|  |  | ||||||
|         let (elapsed, documents) = elapsed::measure_time(|| { |         let (elapsed, documents) = elapsed::measure_time(|| { | ||||||
|             let builder = view.query_builder().unwrap(); |             let builder = view.query_builder().unwrap(); | ||||||
|             builder.query(&buffer, 0..opt.number_results) |             builder.query(query, 0..opt.number_results) | ||||||
|         }); |         }); | ||||||
|  |  | ||||||
|         let mut full_documents = Vec::with_capacity(documents.len()); |         let number_of_documents = documents.len(); | ||||||
|  |         for doc in documents { | ||||||
|  |             match view.retrieve_document::<Document>(doc.id) { | ||||||
|  |                 Ok(document) => { | ||||||
|  |  | ||||||
|         for document in documents { |                     print!("title: "); | ||||||
|             match view.retrieve_document::<Document>(document.id) { |                     let title_areas = create_highlight_areas(&document.title, &doc.matches, 1); | ||||||
|                 Ok(document) => full_documents.push(document), |                     display_highlights(&document.title, &title_areas)?; | ||||||
|  |                     println!(); | ||||||
|  |  | ||||||
|  |                     print!("description: "); | ||||||
|  |                     let description_areas = create_highlight_areas(&document.description, &doc.matches, 2); | ||||||
|  |                     display_highlights(&document.description, &description_areas)?; | ||||||
|  |                     println!(); | ||||||
|  |                 }, | ||||||
|                 Err(e) => eprintln!("{}", e), |                 Err(e) => eprintln!("{}", e), | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         println!("{:#?}", full_documents); |         println!("Found {} results in {}", number_of_documents, elapsed); | ||||||
|         println!("Found {} results in {}", full_documents.len(), elapsed); |  | ||||||
|  |  | ||||||
|         buffer.clear(); |         buffer.clear(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -158,14 +158,15 @@ mod tests { | |||||||
|     use super::*; |     use super::*; | ||||||
|  |  | ||||||
|     use std::error::Error; |     use std::error::Error; | ||||||
|  |     use crate::{Attribute, WordArea}; | ||||||
|  |  | ||||||
|     use crate::DocumentId; |     use crate::DocumentId; | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn builder_serialize_deserialize() -> Result<(), Box<Error>> { |     fn builder_serialize_deserialize() -> Result<(), Box<Error>> { | ||||||
|         let a = DocIndex { document_id: DocumentId(0), attribute: 3, attribute_index: 11 }; |         let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) }; | ||||||
|         let b = DocIndex { document_id: DocumentId(1), attribute: 4, attribute_index: 21 }; |         let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) }; | ||||||
|         let c = DocIndex { document_id: DocumentId(2), attribute: 8, attribute_index: 2 }; |         let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) }; | ||||||
|  |  | ||||||
|         let mut builder = DocIndexesBuilder::memory(); |         let mut builder = DocIndexesBuilder::memory(); | ||||||
|  |  | ||||||
| @@ -186,9 +187,9 @@ mod tests { | |||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn serialize_deserialize() -> Result<(), Box<Error>> { |     fn serialize_deserialize() -> Result<(), Box<Error>> { | ||||||
|         let a = DocIndex { document_id: DocumentId(0), attribute: 3, attribute_index: 11 }; |         let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) }; | ||||||
|         let b = DocIndex { document_id: DocumentId(1), attribute: 4, attribute_index: 21 }; |         let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) }; | ||||||
|         let c = DocIndex { document_id: DocumentId(2), attribute: 8, attribute_index: 2 }; |         let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) }; | ||||||
|  |  | ||||||
|         let mut builder = DocIndexesBuilder::memory(); |         let mut builder = DocIndexesBuilder::memory(); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -203,14 +203,15 @@ mod tests { | |||||||
|     use super::*; |     use super::*; | ||||||
|  |  | ||||||
|     use std::error::Error; |     use std::error::Error; | ||||||
|  |     use crate::{Attribute, WordArea}; | ||||||
|  |  | ||||||
|     use crate::DocumentId; |     use crate::DocumentId; | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn serialize_deserialize() -> Result<(), Box<Error>> { |     fn serialize_deserialize() -> Result<(), Box<Error>> { | ||||||
|         let a = DocIndex { document_id: DocumentId(0), attribute: 3, attribute_index: 11 }; |         let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) }; | ||||||
|         let b = DocIndex { document_id: DocumentId(1), attribute: 4, attribute_index: 21 }; |         let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) }; | ||||||
|         let c = DocIndex { document_id: DocumentId(2), attribute: 8, attribute_index: 2 }; |         let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) }; | ||||||
|  |  | ||||||
|         let mut builder = PositiveBlobBuilder::memory(); |         let mut builder = PositiveBlobBuilder::memory(); | ||||||
|  |  | ||||||
| @@ -231,9 +232,9 @@ mod tests { | |||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn serde_serialize_deserialize() -> Result<(), Box<Error>> { |     fn serde_serialize_deserialize() -> Result<(), Box<Error>> { | ||||||
|         let a = DocIndex { document_id: DocumentId(0), attribute: 3, attribute_index: 11 }; |         let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) }; | ||||||
|         let b = DocIndex { document_id: DocumentId(1), attribute: 4, attribute_index: 21 }; |         let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) }; | ||||||
|         let c = DocIndex { document_id: DocumentId(2), attribute: 8, attribute_index: 2 }; |         let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) }; | ||||||
|  |  | ||||||
|         let mut builder = PositiveBlobBuilder::memory(); |         let mut builder = PositiveBlobBuilder::memory(); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -73,7 +73,7 @@ impl DocumentKeyAttr { | |||||||
|         let mut wtr = Cursor::new(&mut buffer[..]); |         let mut wtr = Cursor::new(&mut buffer[..]); | ||||||
|         wtr.write_all(&raw_key).unwrap(); |         wtr.write_all(&raw_key).unwrap(); | ||||||
|         wtr.write_all(b"-").unwrap(); |         wtr.write_all(b"-").unwrap(); | ||||||
|         wtr.write_u32::<NativeEndian>(attr.as_u32()).unwrap(); |         wtr.write_u16::<NativeEndian>(attr.0).unwrap(); | ||||||
|  |  | ||||||
|         DocumentKeyAttr(buffer) |         DocumentKeyAttr(buffer) | ||||||
|     } |     } | ||||||
| @@ -95,7 +95,7 @@ impl DocumentKeyAttr { | |||||||
|  |  | ||||||
|     pub fn attribute(&self) -> SchemaAttr { |     pub fn attribute(&self) -> SchemaAttr { | ||||||
|         let offset = 4 + size_of::<u64>() + 1; |         let offset = 4 + size_of::<u64>() + 1; | ||||||
|         let value = (&self.0[offset..]).read_u32::<NativeEndian>().unwrap(); |         let value = (&self.0[offset..]).read_u16::<NativeEndian>().unwrap(); | ||||||
|         SchemaAttr::new(value) |         SchemaAttr::new(value) | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -114,7 +114,7 @@ impl fmt::Debug for DocumentKeyAttr { | |||||||
|     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||||
|         f.debug_struct("DocumentKeyAttr") |         f.debug_struct("DocumentKeyAttr") | ||||||
|             .field("document_id", &self.document_id()) |             .field("document_id", &self.document_id()) | ||||||
|             .field("attribute", &self.attribute().as_u32()) |             .field("attribute", &self.attribute().0) | ||||||
|             .finish() |             .finish() | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -1,6 +1,6 @@ | |||||||
| use std::collections::{HashMap, BTreeMap}; | use std::collections::{HashMap, BTreeMap}; | ||||||
| use std::io::{Read, Write}; | use std::io::{Read, Write}; | ||||||
| use std::{fmt, u32}; | use std::{fmt, u16}; | ||||||
| use std::path::Path; | use std::path::Path; | ||||||
| use std::ops::BitOr; | use std::ops::BitOr; | ||||||
| use std::sync::Arc; | use std::sync::Arc; | ||||||
| @@ -53,7 +53,7 @@ impl SchemaBuilder { | |||||||
|         if self.attrs.insert(name.into(), props).is_some() { |         if self.attrs.insert(name.into(), props).is_some() { | ||||||
|             panic!("Field already inserted.") |             panic!("Field already inserted.") | ||||||
|         } |         } | ||||||
|         SchemaAttr(len as u32) |         SchemaAttr(len as u16) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn build(self) -> Schema { |     pub fn build(self) -> Schema { | ||||||
| @@ -61,7 +61,7 @@ impl SchemaBuilder { | |||||||
|         let mut props = Vec::new(); |         let mut props = Vec::new(); | ||||||
|  |  | ||||||
|         for (i, (name, prop)) in self.attrs.into_iter().enumerate() { |         for (i, (name, prop)) in self.attrs.into_iter().enumerate() { | ||||||
|             attrs.insert(name.clone(), SchemaAttr(i as u32)); |             attrs.insert(name.clone(), SchemaAttr(i as u16)); | ||||||
|             props.push((name, prop)); |             props.push((name, prop)); | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -94,10 +94,9 @@ impl Schema { | |||||||
|  |  | ||||||
|     pub fn write_to<W: Write>(&self, writer: W) -> bincode::Result<()> { |     pub fn write_to<W: Write>(&self, writer: W) -> bincode::Result<()> { | ||||||
|         let mut ordered = BTreeMap::new(); |         let mut ordered = BTreeMap::new(); | ||||||
|         for (name, field) in &self.inner.attrs { |         for (name, attr) in &self.inner.attrs { | ||||||
|             let index = field.as_u32(); |             let (_, props) = self.inner.props[attr.0 as usize]; | ||||||
|             let (_, props) = self.inner.props[index as usize]; |             ordered.insert(attr.0, (name, props)); | ||||||
|             ordered.insert(index, (name, props)); |  | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let mut attrs = LinkedHashMap::with_capacity(ordered.len()); |         let mut attrs = LinkedHashMap::with_capacity(ordered.len()); | ||||||
| @@ -109,8 +108,7 @@ impl Schema { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn props(&self, attr: SchemaAttr) -> SchemaProps { |     pub fn props(&self, attr: SchemaAttr) -> SchemaProps { | ||||||
|         let index = attr.as_u32(); |         let (_, props) = self.inner.props[attr.0 as usize]; | ||||||
|         let (_, props) = self.inner.props[index as usize]; |  | ||||||
|         props |         props | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -119,26 +117,21 @@ impl Schema { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn attribute_name(&self, attr: SchemaAttr) -> &str { |     pub fn attribute_name(&self, attr: SchemaAttr) -> &str { | ||||||
|         let index = attr.as_u32(); |         let (name, _) = &self.inner.props[attr.0 as usize]; | ||||||
|         let (name, _) = &self.inner.props[index as usize]; |  | ||||||
|         name |         name | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)] | #[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)] | ||||||
| pub struct SchemaAttr(u32); | pub struct SchemaAttr(pub(crate) u16); | ||||||
|  |  | ||||||
| impl SchemaAttr { | impl SchemaAttr { | ||||||
|     pub fn new(value: u32) -> SchemaAttr { |     pub fn new(value: u16) -> SchemaAttr { | ||||||
|         SchemaAttr(value) |         SchemaAttr(value) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn max() -> SchemaAttr { |     pub fn max() -> SchemaAttr { | ||||||
|         SchemaAttr(u32::MAX) |         SchemaAttr(u16::MAX) | ||||||
|     } |  | ||||||
|  |  | ||||||
|     pub fn as_u32(&self) -> u32 { |  | ||||||
|         self.0 |  | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -9,12 +9,12 @@ use serde::ser::{self, Serialize}; | |||||||
| use crate::database::update::positive::unordered_builder::UnorderedPositiveBlobBuilder; | use crate::database::update::positive::unordered_builder::UnorderedPositiveBlobBuilder; | ||||||
| use crate::database::blob::positive::PositiveBlob; | use crate::database::blob::positive::PositiveBlob; | ||||||
| use crate::database::schema::{Schema, SchemaAttr}; | use crate::database::schema::{Schema, SchemaAttr}; | ||||||
| use crate::tokenizer::TokenizerBuilder; | use crate::tokenizer::{TokenizerBuilder, Token}; | ||||||
| use crate::database::DocumentKeyAttr; | use crate::database::DocumentKeyAttr; | ||||||
| use crate::database::update::Update; | use crate::database::update::Update; | ||||||
| use crate::{DocumentId, DocIndex}; |  | ||||||
| use crate::database::DATA_INDEX; | use crate::database::DATA_INDEX; | ||||||
| use crate::database::blob::Blob; | use crate::database::blob::Blob; | ||||||
|  | use crate::{DocumentId, DocIndex, Attribute, WordArea}; | ||||||
|  |  | ||||||
| pub enum NewState { | pub enum NewState { | ||||||
|     Updated { value: Vec<u8> }, |     Updated { value: Vec<u8> }, | ||||||
| @@ -355,11 +355,11 @@ where B: TokenizerBuilder | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> { |     fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> { | ||||||
|         for (index, word) in self.tokenizer_builder.build(v) { |         for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) { | ||||||
|             let doc_index = DocIndex { |             let doc_index = DocIndex { | ||||||
|                 document_id: self.document_id, |                 document_id: self.document_id, | ||||||
|                 attribute: self.attribute.as_u32() as u8, |                 attribute: Attribute::new(self.attribute.0, word_index as u32), | ||||||
|                 attribute_index: index as u32, |                 word_area: WordArea::new(char_index as u32, word.len() as u16), | ||||||
|             }; |             }; | ||||||
|  |  | ||||||
|             // insert the exact representation |             // insert the exact representation | ||||||
|   | |||||||
							
								
								
									
										200
									
								
								src/lib.rs
									
									
									
									
									
								
							
							
						
						
									
										200
									
								
								src/lib.rs
									
									
									
									
									
								
							| @@ -6,6 +6,8 @@ pub mod tokenizer; | |||||||
| pub mod vec_read_only; | pub mod vec_read_only; | ||||||
| mod common_words; | mod common_words; | ||||||
|  |  | ||||||
|  | use std::fmt; | ||||||
|  |  | ||||||
| pub use rocksdb; | pub use rocksdb; | ||||||
|  |  | ||||||
| pub use self::tokenizer::Tokenizer; | pub use self::tokenizer::Tokenizer; | ||||||
| @@ -18,28 +20,110 @@ pub use self::common_words::CommonWords; | |||||||
| #[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] | #[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] | ||||||
| pub struct DocumentId(pub u64); | pub struct DocumentId(pub u64); | ||||||
|  |  | ||||||
|  | /// Represent an attribute number along with the word index | ||||||
|  | /// according to the tokenizer used. | ||||||
|  | /// | ||||||
|  | /// It can accept up to 1024 attributes and word positions | ||||||
|  | /// can be maximum 2^22. | ||||||
|  | #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||||||
|  | pub struct Attribute(u32); | ||||||
|  |  | ||||||
|  | impl Attribute { | ||||||
|  |     /// Construct an `Attribute` from an attribute number and | ||||||
|  |     /// the word position of a match according to the tokenizer used. | ||||||
|  |     /// | ||||||
|  |     /// # Panics | ||||||
|  |     /// | ||||||
|  |     /// The attribute must not be greater than 1024 | ||||||
|  |     /// and the word index not greater than 2^22. | ||||||
|  |     fn new(attribute: u16, index: u32) -> Attribute { | ||||||
|  |         assert!(attribute & 0b1111_1100_0000_0000 == 0); | ||||||
|  |         assert!(index & 0b1111_1111_1100_0000_0000_0000_0000 == 0); | ||||||
|  |  | ||||||
|  |         let attribute = (attribute as u32) << 22; | ||||||
|  |         Attribute(attribute | index) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn attribute(&self) -> u16 { | ||||||
|  |         (self.0 >> 22) as u16 | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn word_index(&self) -> u32 { | ||||||
|  |         self.0 & 0b0000_0000_0011_1111_1111_1111_1111 | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl fmt::Debug for Attribute { | ||||||
|  |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||||
|  |         f.debug_struct("Attribute") | ||||||
|  |             .field("attribute", &self.attribute()) | ||||||
|  |             .field("word_index", &self.word_index()) | ||||||
|  |             .finish() | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Represent a word position in bytes along with the length of it. | ||||||
|  | /// | ||||||
|  | /// It can represent words byte index to maximum 2^22 and | ||||||
|  | /// up to words of length 1024. | ||||||
|  | #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||||||
|  | pub struct WordArea(u32); | ||||||
|  |  | ||||||
|  | impl WordArea { | ||||||
|  |     /// Construct a `WordArea` from a word position in bytes | ||||||
|  |     /// and the length of it. | ||||||
|  |     /// | ||||||
|  |     /// # Panics | ||||||
|  |     /// | ||||||
|  |     /// The byte index must not be greater than 2^22 | ||||||
|  |     /// and the length not greater than 1024. | ||||||
|  |     fn new(byte_index: u32, length: u16) -> WordArea { | ||||||
|  |         assert!(byte_index & 0b1111_1111_1100_0000_0000_0000_0000 == 0); | ||||||
|  |         assert!(length & 0b1111_1100_0000_0000 == 0); | ||||||
|  |  | ||||||
|  |         let byte_index = byte_index << 10; | ||||||
|  |         WordArea(byte_index | (length as u32)) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn byte_index(&self) -> u32 { | ||||||
|  |         self.0 >> 10 | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn length(&self) -> u16 { | ||||||
|  |         (self.0 & 0b0000_0000_0000_0000_0011_1111_1111) as u16 | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl fmt::Debug for WordArea { | ||||||
|  |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||||
|  |         f.debug_struct("WordArea") | ||||||
|  |             .field("byte_index", &self.byte_index()) | ||||||
|  |             .field("length", &self.length()) | ||||||
|  |             .finish() | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| /// This structure represent the position of a word | /// This structure represent the position of a word | ||||||
| /// in a document and its attributes. | /// in a document and its attributes. | ||||||
| /// | /// | ||||||
| /// This is stored in the map, generated at index time, | /// This is stored in the map, generated at index time, | ||||||
| /// extracted and interpreted at search time. | /// extracted and interpreted at search time. | ||||||
| #[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] | #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||||||
| #[repr(C)] | #[repr(C)] | ||||||
| pub struct DocIndex { | pub struct DocIndex { | ||||||
|     /// The document identifier where the word was found. |     /// The document identifier where the word was found. | ||||||
|     pub document_id: DocumentId, |     pub document_id: DocumentId, | ||||||
|  |  | ||||||
|     /// The attribute identifier in the document |     /// The attribute in the document where the word was found | ||||||
|     /// where the word was found. |     /// along with the index in it. | ||||||
|     /// |     pub attribute: Attribute, | ||||||
|     /// This is an `u8` therefore a document |  | ||||||
|     /// can not have more than `2^8` attributes. |  | ||||||
|     pub attribute: u8, |  | ||||||
|  |  | ||||||
|     /// The index where the word was found in the attribute. |     /// The position in bytes where the word was found | ||||||
|  |     /// along with the length of it. | ||||||
|     /// |     /// | ||||||
|     /// Only the first 1000 words are indexed. |     /// It informs on the original word area in the text indexed | ||||||
|     pub attribute_index: u32, |     /// without needing to run the tokenizer again. | ||||||
|  |     pub word_area: WordArea, | ||||||
| } | } | ||||||
|  |  | ||||||
| /// This structure represent a matching word with informations | /// This structure represent a matching word with informations | ||||||
| @@ -50,7 +134,7 @@ pub struct DocIndex { | |||||||
| /// | /// | ||||||
| /// The word in itself is not important. | /// The word in itself is not important. | ||||||
| // TODO do data oriented programming ? very arrays ? | // TODO do data oriented programming ? very arrays ? | ||||||
| #[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] | #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||||||
| pub struct Match { | pub struct Match { | ||||||
|     /// The word index in the query sentence. |     /// The word index in the query sentence. | ||||||
|     /// Same as the `attribute_index` but for the query words. |     /// Same as the `attribute_index` but for the query words. | ||||||
| @@ -62,23 +146,19 @@ pub struct Match { | |||||||
|     /// (i.e. the Levenshtein distance). |     /// (i.e. the Levenshtein distance). | ||||||
|     pub distance: u8, |     pub distance: u8, | ||||||
|  |  | ||||||
|     /// The attribute in which the word is located |     /// The attribute in the document where the word was found | ||||||
|     /// (i.e. Title is 0, Description is 1). |     /// along with the index in it. | ||||||
|     /// |     pub attribute: Attribute, | ||||||
|     /// This is an `u8` therefore a document |  | ||||||
|     /// can not have more than `2^8` attributes. |  | ||||||
|     pub attribute: u8, |  | ||||||
|  |  | ||||||
|     /// Where does this word is located in the attribute string |  | ||||||
|     /// (i.e. at the start or the end of the attribute). |  | ||||||
|     /// |  | ||||||
|     /// The index in the attribute is limited to a maximum of `2^32` |  | ||||||
|     /// this is because we index only the first 1000 words |  | ||||||
|     /// in an attribute. |  | ||||||
|     pub attribute_index: u32, |  | ||||||
|  |  | ||||||
|     /// Whether the word that match is an exact match or a prefix. |     /// Whether the word that match is an exact match or a prefix. | ||||||
|     pub is_exact: bool, |     pub is_exact: bool, | ||||||
|  |  | ||||||
|  |     /// The position in bytes where the word was found | ||||||
|  |     /// along with the length of it. | ||||||
|  |     /// | ||||||
|  |     /// It informs on the original word area in the text indexed | ||||||
|  |     /// without needing to run the tokenizer again. | ||||||
|  |     pub word_area: WordArea, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl Match { | impl Match { | ||||||
| @@ -86,9 +166,9 @@ impl Match { | |||||||
|         Match { |         Match { | ||||||
|             query_index: 0, |             query_index: 0, | ||||||
|             distance: 0, |             distance: 0, | ||||||
|             attribute: 0, |             attribute: Attribute::new(0, 0), | ||||||
|             attribute_index: 0, |  | ||||||
|             is_exact: false, |             is_exact: false, | ||||||
|  |             word_area: WordArea::new(0, 0), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -96,9 +176,71 @@ impl Match { | |||||||
|         Match { |         Match { | ||||||
|             query_index: u32::max_value(), |             query_index: u32::max_value(), | ||||||
|             distance: u8::max_value(), |             distance: u8::max_value(), | ||||||
|             attribute: u8::max_value(), |             attribute: Attribute(u32::max_value()), | ||||||
|             attribute_index: u32::max_value(), |  | ||||||
|             is_exact: true, |             is_exact: true, | ||||||
|  |             word_area: WordArea(u32::max_value()), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[cfg(test)] | ||||||
|  | mod tests { | ||||||
|  |     use super::*; | ||||||
|  |     use quickcheck::{quickcheck, TestResult}; | ||||||
|  |     use std::mem; | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn docindex_mem_size() { | ||||||
|  |         assert_eq!(mem::size_of::<DocIndex>(), 16); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     quickcheck! { | ||||||
|  |         fn qc_attribute(gen_attr: u16, gen_index: u32) -> TestResult { | ||||||
|  |             if gen_attr > 2_u16.pow(10) || gen_index > 2_u32.pow(22) { | ||||||
|  |                 return TestResult::discard() | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             let attribute = Attribute::new(gen_attr, gen_index); | ||||||
|  |  | ||||||
|  |             let valid_attribute = attribute.attribute() == gen_attr; | ||||||
|  |             let valid_index = attribute.word_index() == gen_index; | ||||||
|  |  | ||||||
|  |             TestResult::from_bool(valid_attribute && valid_index) | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         fn qc_attribute_ord(gen_attr: u16, gen_index: u32) -> TestResult { | ||||||
|  |             if gen_attr >= 2_u16.pow(10) || gen_index >= 2_u32.pow(22) { | ||||||
|  |                 return TestResult::discard() | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             let a = Attribute::new(gen_attr, gen_index); | ||||||
|  |             let b = Attribute::new(gen_attr + 1, gen_index + 1); | ||||||
|  |  | ||||||
|  |             TestResult::from_bool(a < b) | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         fn qc_word_area(gen_byte_index: u32, gen_length: u16) -> TestResult { | ||||||
|  |             if gen_byte_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) { | ||||||
|  |                 return TestResult::discard() | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             let word_area = WordArea::new(gen_byte_index, gen_length); | ||||||
|  |  | ||||||
|  |             let valid_char_index = word_area.byte_index() == gen_byte_index; | ||||||
|  |             let valid_length = word_area.length() == gen_length; | ||||||
|  |  | ||||||
|  |             TestResult::from_bool(valid_char_index && valid_length) | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         fn qc_word_area_ord(gen_byte_index: u32, gen_length: u16) -> TestResult { | ||||||
|  |             if gen_byte_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) { | ||||||
|  |                 return TestResult::discard() | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             let a = WordArea::new(gen_byte_index, gen_length); | ||||||
|  |             let b = WordArea::new(gen_byte_index + 1, gen_length + 1); | ||||||
|  |  | ||||||
|  |             TestResult::from_bool(a < b) | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -44,7 +44,7 @@ where D: Deref<Target=DB> | |||||||
| mod tests { | mod tests { | ||||||
|     use super::*; |     use super::*; | ||||||
|  |  | ||||||
|     use crate::DocumentId; |     use crate::{DocumentId, Attribute, WordArea}; | ||||||
|  |  | ||||||
|     // typing: "Geox CEO" |     // typing: "Geox CEO" | ||||||
|     // |     // | ||||||
| @@ -54,8 +54,8 @@ mod tests { | |||||||
|     fn one_typo_reference() { |     fn one_typo_reference() { | ||||||
|         let doc0 = { |         let doc0 = { | ||||||
|             let matches = vec![ |             let matches = vec![ | ||||||
|                 Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false }, |                 Match { query_index: 0, distance: 0, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) }, | ||||||
|                 Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 2, is_exact: false }, |                 Match { query_index: 1, distance: 0, attribute: Attribute::new(0, 2), is_exact: false, word_area: WordArea::new(0, 6) }, | ||||||
|             ]; |             ]; | ||||||
|             Document { |             Document { | ||||||
|                 id: DocumentId(0), |                 id: DocumentId(0), | ||||||
| @@ -65,8 +65,8 @@ mod tests { | |||||||
|  |  | ||||||
|         let doc1 = { |         let doc1 = { | ||||||
|             let matches = vec![ |             let matches = vec![ | ||||||
|                 Match { query_index: 0, distance: 1, attribute: 0, attribute_index: 0, is_exact: false }, |                 Match { query_index: 0, distance: 1, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) }, | ||||||
|                 Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 2, is_exact: false }, |                 Match { query_index: 1, distance: 0, attribute: Attribute::new(0, 2), is_exact: false, word_area: WordArea::new(0, 6) }, | ||||||
|             ]; |             ]; | ||||||
|             Document { |             Document { | ||||||
|                 id: DocumentId(1), |                 id: DocumentId(1), | ||||||
| @@ -87,8 +87,8 @@ mod tests { | |||||||
|     fn no_typo() { |     fn no_typo() { | ||||||
|         let doc0 = { |         let doc0 = { | ||||||
|             let matches = vec![ |             let matches = vec![ | ||||||
|                 Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false }, |                 Match { query_index: 0, distance: 0, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) }, | ||||||
|                 Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 1, is_exact: false }, |                 Match { query_index: 1, distance: 0, attribute: Attribute::new(0, 1), is_exact: false, word_area: WordArea::new(0, 6) }, | ||||||
|             ]; |             ]; | ||||||
|             Document { |             Document { | ||||||
|                 id: DocumentId(0), |                 id: DocumentId(0), | ||||||
| @@ -98,7 +98,7 @@ mod tests { | |||||||
|  |  | ||||||
|         let doc1 = { |         let doc1 = { | ||||||
|             let matches = vec![ |             let matches = vec![ | ||||||
|                 Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false }, |                 Match { query_index: 0, distance: 0, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) }, | ||||||
|             ]; |             ]; | ||||||
|             Document { |             Document { | ||||||
|                 id: DocumentId(1), |                 id: DocumentId(1), | ||||||
| @@ -119,8 +119,8 @@ mod tests { | |||||||
|     fn one_typo() { |     fn one_typo() { | ||||||
|         let doc0 = { |         let doc0 = { | ||||||
|             let matches = vec![ |             let matches = vec![ | ||||||
|                 Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false }, |                 Match { query_index: 0, distance: 0, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) }, | ||||||
|                 Match { query_index: 1, distance: 1, attribute: 0, attribute_index: 1, is_exact: false }, |                 Match { query_index: 1, distance: 1, attribute: Attribute::new(0, 1), is_exact: false, word_area: WordArea::new(0, 6) }, | ||||||
|             ]; |             ]; | ||||||
|             Document { |             Document { | ||||||
|                 id: DocumentId(0), |                 id: DocumentId(0), | ||||||
| @@ -130,7 +130,7 @@ mod tests { | |||||||
|  |  | ||||||
|         let doc1 = { |         let doc1 = { | ||||||
|             let matches = vec![ |             let matches = vec![ | ||||||
|                 Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false }, |                 Match { query_index: 0, distance: 0, attribute: Attribute::new(0, 0), is_exact: false, word_area: WordArea::new(0, 6) }, | ||||||
|             ]; |             ]; | ||||||
|             Document { |             Document { | ||||||
|                 id: DocumentId(1), |                 id: DocumentId(1), | ||||||
|   | |||||||
| @@ -10,11 +10,11 @@ use crate::rank::criterion::Criterion; | |||||||
| use crate::Match; | use crate::Match; | ||||||
|  |  | ||||||
| #[inline] | #[inline] | ||||||
| fn sum_matches_attributes(matches: &[Match]) -> u8 { | fn sum_matches_attributes(matches: &[Match]) -> u16 { | ||||||
|     // note that GroupBy will never return an empty group |     // note that GroupBy will never return an empty group | ||||||
|     // so we can do this assumption safely |     // so we can do this assumption safely | ||||||
|     GroupBy::new(matches, match_query_index).map(|group| unsafe { |     GroupBy::new(matches, match_query_index).map(|group| unsafe { | ||||||
|         group.get_unchecked(0).attribute |         group.get_unchecked(0).attribute.attribute() | ||||||
|     }).sum() |     }).sum() | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -14,7 +14,7 @@ fn sum_matches_attribute_index(matches: &[Match]) -> u32 { | |||||||
|     // note that GroupBy will never return an empty group |     // note that GroupBy will never return an empty group | ||||||
|     // so we can do this assumption safely |     // so we can do this assumption safely | ||||||
|     GroupBy::new(matches, match_query_index).map(|group| unsafe { |     GroupBy::new(matches, match_query_index).map(|group| unsafe { | ||||||
|         group.get_unchecked(0).attribute_index |         group.get_unchecked(0).attribute.word_index() | ||||||
|     }).sum() |     }).sum() | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -20,8 +20,8 @@ fn index_proximity(lhs: u32, rhs: u32) -> u32 { | |||||||
| } | } | ||||||
|  |  | ||||||
| fn attribute_proximity(lhs: &Match, rhs: &Match) -> u32 { | fn attribute_proximity(lhs: &Match, rhs: &Match) -> u32 { | ||||||
|     if lhs.attribute != rhs.attribute { return MAX_DISTANCE } |     if lhs.attribute.attribute() != rhs.attribute.attribute() { return MAX_DISTANCE } | ||||||
|     index_proximity(lhs.attribute_index, rhs.attribute_index) |     index_proximity(lhs.attribute.word_index(), rhs.attribute.word_index()) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn min_proximity(lhs: &[Match], rhs: &[Match]) -> u32 { | fn min_proximity(lhs: &[Match], rhs: &[Match]) -> u32 { | ||||||
| @@ -67,6 +67,8 @@ where D: Deref<Target=DB> | |||||||
| mod tests { | mod tests { | ||||||
|     use super::*; |     use super::*; | ||||||
|  |  | ||||||
|  |     use crate::Attribute; | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn three_different_attributes() { |     fn three_different_attributes() { | ||||||
|  |  | ||||||
| @@ -79,11 +81,11 @@ mod tests { | |||||||
|         // { id: 3, attr: 3, attr_index: 1 } |         // { id: 3, attr: 3, attr_index: 1 } | ||||||
|  |  | ||||||
|         let matches = &[ |         let matches = &[ | ||||||
|             Match { query_index: 0, attribute: 0, attribute_index: 0, ..Match::zero() }, |             Match { query_index: 0, attribute: Attribute::new(0, 0), ..Match::zero() }, | ||||||
|             Match { query_index: 1, attribute: 1, attribute_index: 0, ..Match::zero() }, |             Match { query_index: 1, attribute: Attribute::new(1, 0), ..Match::zero() }, | ||||||
|             Match { query_index: 2, attribute: 1, attribute_index: 1, ..Match::zero() }, |             Match { query_index: 2, attribute: Attribute::new(1, 1), ..Match::zero() }, | ||||||
|             Match { query_index: 2, attribute: 2, attribute_index: 0, ..Match::zero() }, |             Match { query_index: 2, attribute: Attribute::new(2, 0), ..Match::zero() }, | ||||||
|             Match { query_index: 3, attribute: 3, attribute_index: 1, ..Match::zero() }, |             Match { query_index: 3, attribute: Attribute::new(3, 1), ..Match::zero() }, | ||||||
|         ]; |         ]; | ||||||
|  |  | ||||||
|         //   soup -> of = 8 |         //   soup -> of = 8 | ||||||
| @@ -105,12 +107,12 @@ mod tests { | |||||||
|         // { id: 3, attr: 1, attr_index: 3 } |         // { id: 3, attr: 1, attr_index: 3 } | ||||||
|  |  | ||||||
|         let matches = &[ |         let matches = &[ | ||||||
|             Match { query_index: 0, attribute: 0, attribute_index: 0, ..Match::zero() }, |             Match { query_index: 0, attribute: Attribute::new(0, 0), ..Match::zero() }, | ||||||
|             Match { query_index: 0, attribute: 1, attribute_index: 0, ..Match::zero() }, |             Match { query_index: 0, attribute: Attribute::new(1, 0), ..Match::zero() }, | ||||||
|             Match { query_index: 1, attribute: 1, attribute_index: 1, ..Match::zero() }, |             Match { query_index: 1, attribute: Attribute::new(1, 1), ..Match::zero() }, | ||||||
|             Match { query_index: 2, attribute: 1, attribute_index: 2, ..Match::zero() }, |             Match { query_index: 2, attribute: Attribute::new(1, 2), ..Match::zero() }, | ||||||
|             Match { query_index: 3, attribute: 0, attribute_index: 1, ..Match::zero() }, |             Match { query_index: 3, attribute: Attribute::new(0, 1), ..Match::zero() }, | ||||||
|             Match { query_index: 3, attribute: 1, attribute_index: 3, ..Match::zero() }, |             Match { query_index: 3, attribute: Attribute::new(1, 3), ..Match::zero() }, | ||||||
|         ]; |         ]; | ||||||
|  |  | ||||||
|         //   soup -> of = 1 |         //   soup -> of = 1 | ||||||
|   | |||||||
| @@ -97,8 +97,8 @@ where D: Deref<Target=DB> | |||||||
|                         query_index: iv.index as u32, |                         query_index: iv.index as u32, | ||||||
|                         distance: distance, |                         distance: distance, | ||||||
|                         attribute: doc_index.attribute, |                         attribute: doc_index.attribute, | ||||||
|                         attribute_index: doc_index.attribute_index, |  | ||||||
|                         is_exact: is_exact, |                         is_exact: is_exact, | ||||||
|  |                         word_area: doc_index.word_area, | ||||||
|                     }; |                     }; | ||||||
|                     matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_); |                     matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_); | ||||||
|                 } |                 } | ||||||
|   | |||||||
| @@ -2,7 +2,7 @@ use std::mem; | |||||||
| use self::Separator::*; | use self::Separator::*; | ||||||
|  |  | ||||||
| pub trait TokenizerBuilder { | pub trait TokenizerBuilder { | ||||||
|     fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a>; |     fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a>; | ||||||
| } | } | ||||||
|  |  | ||||||
| pub struct DefaultBuilder; | pub struct DefaultBuilder; | ||||||
| @@ -13,22 +13,39 @@ impl DefaultBuilder { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #[derive(Debug, PartialEq, Eq)] | ||||||
|  | pub struct Token<'a> { | ||||||
|  |     pub word: &'a str, | ||||||
|  |     pub word_index: usize, | ||||||
|  |     pub char_index: usize, | ||||||
|  | } | ||||||
|  |  | ||||||
| impl TokenizerBuilder for DefaultBuilder { | impl TokenizerBuilder for DefaultBuilder { | ||||||
|     fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a> { |     fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a> { | ||||||
|         Box::new(Tokenizer::new(text)) |         Box::new(Tokenizer::new(text)) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| pub struct Tokenizer<'a> { | pub struct Tokenizer<'a> { | ||||||
|     index: usize, |     word_index: usize, | ||||||
|  |     char_index: usize, | ||||||
|     inner: &'a str, |     inner: &'a str, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'a> Tokenizer<'a> { | impl<'a> Tokenizer<'a> { | ||||||
|     pub fn new(string: &str) -> Tokenizer { |     pub fn new(string: &str) -> Tokenizer { | ||||||
|  |         let mut char_advance = 0; | ||||||
|  |         let mut index_advance = 0; | ||||||
|  |         for (n, (i, c)) in string.char_indices().enumerate() { | ||||||
|  |             char_advance = n; | ||||||
|  |             index_advance = i; | ||||||
|  |             if detect_separator(c).is_none() { break } | ||||||
|  |         } | ||||||
|  |  | ||||||
|         Tokenizer { |         Tokenizer { | ||||||
|             index: 0, |             word_index: 0, | ||||||
|             inner: string.trim_matches(&[' ', '.', ';', ',', '!', '?', '-', '\'', '"'][..]), |             char_index: char_advance, | ||||||
|  |             inner: &string[index_advance..], | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -56,43 +73,58 @@ impl Separator { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | fn detect_separator(c: char) -> Option<Separator> { | ||||||
|  |     match c { | ||||||
|  |         '.' | ';' | ',' | '!' | '?' | '-' => Some(Long), | ||||||
|  |         ' ' | '\'' | '"'                  => Some(Short), | ||||||
|  |         _                                 => None, | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| impl<'a> Iterator for Tokenizer<'a> { | impl<'a> Iterator for Tokenizer<'a> { | ||||||
|     type Item = (usize, &'a str); |     type Item = Token<'a>; | ||||||
|  |  | ||||||
|     fn next(&mut self) -> Option<Self::Item> { |     fn next(&mut self) -> Option<Self::Item> { | ||||||
|         let mut start_word = None; |         let mut start_word = None; | ||||||
|         let mut distance = None; |         let mut distance = None; | ||||||
|  |  | ||||||
|         for (i, c) in self.inner.char_indices() { |         for (i, c) in self.inner.char_indices() { | ||||||
|             let separator = match c { |             match detect_separator(c) { | ||||||
|                 '.' | ';' | ',' | '!' | '?' | '-' => Some(Long), |                 Some(sep) => { | ||||||
|                 ' ' | '\'' | '"' => Some(Short), |  | ||||||
|                 _   => None, |  | ||||||
|             }; |  | ||||||
|  |  | ||||||
|             match separator { |  | ||||||
|                 Some(dist) => { |  | ||||||
|                     if let Some(start_word) = start_word { |                     if let Some(start_word) = start_word { | ||||||
|                         let (word, tail) = self.inner.split_at(i); |                         let (prefix, tail) = self.inner.split_at(i); | ||||||
|  |                         let (spaces, word) = prefix.split_at(start_word); | ||||||
|  |  | ||||||
|                         self.inner = tail; |                         self.inner = tail; | ||||||
|                         self.index += distance.map(Separator::to_usize).unwrap_or(0); |                         self.char_index += spaces.len(); | ||||||
|  |                         self.word_index += distance.map(Separator::to_usize).unwrap_or(0); | ||||||
|  |  | ||||||
|                         let word = &word[start_word..]; |                         let token = Token { | ||||||
|                         return Some((self.index, word)) |                             word: word, | ||||||
|  |                             word_index: self.word_index, | ||||||
|  |                             char_index: self.char_index, | ||||||
|  |                         }; | ||||||
|  |  | ||||||
|  |                         self.char_index += word.len(); | ||||||
|  |                         return Some(token) | ||||||
|                     } |                     } | ||||||
|                     distance = Some(distance.map(|s| s.add(dist)).unwrap_or(dist)); |  | ||||||
|  |                     distance.replace(distance.map_or(sep, |s| s.add(sep))); | ||||||
|                 }, |                 }, | ||||||
|                 None => { start_word.get_or_insert(i); }, |                 None => { start_word.get_or_insert(i); }, | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         if let Some(start_word) = start_word { |         if let Some(start_word) = start_word { | ||||||
|             let word = mem::replace(&mut self.inner, ""); |             let prefix = mem::replace(&mut self.inner, ""); | ||||||
|             self.index += distance.map(Separator::to_usize).unwrap_or(0); |             let (spaces, word) = prefix.split_at(start_word); | ||||||
|  |  | ||||||
|             let word = &word[start_word..]; |             let token = Token { | ||||||
|             return Some((self.index, word)) |                 word: word, | ||||||
|  |                 word_index: self.word_index + distance.map(Separator::to_usize).unwrap_or(0), | ||||||
|  |                 char_index: self.char_index + spaces.len(), | ||||||
|  |             }; | ||||||
|  |             return Some(token) | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         None |         None | ||||||
| @@ -107,12 +139,12 @@ mod tests { | |||||||
|     fn easy() { |     fn easy() { | ||||||
|         let mut tokenizer = Tokenizer::new("salut"); |         let mut tokenizer = Tokenizer::new("salut"); | ||||||
|  |  | ||||||
|         assert_eq!(tokenizer.next(), Some((0, "salut"))); |         assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 })); | ||||||
|         assert_eq!(tokenizer.next(), None); |         assert_eq!(tokenizer.next(), None); | ||||||
|  |  | ||||||
|         let mut tokenizer = Tokenizer::new("yo    "); |         let mut tokenizer = Tokenizer::new("yo    "); | ||||||
|  |  | ||||||
|         assert_eq!(tokenizer.next(), Some((0, "yo"))); |         assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); | ||||||
|         assert_eq!(tokenizer.next(), None); |         assert_eq!(tokenizer.next(), None); | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -120,18 +152,37 @@ mod tests { | |||||||
|     fn hard() { |     fn hard() { | ||||||
|         let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe"); |         let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe"); | ||||||
|  |  | ||||||
|         assert_eq!(tokenizer.next(), Some((0, "yo"))); |         assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 })); | ||||||
|         assert_eq!(tokenizer.next(), Some((1, "lolo"))); |         assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 })); | ||||||
|         assert_eq!(tokenizer.next(), Some((9, "aïe"))); |         assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 })); | ||||||
|         assert_eq!(tokenizer.next(), None); |         assert_eq!(tokenizer.next(), None); | ||||||
|  |  | ||||||
|         let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,"); |         let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,"); | ||||||
|  |  | ||||||
|         assert_eq!(tokenizer.next(), Some((0, "yo"))); |         assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); | ||||||
|         assert_eq!(tokenizer.next(), Some((8, "lolo"))); |         assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 })); | ||||||
|         assert_eq!(tokenizer.next(), Some((16, "wtf"))); |         assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 })); | ||||||
|         assert_eq!(tokenizer.next(), Some((24, "lol"))); |         assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 18 })); | ||||||
|         assert_eq!(tokenizer.next(), Some((32, "aïe"))); |         assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 32, char_index: 24 })); | ||||||
|  |         assert_eq!(tokenizer.next(), None); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn hard_long_chars() { | ||||||
|  |         let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe"); | ||||||
|  |  | ||||||
|  |         assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 })); | ||||||
|  |         assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 })); | ||||||
|  |         assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 })); | ||||||
|  |         assert_eq!(tokenizer.next(), None); | ||||||
|  |  | ||||||
|  |         let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,"); | ||||||
|  |  | ||||||
|  |         assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); | ||||||
|  |         assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 })); | ||||||
|  |         assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 })); | ||||||
|  |         assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 19 })); | ||||||
|  |         assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 25 })); | ||||||
|         assert_eq!(tokenizer.next(), None); |         assert_eq!(tokenizer.next(), None); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user