mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 05:26:27 +00:00 
			
		
		
		
	Fix multiple bugs
This commit is contained in:
		| @@ -3,6 +3,9 @@ use std::time::Instant; | ||||
|  | ||||
| use pathfinding::directed::dijkstra::dijkstra; | ||||
|  | ||||
| use smallvec::smallvec; // the macro | ||||
| use crate::SmallVec16; | ||||
|  | ||||
| const ONE_ATTRIBUTE: u32 = 1000; | ||||
| const MAX_DISTANCE: u32 = 8; | ||||
|  | ||||
| @@ -27,17 +30,17 @@ fn extract_position(position: u32) -> (u32, u32) { | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] | ||||
| struct Path(Vec<u32>); | ||||
| struct Path(SmallVec16<u32>); | ||||
|  | ||||
| impl Path { | ||||
|     fn new(positions: &[Vec<u32>]) -> Option<Path> { | ||||
|         let position = positions.first()?.first()?; | ||||
|         Some(Path(vec![*position])) | ||||
|         Some(Path(smallvec![*position])) | ||||
|     } | ||||
|  | ||||
|     // TODO we must skip the successors that have already been sent | ||||
|     fn successors(&self, positions: &[Vec<u32>]) -> Vec<(Path, u32)> { | ||||
|         let mut successors = Vec::new(); | ||||
|     fn successors(&self, positions: &[Vec<u32>]) -> SmallVec16<(Path, u32)> { | ||||
|         let mut successors = SmallVec16::new(); | ||||
|  | ||||
|         // If we can grow or shift the path | ||||
|         if self.0.len() < positions.len() { | ||||
| @@ -103,7 +106,12 @@ impl Iterator for BestProximity { | ||||
|             let result = dijkstra( | ||||
|                 &Path::new(&self.positions)?, | ||||
|                 |p| p.successors(&self.positions), | ||||
|                 |p| self.is_path_successful(p) && output.as_ref().map_or(true, |(_, paths)| !paths.contains(&p.0)), | ||||
|                 |p| { | ||||
|                     self.is_path_successful(p) && | ||||
|                     output.as_ref().map_or(true, |(_, paths)| { | ||||
|                         !paths.iter().position(|q| q.as_slice() == p.0.as_slice()).is_some() | ||||
|                     }) | ||||
|                 }, | ||||
|             ); | ||||
|  | ||||
|             match result { | ||||
| @@ -123,9 +131,9 @@ impl Iterator for BestProximity { | ||||
|  | ||||
|                             // We add the new path to the output list as this path is known | ||||
|                             // to be the requested distance. | ||||
|                             paths.push(positions.0); | ||||
|                             paths.push(positions.0.to_vec()); | ||||
|                         }, | ||||
|                         None => output = Some((positions.proximity(), vec![positions.0])), | ||||
|                         None => output = Some((positions.proximity(), vec![positions.0.to_vec()])), | ||||
|                     } | ||||
|                 }, | ||||
|                 None => break, | ||||
|   | ||||
| @@ -45,10 +45,10 @@ struct Opt { | ||||
|  | ||||
| struct Indexed { | ||||
|     fst: fst::Set<Vec<u8>>, | ||||
|     postings_attrs: FastMap4<SmallVec32, RoaringBitmap>, | ||||
|     prefix_postings_attrs: FastMap4<SmallVec32, RoaringBitmap>, | ||||
|     postings_ids: FastMap4<SmallVec32, FastMap4<AttributeId, RoaringBitmap>>, | ||||
|     prefix_postings_ids: FastMap4<SmallVec32, FastMap4<AttributeId, RoaringBitmap>>, | ||||
|     postings_attrs: FastMap4<SmallVec32<u8>, RoaringBitmap>, | ||||
|     prefix_postings_attrs: FastMap4<SmallVec32<u8>, RoaringBitmap>, | ||||
|     postings_ids: FastMap4<SmallVec32<u8>, FastMap4<AttributeId, RoaringBitmap>>, | ||||
|     prefix_postings_ids: FastMap4<SmallVec32<u8>, FastMap4<AttributeId, RoaringBitmap>>, | ||||
|     headers: Vec<u8>, | ||||
|     documents: Vec<(DocumentId, Vec<u8>)>, | ||||
| } | ||||
|   | ||||
| @@ -93,9 +93,9 @@ async fn main() -> anyhow::Result<()> { | ||||
|                 body.extend_from_slice(headers); | ||||
|  | ||||
|                 for id in documents_ids { | ||||
|                     if let Some(content) = index.documents.get(&rtxn, &BEU32::new(id)).unwrap() { | ||||
|                         body.extend_from_slice(&content); | ||||
|                     } | ||||
|                     let content = index.documents.get(&rtxn, &BEU32::new(id)).unwrap(); | ||||
|                     let content = content.expect(&format!("could not find document {}", id)); | ||||
|                     body.extend_from_slice(&content); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|   | ||||
							
								
								
									
										73
									
								
								src/lib.rs
									
									
									
									
									
								
							
							
						
						
									
										73
									
								
								src/lib.rs
									
									
									
									
									
								
							| @@ -25,7 +25,8 @@ static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true)); | ||||
|  | ||||
| pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>; | ||||
| pub type SmallString32 = smallstr::SmallString<[u8; 32]>; | ||||
| pub type SmallVec32 = smallvec::SmallVec<[u8; 32]>; | ||||
| pub type SmallVec32<T> = smallvec::SmallVec<[T; 32]>; | ||||
| pub type SmallVec16<T> = smallvec::SmallVec<[T; 16]>; | ||||
| pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>; | ||||
| pub type DocumentId = u32; | ||||
| pub type AttributeId = u32; | ||||
| @@ -89,52 +90,41 @@ impl Index { | ||||
|             (word, is_prefix, dfa) | ||||
|         }); | ||||
|  | ||||
|         let mut words_positions = Vec::new(); | ||||
|         let mut words = Vec::new(); | ||||
|         let mut positions = Vec::new(); | ||||
|         let before = Instant::now(); | ||||
|  | ||||
|         for (word, is_prefix, dfa) in dfas { | ||||
|         for (word, _is_prefix, dfa) in dfas { | ||||
|             let mut count = 0; | ||||
|             let mut union_positions = RoaringBitmap::default(); | ||||
|             if false && word.len() <= 4 && is_prefix { | ||||
|                 if let Some(ids) = self.prefix_postings_attrs.get(rtxn, word.as_bytes())? { | ||||
|                     let right = RoaringBitmap::deserialize_from(ids)?; | ||||
|             let mut derived_words = Vec::new(); | ||||
|             // TODO re-enable the prefixes system | ||||
|             let mut stream = fst.search(&dfa).into_stream(); | ||||
|             while let Some(word) = stream.next() { | ||||
|                 derived_words.push(word.to_vec()); | ||||
|                 let word = std::str::from_utf8(word)?; | ||||
|                 if let Some(attrs) = self.postings_attrs.get(rtxn, word)? { | ||||
|                     let right = RoaringBitmap::deserialize_from(attrs)?; | ||||
|                     union_positions.union_with(&right); | ||||
|                     count = 1; | ||||
|                 } | ||||
|             } else { | ||||
|                 let mut stream = fst.search(&dfa).into_stream(); | ||||
|                 while let Some(word) = stream.next() { | ||||
|                     let word = std::str::from_utf8(word)?; | ||||
|                     if let Some(attrs) = self.postings_attrs.get(rtxn, word)? { | ||||
|                         let right = RoaringBitmap::deserialize_from(attrs)?; | ||||
|                         union_positions.union_with(&right); | ||||
|                         count += 1; | ||||
|                     } | ||||
|                     count += 1; | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             eprintln!("{} words for {:?} we have found positions {:?}", count, word, union_positions); | ||||
|             words_positions.push((word, is_prefix, dfa)); | ||||
|             words.push(derived_words); | ||||
|             positions.push(union_positions.iter().collect()); | ||||
|         } | ||||
|  | ||||
|         eprintln!("Retrieving words positions took {:.02?}", before.elapsed()); | ||||
|  | ||||
|         // TODO re-enable the prefixes system | ||||
|         let mut words = Vec::new(); | ||||
|         for (_word, _is_prefix, dfa) in words_positions { | ||||
|             let mut stream = fst.search(dfa).into_stream(); | ||||
|             let mut derived_words = Vec::new(); | ||||
|             while let Some(word) = stream.next() { | ||||
|                 derived_words.push(word.to_vec()); | ||||
|             } | ||||
|             words.push(derived_words); | ||||
|         } | ||||
|  | ||||
|         let mut documents = Vec::new(); | ||||
|  | ||||
|         'outer: for (proximity, positions) in BestProximity::new(positions) { | ||||
|         for (proximity, mut positions) in BestProximity::new(positions) { | ||||
|             // TODO we must ignore positions paths that gives nothing | ||||
|             if words.len() > 1 && proximity == 0 { continue } | ||||
|  | ||||
|             positions.sort_unstable(); | ||||
|  | ||||
|             let same_prox_before = Instant::now(); | ||||
|             let mut same_proximity_union = RoaringBitmap::default(); | ||||
|  | ||||
| @@ -177,15 +167,30 @@ impl Index { | ||||
|                 if let Some(intersect_docids) = intersect_docids { | ||||
|                     same_proximity_union.union_with(&intersect_docids); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             eprintln!("proximity {} took a total of {:.02?}", proximity, same_prox_before.elapsed()); | ||||
|                 // We found enough documents we can stop here | ||||
|                 if documents.iter().map(RoaringBitmap::len).sum::<u64>() + same_proximity_union.len() >= 20 { | ||||
|                     eprintln!("proximity {} took a total of {:.02?}", proximity, same_prox_before.elapsed()); | ||||
|                     break; | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             documents.push(same_proximity_union); | ||||
|  | ||||
|             // We found enough documents we can stop here | ||||
|             // We remove the double occurences of documents. | ||||
|             for i in 0..documents.len() { | ||||
|                 if let Some((docs, others)) = documents[..=i].split_last_mut() { | ||||
|                     others.iter().for_each(|other| docs.difference_with(other)); | ||||
|                 } | ||||
|             } | ||||
|             documents.retain(|rb| !rb.is_empty()); | ||||
|  | ||||
|             eprintln!("documents: {:?}", documents); | ||||
|             eprintln!("proximity {} took a total of {:.02?}", proximity, same_prox_before.elapsed()); | ||||
|  | ||||
|             // We found enough documents we can stop here. | ||||
|             if documents.iter().map(RoaringBitmap::len).sum::<u64>() >= 20 { | ||||
|                 break 'outer; | ||||
|                 break; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user