mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 13:06:27 +00:00 
			
		
		
		
	adds support for aligned cropping with cjk
This commit is contained in:
		
							
								
								
									
										1
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										1
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -1081,6 +1081,7 @@ dependencies = [ | |||||||
|  "main_error 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", |  "main_error 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  "meilisearch-core 0.9.0", |  "meilisearch-core 0.9.0", | ||||||
|  "meilisearch-schema 0.9.0", |  "meilisearch-schema 0.9.0", | ||||||
|  |  "meilisearch-tokenizer 0.9.0", | ||||||
|  "mime 0.3.16 (registry+https://github.com/rust-lang/crates.io-index)", |  "mime 0.3.16 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  "pretty-bytes 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", |  "pretty-bytes 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  "rand 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)", |  "rand 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|   | |||||||
| @@ -27,6 +27,7 @@ log = "0.4.8" | |||||||
| main_error = "0.1.0" | main_error = "0.1.0" | ||||||
| meilisearch-core = { path = "../meilisearch-core", version = "0.9.0" } | meilisearch-core = { path = "../meilisearch-core", version = "0.9.0" } | ||||||
| meilisearch-schema = { path = "../meilisearch-schema", version = "0.9.0" } | meilisearch-schema = { path = "../meilisearch-schema", version = "0.9.0" } | ||||||
|  | meilisearch-tokenizer = {path = "../meilisearch-tokenizer", version = "0.9.0"} | ||||||
| mime = "0.3.16" | mime = "0.3.16" | ||||||
| pretty-bytes = "0.2.2" | pretty-bytes = "0.2.2" | ||||||
| rand = "0.7.2" | rand = "0.7.2" | ||||||
|   | |||||||
| @@ -11,6 +11,7 @@ use log::error; | |||||||
| use meilisearch_core::criterion::*; | use meilisearch_core::criterion::*; | ||||||
| use meilisearch_core::settings::RankingRule; | use meilisearch_core::settings::RankingRule; | ||||||
| use meilisearch_core::{Highlight, Index, MainT, RankedMap}; | use meilisearch_core::{Highlight, Index, MainT, RankedMap}; | ||||||
|  | use meilisearch_tokenizer::is_cjk; | ||||||
| use meilisearch_schema::{FieldId, Schema}; | use meilisearch_schema::{FieldId, Schema}; | ||||||
| use serde::{Deserialize, Serialize}; | use serde::{Deserialize, Serialize}; | ||||||
| use serde_json::Value; | use serde_json::Value; | ||||||
| @@ -372,19 +373,21 @@ pub struct SearchResult { | |||||||
|     pub query: String, |     pub query: String, | ||||||
| } | } | ||||||
|  |  | ||||||
|  | /// returns the start index and the length on the crop.  | ||||||
| fn aligned_crop(text: &str, match_index: usize, context: usize) -> (usize, usize) { | fn aligned_crop(text: &str, match_index: usize, context: usize) -> (usize, usize) { | ||||||
|  |     let is_word_component = |c: &char| c.is_alphanumeric() && !is_cjk(*c); | ||||||
|     if context == 0 { |  | ||||||
|         return (match_index, text.chars().skip(match_index).take_while(|c| c.is_alphanumeric()).count()); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     let word_end_index = |mut index| { |     let word_end_index = |mut index| { | ||||||
|         if let Some(true) = text.chars().nth(index - 1).map(|c| c.is_alphanumeric()) { |         if text.chars().nth(index - 1).map_or(false, |c| is_word_component(&c)) { | ||||||
|             index += text.chars().skip(index).take_while(|c| c.is_alphanumeric()).count(); |             index += text.chars().skip(index).take_while(is_word_component).count(); | ||||||
|         } |         } | ||||||
|         index |         index | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|  |     if context == 0 { | ||||||
|  |         // count need to be at least 1 for cjk queries to return something | ||||||
|  |         return (match_index, 1 + text.chars().skip(match_index).take_while(is_word_component).count()); | ||||||
|  |     } | ||||||
|     let start = match match_index.saturating_sub(context) { |     let start = match match_index.saturating_sub(context) { | ||||||
|         n if n == 0 => n, |         n if n == 0 => n, | ||||||
|         n => word_end_index(n) |         n => word_end_index(n) | ||||||
| @@ -404,8 +407,10 @@ fn crop_text( | |||||||
|     let char_index = matches.peek().map(|m| m.char_index as usize).unwrap_or(0); |     let char_index = matches.peek().map(|m| m.char_index as usize).unwrap_or(0); | ||||||
|     let (start, count) = aligned_crop(text, char_index, context); |     let (start, count) = aligned_crop(text, char_index, context); | ||||||
|  |  | ||||||
|     let text = text.chars().skip(start).take(count).collect::<String>().trim().into(); |     //TODO do something about the double allocation | ||||||
|  |     let text = text.chars().skip(start).take(count).collect::<String>().trim().to_string(); | ||||||
|  |  | ||||||
|  |     // update matches index to match the new cropped text | ||||||
|     let matches = matches |     let matches = matches | ||||||
|         .take_while(|m| (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)) |         .take_while(|m| (m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)) | ||||||
|         .map(|match_| Highlight { |         .map(|match_| Highlight { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user