Come back to the old tokenizer

This commit is contained in:
Clément Renault
2020-08-30 21:50:30 +02:00
committed by Kerollmops
parent 220ba0785c
commit bad0663138
8 changed files with 45 additions and 101 deletions

View File

@ -21,7 +21,8 @@ use rayon::prelude::*;
use roaring::RoaringBitmap;
use structopt::StructOpt;
use milli::{lexer, SmallVec32, Index, DocumentId, Position, Attribute, BEU32};
use milli::{SmallVec32, Index, DocumentId, Position, Attribute, BEU32};
use milli::tokenizer::{simple_tokenizer, only_words};
const LMDB_MAX_KEY_LENGTH: usize = 511;
const ONE_MILLION: usize = 1_000_000;
@ -367,7 +368,7 @@ fn merge(key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
WORDS_FST_KEY => {
let fsts: Vec<_> = values.iter().map(|v| fst::Set::new(v).unwrap()).collect();
// Union of the two FSTs
// Union of the FSTs
let mut op = fst::set::OpBuilder::new();
fsts.iter().for_each(|fst| op.push(fst.into_stream()));
let op = op.r#union();
@ -387,15 +388,16 @@ fn merge(key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
| WORD_FOUR_POSITIONS_DOCIDS_BYTE
| WORD_ATTRIBUTE_DOCIDS_BYTE =>
{
let mut first = RoaringBitmap::deserialize_from(values[0].as_slice()).unwrap();
let (head, tail) = values.split_first().unwrap();
for value in &values[1..] {
let mut head = RoaringBitmap::deserialize_from(head.as_slice()).unwrap();
for value in tail {
let bitmap = RoaringBitmap::deserialize_from(value.as_slice()).unwrap();
first.union_with(&bitmap);
head.union_with(&bitmap);
}
let mut vec = Vec::new();
first.serialize_into(&mut vec).unwrap();
let mut vec = Vec::with_capacity(head.serialized_size());
head.serialize_into(&mut vec).unwrap();
Ok(vec)
},
otherwise => panic!("wut {:?}", otherwise),
@ -505,8 +507,8 @@ fn index_csv(
let document_id = DocumentId::try_from(document_id).context("generated id is too big")?;
for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) {
for (pos, word) in lexer::break_string(&content).enumerate().take(MAX_POSITION) {
let word = word.cow_to_lowercase();
for (pos, (_, token)) in simple_tokenizer(&content).filter(only_words).enumerate().take(MAX_POSITION) {
let word = token.cow_to_lowercase();
let position = (attr * MAX_POSITION + pos) as u32;
store.insert_word_position_docid(&word, position, document_id)?;
}

View File

@ -9,10 +9,10 @@ use std::time::Instant;
use askama_warp::Template;
use heed::EnvOpenOptions;
use serde::Deserialize;
use slice_group_by::StrGroupBy;
use structopt::StructOpt;
use warp::{Filter, http::Response};
use milli::tokenizer::{simple_tokenizer, TokenType};
use milli::{Index, SearchResult};
#[cfg(target_os = "linux")]
@ -47,12 +47,16 @@ struct Opt {
fn highlight_string(string: &str, words: &HashSet<String>) -> String {
let mut output = String::new();
for token in string.linear_group_by_key(|c| c.is_alphanumeric()) {
let lowercase_token = token.to_lowercase();
let to_highlight = words.contains(&lowercase_token);
if to_highlight { output.push_str("<mark>") }
output.push_str(token);
if to_highlight { output.push_str("</mark>") }
for (token_type, token) in simple_tokenizer(string) {
if token_type == TokenType::Word {
let lowercase_token = token.to_lowercase();
let to_highlight = words.contains(&lowercase_token);
if to_highlight { output.push_str("<mark>") }
output.push_str(token);
if to_highlight { output.push_str("</mark>") }
} else {
output.push_str(token);
}
}
output
}