Make sure we index all kind of JSON types

This commit is contained in:
Clément Renault
2020-11-06 16:15:07 +01:00
parent 640c7d748a
commit 4fb138c42e
3 changed files with 135 additions and 20 deletions

View File

@ -901,4 +901,41 @@ mod tests {
assert_eq!(count, 1);
drop(rtxn);
}
#[test]
fn complex_json_documents() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
// First we send 3 documents with an id for only one of them.
let mut wtxn = index.write_txn().unwrap();
let content = &br#"[
{ "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
{ "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
{ "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index);
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is 1 documents now.
let rtxn = index.read_txn().unwrap();
// Search for a sub object value
let result = index.search(&rtxn).query(r#""value2""#).execute().unwrap();
assert_eq!(result.documents_ids, vec![0]);
// Search for a sub array value
let result = index.search(&rtxn).query(r#""fine""#).execute().unwrap();
assert_eq!(result.documents_ids, vec![1]);
// Search for a sub array sub object key
let result = index.search(&rtxn).query(r#""wow""#).execute().unwrap();
assert_eq!(result.documents_ids, vec![2]);
drop(rtxn);
}
}

View File

@ -1,4 +1,3 @@
use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap, HashSet};
use std::convert::{TryFrom, TryInto};
use std::fs::File;
@ -17,7 +16,7 @@ use tempfile::tempfile;
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::tokenizer::{simple_tokenizer, only_token};
use crate::{SmallVec32, Position, DocumentId};
use crate::{json_to_string, SmallVec32, Position, DocumentId};
use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
use super::merge_function::{main_merge, word_docids_merge, words_pairs_proximities_docids_merge};
@ -317,25 +316,21 @@ impl Store {
}
for (attr, content) in document.iter() {
if self.searchable_fields.contains(&attr) {
use serde_json::Value;
let content: Cow<str> = match serde_json::from_slice(content) {
Ok(string) => string,
Err(_) => match serde_json::from_slice(content)? {
Value::Null => continue,
Value::Bool(boolean) => Cow::Owned(boolean.to_string()),
Value::Number(number) => Cow::Owned(number.to_string()),
Value::String(string) => Cow::Owned(string),
Value::Array(_array) => continue,
Value::Object(_object) => continue,
}
};
if !self.searchable_fields.contains(&attr) {
continue;
}
for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) {
let word = token.to_lowercase();
let position = (attr as usize * MAX_POSITION + pos) as u32;
words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
}
let value = serde_json::from_slice(content)?;
let content = match json_to_string(value) {
Some(content) => content,
None => continue,
};
let tokens = simple_tokenizer(&content).filter_map(only_token);
for (pos, token) in tokens.enumerate().take(MAX_POSITION) {
let word = token.to_lowercase();
let position = (attr as usize * MAX_POSITION + pos) as u32;
words_positions.entry(word).or_insert_with(SmallVec32::new).push(position);
}
}