mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-08-02 11:50:03 +00:00
Merge #322
322: Geosearch r=ManyTheFish a=irevoire This PR introduces [basic geo-search functionalities](https://github.com/meilisearch/specifications/pull/59), it makes the engine able to index, filter and, sort by geo-point. We decided to use [the rstar library](https://docs.rs/rstar) and to save the points in [an RTree](https://docs.rs/rstar/0.9.1/rstar/struct.RTree.html) that we de/serialize in the index database [by using serde](https://serde.rs/) with [bincode](https://docs.rs/bincode). This is not an efficient way to query this tree as it will consume a lot of CPU and memory when a search is made, but at least it is an easy first way to do so. ### What we will have to do on the indexing part: - [x] Index the `_geo` fields from the documents. - [x] Create a new module with an extractor in the `extract` module that takes the `obkv_documents` and retrieves the latitude and longitude coordinates, outputting them in a `grenad::Reader` for further process. - [x] Call the extractor in the `extract::extract_documents_data` function and send the result to the `TypedChunk` module. - [x] Get the `grenad::Reader` in the `typed_chunk::write_typed_chunk_into_index` function and store all the points in the `rtree` - [x] Delete the documents from the `RTree` when deleting documents from the database. All this can be done in the `delete_documents.rs` file by getting the data structure and removing the points from it, inserting it back after the modification. - [x] Clearing the `RTree` entirely when we clear the documents from the database, everything happens in the `clear_documents.rs` file. - [x] save a Roaring bitmap of all documents containing the `_geo` field ### What we will have to do on the query part: - [x] Filter the documents at a certain distance around a point, this is done by [collecting the documents from the searched point](https://docs.rs/rstar/0.9.1/rstar/struct.RTree.html#method.nearest_neighbor_iter) while they are in range. - [x] We must introduce new `geoLowerThan` and `geoGreaterThan` variants to the `Operator` filter enum. - [x] Implement the `negative` method on both variants where the `geoGreaterThan` variant is implemented by executing the `geoLowerThan` and removing the results found from the whole list of geo faceted documents. - [x] Add the `_geoRadius` function in the pest parser. - [x] Introduce a `_geo` ascending ranking function that takes a point in parameter, ~~this function must keep the iterator on the `RTree` and make it peekable~~ This was not possible for now, we had to collect the whole iterator. Only the documents that are part of the candidates must be sent too! - [x] This ascending ranking rule will only be active if the search is set up with the `_geoPoint` parameter that indicates the center point of the ascending ranking rule. ----------- - On Meilisearch part: We must introduce a new concept, returning the documents with a new `_geoDistance` field when it passed by the `_geo` ranking rule, this has never been done before. We could maybe just do it afterward when the documents have been retrieved from the database, computing the distance from the `_geoPoint` and all of the documents to be returned. Co-authored-by: Irevoire <tamo@meilisearch.com> Co-authored-by: cvermand <33010418+bidoubiwa@users.noreply.github.com> Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
@ -0,0 +1,44 @@
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
|
||||
use concat_arrays::concat_arrays;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::{FieldId, InternalError, Result, UserError};
|
||||
|
||||
/// Extracts the geographical coordinates contained in each document under the `_geo` field.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid as key associated to the (latitude, longitude)
|
||||
pub fn extract_geo_points<R: io::Read>(
|
||||
mut obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
primary_key_id: FieldId,
|
||||
geo_field_id: FieldId,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
let mut writer = tempfile::tempfile().and_then(|file| {
|
||||
create_writer(indexer.chunk_compression_type, indexer.chunk_compression_level, file)
|
||||
})?;
|
||||
|
||||
while let Some((docid_bytes, value)) = obkv_documents.next()? {
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
let point: Value = match obkv.get(geo_field_id) {
|
||||
Some(point) => serde_json::from_slice(point).map_err(InternalError::SerdeJson)?,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
if let Some((lat, lng)) = point["lat"].as_f64().zip(point["lng"].as_f64()) {
|
||||
// this will create an array of 16 bytes (two 8 bytes floats)
|
||||
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
||||
writer.insert(docid_bytes, bytes)?;
|
||||
} else {
|
||||
// All document must have a primary key so we can unwrap safely here
|
||||
let primary_key = obkv.get(primary_key_id).unwrap();
|
||||
let primary_key =
|
||||
serde_json::from_slice(primary_key).map_err(InternalError::SerdeJson)?;
|
||||
Err(UserError::InvalidGeoField { document_id: primary_key, object: point })?
|
||||
}
|
||||
}
|
||||
|
||||
Ok(writer_into_reader(writer)?)
|
||||
}
|
@ -3,6 +3,7 @@ mod extract_facet_number_docids;
|
||||
mod extract_facet_string_docids;
|
||||
mod extract_fid_docid_facet_values;
|
||||
mod extract_fid_word_count_docids;
|
||||
mod extract_geo_points;
|
||||
mod extract_word_docids;
|
||||
mod extract_word_level_position_docids;
|
||||
mod extract_word_pair_proximity_docids;
|
||||
@ -19,6 +20,7 @@ use self::extract_facet_number_docids::extract_facet_number_docids;
|
||||
use self::extract_facet_string_docids::extract_facet_string_docids;
|
||||
use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values;
|
||||
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
||||
use self::extract_geo_points::extract_geo_points;
|
||||
use self::extract_word_docids::extract_word_docids;
|
||||
use self::extract_word_level_position_docids::extract_word_level_position_docids;
|
||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||
@ -37,6 +39,8 @@ pub(crate) fn data_from_obkv_documents(
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
searchable_fields: Option<HashSet<FieldId>>,
|
||||
faceted_fields: HashSet<FieldId>,
|
||||
primary_key_id: FieldId,
|
||||
geo_field_id: Option<FieldId>,
|
||||
stop_words: Option<fst::Set<&[u8]>>,
|
||||
) -> Result<()> {
|
||||
let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks
|
||||
@ -48,6 +52,8 @@ pub(crate) fn data_from_obkv_documents(
|
||||
lmdb_writer_sx.clone(),
|
||||
&searchable_fields,
|
||||
&faceted_fields,
|
||||
primary_key_id,
|
||||
geo_field_id,
|
||||
&stop_words,
|
||||
)
|
||||
})
|
||||
@ -168,6 +174,8 @@ fn extract_documents_data(
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
faceted_fields: &HashSet<FieldId>,
|
||||
primary_key_id: FieldId,
|
||||
geo_field_id: Option<FieldId>,
|
||||
stop_words: &Option<fst::Set<&[u8]>>,
|
||||
) -> Result<(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
@ -177,6 +185,19 @@ fn extract_documents_data(
|
||||
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(documents_chunk.clone())));
|
||||
|
||||
if let Some(geo_field_id) = geo_field_id {
|
||||
let documents_chunk_cloned = documents_chunk.clone();
|
||||
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
let result =
|
||||
extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, geo_field_id);
|
||||
let _ = match result {
|
||||
Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))),
|
||||
Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
|
||||
rayon::join(
|
||||
|| {
|
||||
|
@ -228,11 +228,27 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
Receiver<Result<TypedChunk>>,
|
||||
) = crossbeam_channel::unbounded();
|
||||
|
||||
// get the primary key field id
|
||||
let primary_key_id = fields_ids_map.id(&primary_key).unwrap();
|
||||
|
||||
// get searchable fields for word databases
|
||||
let searchable_fields =
|
||||
self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter);
|
||||
// get filterable fields for facet databases
|
||||
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
|
||||
// get the fid of the `_geo` field.
|
||||
let geo_field_id = match self.index.fields_ids_map(self.wtxn)?.id("_geo") {
|
||||
Some(gfid) => {
|
||||
let is_sortable = self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid);
|
||||
let is_filterable = self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid);
|
||||
if is_sortable || is_filterable {
|
||||
Some(gfid)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
let stop_words = self.index.stop_words(self.wtxn)?;
|
||||
// let stop_words = stop_words.as_ref();
|
||||
@ -261,6 +277,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
lmdb_writer_sx.clone(),
|
||||
searchable_fields,
|
||||
faceted_fields,
|
||||
primary_key_id,
|
||||
geo_field_id,
|
||||
stop_words,
|
||||
)
|
||||
});
|
||||
@ -876,12 +894,12 @@ mod tests {
|
||||
// First we send 3 documents with an id for only one of them.
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let documents = &r#"[
|
||||
{ "id": 2, "title": "Pride and Prejudice", "author": "Jane Austin", "genre": "romance", "price": 3.5 },
|
||||
{ "id": 2, "title": "Pride and Prejudice", "author": "Jane Austin", "genre": "romance", "price": 3.5, "_geo": { "lat": 12, "lng": 42 } },
|
||||
{ "id": 456, "title": "Le Petit Prince", "author": "Antoine de Saint-Exupéry", "genre": "adventure" , "price": 10.0 },
|
||||
{ "id": 1, "title": "Alice In Wonderland", "author": "Lewis Carroll", "genre": "fantasy", "price": 25.99 },
|
||||
{ "id": 1344, "title": "The Hobbit", "author": "J. R. R. Tolkien", "genre": "fantasy" },
|
||||
{ "id": 4, "title": "Harry Potter and the Half-Blood Prince", "author": "J. K. Rowling", "genre": "fantasy" },
|
||||
{ "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams" }
|
||||
{ "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams", "_geo": { "lat": 35, "lng": 23 } }
|
||||
]"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
@ -917,7 +935,7 @@ mod tests {
|
||||
{ "objectId": 123, "title": "Pride and Prejudice", "comment": "A great book" },
|
||||
{ "objectId": 456, "title": "Le Petit Prince", "comment": "A french book" },
|
||||
{ "objectId": 1, "title": "Alice In Wonderland", "comment": "A weird book" },
|
||||
{ "objectId": 30, "title": "Hamlet" }
|
||||
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
|
||||
]"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
@ -934,7 +952,7 @@ mod tests {
|
||||
assert!(external_documents_ids.get("30").is_none());
|
||||
|
||||
let content = &br#"[
|
||||
{ "objectId": 30, "title": "Hamlet" }
|
||||
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
|
||||
]"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
@ -944,7 +962,7 @@ mod tests {
|
||||
assert!(external_documents_ids.get("30").is_some());
|
||||
|
||||
let content = &br#"[
|
||||
{ "objectId": 30, "title": "Hamlet" }
|
||||
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
|
||||
]"#[..];
|
||||
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||
builder.update_format(UpdateFormat::Json);
|
||||
|
@ -1,4 +1,5 @@
|
||||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
|
||||
use heed::types::ByteSlice;
|
||||
@ -6,11 +7,12 @@ use heed::{BytesDecode, RwTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::helpers::{
|
||||
roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap,
|
||||
self, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key,
|
||||
CursorClonableMmap,
|
||||
};
|
||||
use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string};
|
||||
use crate::update::index_documents::helpers::into_clonable_grenad;
|
||||
use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Index, Result};
|
||||
use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result};
|
||||
|
||||
pub(crate) enum TypedChunk {
|
||||
DocidWordPositions(grenad::Reader<CursorClonableMmap>),
|
||||
@ -24,6 +26,7 @@ pub(crate) enum TypedChunk {
|
||||
WordPairProximityDocids(grenad::Reader<File>),
|
||||
FieldIdFacetStringDocids(grenad::Reader<File>),
|
||||
FieldIdFacetNumberDocids(grenad::Reader<File>),
|
||||
GeoPoints(grenad::Reader<File>),
|
||||
}
|
||||
|
||||
/// Write typed chunk in the corresponding LMDB database of the provided index.
|
||||
@ -177,6 +180,24 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::GeoPoints(mut geo_points) => {
|
||||
let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default();
|
||||
let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?;
|
||||
|
||||
while let Some((key, value)) = geo_points.next()? {
|
||||
// convert the key back to a u32 (4 bytes)
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
|
||||
// convert the latitude and longitude back to a f64 (8 bytes)
|
||||
let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap();
|
||||
let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap();
|
||||
let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)];
|
||||
rtree.insert(GeoPoint::new(point, docid));
|
||||
geo_faceted_docids.insert(docid);
|
||||
}
|
||||
index.put_geo_rtree(wtxn, &rtree)?;
|
||||
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok((RoaringBitmap::new(), is_merged_database))
|
||||
|
Reference in New Issue
Block a user