mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-08-02 03:40:00 +00:00
Implements the geo-sort ranking rule
This commit is contained in:
261
milli/src/search/new/geo_sort.rs
Normal file
261
milli/src/search/new/geo_sort.rs
Normal file
@ -0,0 +1,261 @@
|
||||
use std::collections::VecDeque;
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use heed::types::{ByteSlice, Unit};
|
||||
use heed::{RoPrefix, RoTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
use rstar::RTree;
|
||||
|
||||
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
|
||||
use crate::heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec};
|
||||
use crate::{
|
||||
distance_between_two_points, lat_lng_to_xyz, GeoPoint, Index, Result, SearchContext,
|
||||
SearchLogger,
|
||||
};
|
||||
|
||||
const FID_SIZE: usize = 2;
|
||||
const DOCID_SIZE: usize = 4;
|
||||
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] {
|
||||
concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes())
|
||||
}
|
||||
|
||||
/// Return an iterator over each number value in the given field of the given document.
|
||||
fn facet_number_values<'a>(
|
||||
docid: u32,
|
||||
field_id: u16,
|
||||
index: &Index,
|
||||
txn: &'a RoTxn,
|
||||
) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<OrderedF64Codec>, Unit>> {
|
||||
let key = facet_values_prefix_key(field_id, docid);
|
||||
|
||||
let iter = index
|
||||
.field_id_docid_facet_f64s
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(txn, &key)?
|
||||
.remap_key_type();
|
||||
|
||||
Ok(iter)
|
||||
}
|
||||
|
||||
/// Define the strategy used by the geo sort.
|
||||
/// The paramater represents the cache size, and, in the case of the Dynamic strategy,
|
||||
/// the point where we move from using the iterative strategy to the rtree.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum Strategy {
|
||||
AlwaysIterative(usize),
|
||||
AlwaysRtree(usize),
|
||||
Dynamic(usize),
|
||||
}
|
||||
|
||||
impl Default for Strategy {
|
||||
fn default() -> Self {
|
||||
Strategy::Dynamic(1000)
|
||||
}
|
||||
}
|
||||
|
||||
impl Strategy {
|
||||
pub fn use_rtree(&self, candidates: usize) -> bool {
|
||||
match self {
|
||||
Strategy::AlwaysIterative(_) => false,
|
||||
Strategy::AlwaysRtree(_) => true,
|
||||
Strategy::Dynamic(i) => candidates >= *i,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn cache_size(&self) -> usize {
|
||||
match self {
|
||||
Strategy::AlwaysIterative(i) | Strategy::AlwaysRtree(i) | Strategy::Dynamic(i) => *i,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct GeoSort<Q: RankingRuleQueryTrait> {
|
||||
query: Option<Q>,
|
||||
|
||||
strategy: Strategy,
|
||||
ascending: bool,
|
||||
point: [f64; 2],
|
||||
field_ids: Option<[u16; 2]>,
|
||||
rtree: Option<RTree<GeoPoint>>,
|
||||
|
||||
cached_sorted_docids: VecDeque<u32>,
|
||||
geo_candidates: RoaringBitmap,
|
||||
}
|
||||
|
||||
impl<Q: RankingRuleQueryTrait> GeoSort<Q> {
|
||||
pub fn new(
|
||||
strategy: Strategy,
|
||||
geo_faceted_docids: RoaringBitmap,
|
||||
point: [f64; 2],
|
||||
ascending: bool,
|
||||
) -> Result<Self> {
|
||||
Ok(Self {
|
||||
query: None,
|
||||
strategy,
|
||||
ascending,
|
||||
point,
|
||||
geo_candidates: geo_faceted_docids,
|
||||
field_ids: None,
|
||||
rtree: None,
|
||||
cached_sorted_docids: VecDeque::new(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Refill the internal buffer of cached docids based on the strategy.
|
||||
/// Drop the rtree if we don't need it anymore.
|
||||
fn fill_buffer<'ctx>(&mut self, ctx: &mut SearchContext<'ctx>) -> Result<()> {
|
||||
debug_assert!(self.field_ids.is_some(), "fill_buffer can't be called without the lat&lng");
|
||||
debug_assert!(self.cached_sorted_docids.is_empty());
|
||||
|
||||
// if we had an rtree and the strategy doesn't require one anymore we can drop it
|
||||
let use_rtree = self.strategy.use_rtree(self.geo_candidates.len() as usize);
|
||||
if !use_rtree && self.rtree.is_some() {
|
||||
self.rtree = None;
|
||||
}
|
||||
|
||||
let cache_size = self.strategy.cache_size();
|
||||
if let Some(ref mut rtree) = self.rtree {
|
||||
let point = lat_lng_to_xyz(&self.point);
|
||||
|
||||
if self.ascending {
|
||||
for point in rtree.nearest_neighbor_iter(&point) {
|
||||
if self.geo_candidates.contains(point.data.0) {
|
||||
self.cached_sorted_docids.push_back(point.data.0);
|
||||
if self.cached_sorted_docids.len() >= cache_size {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// in the case of the desc geo sort we have to scan the whole database
|
||||
// and only keep the latest candidates.
|
||||
for point in rtree.nearest_neighbor_iter(&point) {
|
||||
if self.geo_candidates.contains(point.data.0) {
|
||||
self.cached_sorted_docids.pop_front();
|
||||
self.cached_sorted_docids.push_back(point.data.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// the iterative version
|
||||
let [lat, lng] = self.field_ids.unwrap();
|
||||
|
||||
let mut documents = self
|
||||
.geo_candidates
|
||||
.iter()
|
||||
.map(|id| -> Result<_> {
|
||||
Ok((
|
||||
id,
|
||||
[
|
||||
facet_number_values(id, lat, ctx.index, ctx.txn)?
|
||||
.next()
|
||||
.expect("A geo faceted document doesn't contain any lat")?
|
||||
.0
|
||||
.2,
|
||||
facet_number_values(id, lng, ctx.index, ctx.txn)?
|
||||
.next()
|
||||
.expect("A geo faceted document doesn't contain any lng")?
|
||||
.0
|
||||
.2,
|
||||
],
|
||||
))
|
||||
})
|
||||
.collect::<Result<Vec<(u32, [f64; 2])>>>()?;
|
||||
documents.sort_by_key(|(_, p)| distance_between_two_points(&self.point, &p) as usize);
|
||||
self.cached_sorted_docids.extend(documents.into_iter().map(|(doc_id, _)| doc_id));
|
||||
};
|
||||
|
||||
if self.cached_sorted_docids.is_empty() && matches!(self.strategy, Strategy::AlwaysRtree(_))
|
||||
{
|
||||
// this shouldn't be possible
|
||||
self.rtree = None;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort<Q> {
|
||||
fn id(&self) -> String {
|
||||
"geo_sort".to_owned()
|
||||
}
|
||||
|
||||
fn start_iteration(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<Q>,
|
||||
universe: &RoaringBitmap,
|
||||
query: &Q,
|
||||
) -> Result<()> {
|
||||
assert!(self.query.is_none());
|
||||
|
||||
self.query = Some(query.clone());
|
||||
self.geo_candidates &= universe;
|
||||
|
||||
if self.geo_candidates.len() == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let fid_map = ctx.index.fields_ids_map(ctx.txn)?;
|
||||
let lat = fid_map.id("_geo.lat").expect("geo candidates but no fid for lat");
|
||||
let lng = fid_map.id("_geo.lng").expect("geo candidates but no fid for lng");
|
||||
self.field_ids = Some([lat, lng]);
|
||||
|
||||
if self.strategy.use_rtree(self.geo_candidates.len() as usize) {
|
||||
self.rtree = Some(ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree"));
|
||||
}
|
||||
|
||||
self.fill_buffer(ctx)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn next_bucket(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
logger: &mut dyn SearchLogger<Q>,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<Option<RankingRuleOutput<Q>>> {
|
||||
assert!(universe.len() > 1);
|
||||
let query = self.query.as_ref().unwrap().clone();
|
||||
self.geo_candidates &= universe;
|
||||
|
||||
if self.geo_candidates.is_empty() {
|
||||
return Ok(Some(RankingRuleOutput { query, candidates: universe.clone() }));
|
||||
}
|
||||
|
||||
let ascending = self.ascending;
|
||||
let next = |cache: &mut VecDeque<_>| {
|
||||
if ascending {
|
||||
cache.pop_front()
|
||||
} else {
|
||||
cache.pop_back()
|
||||
}
|
||||
};
|
||||
while let Some(id) = next(&mut self.cached_sorted_docids) {
|
||||
if self.geo_candidates.contains(id) {
|
||||
return Ok(Some(RankingRuleOutput {
|
||||
query,
|
||||
candidates: RoaringBitmap::from_iter([id]),
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
// if we got out of this loop it means we've exhausted our cache.
|
||||
|
||||
if self.rtree.is_none() {
|
||||
// with no rtree it means all geo candidates have been returned. We can return all the non geo-faceted documents
|
||||
Ok(Some(RankingRuleOutput { query, candidates: universe.clone() }))
|
||||
} else {
|
||||
// else, we need to refill our bucket and run the function again
|
||||
self.fill_buffer(ctx)?;
|
||||
self.next_bucket(ctx, logger, universe)
|
||||
}
|
||||
}
|
||||
|
||||
fn end_iteration(&mut self, _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger<Q>) {
|
||||
self.query = None;
|
||||
self.rtree = None;
|
||||
self.cached_sorted_docids.clear();
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user