Compare commits

..

1 Commits

Author SHA1 Message Date
Louis Dureuil
40215bfec3 TMP: check windows free disk space 2024-05-29 11:27:27 +02:00
22 changed files with 133 additions and 526 deletions

View File

@@ -56,6 +56,12 @@ jobs:
matrix:
os: [macos-12, windows-2022]
steps:
- name: Check free disk space on C
run: |
fsutil volume diskfree c:
- name: Check free disk space on D
run: |
fsutil volume diskfree d:
- uses: actions/checkout@v3
- name: Cache dependencies
uses: Swatinem/rust-cache@v2.7.1
@@ -63,11 +69,23 @@ jobs:
with:
toolchain: stable
override: true
- name: Check free disk space on C
run: |
fsutil volume diskfree c:
- name: Check free disk space on D
run: |
fsutil volume diskfree d:
- name: Run cargo check without any default features
uses: actions-rs/cargo@v1
with:
command: build
args: --locked --release --no-default-features --all
- name: Check free disk space on C
run: |
fsutil volume diskfree c:
- name: Check free disk space on D
run: |
fsutil volume diskfree d:
- name: Run cargo test
uses: actions-rs/cargo@v1
with:

34
Cargo.lock generated
View File

@@ -500,7 +500,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
[[package]]
name = "benchmarks"
version = "1.9.0"
version = "1.8.0"
dependencies = [
"anyhow",
"bytes",
@@ -645,7 +645,7 @@ dependencies = [
[[package]]
name = "build-info"
version = "1.9.0"
version = "1.8.0"
dependencies = [
"anyhow",
"time",
@@ -1545,7 +1545,7 @@ dependencies = [
[[package]]
name = "dump"
version = "1.9.0"
version = "1.8.0"
dependencies = [
"anyhow",
"big_s",
@@ -1793,7 +1793,7 @@ dependencies = [
[[package]]
name = "file-store"
version = "1.9.0"
version = "1.8.0"
dependencies = [
"faux",
"tempfile",
@@ -1816,7 +1816,7 @@ dependencies = [
[[package]]
name = "filter-parser"
version = "1.9.0"
version = "1.8.0"
dependencies = [
"insta",
"nom",
@@ -1836,7 +1836,7 @@ dependencies = [
[[package]]
name = "flatten-serde-json"
version = "1.9.0"
version = "1.8.0"
dependencies = [
"criterion",
"serde_json",
@@ -1954,7 +1954,7 @@ dependencies = [
[[package]]
name = "fuzzers"
version = "1.9.0"
version = "1.8.0"
dependencies = [
"arbitrary",
"clap",
@@ -2447,7 +2447,7 @@ checksum = "206ca75c9c03ba3d4ace2460e57b189f39f43de612c2f85836e65c929701bb2d"
[[package]]
name = "index-scheduler"
version = "1.9.0"
version = "1.8.0"
dependencies = [
"anyhow",
"big_s",
@@ -2642,7 +2642,7 @@ dependencies = [
[[package]]
name = "json-depth-checker"
version = "1.9.0"
version = "1.8.0"
dependencies = [
"criterion",
"serde_json",
@@ -3272,7 +3272,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]]
name = "meili-snap"
version = "1.9.0"
version = "1.8.0"
dependencies = [
"insta",
"md5",
@@ -3281,7 +3281,7 @@ dependencies = [
[[package]]
name = "meilisearch"
version = "1.9.0"
version = "1.8.0"
dependencies = [
"actix-cors",
"actix-http",
@@ -3373,7 +3373,7 @@ dependencies = [
[[package]]
name = "meilisearch-auth"
version = "1.9.0"
version = "1.8.0"
dependencies = [
"base64 0.21.7",
"enum-iterator",
@@ -3392,7 +3392,7 @@ dependencies = [
[[package]]
name = "meilisearch-types"
version = "1.9.0"
version = "1.8.0"
dependencies = [
"actix-web",
"anyhow",
@@ -3422,7 +3422,7 @@ dependencies = [
[[package]]
name = "meilitool"
version = "1.9.0"
version = "1.8.0"
dependencies = [
"anyhow",
"clap",
@@ -3461,7 +3461,7 @@ dependencies = [
[[package]]
name = "milli"
version = "1.9.0"
version = "1.8.0"
dependencies = [
"arroy",
"big_s",
@@ -3901,7 +3901,7 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
[[package]]
name = "permissive-json-pointer"
version = "1.9.0"
version = "1.8.0"
dependencies = [
"big_s",
"serde_json",
@@ -6052,7 +6052,7 @@ dependencies = [
[[package]]
name = "xtask"
version = "1.9.0"
version = "1.8.0"
dependencies = [
"anyhow",
"build-info",

View File

@@ -22,7 +22,7 @@ members = [
]
[workspace.package]
version = "1.9.0"
version = "1.8.0"
authors = [
"Quentin de Quelen <quentin@dequelen.me>",
"Clément Renault <clement@meilisearch.com>",

View File

@@ -25,7 +25,7 @@
<p align="center">⚡ A lightning-fast search engine that fits effortlessly into your apps, websites, and workflow 🔍</p>
[Meilisearch](https://www.meilisearch.com) helps you shape a delightful search experience in a snap, offering features that work out of the box to speed up your workflow.
Meilisearch helps you shape a delightful search experience in a snap, offering features that work out-of-the-box to speed up your workflow.
<p align="center" name="demo">
<a href="https://where2watch.meilisearch.com/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=demo-gif#gh-light-mode-only" target="_blank">
@@ -39,8 +39,8 @@
🔥 [**Try it!**](https://where2watch.meilisearch.com/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=demo-link) 🔥
## ✨ Features
- **Hybrid search:** Combine the best of both [semantic](https://www.meilisearch.com/docs/learn/experimental/vector_search) & full-text search to get the most relevant results
- **Search-as-you-type:** find & display results in less than 50 milliseconds to provide an intuitive experience
- **Search-as-you-type:** find search results in less than 50 milliseconds
- **[Typo tolerance](https://www.meilisearch.com/docs/learn/configuration/typo_tolerance?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** get relevant matches even when queries contain typos and misspellings
- **[Filtering](https://www.meilisearch.com/docs/learn/fine_tuning_results/filtering?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features) and [faceted search](https://www.meilisearch.com/docs/learn/fine_tuning_results/faceted_search?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** enhance your users' search experience with custom filters and build a faceted search interface in a few lines of code
- **[Sorting](https://www.meilisearch.com/docs/learn/fine_tuning_results/sorting?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** sort results based on price, date, or pretty much anything else your users need
@@ -55,15 +55,15 @@
## 📖 Documentation
You can consult Meilisearch's documentation at [meilisearch.com/docs](https://www.meilisearch.com/docs/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=docs).
You can consult Meilisearch's documentation at [https://www.meilisearch.com/docs](https://www.meilisearch.com/docs/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=docs).
## 🚀 Getting started
For basic instructions on how to set up Meilisearch, add documents to an index, and search for documents, take a look at our [Quick Start](https://www.meilisearch.com/docs/learn/getting_started/quick_start?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=get-started) guide.
## 🌍 Supercharge your Meilisearch experience
## Supercharge your Meilisearch experience
Say goodbye to server deployment and manual updates with [Meilisearch Cloud](https://www.meilisearch.com/cloud?utm_campaign=oss&utm_source=github&utm_medium=meilisearch). Additional features include analytics & monitoring in many regions around the world. No credit card is required.
Say goodbye to server deployment and manual updates with [Meilisearch Cloud](https://www.meilisearch.com/cloud?utm_campaign=oss&utm_source=github&utm_medium=meilisearch). No credit card required.
## 🧰 SDKs & integration tools
@@ -85,13 +85,13 @@ Finally, for more in-depth information, refer to our articles explaining fundame
Meilisearch collects **anonymized** data from users to help us improve our product. You can [deactivate this](https://www.meilisearch.com/docs/learn/what_is_meilisearch/telemetry?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=telemetry#how-to-disable-data-collection) whenever you want.
To request deletion of collected data, please write to us at [privacy@meilisearch.com](mailto:privacy@meilisearch.com). Remember to include your `Instance UID` in the message, as this helps us quickly find and delete your data.
To request deletion of collected data, please write to us at [privacy@meilisearch.com](mailto:privacy@meilisearch.com). Don't forget to include your `Instance UID` in the message, as this helps us quickly find and delete your data.
If you want to know more about the kind of data we collect and what we use it for, check the [telemetry section](https://www.meilisearch.com/docs/learn/what_is_meilisearch/telemetry?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=telemetry#how-to-disable-data-collection) of our documentation.
## 📫 Get in touch!
Meilisearch is a search engine created by [Meili]([https://www.welcometothejungle.com/en/companies/meilisearch](https://www.meilisearch.com/careers)), a software development company headquartered in France and with team members all over the world. Want to know more about us? [Check out our blog!](https://blog.meilisearch.com/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=contact)
Meilisearch is a search engine created by [Meili](https://www.welcometothejungle.com/en/companies/meilisearch), a software development company based in France and with team members all over the world. Want to know more about us? [Check out our blog!](https://blog.meilisearch.com/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=contact)
🗞 [Subscribe to our newsletter](https://meilisearch.us2.list-manage.com/subscribe?u=27870f7b71c908a8b359599fb&id=79582d828e) if you don't want to miss any updates! We promise we won't clutter your mailbox: we only send one edition every two months.

View File

@@ -46,10 +46,6 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
.route(web::delete().to(SeqHandler(delete_index))),
)
.service(web::resource("/stats").route(web::get().to(SeqHandler(get_index_stats))))
.service(
web::resource("/advanced-stats")
.route(web::get().to(SeqHandler(get_advanced_index_stats))),
)
.service(web::scope("/documents").configure(documents::configure))
.service(web::scope("/search").configure(search::configure))
.service(web::scope("/facet-search").configure(facet_search::configure))
@@ -282,16 +278,3 @@ pub async fn get_index_stats(
debug!(returns = ?stats, "Get index stats");
Ok(HttpResponse::Ok().json(stats))
}
pub async fn get_advanced_index_stats(
index_scheduler: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<IndexScheduler>>,
index_uid: web::Path<String>,
) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
let index = index_scheduler.index(&index_uid)?;
let rtxn = index.read_txn()?;
let advanced_stats = index.advanced_stats(&rtxn)?;
debug!(returns = ?advanced_stats, "Get advanced index stats");
Ok(HttpResponse::Ok().json(advanced_stats))
}

View File

@@ -477,8 +477,6 @@ pub enum MatchingStrategy {
Last,
/// All query words are mandatory
All,
/// Remove query words from the most frequent to the least
Frequency,
}
impl Default for MatchingStrategy {
@@ -492,7 +490,6 @@ impl From<MatchingStrategy> for TermsMatchingStrategy {
match other {
MatchingStrategy::Last => Self::Last,
MatchingStrategy::All => Self::All,
MatchingStrategy::Frequency => Self::Frequency,
}
}
}

View File

@@ -505,7 +505,7 @@ async fn search_bad_matching_strategy() {
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Unknown value `doggo` at `.matchingStrategy`: expected one of `last`, `all`, `frequency`",
"message": "Unknown value `doggo` at `.matchingStrategy`: expected one of `last`, `all`",
"code": "invalid_search_matching_strategy",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_matching_strategy"
@@ -527,7 +527,7 @@ async fn search_bad_matching_strategy() {
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Unknown value `doggo` for parameter `matchingStrategy`: expected one of `last`, `all`, `frequency`",
"message": "Unknown value `doggo` for parameter `matchingStrategy`: expected one of `last`, `all`",
"code": "invalid_search_matching_strategy",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_matching_strategy"

View File

@@ -117,69 +117,3 @@ async fn geo_bounding_box_with_string_and_number() {
)
.await;
}
#[actix_rt::test]
async fn bug_4640() {
// https://github.com/meilisearch/meilisearch/issues/4640
let server = Server::new().await;
let index = server.index("test");
let documents = DOCUMENTS.clone();
index.add_documents(documents, None).await;
index.update_settings_filterable_attributes(json!(["_geo"])).await;
let (ret, _code) = index.update_settings_sortable_attributes(json!(["_geo"])).await;
index.wait_task(ret.uid()).await;
// Sort the document with the second one first
index
.search(
json!({
"sort": ["_geoPoint(45.4777599, 9.1967508):asc"],
}),
|response, code| {
assert_eq!(code, 200, "{}", response);
snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###"
{
"hits": [
{
"id": 2,
"name": "La Bella Italia",
"address": "456 Elm Street, Townsville",
"type": "Italian",
"rating": 9,
"_geo": {
"lat": "45.4777599",
"lng": "9.1967508"
}
},
{
"id": 1,
"name": "Taco Truck",
"address": "444 Salsa Street, Burritoville",
"type": "Mexican",
"rating": 9,
"_geo": {
"lat": 34.0522,
"lng": -118.2437
},
"_geoDistance": 9714063
},
{
"id": 3,
"name": "Crêpe Truck",
"address": "2 Billig Avenue, Rouenville",
"type": "French",
"rating": 10
}
],
"query": "",
"processingTimeMs": "[time]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 3
}
"###);
},
)
.await;
}

View File

@@ -1,128 +0,0 @@
use meili_snap::snapshot;
use once_cell::sync::Lazy;
use crate::common::index::Index;
use crate::common::{Server, Value};
use crate::json;
async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Index<'a> {
let index = server.index("test");
index.add_documents(documents.clone(), None).await;
index.wait_task(0).await;
index
}
static SIMPLE_SEARCH_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
json!([
{
"title": "Shazam!",
"id": "1",
},
{
"title": "Captain Planet",
"id": "2",
},
{
"title": "Captain Marvel",
"id": "3",
},
{
"title": "a Captain Marvel ersatz",
"id": "4"
},
{
"title": "He's not part of the Marvel Cinematic Universe",
"id": "5"
},
{
"title": "a Shazam ersatz, but better than Captain Planet",
"id": "6"
},
{
"title": "Capitain CAAAAAVEEERNE!!!!",
"id": "7"
}
])
});
#[actix_rt::test]
async fn simple_search() {
let server = Server::new().await;
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
index
.search(json!({"q": "Captain Marvel", "matchingStrategy": "last", "attributesToRetrieve": ["id"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"},{"id":"2"},{"id":"6"},{"id":"7"}]"###);
})
.await;
index
.search(json!({"q": "Captain Marvel", "matchingStrategy": "all", "attributesToRetrieve": ["id"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"}]"###);
})
.await;
index
.search(json!({"q": "Captain Marvel", "matchingStrategy": "frequency", "attributesToRetrieve": ["id"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"},{"id":"5"}]"###);
})
.await;
}
#[actix_rt::test]
async fn search_with_typo() {
let server = Server::new().await;
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
index
.search(json!({"q": "Capitain Marvel", "matchingStrategy": "last", "attributesToRetrieve": ["id"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"},{"id":"7"},{"id":"2"},{"id":"6"}]"###);
})
.await;
index
.search(json!({"q": "Capitain Marvel", "matchingStrategy": "all", "attributesToRetrieve": ["id"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"}]"###);
})
.await;
index
.search(json!({"q": "Capitain Marvel", "matchingStrategy": "frequency", "attributesToRetrieve": ["id"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"},{"id":"5"}]"###);
})
.await;
}
#[actix_rt::test]
async fn search_with_unknown_word() {
let server = Server::new().await;
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
index
.search(json!({"q": "Captain Supercopter Marvel", "matchingStrategy": "last", "attributesToRetrieve": ["id"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"id":"2"},{"id":"3"},{"id":"4"},{"id":"6"},{"id":"7"}]"###);
})
.await;
index
.search(json!({"q": "Captain Supercopter Marvel", "matchingStrategy": "all", "attributesToRetrieve": ["id"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @"[]");
})
.await;
index
.search(json!({"q": "Captain Supercopter Marvel", "matchingStrategy": "frequency", "attributesToRetrieve": ["id"]}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"},{"id":"5"}]"###);
})
.await;
}

View File

@@ -7,7 +7,6 @@ mod facet_search;
mod formatted;
mod geo;
mod hybrid;
mod matching_strategy;
mod multi;
mod pagination;
mod restrict_searchable;

View File

@@ -31,7 +31,6 @@ macro_rules! verify_snapshot {
}
#[actix_rt::test]
#[cfg_attr(target_os = "windows", ignore)]
async fn perform_snapshot() {
let temp = tempfile::tempdir().unwrap();
let snapshot_dir = tempfile::tempdir().unwrap();

View File

@@ -9,7 +9,6 @@ use heed::types::*;
use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified};
use roaring::RoaringBitmap;
use rstar::RTree;
use serde::Serialize;
use time::OffsetDateTime;
use crate::documents::PrimaryKey;
@@ -325,87 +324,6 @@ impl Index {
self.env.info().map_size
}
pub fn advanced_stats(&self, rtxn: &heed::RoTxn) -> Result<AdvancedStats> {
use db_name::*;
let mut dbs = BTreeMap::new();
dbs.insert(WORD_DOCIDS, advanced_database_stats(rtxn, self.word_docids)?);
dbs.insert(
WORD_PAIR_PROXIMITY_DOCIDS,
advanced_database_stats(rtxn, self.word_pair_proximity_docids)?,
);
dbs.insert(WORD_PREFIX_DOCIDS, advanced_database_stats(rtxn, self.word_prefix_docids)?);
dbs.insert(WORD_FIELD_ID_DOCIDS, advanced_database_stats(rtxn, self.word_fid_docids)?);
dbs.insert(WORD_POSITION_DOCIDS, advanced_database_stats(rtxn, self.word_position_docids)?);
dbs.insert(DOCUMENTS, advanced_database_stats_no_bitmap(rtxn, self.documents)?);
fn advanced_database_stats<KC>(
rtxn: &heed::RoTxn,
db: Database<KC, CboRoaringBitmapCodec>,
) -> Result<AdvancedDatabaseStats> {
let db = db.remap_key_type::<Bytes>().lazily_decode_data();
let mut entries_count = 0;
let mut total_bitmap_size = 0;
let mut total_bitmap_len = 0;
let mut total_key_size = 0;
for result in db.iter(rtxn)? {
let (bytes_key, lazy_value) = result?;
entries_count += 1;
total_bitmap_size += lazy_value.remap::<Bytes>().decode().unwrap().len();
let bitmap = lazy_value.decode().map_err(heed::Error::Decoding)?;
total_bitmap_len += bitmap.len();
total_key_size += bytes_key.len();
}
Ok(AdvancedDatabaseStats {
entries_count,
average_bitmap_len: Some(total_bitmap_len as f64 / entries_count as f64),
median_bitmap_len: None,
average_value_size: Some(total_bitmap_size as f64 / entries_count as f64),
median_value_size: None,
average_key_size: Some(total_key_size as f64 / entries_count as f64),
median_key_size: None,
})
}
fn advanced_database_stats_no_bitmap<KC, DC>(
rtxn: &heed::RoTxn,
db: Database<KC, DC>,
) -> Result<AdvancedDatabaseStats> {
let db = db.remap_types::<Bytes, Bytes>();
let mut entries_count = 0;
let mut total_value_size = 0;
let mut total_key_size = 0;
for result in db.iter(rtxn)? {
let (bytes_key, bytes_value) = result?;
entries_count += 1;
total_value_size += bytes_value.len();
total_key_size += bytes_key.len();
}
Ok(AdvancedDatabaseStats {
entries_count,
average_bitmap_len: None,
median_bitmap_len: None,
average_value_size: Some(total_value_size as f64 / entries_count as f64),
median_value_size: None,
average_key_size: Some(total_key_size as f64 / entries_count as f64),
median_key_size: None,
})
}
Ok(AdvancedStats {
map_size: self.map_size(),
non_free_pages_size: self.on_disk_size()?,
on_disk_size: self.on_disk_size()?,
databases: dbs,
})
}
pub fn copy_to_file<P: AsRef<Path>>(&self, path: P, option: CompactionOption) -> Result<File> {
self.env.copy_to_file(path, option).map_err(Into::into)
}
@@ -1744,36 +1662,6 @@ impl Index {
}
}
#[derive(Clone, Debug, Serialize)]
pub struct AdvancedStats {
/// Size of the data memory map.
map_size: usize,
/// Returns the size used by all the databases in the environment without the free pages.
non_free_pages_size: u64,
/// The size of the data file on disk.
on_disk_size: u64,
/// Databases advanced stats.
databases: BTreeMap<&'static str, AdvancedDatabaseStats>,
}
#[derive(Clone, Debug, Serialize)]
pub struct AdvancedDatabaseStats {
/// The number of entries in this database.
entries_count: usize,
/// The average number of entries in the bitmaps of this database.
average_bitmap_len: Option<f64>,
/// The median number of entries in the bitmaps of this database.
median_bitmap_len: Option<f64>,
/// The average size of values of this database.
average_value_size: Option<f64>,
/// The median size of values of this database.
median_value_size: Option<f64>,
/// The average size of keys of this database.
average_key_size: Option<f64>,
/// The mediane size of keys of this database.
median_key_size: Option<f64>,
}
#[cfg(test)]
pub(crate) mod tests {
use std::collections::HashSet;

View File

@@ -277,8 +277,6 @@ pub enum TermsMatchingStrategy {
Last,
// all words are mandatory
All,
// remove more frequent word first
Frequency,
}
impl Default for TermsMatchingStrategy {

View File

@@ -164,21 +164,6 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase
}
costs
}
TermsMatchingStrategy::Frequency => {
let removal_order =
query_graph.removal_order_for_terms_matching_strategy_frequency(ctx)?;
let mut forbidden_nodes =
SmallBitmap::for_interned_values_in(&query_graph.nodes);
let mut costs = query_graph.nodes.map(|_| None);
// FIXME: this works because only words uses termsmatchingstrategy at the moment.
for ns in removal_order {
for n in ns.iter() {
*costs.get_mut(n) = Some((1, forbidden_nodes.clone()));
}
forbidden_nodes.union(&ns);
}
costs
}
TermsMatchingStrategy::All => query_graph.nodes.map(|_| None),
}
} else {

View File

@@ -197,11 +197,6 @@ fn resolve_maximally_reduced_query_graph(
.iter()
.flat_map(|x| x.iter())
.collect(),
TermsMatchingStrategy::Frequency => query_graph
.removal_order_for_terms_matching_strategy_frequency(ctx)?
.iter()
.flat_map(|x| x.iter())
.collect(),
TermsMatchingStrategy::All => vec![],
};
graph.remove_nodes_keep_edges(&nodes_to_remove);

View File

@@ -1,9 +1,8 @@
use std::cmp::{Ordering, Reverse};
use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::hash::{Hash, Hasher};
use fxhash::{FxHashMap, FxHasher};
use roaring::RoaringBitmap;
use super::interner::{FixedSizeInterner, Interned};
use super::query_term::{
@@ -12,7 +11,6 @@ use super::query_term::{
use super::small_bitmap::SmallBitmap;
use super::SearchContext;
use crate::search::new::interner::Interner;
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
use crate::Result;
/// A node of the [`QueryGraph`].
@@ -292,49 +290,6 @@ impl QueryGraph {
}
}
pub fn removal_order_for_terms_matching_strategy_frequency(
&self,
ctx: &mut SearchContext,
) -> Result<Vec<SmallBitmap<QueryNode>>> {
// lookup frequency for each term
let mut term_with_frequency: Vec<(u8, u64)> = {
let mut term_docids: BTreeMap<u8, RoaringBitmap> = Default::default();
for (_, node) in self.nodes.iter() {
match &node.data {
QueryNodeData::Term(t) => {
let docids = compute_query_term_subset_docids(ctx, &t.term_subset)?;
for id in t.term_ids.clone() {
term_docids
.entry(id)
.and_modify(|curr| *curr |= &docids)
.or_insert_with(|| docids.clone());
}
}
QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
}
}
term_docids
.into_iter()
.map(|(idx, docids)| match docids.len() {
0 => (idx, u64::max_value()),
frequency => (idx, frequency),
})
.collect()
};
term_with_frequency.sort_by_key(|(_, frequency)| Reverse(*frequency));
let mut term_weight = BTreeMap::new();
let mut weight: u16 = 1;
let mut peekable = term_with_frequency.into_iter().peekable();
while let Some((idx, frequency)) = peekable.next() {
term_weight.insert(idx, weight);
if peekable.peek().map_or(false, |(_, f)| frequency != *f) {
weight += 1;
}
}
let cost_of_term_idx = move |term_idx: u8| *term_weight.get(&term_idx).unwrap();
Ok(self.removal_order_for_terms_matching_strategy(ctx, cost_of_term_idx))
}
pub fn removal_order_for_terms_matching_strategy_last(
&self,
ctx: &SearchContext,
@@ -360,19 +315,10 @@ impl QueryGraph {
if first_term_idx >= last_term_idx {
return vec![];
}
let cost_of_term_idx = |term_idx: u8| {
let rank = 1 + last_term_idx - term_idx;
rank as u16
};
self.removal_order_for_terms_matching_strategy(ctx, cost_of_term_idx)
}
pub fn removal_order_for_terms_matching_strategy(
&self,
ctx: &SearchContext,
order: impl Fn(u8) -> u16,
) -> Vec<SmallBitmap<QueryNode>> {
let mut nodes_to_remove = BTreeMap::<u16, SmallBitmap<QueryNode>>::new();
let mut at_least_one_mandatory_term = false;
for (node_id, node) in self.nodes.iter() {
@@ -383,7 +329,7 @@ impl QueryGraph {
}
let mut cost = 0;
for id in t.term_ids.clone() {
cost = std::cmp::max(cost, order(id));
cost = std::cmp::max(cost, cost_of_term_idx(id));
}
nodes_to_remove
.entry(cost)

View File

@@ -45,6 +45,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters,
settings_diff: &InnerIndexSettingsDiff,
geo_fields_ids: Option<(FieldId, FieldId)>,
) -> Result<ExtractedFacetValues> {
let max_memory = indexer.max_memory_by_thread();
@@ -124,18 +125,12 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
add_exists.insert(document);
}
let del_geo_support = settings_diff
.old
.geo_fields_ids
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
let add_geo_support = settings_diff
.new
.geo_fields_ids
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
let geo_support =
geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
let del_filterable_values =
del_value.map(|value| extract_facet_values(&value, del_geo_support));
del_value.map(|value| extract_facet_values(&value, geo_support));
let add_filterable_values =
add_value.map(|value| extract_facet_values(&value, add_geo_support));
add_value.map(|value| extract_facet_values(&value, geo_support));
// Those closures are just here to simplify things a bit.
let mut insert_numbers_diff = |del_numbers, add_numbers| {

View File

@@ -8,7 +8,6 @@ use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
use crate::error::GeoError;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::extract_finite_float_from_value;
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
use crate::{FieldId, InternalError, Result};
/// Extracts the geographical coordinates contained in each document under the `_geo` field.
@@ -19,7 +18,7 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters,
primary_key_id: FieldId,
settings_diff: &InnerIndexSettingsDiff,
(lat_fid, lng_fid): (FieldId, FieldId),
) -> Result<grenad::Reader<BufReader<File>>> {
let mut writer = create_writer(
indexer.chunk_compression_type,
@@ -39,27 +38,47 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
serde_json::from_slice(document_id).unwrap()
};
// extract old version
let del_lat_lng =
extract_lat_lng(&obkv, &settings_diff.old, DelAdd::Deletion, document_id)?;
// extract new version
let add_lat_lng =
extract_lat_lng(&obkv, &settings_diff.new, DelAdd::Addition, document_id)?;
// first we get the two fields
match (obkv.get(lat_fid), obkv.get(lng_fid)) {
(Some(lat), Some(lng)) => {
let deladd_lat_obkv = KvReaderDelAdd::new(lat);
let deladd_lng_obkv = KvReaderDelAdd::new(lng);
if del_lat_lng != add_lat_lng {
let mut obkv = KvWriterDelAdd::memory();
if let Some([lat, lng]) = del_lat_lng {
#[allow(clippy::drop_non_drop)]
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
obkv.insert(DelAdd::Deletion, bytes)?;
// then we extract the values
let del_lat_lng = deladd_lat_obkv
.get(DelAdd::Deletion)
.zip(deladd_lng_obkv.get(DelAdd::Deletion))
.map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
.transpose()?;
let add_lat_lng = deladd_lat_obkv
.get(DelAdd::Addition)
.zip(deladd_lng_obkv.get(DelAdd::Addition))
.map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
.transpose()?;
if del_lat_lng != add_lat_lng {
let mut obkv = KvWriterDelAdd::memory();
if let Some([lat, lng]) = del_lat_lng {
#[allow(clippy::drop_non_drop)]
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
obkv.insert(DelAdd::Deletion, bytes)?;
}
if let Some([lat, lng]) = add_lat_lng {
#[allow(clippy::drop_non_drop)]
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
obkv.insert(DelAdd::Addition, bytes)?;
}
let bytes = obkv.into_inner()?;
writer.insert(docid_bytes, bytes)?;
}
}
if let Some([lat, lng]) = add_lat_lng {
#[allow(clippy::drop_non_drop)]
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
obkv.insert(DelAdd::Addition, bytes)?;
(None, Some(_)) => {
return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
}
let bytes = obkv.into_inner()?;
writer.insert(docid_bytes, bytes)?;
(Some(_), None) => {
return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
}
(None, None) => (),
}
}
@@ -67,37 +86,16 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
}
/// Extract the finite floats lat and lng from two bytes slices.
fn extract_lat_lng(
document: &obkv::KvReader<FieldId>,
settings: &InnerIndexSettings,
deladd: DelAdd,
document_id: impl Fn() -> Value,
) -> Result<Option<[f64; 2]>> {
match settings.geo_fields_ids {
Some((lat_fid, lng_fid)) => {
let lat = document.get(lat_fid).map(KvReaderDelAdd::new).and_then(|r| r.get(deladd));
let lng = document.get(lng_fid).map(KvReaderDelAdd::new).and_then(|r| r.get(deladd));
let (lat, lng) = match (lat, lng) {
(Some(lat), Some(lng)) => (lat, lng),
(Some(_), None) => {
return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
}
(None, Some(_)) => {
return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
}
(None, None) => return Ok(None),
};
let lat = extract_finite_float_from_value(
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
)
.map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
fn extract_lat_lng(lat: &[u8], lng: &[u8], document_id: impl Fn() -> Value) -> Result<[f64; 2]> {
let lat = extract_finite_float_from_value(
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
)
.map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
let lng = extract_finite_float_from_value(
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
)
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
Ok(Some([lat, lng]))
}
None => Ok(None),
}
let lng = extract_finite_float_from_value(
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
)
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
Ok([lat, lng])
}

View File

@@ -43,6 +43,7 @@ pub(crate) fn data_from_obkv_documents(
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
primary_key_id: FieldId,
geo_fields_ids: Option<(FieldId, FieldId)>,
settings_diff: Arc<InnerIndexSettingsDiff>,
max_positions_per_attributes: Option<u32>,
) -> Result<()> {
@@ -69,6 +70,7 @@ pub(crate) fn data_from_obkv_documents(
indexer,
lmdb_writer_sx.clone(),
primary_key_id,
geo_fields_ids,
settings_diff.clone(),
max_positions_per_attributes,
)
@@ -291,6 +293,7 @@ fn send_and_extract_flattened_documents_data(
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
primary_key_id: FieldId,
geo_fields_ids: Option<(FieldId, FieldId)>,
settings_diff: Arc<InnerIndexSettingsDiff>,
max_positions_per_attributes: Option<u32>,
) -> Result<(
@@ -300,13 +303,12 @@ fn send_and_extract_flattened_documents_data(
let flattened_documents_chunk =
flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
if settings_diff.run_geo_indexing() {
if let Some(geo_fields_ids) = geo_fields_ids {
let documents_chunk_cloned = flattened_documents_chunk.clone();
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
let settings_diff = settings_diff.clone();
rayon::spawn(move || {
let result =
extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, &settings_diff);
extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, geo_fields_ids);
let _ = match result {
Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))),
Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
@@ -345,6 +347,7 @@ fn send_and_extract_flattened_documents_data(
flattened_documents_chunk.clone(),
indexer,
&settings_diff,
geo_fields_ids,
)?;
// send fid_docid_facet_numbers_chunk to DB writer

View File

@@ -315,6 +315,28 @@ where
// get the primary key field id
let primary_key_id = settings_diff.new.fields_ids_map.id(&primary_key).unwrap();
// get the fid of the `_geo.lat` and `_geo.lng` fields.
let mut field_id_map = self.index.fields_ids_map(self.wtxn)?;
// self.index.fields_ids_map($a)? ==>> field_id_map
let geo_fields_ids = match field_id_map.id("_geo") {
Some(gfid) => {
let is_sortable = self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid);
let is_filterable = self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid);
// if `_geo` is faceted then we get the `lat` and `lng`
if is_sortable || is_filterable {
let field_ids = field_id_map
.insert("_geo.lat")
.zip(field_id_map.insert("_geo.lng"))
.ok_or(UserError::AttributeLimitReached)?;
Some(field_ids)
} else {
None
}
}
None => None,
};
let pool_params = GrenadParameters {
chunk_compression_type: self.indexer_config.chunk_compression_type,
chunk_compression_level: self.indexer_config.chunk_compression_level,
@@ -398,6 +420,7 @@ where
pool_params,
lmdb_writer_sx.clone(),
primary_key_id,
geo_fields_ids,
settings_diff.clone(),
max_positions_per_attributes,
)

View File

@@ -1142,11 +1142,6 @@ impl InnerIndexSettingsDiff {
self.settings_update_only
}
pub fn run_geo_indexing(&self) -> bool {
self.old.geo_fields_ids != self.new.geo_fields_ids
|| (!self.settings_update_only && self.new.geo_fields_ids.is_some())
}
pub fn modified_faceted_fields(&self) -> HashSet<String> {
&self.old.user_defined_faceted_fields ^ &self.new.user_defined_faceted_fields
}
@@ -1166,7 +1161,6 @@ pub(crate) struct InnerIndexSettings {
pub proximity_precision: ProximityPrecision,
pub embedding_configs: EmbeddingConfigs,
pub existing_fields: HashSet<String>,
pub geo_fields_ids: Option<(FieldId, FieldId)>,
}
impl InnerIndexSettings {
@@ -1175,7 +1169,7 @@ impl InnerIndexSettings {
let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());
let allowed_separators = index.allowed_separators(rtxn)?;
let dictionary = index.dictionary(rtxn)?;
let mut fields_ids_map = index.fields_ids_map(rtxn)?;
let fields_ids_map = index.fields_ids_map(rtxn)?;
let user_defined_searchable_fields = index.user_defined_searchable_fields(rtxn)?;
let user_defined_searchable_fields =
user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect());
@@ -1190,24 +1184,6 @@ impl InnerIndexSettings {
.into_iter()
.filter_map(|(field, count)| (count != 0).then_some(field))
.collect();
// index.fields_ids_map($a)? ==>> fields_ids_map
let geo_fields_ids = match fields_ids_map.id("_geo") {
Some(gfid) => {
let is_sortable = index.sortable_fields_ids(rtxn)?.contains(&gfid);
let is_filterable = index.filterable_fields_ids(rtxn)?.contains(&gfid);
// if `_geo` is faceted then we get the `lat` and `lng`
if is_sortable || is_filterable {
let field_ids = fields_ids_map
.insert("_geo.lat")
.zip(fields_ids_map.insert("_geo.lng"))
.ok_or(UserError::AttributeLimitReached)?;
Some(field_ids)
} else {
None
}
}
None => None,
};
Ok(Self {
stop_words,
@@ -1222,7 +1198,6 @@ impl InnerIndexSettings {
proximity_precision,
embedding_configs,
existing_fields,
geo_fields_ids,
})
}

View File

@@ -159,7 +159,6 @@ pub fn expected_order(
match optional_words {
TermsMatchingStrategy::Last => groups.into_iter().flatten().collect(),
TermsMatchingStrategy::Frequency => groups.into_iter().flatten().collect(),
TermsMatchingStrategy::All => {
groups.into_iter().flatten().filter(|d| d.word_rank == 0).collect()
}