Compare commits

..

12 Commits

Author SHA1 Message Date
Tamo
c8d8522279 update segment and reqwest to the latest version 2024-05-13 10:25:29 +02:00
meili-bors[bot]
95fcd17373 Merge #4622
4622: Bump Rustls to non-vulnerable versions r=Kerollmops a=Kerollmops

This PR Fixes #4599 by bumping the Rustls dependency to v0.21.12 and [ureq to v2.9.7](https://github.com/algesten/ureq/blob/main/CHANGELOG.md#297) (which bump rustls to v0.22.4).

Co-authored-by: Clément Renault <clement@meilisearch.com>
2024-05-07 09:47:30 +00:00
Clément Renault
ac4bc143c4 Bump ureq to v2.9.7 2024-05-07 10:39:38 +02:00
Clément Renault
f33a1282f8 Bump Rustls to v0.21.12 2024-05-07 10:31:39 +02:00
meili-bors[bot]
4d5971f343 Merge #4621
4621: Bring back changes from v1.8.0 into main r=curquiza a=curquiza



Co-authored-by: ManyTheFish <many@meilisearch.com>
Co-authored-by: Tamo <tamo@meilisearch.com>
Co-authored-by: meili-bors[bot] <89034592+meili-bors[bot]@users.noreply.github.com>
Co-authored-by: Clément Renault <clement@meilisearch.com>
2024-05-06 13:46:39 +00:00
meili-bors[bot]
ecb5c506b3 Merge #4619
4619: Use http path pattern instead of full path in metrics r=irevoire a=gh2k

# Pull Request

## Related issue

Fixes #3983 

## What does this PR do?

- This records only the HTTP pattern in metrics instead of the full path

An alternative solution was proposed in #4145, but this doesn't really fix the root cause of the issue. The problem I'm experiencing at my end is that by using the full path, the number of labels is far too high to be useful. It is normal practice to use the path with variable placeholders, instead of the fully-expanded path.

The example given in the ticket was endpoints under `/tasks`, but this can also be a very significant problem under `/indexes/{index-uid}/documents`. e.g.:
<img width="1510" alt="Screenshot 2024-05-03 at 12 14 36" src="https://github.com/meilisearch/meilisearch/assets/6530014/1df2ec19-5f69-4164-90d2-f65c59f9b544">

This patch replaces the fully-expanded path with the matched pattern.

The linked PR also mentions paths under other routes, e.g. `/static`, but this feels like a separate concern and these can be stripped out at the Prometheus end by filters if they are unwanted. The most important thing is to make the paths usable so that we can still get stats on e.g. the number of document deletes we see.

## PR checklist

Please check if your PR fulfills the following requirements:
- [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)?
- [x] Have you read the contributing guidelines?
- [x] Have you made sure that the title is accurate and descriptive of the changes?

Thank you so much for contributing to Meilisearch!


Co-authored-by: Simon Detheridge <s@sd.ai>
Co-authored-by: Tamo <tamo@meilisearch.com>
2024-05-06 09:37:32 +00:00
Tamo
3698aef66b fix warning 2024-05-06 11:36:37 +02:00
Simon Detheridge
7f5ab3cef5 Use http path pattern instead of full path in metrics 2024-05-03 12:29:31 +01:00
meili-bors[bot]
248e22005a Merge #4582
4582: Fix some typos in comments r=curquiza a=writegr

# Pull Request

## Related issue

No

## What does this PR do?

 fix some typos in comments

## PR checklist
Please check if your PR fulfills the following requirements:
- [ ] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)?
- [ ] Have you read the contributing guidelines?
- [ ] Have you made sure that the title is accurate and descriptive of the changes?

Thank you so much for contributing to Meilisearch!


Co-authored-by: writegr <wellweek@outlook.com>
2024-04-18 07:07:33 +00:00
writegr
ab43a8a949 chore: fix some typos in comments
Signed-off-by: writegr <wellweek@outlook.com>
2024-04-18 14:12:52 +08:00
meili-bors[bot]
4089dd04a5 Merge #4568
4568: Fix some typos in comments r=curquiza a=yudrywet

# Pull Request

## Related issue
No

## What does this PR do?
fix some typos in comments

## PR checklist
Please check if your PR fulfills the following requirements:
- [ ] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)?
- [ ] Have you read the contributing guidelines?
- [ ] Have you made sure that the title is accurate and descriptive of the changes?

Thank you so much for contributing to Meilisearch!


Co-authored-by: yudrywet <yudeyao@yeah.net>
2024-04-15 08:12:43 +00:00
yudrywet
cf864a1c2e chore: fix some typos in comments
Signed-off-by: yudrywet <yudeyao@yeah.net>
2024-04-14 20:11:34 +08:00
36 changed files with 455 additions and 458 deletions

View File

@@ -1,6 +1,4 @@
name: Look for flaky tests
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
on:
workflow_dispatch:
schedule:

View File

@@ -1,6 +1,5 @@
name: Run the indexing fuzzer
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
on:
push:
branches:

View File

@@ -15,8 +15,6 @@ jobs:
debian:
name: Publish debian packagge
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
runs-on: ubuntu-latest
needs: check-version
container:

View File

@@ -35,8 +35,6 @@ jobs:
publish-linux:
name: Publish binary for Linux
runs-on: ubuntu-latest
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
needs: check-version
container:
# Use ubuntu-18.04 to compile with glibc 2.27
@@ -134,8 +132,6 @@ jobs:
name: Publish binary for aarch64
runs-on: ubuntu-latest
needs: check-version
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
container:
# Use ubuntu-18.04 to compile with glibc 2.27
image: ubuntu:18.04

View File

@@ -21,8 +21,6 @@ jobs:
test-linux:
name: Tests on ubuntu-18.04
runs-on: ubuntu-latest
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
container:
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
image: ubuntu:18.04
@@ -79,8 +77,6 @@ jobs:
test-all-features:
name: Tests almost all features
runs-on: ubuntu-latest
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
container:
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
image: ubuntu:18.04
@@ -104,8 +100,6 @@ jobs:
test-disabled-tokenization:
name: Test disabled tokenization
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
runs-on: ubuntu-latest
container:
image: ubuntu:18.04
@@ -133,8 +127,6 @@ jobs:
# We run tests in debug also, to make sure that the debug_assertions are hit
test-debug:
name: Run tests in debug
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
runs-on: ubuntu-latest
container:
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations

458
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -22,7 +22,7 @@ members = [
]
[workspace.package]
version = "1.8.4"
version = "1.8.0"
authors = [
"Quentin de Quelen <quentin@dequelen.me>",
"Clément Renault <clement@meilisearch.com>",

View File

@@ -17,7 +17,7 @@ RUN set -eux; \
if [ "$apkArch" = "aarch64" ]; then \
export JEMALLOC_SYS_WITH_LG_PAGE=16; \
fi && \
cargo build --release -p meilisearch -p meilitool --features "swedish-recomposition"
cargo build --release -p meilisearch -p meilitool
# Run
FROM alpine:3.16

View File

@@ -152,7 +152,6 @@ impl Settings<Unchecked> {
}
#[derive(Debug, Clone, Deserialize)]
#[allow(dead_code)] // otherwise rustc complains that the fields go unused
#[cfg_attr(test, derive(serde::Serialize))]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]

View File

@@ -182,7 +182,6 @@ impl Settings<Unchecked> {
}
}
#[allow(dead_code)] // otherwise rustc complains that the fields go unused
#[derive(Debug, Clone, Deserialize)]
#[cfg_attr(test, derive(serde::Serialize))]
#[serde(deny_unknown_fields)]

View File

@@ -200,7 +200,6 @@ impl std::ops::Deref for IndexUid {
}
}
#[allow(dead_code)] // otherwise rustc complains that the fields go unused
#[derive(Debug)]
#[cfg_attr(test, derive(serde::Serialize))]
#[cfg_attr(test, serde(rename_all = "camelCase"))]

View File

@@ -568,7 +568,7 @@ pub mod tests {
insta::assert_display_snapshot!(p(r"title = 'foo\\\\'"), @r#"{title} = {foo\\}"#);
insta::assert_display_snapshot!(p(r"title = 'foo\\\\\\'"), @r#"{title} = {foo\\\}"#);
insta::assert_display_snapshot!(p(r"title = 'foo\\\\\\\\'"), @r#"{title} = {foo\\\\}"#);
// but it also works with other sequencies
// but it also works with other sequences
insta::assert_display_snapshot!(p(r#"title = 'foo\x20\n\t\"\'"'"#), @"{title} = {foo \n\t\"\'\"}");
}

View File

@@ -37,7 +37,7 @@ time = { version = "0.3.31", features = [
"macros",
] }
tracing = "0.1.40"
ureq = "2.9.1"
ureq = "2.9.7"
uuid = { version = "1.6.1", features = ["serde", "v4"] }
[dev-dependencies]

View File

@@ -13,7 +13,7 @@ We can combine the two tasks in a single batch:
1. import documents X and Y
Processing this batch is functionally equivalent to processing the two
tasks individally, but should be much faster since we are only performing
tasks individually, but should be much faster since we are only performing
one indexing operation.
*/
@@ -914,34 +914,8 @@ impl IndexScheduler {
if self.must_stop_processing.get() {
return Err(Error::AbortedTask);
}
let (id, doc) = ret?;
let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
'inject_vectors: {
let embeddings = index.embeddings(&rtxn, id)?;
if embeddings.is_empty() {
break 'inject_vectors;
}
let vectors = document
.entry("_vectors".to_owned())
.or_insert(serde_json::Value::Object(Default::default()));
let serde_json::Value::Object(vectors) = vectors else {
break 'inject_vectors;
};
for (embedder_name, embeddings) in embeddings {
vectors.entry(embedder_name).or_insert_with(|| {
serde_json::json!({
"embeddings": embeddings,
"regenerate": true
})
});
}
}
let (_id, doc) = ret?;
let document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
index_dumper.push_document(&document)?;
}

View File

@@ -26,7 +26,7 @@ pub type DeserrQueryParamError<C = BadRequest> = DeserrError<DeserrQueryParam, C
/// A request deserialization error.
///
/// The first generic paramater is a marker type describing the format of the request: either json (e.g. [`DeserrJson`] or [`DeserrQueryParam`]).
/// The first generic parameter is a marker type describing the format of the request: either json (e.g. [`DeserrJson`] or [`DeserrQueryParam`]).
/// The second generic parameter is the default error code for the deserialization error, in case it is not given.
pub struct DeserrError<Format, C: Default + ErrorCode> {
pub msg: String,

View File

@@ -71,13 +71,13 @@ puffin = { version = "0.16.0", features = ["serialization"] }
rand = "0.8.5"
rayon = "1.8.0"
regex = "1.10.2"
reqwest = { version = "0.11.23", features = [
reqwest = { version = "0.12.4", features = [
"rustls-tls",
"json",
], default-features = false }
rustls = "0.21.6"
rustls = "0.21.12"
rustls-pemfile = "1.0.2"
segment = { version = "0.2.3", optional = true }
segment = { version = "0.2.4", optional = true }
serde = { version = "1.0.195", features = ["derive"] }
serde_json = { version = "1.0.111", features = ["preserve_order"] }
sha2 = "0.10.8"

View File

@@ -419,41 +419,7 @@ fn import_dump(
let file = tempfile::tempfile()?;
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file));
for document in index_reader.documents()? {
let mut document = document?;
'remove_injected_vectors: {
let Some(vectors) = document.get_mut("_vectors") else {
break 'remove_injected_vectors;
};
let Some(vectors) = vectors.as_object_mut() else { break 'remove_injected_vectors };
vectors.retain(|_embedder, embedding_object| {
// don't touch values that aren't objects
let Some(embedding_object) = embedding_object.as_object() else {
return true;
};
let mut has_regenerate_true = false;
for (field, value) in embedding_object {
match (field.as_str(), value) {
// detected regenerate : true
// if we don't have any superfluous field, we'll remove the entire entry
("regenerate", serde_json::Value::Bool(true)) => {
has_regenerate_true = true;
}
// ignore embeddings
("embeddings", _) => continue,
// any other field: immediately retain the entry
_ => return true,
}
}
// retain the entry unless it has regenerate: true
!has_regenerate_true
})
}
builder.append_json_object(&document)?;
builder.append_json_object(&document?)?;
}
// This flush the content of the batch builder.

View File

@@ -59,10 +59,12 @@ where
let request_path = req.path();
let is_registered_resource = req.resource_map().has_resource(request_path);
if is_registered_resource {
let request_pattern = req.match_pattern();
let metric_path = request_pattern.as_ref().map_or(request_path, String::as_str);
let request_method = req.method().to_string();
histogram_timer = Some(
crate::metrics::MEILISEARCH_HTTP_RESPONSE_TIME_SECONDS
.with_label_values(&[&request_method, request_path])
.with_label_values(&[&request_method, metric_path])
.start_timer(),
);
}

View File

@@ -40,9 +40,8 @@ pub struct Permit {
impl Drop for Permit {
fn drop(&mut self) {
let sender = self.sender.clone();
// if the channel is closed then the whole instance is down
std::mem::drop(tokio::spawn(async move { sender.send(()).await }));
let _ = futures::executor::block_on(self.sender.send(()));
}
}

View File

@@ -117,69 +117,3 @@ async fn geo_bounding_box_with_string_and_number() {
)
.await;
}
#[actix_rt::test]
async fn bug_4640() {
// https://github.com/meilisearch/meilisearch/issues/4640
let server = Server::new().await;
let index = server.index("test");
let documents = DOCUMENTS.clone();
index.add_documents(documents, None).await;
index.update_settings_filterable_attributes(json!(["_geo"])).await;
let (ret, _code) = index.update_settings_sortable_attributes(json!(["_geo"])).await;
index.wait_task(ret.uid()).await;
// Sort the document with the second one first
index
.search(
json!({
"sort": ["_geoPoint(45.4777599, 9.1967508):asc"],
}),
|response, code| {
assert_eq!(code, 200, "{}", response);
snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###"
{
"hits": [
{
"id": 2,
"name": "La Bella Italia",
"address": "456 Elm Street, Townsville",
"type": "Italian",
"rating": 9,
"_geo": {
"lat": "45.4777599",
"lng": "9.1967508"
}
},
{
"id": 1,
"name": "Taco Truck",
"address": "444 Salsa Street, Burritoville",
"type": "Mexican",
"rating": 9,
"_geo": {
"lat": 34.0522,
"lng": -118.2437
},
"_geoDistance": 9714063
},
{
"id": 3,
"name": "Crêpe Truck",
"address": "2 Billig Avenue, Rouenville",
"type": "French",
"rating": 10
}
],
"query": "",
"processingTimeMs": "[time]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 3
}
"###);
},
)
.await;
}

View File

@@ -129,7 +129,7 @@ fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> {
}
}
eprintln!("Sucessfully deleted {count} content files from disk!");
eprintln!("Successfully deleted {count} content files from disk!");
Ok(())
}

View File

@@ -74,10 +74,10 @@ csv = "1.3.0"
candle-core = { version = "0.4.1" }
candle-transformers = { version = "0.4.1" }
candle-nn = { version = "0.4.1" }
tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default-features = false, features = [
tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default_features = false, features = [
"onig",
] }
hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default-features = false, features = [
hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default_features = false, features = [
"online",
] }
tiktoken-rs = "0.5.8"
@@ -85,7 +85,7 @@ liquid = "0.26.4"
arroy = "0.2.0"
rand = "0.8.5"
tracing = "0.1.40"
ureq = { version = "2.9.6", features = ["json"] }
ureq = { version = "2.9.7", features = ["json"] }
url = "2.5.0"
[dev-dependencies]

View File

@@ -203,7 +203,7 @@ fn parse_csv_header(header: &str) -> (&str, AllowedType) {
"string" => (field_name, AllowedType::String),
"boolean" => (field_name, AllowedType::Boolean),
"number" => (field_name, AllowedType::Number),
// if the pattern isn't reconized, we keep the whole field.
// if the pattern isn't recognized, we keep the whole field.
_otherwise => (header, AllowedType::String),
},
None => (header, AllowedType::String),

View File

@@ -22,7 +22,7 @@ use crate::heed_codec::{
};
use crate::order_by_map::OrderByMap;
use crate::proximity::ProximityPrecision;
use crate::vector::{Embedding, EmbeddingConfig};
use crate::vector::EmbeddingConfig;
use crate::{
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec,
@@ -1516,42 +1516,6 @@ impl Index {
.unwrap_or_default())
}
pub fn embeddings(
&self,
rtxn: &RoTxn<'_>,
docid: DocumentId,
) -> Result<BTreeMap<String, Vec<Embedding>>> {
let mut res = BTreeMap::new();
for row in self.embedder_category_id.iter(rtxn)? {
let (embedder_name, embedder_id) = row?;
let embedder_id = (embedder_id as u16) << 8;
let mut embeddings = Vec::new();
'vectors: for i in 0..=u8::MAX {
let reader = arroy::Reader::open(rtxn, embedder_id | (i as u16), self.vector_arroy)
.map(Some)
.or_else(|e| match e {
arroy::Error::MissingMetadata => Ok(None),
e => Err(e),
})
.transpose();
let Some(reader) = reader else {
break 'vectors;
};
let embedding = reader?.item_vector(rtxn, docid)?;
if let Some(embedding) = embedding {
embeddings.push(embedding)
} else {
break 'vectors;
}
}
res.insert(embedder_name.to_owned(), embeddings);
}
Ok(res)
}
pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> {
self.main.remap_types::<Str, BEU64>().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff)
}

View File

@@ -42,7 +42,7 @@ fn facet_number_values<'a>(
}
/// Define the strategy used by the geo sort.
/// The paramater represents the cache size, and, in the case of the Dynamic strategy,
/// The parameter represents the cache size, and, in the case of the Dynamic strategy,
/// the point where we move from using the iterative strategy to the rtree.
#[derive(Debug, Clone, Copy)]
pub enum Strategy {

View File

@@ -22,7 +22,7 @@ pub enum SearchEvents {
RankingRuleStartIteration { ranking_rule_idx: usize, universe_len: u64 },
RankingRuleNextBucket { ranking_rule_idx: usize, universe_len: u64, bucket_len: u64 },
RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 },
RankingRuleEndIteration { ranking_rule_idx: usize },
RankingRuleEndIteration { ranking_rule_idx: usize, universe_len: u64 },
ExtendResults { new: Vec<u32> },
ProximityGraph { graph: RankingRuleGraph<ProximityGraph> },
ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> },
@@ -123,9 +123,12 @@ impl SearchLogger<QueryGraph> for VisualSearchLogger {
&mut self,
ranking_rule_idx: usize,
_ranking_rule: &dyn RankingRule<QueryGraph>,
_universe: &RoaringBitmap,
universe: &RoaringBitmap,
) {
self.events.push(SearchEvents::RankingRuleEndIteration { ranking_rule_idx });
self.events.push(SearchEvents::RankingRuleEndIteration {
ranking_rule_idx,
universe_len: universe.len(),
});
self.location.pop();
}
fn add_to_results(&mut self, docids: &[u32]) {
@@ -323,7 +326,7 @@ impl<'ctx> DetailedLoggerFinish<'ctx> {
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
self.write_skip_bucket(bucket_len)?;
}
SearchEvents::RankingRuleEndIteration { ranking_rule_idx } => {
SearchEvents::RankingRuleEndIteration { ranking_rule_idx, universe_len: _ } => {
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
self.write_end_iteration()?;
}

View File

@@ -134,7 +134,7 @@ impl<'t> Matcher<'t, '_> {
for (token_position, word_position, word) in words_positions {
partial = match partial.match_token(word) {
// token matches the partial match, but the match is not full,
// we temporarly save the current token then we try to match the next one.
// we temporarily save the current token then we try to match the next one.
Some(MatchType::Partial(partial)) => {
potential_matches.push((token_position, word_position, partial.char_len()));
partial
@@ -722,7 +722,7 @@ mod tests {
@"…void void void void void split the world void void"
);
// Text containing matches with diferent density.
// Text containing matches with different density.
let text = "split void the void void world void void void void void void void void void void split the world void void";
let mut matcher = builder.build(text);
// crop should return 10 last words with a marker at the start.

View File

@@ -119,7 +119,7 @@ pub fn located_query_terms_from_tokens(
if let Some(located_query_term) = phrase.build(ctx) {
// as we are evaluating a negative operator we put the phrase
// in the negative one *but* we don't reset the negative operator
// as we are immediatly starting a new negative phrase.
// as we are immediately starting a new negative phrase.
if negative_phrase {
negative_phrases.push(located_query_term);
} else {

View File

@@ -499,7 +499,7 @@ impl FacetsUpdateIncrementalInner {
ModificationResult::Expand | ModificationResult::Reduce { .. }
)
{
// if any modification occured, insert it in the database.
// if any modification occurred, insert it in the database.
self.db.put(txn, &insertion_key.as_ref(), &updated_value)?;
Ok(insertion_key_modification)
} else {

View File

@@ -37,7 +37,7 @@ pub struct ExtractedFacetValues {
/// Extracts the facet values of each faceted field of each document.
///
/// Returns the generated grenad reader containing the docid the fid and the orginal value as key
/// Returns the generated grenad reader containing the docid the fid and the original value as key
/// and the normalized value as value extracted from the given chunk of documents.
/// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially.
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
@@ -45,6 +45,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters,
settings_diff: &InnerIndexSettingsDiff,
geo_fields_ids: Option<(FieldId, FieldId)>,
) -> Result<ExtractedFacetValues> {
puffin::profile_function!();
@@ -126,18 +127,12 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
add_exists.insert(document);
}
let del_geo_support = settings_diff
.old
.geo_fields_ids
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
let add_geo_support = settings_diff
.new
.geo_fields_ids
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
let geo_support =
geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
let del_filterable_values =
del_value.map(|value| extract_facet_values(&value, del_geo_support));
del_value.map(|value| extract_facet_values(&value, geo_support));
let add_filterable_values =
add_value.map(|value| extract_facet_values(&value, add_geo_support));
add_value.map(|value| extract_facet_values(&value, geo_support));
// Those closures are just here to simplify things a bit.
let mut insert_numbers_diff = |del_numbers, add_numbers| {

View File

@@ -8,7 +8,6 @@ use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
use crate::error::GeoError;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::extract_finite_float_from_value;
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
use crate::{FieldId, InternalError, Result};
/// Extracts the geographical coordinates contained in each document under the `_geo` field.
@@ -19,7 +18,7 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters,
primary_key_id: FieldId,
settings_diff: &InnerIndexSettingsDiff,
(lat_fid, lng_fid): (FieldId, FieldId),
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
@@ -41,27 +40,47 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
serde_json::from_slice(document_id).unwrap()
};
// extract old version
let del_lat_lng =
extract_lat_lng(&obkv, &settings_diff.old, DelAdd::Deletion, document_id)?;
// extract new version
let add_lat_lng =
extract_lat_lng(&obkv, &settings_diff.new, DelAdd::Addition, document_id)?;
// first we get the two fields
match (obkv.get(lat_fid), obkv.get(lng_fid)) {
(Some(lat), Some(lng)) => {
let deladd_lat_obkv = KvReaderDelAdd::new(lat);
let deladd_lng_obkv = KvReaderDelAdd::new(lng);
if del_lat_lng != add_lat_lng {
let mut obkv = KvWriterDelAdd::memory();
if let Some([lat, lng]) = del_lat_lng {
#[allow(clippy::drop_non_drop)]
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
obkv.insert(DelAdd::Deletion, bytes)?;
// then we extract the values
let del_lat_lng = deladd_lat_obkv
.get(DelAdd::Deletion)
.zip(deladd_lng_obkv.get(DelAdd::Deletion))
.map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
.transpose()?;
let add_lat_lng = deladd_lat_obkv
.get(DelAdd::Addition)
.zip(deladd_lng_obkv.get(DelAdd::Addition))
.map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
.transpose()?;
if del_lat_lng != add_lat_lng {
let mut obkv = KvWriterDelAdd::memory();
if let Some([lat, lng]) = del_lat_lng {
#[allow(clippy::drop_non_drop)]
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
obkv.insert(DelAdd::Deletion, bytes)?;
}
if let Some([lat, lng]) = add_lat_lng {
#[allow(clippy::drop_non_drop)]
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
obkv.insert(DelAdd::Addition, bytes)?;
}
let bytes = obkv.into_inner()?;
writer.insert(docid_bytes, bytes)?;
}
}
if let Some([lat, lng]) = add_lat_lng {
#[allow(clippy::drop_non_drop)]
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
obkv.insert(DelAdd::Addition, bytes)?;
(None, Some(_)) => {
return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
}
let bytes = obkv.into_inner()?;
writer.insert(docid_bytes, bytes)?;
(Some(_), None) => {
return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
}
(None, None) => (),
}
}
@@ -69,37 +88,16 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
}
/// Extract the finite floats lat and lng from two bytes slices.
fn extract_lat_lng(
document: &obkv::KvReader<FieldId>,
settings: &InnerIndexSettings,
deladd: DelAdd,
document_id: impl Fn() -> Value,
) -> Result<Option<[f64; 2]>> {
match settings.geo_fields_ids {
Some((lat_fid, lng_fid)) => {
let lat = document.get(lat_fid).map(KvReaderDelAdd::new).and_then(|r| r.get(deladd));
let lng = document.get(lng_fid).map(KvReaderDelAdd::new).and_then(|r| r.get(deladd));
let (lat, lng) = match (lat, lng) {
(Some(lat), Some(lng)) => (lat, lng),
(Some(_), None) => {
return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
}
(None, Some(_)) => {
return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
}
(None, None) => return Ok(None),
};
let lat = extract_finite_float_from_value(
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
)
.map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
fn extract_lat_lng(lat: &[u8], lng: &[u8], document_id: impl Fn() -> Value) -> Result<[f64; 2]> {
let lat = extract_finite_float_from_value(
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
)
.map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
let lng = extract_finite_float_from_value(
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
)
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
Ok(Some([lat, lng]))
}
None => Ok(None),
}
let lng = extract_finite_float_from_value(
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
)
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
Ok([lat, lng])
}

View File

@@ -11,7 +11,7 @@ mod extract_word_position_docids;
use std::fs::File;
use std::io::BufReader;
use std::sync::{Arc, OnceLock};
use std::sync::Arc;
use crossbeam_channel::Sender;
use rayon::prelude::*;
@@ -31,7 +31,7 @@ use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
use super::{helpers, TypedChunk};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
use crate::{FieldId, Result, ThreadPoolNoAbortBuilder};
/// Extract data for each databases from obkv documents in parallel.
/// Send data in grenad file over provided Sender.
@@ -43,6 +43,7 @@ pub(crate) fn data_from_obkv_documents(
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
primary_key_id: FieldId,
geo_fields_ids: Option<(FieldId, FieldId)>,
settings_diff: Arc<InnerIndexSettingsDiff>,
max_positions_per_attributes: Option<u32>,
) -> Result<()> {
@@ -71,6 +72,7 @@ pub(crate) fn data_from_obkv_documents(
indexer,
lmdb_writer_sx.clone(),
primary_key_id,
geo_fields_ids,
settings_diff.clone(),
max_positions_per_attributes,
)
@@ -213,18 +215,6 @@ fn run_extraction_task<FE, FS, M>(
})
}
fn request_threads() -> &'static ThreadPoolNoAbort {
static REQUEST_THREADS: OnceLock<ThreadPoolNoAbort> = OnceLock::new();
REQUEST_THREADS.get_or_init(|| {
ThreadPoolNoAbortBuilder::new()
.num_threads(crate::vector::REQUEST_PARALLELISM)
.thread_name(|index| format!("embedding-request-{index}"))
.build()
.unwrap()
})
}
/// Extract chunked data and send it into lmdb_writer_sx sender:
/// - documents
fn send_original_documents_data(
@@ -239,6 +229,11 @@ fn send_original_documents_data(
let documents_chunk_cloned = original_documents_chunk.clone();
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
let request_threads = ThreadPoolNoAbortBuilder::new()
.num_threads(crate::vector::REQUEST_PARALLELISM)
.thread_name(|index| format!("embedding-request-{index}"))
.build()?;
if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() {
let settings_diff = settings_diff.clone();
rayon::spawn(move || {
@@ -256,7 +251,7 @@ fn send_original_documents_data(
prompts,
indexer,
embedder.clone(),
request_threads(),
&request_threads,
) {
Ok(results) => Some(results),
Err(error) => {
@@ -305,6 +300,7 @@ fn send_and_extract_flattened_documents_data(
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
primary_key_id: FieldId,
geo_fields_ids: Option<(FieldId, FieldId)>,
settings_diff: Arc<InnerIndexSettingsDiff>,
max_positions_per_attributes: Option<u32>,
) -> Result<(
@@ -314,13 +310,12 @@ fn send_and_extract_flattened_documents_data(
let flattened_documents_chunk =
flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
if settings_diff.run_geo_indexing() {
if let Some(geo_fields_ids) = geo_fields_ids {
let documents_chunk_cloned = flattened_documents_chunk.clone();
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
let settings_diff = settings_diff.clone();
rayon::spawn(move || {
let result =
extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, &settings_diff);
extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, geo_fields_ids);
let _ = match result {
Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))),
Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
@@ -359,6 +354,7 @@ fn send_and_extract_flattened_documents_data(
flattened_documents_chunk.clone(),
indexer,
&settings_diff,
geo_fields_ids,
)?;
// send fid_docid_facet_numbers_chunk to DB writer

View File

@@ -324,6 +324,28 @@ where
// get the primary key field id
let primary_key_id = settings_diff.new.fields_ids_map.id(&primary_key).unwrap();
// get the fid of the `_geo.lat` and `_geo.lng` fields.
let mut field_id_map = self.index.fields_ids_map(self.wtxn)?;
// self.index.fields_ids_map($a)? ==>> field_id_map
let geo_fields_ids = match field_id_map.id("_geo") {
Some(gfid) => {
let is_sortable = self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid);
let is_filterable = self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid);
// if `_geo` is faceted then we get the `lat` and `lng`
if is_sortable || is_filterable {
let field_ids = field_id_map
.insert("_geo.lat")
.zip(field_id_map.insert("_geo.lng"))
.ok_or(UserError::AttributeLimitReached)?;
Some(field_ids)
} else {
None
}
}
None => None,
};
let pool_params = GrenadParameters {
chunk_compression_type: self.indexer_config.chunk_compression_type,
chunk_compression_level: self.indexer_config.chunk_compression_level,
@@ -390,6 +412,7 @@ where
pool_params,
lmdb_writer_sx.clone(),
primary_key_id,
geo_fields_ids,
settings_diff.clone(),
max_positions_per_attributes,
)

View File

@@ -48,6 +48,7 @@ pub struct Transform<'a, 'i> {
fields_ids_map: FieldsIdsMap,
indexer_settings: &'a IndexerConfig,
pub autogenerate_docids: bool,
pub index_documents_method: IndexDocumentsMethod,
available_documents_ids: AvailableDocumentsIds,
@@ -101,7 +102,7 @@ impl<'a, 'i> Transform<'a, 'i> {
index: &'i Index,
indexer_settings: &'a IndexerConfig,
index_documents_method: IndexDocumentsMethod,
_autogenerate_docids: bool,
autogenerate_docids: bool,
) -> Result<Self> {
// We must choose the appropriate merge function for when two or more documents
// with the same user id must be merged or fully replaced in the same batch.
@@ -135,6 +136,7 @@ impl<'a, 'i> Transform<'a, 'i> {
index,
fields_ids_map: index.fields_ids_map(wtxn)?,
indexer_settings,
autogenerate_docids,
available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids),
original_sorter,
flattened_sorter,

View File

@@ -1161,11 +1161,6 @@ impl InnerIndexSettingsDiff {
pub fn settings_update_only(&self) -> bool {
self.settings_update_only
}
pub fn run_geo_indexing(&self) -> bool {
self.old.geo_fields_ids != self.new.geo_fields_ids
|| (!self.settings_update_only && self.new.geo_fields_ids.is_some())
}
}
#[derive(Clone)]
@@ -1182,7 +1177,6 @@ pub(crate) struct InnerIndexSettings {
pub proximity_precision: ProximityPrecision,
pub embedding_configs: EmbeddingConfigs,
pub existing_fields: HashSet<String>,
pub geo_fields_ids: Option<(FieldId, FieldId)>,
}
impl InnerIndexSettings {
@@ -1191,7 +1185,7 @@ impl InnerIndexSettings {
let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());
let allowed_separators = index.allowed_separators(rtxn)?;
let dictionary = index.dictionary(rtxn)?;
let mut fields_ids_map = index.fields_ids_map(rtxn)?;
let fields_ids_map = index.fields_ids_map(rtxn)?;
let user_defined_searchable_fields = index.user_defined_searchable_fields(rtxn)?;
let user_defined_searchable_fields =
user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect());
@@ -1206,24 +1200,6 @@ impl InnerIndexSettings {
.into_iter()
.filter_map(|(field, count)| (count != 0).then_some(field))
.collect();
// index.fields_ids_map($a)? ==>> fields_ids_map
let geo_fields_ids = match fields_ids_map.id("_geo") {
Some(gfid) => {
let is_sortable = index.sortable_fields_ids(rtxn)?.contains(&gfid);
let is_filterable = index.filterable_fields_ids(rtxn)?.contains(&gfid);
// if `_geo` is faceted then we get the `lat` and `lng`
if is_sortable || is_filterable {
let field_ids = fields_ids_map
.insert("_geo.lat")
.zip(fields_ids_map.insert("_geo.lng"))
.ok_or(UserError::AttributeLimitReached)?;
Some(field_ids)
} else {
None
}
}
None => None,
};
Ok(Self {
stop_words,
@@ -1238,7 +1214,6 @@ impl InnerIndexSettings {
proximity_precision,
embedding_configs,
existing_fields,
geo_fields_ids,
})
}

View File

@@ -21,7 +21,7 @@ reqwest = { version = "0.11.23", features = [
"stream",
"json",
"rustls-tls",
], default-features = false }
], default_features = false }
serde = { version = "1.0.195", features = ["derive"] }
serde_json = "1.0.111"
sha2 = "0.10.8"