mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-19 21:10:34 +00:00
Compare commits
13 Commits
embedding-
...
v1.8.4
Author | SHA1 | Date | |
---|---|---|---|
50c6854964 | |||
f0f02e6412 | |||
43bf3ff4e0 | |||
8fe6d31e01 | |||
9ec209bbf4 | |||
9375b7bba5 | |||
363a5cc590 | |||
7d69953267 | |||
7bd1b7ac43 | |||
1ff860e0a8 | |||
5d2b172e79 | |||
e64d0a206e | |||
6254c7cee1 |
2
.github/workflows/flaky-tests.yml
vendored
2
.github/workflows/flaky-tests.yml
vendored
@ -1,4 +1,6 @@
|
|||||||
name: Look for flaky tests
|
name: Look for flaky tests
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
on:
|
on:
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
schedule:
|
schedule:
|
||||||
|
3
.github/workflows/fuzzer-indexing.yml
vendored
3
.github/workflows/fuzzer-indexing.yml
vendored
@ -1,5 +1,6 @@
|
|||||||
name: Run the indexing fuzzer
|
name: Run the indexing fuzzer
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
|
2
.github/workflows/publish-apt-brew-pkg.yml
vendored
2
.github/workflows/publish-apt-brew-pkg.yml
vendored
@ -15,6 +15,8 @@ jobs:
|
|||||||
|
|
||||||
debian:
|
debian:
|
||||||
name: Publish debian packagge
|
name: Publish debian packagge
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: check-version
|
needs: check-version
|
||||||
container:
|
container:
|
||||||
|
4
.github/workflows/publish-binaries.yml
vendored
4
.github/workflows/publish-binaries.yml
vendored
@ -35,6 +35,8 @@ jobs:
|
|||||||
publish-linux:
|
publish-linux:
|
||||||
name: Publish binary for Linux
|
name: Publish binary for Linux
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
needs: check-version
|
needs: check-version
|
||||||
container:
|
container:
|
||||||
# Use ubuntu-18.04 to compile with glibc 2.27
|
# Use ubuntu-18.04 to compile with glibc 2.27
|
||||||
@ -132,6 +134,8 @@ jobs:
|
|||||||
name: Publish binary for aarch64
|
name: Publish binary for aarch64
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: check-version
|
needs: check-version
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
container:
|
container:
|
||||||
# Use ubuntu-18.04 to compile with glibc 2.27
|
# Use ubuntu-18.04 to compile with glibc 2.27
|
||||||
image: ubuntu:18.04
|
image: ubuntu:18.04
|
||||||
|
8
.github/workflows/test-suite.yml
vendored
8
.github/workflows/test-suite.yml
vendored
@ -21,6 +21,8 @@ jobs:
|
|||||||
test-linux:
|
test-linux:
|
||||||
name: Tests on ubuntu-18.04
|
name: Tests on ubuntu-18.04
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
container:
|
container:
|
||||||
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
||||||
image: ubuntu:18.04
|
image: ubuntu:18.04
|
||||||
@ -77,6 +79,8 @@ jobs:
|
|||||||
test-all-features:
|
test-all-features:
|
||||||
name: Tests almost all features
|
name: Tests almost all features
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
container:
|
container:
|
||||||
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
||||||
image: ubuntu:18.04
|
image: ubuntu:18.04
|
||||||
@ -100,6 +104,8 @@ jobs:
|
|||||||
|
|
||||||
test-disabled-tokenization:
|
test-disabled-tokenization:
|
||||||
name: Test disabled tokenization
|
name: Test disabled tokenization
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
container:
|
container:
|
||||||
image: ubuntu:18.04
|
image: ubuntu:18.04
|
||||||
@ -127,6 +133,8 @@ jobs:
|
|||||||
# We run tests in debug also, to make sure that the debug_assertions are hit
|
# We run tests in debug also, to make sure that the debug_assertions are hit
|
||||||
test-debug:
|
test-debug:
|
||||||
name: Run tests in debug
|
name: Run tests in debug
|
||||||
|
env:
|
||||||
|
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
container:
|
container:
|
||||||
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
||||||
|
34
Cargo.lock
generated
34
Cargo.lock
generated
@ -494,7 +494,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "benchmarks"
|
name = "benchmarks"
|
||||||
version = "1.8.2"
|
version = "1.8.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"bytes",
|
"bytes",
|
||||||
@ -639,7 +639,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "build-info"
|
name = "build-info"
|
||||||
version = "1.8.2"
|
version = "1.8.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"time",
|
"time",
|
||||||
@ -1539,7 +1539,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "dump"
|
name = "dump"
|
||||||
version = "1.8.2"
|
version = "1.8.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"big_s",
|
"big_s",
|
||||||
@ -1787,7 +1787,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "file-store"
|
name = "file-store"
|
||||||
version = "1.8.2"
|
version = "1.8.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"faux",
|
"faux",
|
||||||
"tempfile",
|
"tempfile",
|
||||||
@ -1810,7 +1810,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "filter-parser"
|
name = "filter-parser"
|
||||||
version = "1.8.2"
|
version = "1.8.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"insta",
|
"insta",
|
||||||
"nom",
|
"nom",
|
||||||
@ -1830,7 +1830,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "flatten-serde-json"
|
name = "flatten-serde-json"
|
||||||
version = "1.8.2"
|
version = "1.8.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"criterion",
|
"criterion",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
@ -1948,7 +1948,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fuzzers"
|
name = "fuzzers"
|
||||||
version = "1.8.2"
|
version = "1.8.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arbitrary",
|
"arbitrary",
|
||||||
"clap",
|
"clap",
|
||||||
@ -2442,7 +2442,7 @@ checksum = "206ca75c9c03ba3d4ace2460e57b189f39f43de612c2f85836e65c929701bb2d"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "index-scheduler"
|
name = "index-scheduler"
|
||||||
version = "1.8.2"
|
version = "1.8.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"big_s",
|
"big_s",
|
||||||
@ -2638,7 +2638,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "json-depth-checker"
|
name = "json-depth-checker"
|
||||||
version = "1.8.2"
|
version = "1.8.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"criterion",
|
"criterion",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
@ -3275,7 +3275,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "meili-snap"
|
name = "meili-snap"
|
||||||
version = "1.8.2"
|
version = "1.8.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"insta",
|
"insta",
|
||||||
"md5",
|
"md5",
|
||||||
@ -3284,7 +3284,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "meilisearch"
|
name = "meilisearch"
|
||||||
version = "1.8.2"
|
version = "1.8.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"actix-cors",
|
"actix-cors",
|
||||||
"actix-http",
|
"actix-http",
|
||||||
@ -3377,7 +3377,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "meilisearch-auth"
|
name = "meilisearch-auth"
|
||||||
version = "1.8.2"
|
version = "1.8.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"base64 0.21.7",
|
"base64 0.21.7",
|
||||||
"enum-iterator",
|
"enum-iterator",
|
||||||
@ -3396,7 +3396,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "meilisearch-types"
|
name = "meilisearch-types"
|
||||||
version = "1.8.2"
|
version = "1.8.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"actix-web",
|
"actix-web",
|
||||||
"anyhow",
|
"anyhow",
|
||||||
@ -3426,7 +3426,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "meilitool"
|
name = "meilitool"
|
||||||
version = "1.8.2"
|
version = "1.8.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"clap",
|
"clap",
|
||||||
@ -3465,7 +3465,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "milli"
|
name = "milli"
|
||||||
version = "1.8.2"
|
version = "1.8.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arroy",
|
"arroy",
|
||||||
"big_s",
|
"big_s",
|
||||||
@ -3906,7 +3906,7 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "permissive-json-pointer"
|
name = "permissive-json-pointer"
|
||||||
version = "1.8.2"
|
version = "1.8.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"big_s",
|
"big_s",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
@ -6074,7 +6074,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "xtask"
|
name = "xtask"
|
||||||
version = "1.8.2"
|
version = "1.8.4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"build-info",
|
"build-info",
|
||||||
|
@ -22,7 +22,7 @@ members = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
version = "1.8.2"
|
version = "1.8.4"
|
||||||
authors = [
|
authors = [
|
||||||
"Quentin de Quelen <quentin@dequelen.me>",
|
"Quentin de Quelen <quentin@dequelen.me>",
|
||||||
"Clément Renault <clement@meilisearch.com>",
|
"Clément Renault <clement@meilisearch.com>",
|
||||||
|
@ -152,6 +152,7 @@ impl Settings<Unchecked> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
#[derive(Debug, Clone, Deserialize)]
|
||||||
|
#[allow(dead_code)] // otherwise rustc complains that the fields go unused
|
||||||
#[cfg_attr(test, derive(serde::Serialize))]
|
#[cfg_attr(test, derive(serde::Serialize))]
|
||||||
#[serde(deny_unknown_fields)]
|
#[serde(deny_unknown_fields)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
|
@ -182,6 +182,7 @@ impl Settings<Unchecked> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[allow(dead_code)] // otherwise rustc complains that the fields go unused
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
#[derive(Debug, Clone, Deserialize)]
|
||||||
#[cfg_attr(test, derive(serde::Serialize))]
|
#[cfg_attr(test, derive(serde::Serialize))]
|
||||||
#[serde(deny_unknown_fields)]
|
#[serde(deny_unknown_fields)]
|
||||||
|
@ -200,6 +200,7 @@ impl std::ops::Deref for IndexUid {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[allow(dead_code)] // otherwise rustc complains that the fields go unused
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
#[cfg_attr(test, derive(serde::Serialize))]
|
#[cfg_attr(test, derive(serde::Serialize))]
|
||||||
#[cfg_attr(test, serde(rename_all = "camelCase"))]
|
#[cfg_attr(test, serde(rename_all = "camelCase"))]
|
||||||
|
@ -914,8 +914,34 @@ impl IndexScheduler {
|
|||||||
if self.must_stop_processing.get() {
|
if self.must_stop_processing.get() {
|
||||||
return Err(Error::AbortedTask);
|
return Err(Error::AbortedTask);
|
||||||
}
|
}
|
||||||
let (_id, doc) = ret?;
|
let (id, doc) = ret?;
|
||||||
let document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
|
let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
|
||||||
|
|
||||||
|
'inject_vectors: {
|
||||||
|
let embeddings = index.embeddings(&rtxn, id)?;
|
||||||
|
|
||||||
|
if embeddings.is_empty() {
|
||||||
|
break 'inject_vectors;
|
||||||
|
}
|
||||||
|
|
||||||
|
let vectors = document
|
||||||
|
.entry("_vectors".to_owned())
|
||||||
|
.or_insert(serde_json::Value::Object(Default::default()));
|
||||||
|
|
||||||
|
let serde_json::Value::Object(vectors) = vectors else {
|
||||||
|
break 'inject_vectors;
|
||||||
|
};
|
||||||
|
|
||||||
|
for (embedder_name, embeddings) in embeddings {
|
||||||
|
vectors.entry(embedder_name).or_insert_with(|| {
|
||||||
|
serde_json::json!({
|
||||||
|
"embeddings": embeddings,
|
||||||
|
"regenerate": true
|
||||||
|
})
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
index_dumper.push_document(&document)?;
|
index_dumper.push_document(&document)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -419,7 +419,41 @@ fn import_dump(
|
|||||||
let file = tempfile::tempfile()?;
|
let file = tempfile::tempfile()?;
|
||||||
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file));
|
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file));
|
||||||
for document in index_reader.documents()? {
|
for document in index_reader.documents()? {
|
||||||
builder.append_json_object(&document?)?;
|
let mut document = document?;
|
||||||
|
|
||||||
|
'remove_injected_vectors: {
|
||||||
|
let Some(vectors) = document.get_mut("_vectors") else {
|
||||||
|
break 'remove_injected_vectors;
|
||||||
|
};
|
||||||
|
|
||||||
|
let Some(vectors) = vectors.as_object_mut() else { break 'remove_injected_vectors };
|
||||||
|
|
||||||
|
vectors.retain(|_embedder, embedding_object| {
|
||||||
|
// don't touch values that aren't objects
|
||||||
|
let Some(embedding_object) = embedding_object.as_object() else {
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut has_regenerate_true = false;
|
||||||
|
for (field, value) in embedding_object {
|
||||||
|
match (field.as_str(), value) {
|
||||||
|
// detected regenerate : true
|
||||||
|
// if we don't have any superfluous field, we'll remove the entire entry
|
||||||
|
("regenerate", serde_json::Value::Bool(true)) => {
|
||||||
|
has_regenerate_true = true;
|
||||||
|
}
|
||||||
|
// ignore embeddings
|
||||||
|
("embeddings", _) => continue,
|
||||||
|
// any other field: immediately retain the entry
|
||||||
|
_ => return true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// retain the entry unless it has regenerate: true
|
||||||
|
!has_regenerate_true
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
builder.append_json_object(&document)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This flush the content of the batch builder.
|
// This flush the content of the batch builder.
|
||||||
|
@ -74,10 +74,10 @@ csv = "1.3.0"
|
|||||||
candle-core = { version = "0.4.1" }
|
candle-core = { version = "0.4.1" }
|
||||||
candle-transformers = { version = "0.4.1" }
|
candle-transformers = { version = "0.4.1" }
|
||||||
candle-nn = { version = "0.4.1" }
|
candle-nn = { version = "0.4.1" }
|
||||||
tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default_features = false, features = [
|
tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default-features = false, features = [
|
||||||
"onig",
|
"onig",
|
||||||
] }
|
] }
|
||||||
hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default_features = false, features = [
|
hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default-features = false, features = [
|
||||||
"online",
|
"online",
|
||||||
] }
|
] }
|
||||||
tiktoken-rs = "0.5.8"
|
tiktoken-rs = "0.5.8"
|
||||||
|
@ -22,7 +22,7 @@ use crate::heed_codec::{
|
|||||||
};
|
};
|
||||||
use crate::order_by_map::OrderByMap;
|
use crate::order_by_map::OrderByMap;
|
||||||
use crate::proximity::ProximityPrecision;
|
use crate::proximity::ProximityPrecision;
|
||||||
use crate::vector::EmbeddingConfig;
|
use crate::vector::{Embedding, EmbeddingConfig};
|
||||||
use crate::{
|
use crate::{
|
||||||
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
||||||
FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec,
|
FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec,
|
||||||
@ -1516,6 +1516,42 @@ impl Index {
|
|||||||
.unwrap_or_default())
|
.unwrap_or_default())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn embeddings(
|
||||||
|
&self,
|
||||||
|
rtxn: &RoTxn<'_>,
|
||||||
|
docid: DocumentId,
|
||||||
|
) -> Result<BTreeMap<String, Vec<Embedding>>> {
|
||||||
|
let mut res = BTreeMap::new();
|
||||||
|
for row in self.embedder_category_id.iter(rtxn)? {
|
||||||
|
let (embedder_name, embedder_id) = row?;
|
||||||
|
let embedder_id = (embedder_id as u16) << 8;
|
||||||
|
let mut embeddings = Vec::new();
|
||||||
|
'vectors: for i in 0..=u8::MAX {
|
||||||
|
let reader = arroy::Reader::open(rtxn, embedder_id | (i as u16), self.vector_arroy)
|
||||||
|
.map(Some)
|
||||||
|
.or_else(|e| match e {
|
||||||
|
arroy::Error::MissingMetadata => Ok(None),
|
||||||
|
e => Err(e),
|
||||||
|
})
|
||||||
|
.transpose();
|
||||||
|
|
||||||
|
let Some(reader) = reader else {
|
||||||
|
break 'vectors;
|
||||||
|
};
|
||||||
|
|
||||||
|
let embedding = reader?.item_vector(rtxn, docid)?;
|
||||||
|
if let Some(embedding) = embedding {
|
||||||
|
embeddings.push(embedding)
|
||||||
|
} else {
|
||||||
|
break 'vectors;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
res.insert(embedder_name.to_owned(), embeddings);
|
||||||
|
}
|
||||||
|
Ok(res)
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> {
|
pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> {
|
||||||
self.main.remap_types::<Str, BEU64>().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff)
|
self.main.remap_types::<Str, BEU64>().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff)
|
||||||
}
|
}
|
||||||
|
@ -22,7 +22,7 @@ pub enum SearchEvents {
|
|||||||
RankingRuleStartIteration { ranking_rule_idx: usize, universe_len: u64 },
|
RankingRuleStartIteration { ranking_rule_idx: usize, universe_len: u64 },
|
||||||
RankingRuleNextBucket { ranking_rule_idx: usize, universe_len: u64, bucket_len: u64 },
|
RankingRuleNextBucket { ranking_rule_idx: usize, universe_len: u64, bucket_len: u64 },
|
||||||
RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 },
|
RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 },
|
||||||
RankingRuleEndIteration { ranking_rule_idx: usize, universe_len: u64 },
|
RankingRuleEndIteration { ranking_rule_idx: usize },
|
||||||
ExtendResults { new: Vec<u32> },
|
ExtendResults { new: Vec<u32> },
|
||||||
ProximityGraph { graph: RankingRuleGraph<ProximityGraph> },
|
ProximityGraph { graph: RankingRuleGraph<ProximityGraph> },
|
||||||
ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> },
|
ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> },
|
||||||
@ -123,12 +123,9 @@ impl SearchLogger<QueryGraph> for VisualSearchLogger {
|
|||||||
&mut self,
|
&mut self,
|
||||||
ranking_rule_idx: usize,
|
ranking_rule_idx: usize,
|
||||||
_ranking_rule: &dyn RankingRule<QueryGraph>,
|
_ranking_rule: &dyn RankingRule<QueryGraph>,
|
||||||
universe: &RoaringBitmap,
|
_universe: &RoaringBitmap,
|
||||||
) {
|
) {
|
||||||
self.events.push(SearchEvents::RankingRuleEndIteration {
|
self.events.push(SearchEvents::RankingRuleEndIteration { ranking_rule_idx });
|
||||||
ranking_rule_idx,
|
|
||||||
universe_len: universe.len(),
|
|
||||||
});
|
|
||||||
self.location.pop();
|
self.location.pop();
|
||||||
}
|
}
|
||||||
fn add_to_results(&mut self, docids: &[u32]) {
|
fn add_to_results(&mut self, docids: &[u32]) {
|
||||||
@ -326,7 +323,7 @@ impl<'ctx> DetailedLoggerFinish<'ctx> {
|
|||||||
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
|
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
|
||||||
self.write_skip_bucket(bucket_len)?;
|
self.write_skip_bucket(bucket_len)?;
|
||||||
}
|
}
|
||||||
SearchEvents::RankingRuleEndIteration { ranking_rule_idx, universe_len: _ } => {
|
SearchEvents::RankingRuleEndIteration { ranking_rule_idx } => {
|
||||||
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
|
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
|
||||||
self.write_end_iteration()?;
|
self.write_end_iteration()?;
|
||||||
}
|
}
|
||||||
|
@ -11,7 +11,7 @@ mod extract_word_position_docids;
|
|||||||
|
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::BufReader;
|
use std::io::BufReader;
|
||||||
use std::sync::Arc;
|
use std::sync::{Arc, OnceLock};
|
||||||
|
|
||||||
use crossbeam_channel::Sender;
|
use crossbeam_channel::Sender;
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
@ -31,7 +31,7 @@ use self::extract_word_position_docids::extract_word_position_docids;
|
|||||||
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
|
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
|
||||||
use super::{helpers, TypedChunk};
|
use super::{helpers, TypedChunk};
|
||||||
use crate::update::settings::InnerIndexSettingsDiff;
|
use crate::update::settings::InnerIndexSettingsDiff;
|
||||||
use crate::{FieldId, Result, ThreadPoolNoAbortBuilder};
|
use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
||||||
|
|
||||||
/// Extract data for each databases from obkv documents in parallel.
|
/// Extract data for each databases from obkv documents in parallel.
|
||||||
/// Send data in grenad file over provided Sender.
|
/// Send data in grenad file over provided Sender.
|
||||||
@ -213,6 +213,18 @@ fn run_extraction_task<FE, FS, M>(
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn request_threads() -> &'static ThreadPoolNoAbort {
|
||||||
|
static REQUEST_THREADS: OnceLock<ThreadPoolNoAbort> = OnceLock::new();
|
||||||
|
|
||||||
|
REQUEST_THREADS.get_or_init(|| {
|
||||||
|
ThreadPoolNoAbortBuilder::new()
|
||||||
|
.num_threads(crate::vector::REQUEST_PARALLELISM)
|
||||||
|
.thread_name(|index| format!("embedding-request-{index}"))
|
||||||
|
.build()
|
||||||
|
.unwrap()
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
||||||
/// - documents
|
/// - documents
|
||||||
fn send_original_documents_data(
|
fn send_original_documents_data(
|
||||||
@ -227,11 +239,6 @@ fn send_original_documents_data(
|
|||||||
let documents_chunk_cloned = original_documents_chunk.clone();
|
let documents_chunk_cloned = original_documents_chunk.clone();
|
||||||
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
||||||
|
|
||||||
let request_threads = ThreadPoolNoAbortBuilder::new()
|
|
||||||
.num_threads(crate::vector::REQUEST_PARALLELISM)
|
|
||||||
.thread_name(|index| format!("embedding-request-{index}"))
|
|
||||||
.build()?;
|
|
||||||
|
|
||||||
if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() {
|
if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() {
|
||||||
let settings_diff = settings_diff.clone();
|
let settings_diff = settings_diff.clone();
|
||||||
rayon::spawn(move || {
|
rayon::spawn(move || {
|
||||||
@ -249,7 +256,7 @@ fn send_original_documents_data(
|
|||||||
prompts,
|
prompts,
|
||||||
indexer,
|
indexer,
|
||||||
embedder.clone(),
|
embedder.clone(),
|
||||||
&request_threads,
|
request_threads(),
|
||||||
) {
|
) {
|
||||||
Ok(results) => Some(results),
|
Ok(results) => Some(results),
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
|
@ -48,7 +48,6 @@ pub struct Transform<'a, 'i> {
|
|||||||
fields_ids_map: FieldsIdsMap,
|
fields_ids_map: FieldsIdsMap,
|
||||||
|
|
||||||
indexer_settings: &'a IndexerConfig,
|
indexer_settings: &'a IndexerConfig,
|
||||||
pub autogenerate_docids: bool,
|
|
||||||
pub index_documents_method: IndexDocumentsMethod,
|
pub index_documents_method: IndexDocumentsMethod,
|
||||||
available_documents_ids: AvailableDocumentsIds,
|
available_documents_ids: AvailableDocumentsIds,
|
||||||
|
|
||||||
@ -102,7 +101,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
indexer_settings: &'a IndexerConfig,
|
indexer_settings: &'a IndexerConfig,
|
||||||
index_documents_method: IndexDocumentsMethod,
|
index_documents_method: IndexDocumentsMethod,
|
||||||
autogenerate_docids: bool,
|
_autogenerate_docids: bool,
|
||||||
) -> Result<Self> {
|
) -> Result<Self> {
|
||||||
// We must choose the appropriate merge function for when two or more documents
|
// We must choose the appropriate merge function for when two or more documents
|
||||||
// with the same user id must be merged or fully replaced in the same batch.
|
// with the same user id must be merged or fully replaced in the same batch.
|
||||||
@ -136,7 +135,6 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
index,
|
index,
|
||||||
fields_ids_map: index.fields_ids_map(wtxn)?,
|
fields_ids_map: index.fields_ids_map(wtxn)?,
|
||||||
indexer_settings,
|
indexer_settings,
|
||||||
autogenerate_docids,
|
|
||||||
available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids),
|
available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids),
|
||||||
original_sorter,
|
original_sorter,
|
||||||
flattened_sorter,
|
flattened_sorter,
|
||||||
|
@ -21,7 +21,7 @@ reqwest = { version = "0.11.23", features = [
|
|||||||
"stream",
|
"stream",
|
||||||
"json",
|
"json",
|
||||||
"rustls-tls",
|
"rustls-tls",
|
||||||
], default_features = false }
|
], default-features = false }
|
||||||
serde = { version = "1.0.195", features = ["derive"] }
|
serde = { version = "1.0.195", features = ["derive"] }
|
||||||
serde_json = "1.0.111"
|
serde_json = "1.0.111"
|
||||||
sha2 = "0.10.8"
|
sha2 = "0.10.8"
|
||||||
|
Reference in New Issue
Block a user