mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-12-14 08:27:18 +00:00
Compare commits
2 Commits
improve-in
...
hackaton-r
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
42bbfebf70 | ||
|
|
5637978fe4 |
2
.github/workflows/benchmarks-manual.yml
vendored
2
.github/workflows/benchmarks-manual.yml
vendored
@@ -74,4 +74,4 @@ jobs:
|
|||||||
echo "${{ steps.file.outputs.basename }}.json has just been pushed."
|
echo "${{ steps.file.outputs.basename }}.json has just been pushed."
|
||||||
echo 'How to compare this benchmark with another one?'
|
echo 'How to compare this benchmark with another one?'
|
||||||
echo ' - Check the available files with: ./benchmarks/scripts/list.sh'
|
echo ' - Check the available files with: ./benchmarks/scripts/list.sh'
|
||||||
echo " - Run the following command: ./benchmaks/scripts/compare.sh <file-to-compare-with> ${{ steps.file.outputs.basename }}.json"
|
echo " - Run the following command: ./benchmaks/scipts/compare.sh <file-to-compare-with> ${{ steps.file.outputs.basename }}.json"
|
||||||
|
|||||||
8
.github/workflows/publish-docker-images.yml
vendored
8
.github/workflows/publish-docker-images.yml
vendored
@@ -57,10 +57,10 @@ jobs:
|
|||||||
echo "date=$commit_date" >> $GITHUB_OUTPUT
|
echo "date=$commit_date" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
- name: Set up QEMU
|
- name: Set up QEMU
|
||||||
uses: docker/setup-qemu-action@v3
|
uses: docker/setup-qemu-action@v2
|
||||||
|
|
||||||
- name: Set up Docker Buildx
|
- name: Set up Docker Buildx
|
||||||
uses: docker/setup-buildx-action@v3
|
uses: docker/setup-buildx-action@v2
|
||||||
|
|
||||||
- name: Login to Docker Hub
|
- name: Login to Docker Hub
|
||||||
uses: docker/login-action@v2
|
uses: docker/login-action@v2
|
||||||
@@ -70,7 +70,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Docker meta
|
- name: Docker meta
|
||||||
id: meta
|
id: meta
|
||||||
uses: docker/metadata-action@v5
|
uses: docker/metadata-action@v4
|
||||||
with:
|
with:
|
||||||
images: getmeili/meilisearch
|
images: getmeili/meilisearch
|
||||||
# Prevent `latest` to be updated for each new tag pushed.
|
# Prevent `latest` to be updated for each new tag pushed.
|
||||||
@@ -83,7 +83,7 @@ jobs:
|
|||||||
type=raw,value=latest,enable=${{ steps.check-tag-format.outputs.stable == 'true' && steps.check-tag-format.outputs.latest == 'true' }}
|
type=raw,value=latest,enable=${{ steps.check-tag-format.outputs.stable == 'true' && steps.check-tag-format.outputs.latest == 'true' }}
|
||||||
|
|
||||||
- name: Build and push
|
- name: Build and push
|
||||||
uses: docker/build-push-action@v5
|
uses: docker/build-push-action@v4
|
||||||
with:
|
with:
|
||||||
push: true
|
push: true
|
||||||
platforms: linux/amd64,linux/arm64
|
platforms: linux/amd64,linux/arm64
|
||||||
|
|||||||
@@ -1,81 +0,0 @@
|
|||||||
name: Benchmarks (PR)
|
|
||||||
on: issue_comment
|
|
||||||
permissions:
|
|
||||||
issues: write
|
|
||||||
|
|
||||||
env:
|
|
||||||
GH_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }}
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
run-benchmarks-on-comment:
|
|
||||||
name: Run and upload benchmarks
|
|
||||||
runs-on: benchmarks
|
|
||||||
timeout-minutes: 4320 # 72h
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- uses: actions-rs/toolchain@v1
|
|
||||||
with:
|
|
||||||
profile: minimal
|
|
||||||
toolchain: stable
|
|
||||||
override: true
|
|
||||||
|
|
||||||
- name: Check for Command
|
|
||||||
id: command
|
|
||||||
uses: xt0rted/slash-command-action@v2
|
|
||||||
with:
|
|
||||||
command: benchmark
|
|
||||||
reaction-type: "eyes"
|
|
||||||
|
|
||||||
# Set variables
|
|
||||||
- name: Set current branch name
|
|
||||||
shell: bash
|
|
||||||
run: echo "name=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT
|
|
||||||
id: current_branch
|
|
||||||
- name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
|
|
||||||
shell: bash
|
|
||||||
run: echo "name=$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" >> $GITHUB_OUTPUT
|
|
||||||
id: normalized_current_branch
|
|
||||||
- name: Set shorter commit SHA
|
|
||||||
shell: bash
|
|
||||||
run: echo "short=$(echo $GITHUB_SHA | cut -c1-8)" >> $GITHUB_OUTPUT
|
|
||||||
id: commit_sha
|
|
||||||
- name: Set file basename with format "dataset_branch_commitSHA"
|
|
||||||
shell: bash
|
|
||||||
run: echo "basename=$(echo ${{ steps.command.outputs.command-arguments }}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" >> $GITHUB_OUTPUT
|
|
||||||
id: file
|
|
||||||
|
|
||||||
# Run benchmarks
|
|
||||||
- name: Run benchmarks - Dataset ${{ steps.command.outputs.command-arguments }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
|
|
||||||
run: |
|
|
||||||
cd benchmarks
|
|
||||||
cargo bench --bench ${{ steps.command.outputs.command-arguments }} -- --save-baseline ${{ steps.file.outputs.basename }}
|
|
||||||
|
|
||||||
# Generate critcmp files
|
|
||||||
- name: Install critcmp
|
|
||||||
uses: taiki-e/install-action@v2
|
|
||||||
with:
|
|
||||||
tool: critcmp
|
|
||||||
- name: Export cripcmp file
|
|
||||||
run: |
|
|
||||||
critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json
|
|
||||||
|
|
||||||
# Upload benchmarks
|
|
||||||
- name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3
|
|
||||||
uses: BetaHuhn/do-spaces-action@v2
|
|
||||||
with:
|
|
||||||
access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }}
|
|
||||||
secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }}
|
|
||||||
space_name: ${{ secrets.DO_SPACES_SPACE_NAME }}
|
|
||||||
space_region: ${{ secrets.DO_SPACES_SPACE_REGION }}
|
|
||||||
source: ${{ steps.file.outputs.basename }}.json
|
|
||||||
out_dir: critcmp_results
|
|
||||||
|
|
||||||
# Compute the diff of the benchmarks and send a message on the GitHub PR
|
|
||||||
- name: Compute and send a message in the PR
|
|
||||||
run: |
|
|
||||||
export base=git rev-parse $(git cherry main | head -n 1 | cut -c 3-)~ | cut -c -8
|
|
||||||
echo 'Here are your benchmarks diff 👊' >> body.txt
|
|
||||||
echo '```' >> body.txt
|
|
||||||
./benchmaks/scipts/compare.sh $base ${{ steps.file.outputs.basename }}.json >> body.txt
|
|
||||||
echo '```' >> body.txt
|
|
||||||
gh pr comment ${GITHUB_REF#refs/heads/} --body-file body.txt
|
|
||||||
898
Cargo.lock
generated
898
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -21,7 +21,7 @@ serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
|||||||
criterion = { version = "0.5.1", features = ["html_reports"] }
|
criterion = { version = "0.5.1", features = ["html_reports"] }
|
||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
rand_chacha = "0.3.1"
|
rand_chacha = "0.3.1"
|
||||||
roaring = { path = "../../roaring-rs" }
|
roaring = "0.10.1"
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
anyhow = "1.0.70"
|
anyhow = "1.0.70"
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ meilisearch-auth = { path = "../meilisearch-auth" }
|
|||||||
meilisearch-types = { path = "../meilisearch-types" }
|
meilisearch-types = { path = "../meilisearch-types" }
|
||||||
once_cell = "1.17.1"
|
once_cell = "1.17.1"
|
||||||
regex = "1.7.3"
|
regex = "1.7.3"
|
||||||
roaring = { path = "../../roaring-rs", features = ["serde"] }
|
roaring = { version = "0.10.1", features = ["serde"] }
|
||||||
serde = { version = "1.0.160", features = ["derive"] }
|
serde = { version = "1.0.160", features = ["derive"] }
|
||||||
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
||||||
tar = "0.4.38"
|
tar = "0.4.38"
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ meilisearch-auth = { path = "../meilisearch-auth" }
|
|||||||
meilisearch-types = { path = "../meilisearch-types" }
|
meilisearch-types = { path = "../meilisearch-types" }
|
||||||
page_size = "0.5.0"
|
page_size = "0.5.0"
|
||||||
puffin = "0.16.0"
|
puffin = "0.16.0"
|
||||||
roaring = { path = "../../roaring-rs", features = ["serde"] }
|
roaring = { version = "0.10.1", features = ["serde"] }
|
||||||
serde = { version = "1.0.160", features = ["derive"] }
|
serde = { version = "1.0.160", features = ["derive"] }
|
||||||
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
||||||
synchronoise = "1.0.1"
|
synchronoise = "1.0.1"
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ hmac = "0.12.1"
|
|||||||
maplit = "1.0.2"
|
maplit = "1.0.2"
|
||||||
meilisearch-types = { path = "../meilisearch-types" }
|
meilisearch-types = { path = "../meilisearch-types" }
|
||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
roaring = { path = "../../roaring-rs", features = ["serde"] }
|
roaring = { version = "0.10.1", features = ["serde"] }
|
||||||
serde = { version = "1.0.160", features = ["derive"] }
|
serde = { version = "1.0.160", features = ["derive"] }
|
||||||
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
||||||
sha2 = "0.10.6"
|
sha2 = "0.10.6"
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ flate2 = "1.0.25"
|
|||||||
fst = "0.4.7"
|
fst = "0.4.7"
|
||||||
memmap2 = "0.7.1"
|
memmap2 = "0.7.1"
|
||||||
milli = { path = "../milli" }
|
milli = { path = "../milli" }
|
||||||
roaring = { path = "../../roaring-rs", features = ["serde"] }
|
roaring = { version = "0.10.1", features = ["serde"] }
|
||||||
serde = { version = "1.0.160", features = ["derive"] }
|
serde = { version = "1.0.160", features = ["derive"] }
|
||||||
serde-cs = "0.2.4"
|
serde-cs = "0.2.4"
|
||||||
serde_json = "1.0.95"
|
serde_json = "1.0.95"
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ once_cell = "1.17.1"
|
|||||||
ordered-float = "3.6.0"
|
ordered-float = "3.6.0"
|
||||||
rand_pcg = { version = "0.3.1", features = ["serde1"] }
|
rand_pcg = { version = "0.3.1", features = ["serde1"] }
|
||||||
rayon = "1.7.0"
|
rayon = "1.7.0"
|
||||||
roaring = { path = "../../roaring-rs" }
|
roaring = "0.10.1"
|
||||||
rstar = { version = "0.11.0", features = ["serde"] }
|
rstar = { version = "0.11.0", features = ["serde"] }
|
||||||
serde = { version = "1.0.160", features = ["derive"] }
|
serde = { version = "1.0.160", features = ["derive"] }
|
||||||
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::convert::TryInto;
|
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::mem::size_of;
|
use std::mem::size_of;
|
||||||
|
|
||||||
@@ -57,30 +56,22 @@ impl CboRoaringBitmapCodec {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Merge serialized CboRoaringBitmaps in a buffer.
|
/// Merge serialized CboRoaringBitmaps in a buffer.
|
||||||
/// The buffer must be empty before calling the function.
|
|
||||||
///
|
///
|
||||||
/// if the merged values length is under the threshold, values are directly
|
/// if the merged values length is under the threshold, values are directly
|
||||||
/// serialized in the buffer else a RoaringBitmap is created from the
|
/// serialized in the buffer else a RoaringBitmap is created from the
|
||||||
/// values and is serialized in the buffer.
|
/// values and is serialized in the buffer.
|
||||||
pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec<u8>) -> io::Result<()> {
|
pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec<u8>) -> io::Result<()> {
|
||||||
debug_assert!(buffer.is_empty());
|
|
||||||
|
|
||||||
let mut roaring = RoaringBitmap::new();
|
let mut roaring = RoaringBitmap::new();
|
||||||
let mut vec = Vec::new();
|
let mut vec = Vec::new();
|
||||||
|
|
||||||
for bytes in slices {
|
for bytes in slices {
|
||||||
if bytes.len() <= THRESHOLD * size_of::<u32>() {
|
if bytes.len() <= THRESHOLD * size_of::<u32>() {
|
||||||
debug_assert!(bytes.len() % size_of::<u32>() == 0);
|
let mut reader = bytes.as_ref();
|
||||||
vec.reserve(bytes.len() / size_of::<u32>());
|
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
|
||||||
|
vec.push(integer);
|
||||||
for bytes in bytes.chunks_exact(size_of::<u32>()) {
|
|
||||||
// unwrap can't happens since we ensured that everything
|
|
||||||
// was a multiple of size_of<u32>.
|
|
||||||
let v = u32::from_ne_bytes(bytes.try_into().unwrap());
|
|
||||||
vec.push(v);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
roaring.union_with_serialized_unchecked(bytes.as_ref())?;
|
roaring |= RoaringBitmap::deserialize_unchecked_from(bytes.as_ref())?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -94,7 +85,7 @@ impl CboRoaringBitmapCodec {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// We can unwrap safely because the vector is sorted upper.
|
// We can unwrap safely because the vector is sorted upper.
|
||||||
let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap();
|
let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap();
|
||||||
roaring.serialize_into(buffer)?;
|
roaring.serialize_into(buffer)?;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@@ -195,11 +186,8 @@ mod tests {
|
|||||||
|
|
||||||
let medium_data: Vec<_> =
|
let medium_data: Vec<_> =
|
||||||
medium_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect();
|
medium_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect();
|
||||||
// TODO: used for profiling purpose, get rids of it once the function is optimized
|
buffer.clear();
|
||||||
for _ in 0..100000 {
|
CboRoaringBitmapCodec::merge_into(medium_data.as_slice(), &mut buffer).unwrap();
|
||||||
buffer.clear();
|
|
||||||
CboRoaringBitmapCodec::merge_into(medium_data.as_slice(), &mut buffer).unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap();
|
let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap();
|
||||||
let expected = RoaringBitmap::from_sorted_iter(0..23).unwrap();
|
let expected = RoaringBitmap::from_sorted_iter(0..23).unwrap();
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
#![cfg_attr(all(test, fuzzing), feature(no_coverage))]
|
#![cfg_attr(all(test, fuzzing), feature(no_coverage))]
|
||||||
#![allow(clippy::type_complexity)]
|
#![allow(clippy::type_complexity)]
|
||||||
#![feature(test)]
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
#[global_allocator]
|
#[global_allocator]
|
||||||
|
|||||||
@@ -12,7 +12,8 @@ use super::Word;
|
|||||||
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
|
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
|
||||||
use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
|
use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
|
||||||
use crate::{
|
use crate::{
|
||||||
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext,
|
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec,
|
||||||
|
RoaringBitmapLenCodec, SearchContext,
|
||||||
};
|
};
|
||||||
|
|
||||||
/// A cache storing pointers to values in the LMDB databases.
|
/// A cache storing pointers to values in the LMDB databases.
|
||||||
@@ -259,6 +260,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
word2: Interned<String>,
|
word2: Interned<String>,
|
||||||
proximity: u8,
|
proximity: u8,
|
||||||
) -> Result<Option<RoaringBitmap>> {
|
) -> Result<Option<RoaringBitmap>> {
|
||||||
|
unreachable!();
|
||||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||||
self.txn,
|
self.txn,
|
||||||
(proximity, word1, word2),
|
(proximity, word1, word2),
|
||||||
@@ -278,6 +280,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
word2: Interned<String>,
|
word2: Interned<String>,
|
||||||
proximity: u8,
|
proximity: u8,
|
||||||
) -> Result<Option<u64>> {
|
) -> Result<Option<u64>> {
|
||||||
|
unreachable!();
|
||||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>(
|
DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>(
|
||||||
self.txn,
|
self.txn,
|
||||||
(proximity, word1, word2),
|
(proximity, word1, word2),
|
||||||
@@ -291,12 +294,23 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_db_word_docids_len(&mut self, word: Interned<String>) -> Result<Option<u64>> {
|
||||||
|
DatabaseCache::get_value::<_, _, RoaringBitmapLenCodec>(
|
||||||
|
self.txn,
|
||||||
|
word,
|
||||||
|
self.word_interner.get(word).as_str(),
|
||||||
|
&mut self.db_cache.word_docids,
|
||||||
|
self.index.word_docids.remap_data_type::<ByteSlice>(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn get_db_word_prefix_pair_proximity_docids(
|
pub fn get_db_word_prefix_pair_proximity_docids(
|
||||||
&mut self,
|
&mut self,
|
||||||
word1: Interned<String>,
|
word1: Interned<String>,
|
||||||
prefix2: Interned<String>,
|
prefix2: Interned<String>,
|
||||||
proximity: u8,
|
proximity: u8,
|
||||||
) -> Result<Option<RoaringBitmap>> {
|
) -> Result<Option<RoaringBitmap>> {
|
||||||
|
unreachable!();
|
||||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||||
self.txn,
|
self.txn,
|
||||||
(proximity, word1, prefix2),
|
(proximity, word1, prefix2),
|
||||||
@@ -315,6 +329,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
right: Interned<String>,
|
right: Interned<String>,
|
||||||
proximity: u8,
|
proximity: u8,
|
||||||
) -> Result<Option<RoaringBitmap>> {
|
) -> Result<Option<RoaringBitmap>> {
|
||||||
|
unreachable!();
|
||||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||||
self.txn,
|
self.txn,
|
||||||
(proximity, left_prefix, right),
|
(proximity, left_prefix, right),
|
||||||
|
|||||||
@@ -295,11 +295,11 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
|
|||||||
ranking_rules.push(Box::new(Typo::new(None)));
|
ranking_rules.push(Box::new(Typo::new(None)));
|
||||||
}
|
}
|
||||||
crate::Criterion::Proximity => {
|
crate::Criterion::Proximity => {
|
||||||
if proximity {
|
// if proximity {
|
||||||
continue;
|
continue;
|
||||||
}
|
// }
|
||||||
proximity = true;
|
// proximity = true;
|
||||||
ranking_rules.push(Box::new(Proximity::new(None)));
|
// ranking_rules.push(Box::new(Proximity::new(None)));
|
||||||
}
|
}
|
||||||
crate::Criterion::Attribute => {
|
crate::Criterion::Attribute => {
|
||||||
if attribute {
|
if attribute {
|
||||||
|
|||||||
@@ -265,11 +265,11 @@ pub fn partially_initialized_term_from_word(
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Interned<Phrase>>> {
|
fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Interned<Phrase>>> {
|
||||||
if let Some((l, r)) = split_best_frequency(ctx, word)? {
|
// if let Some((l, r)) = split_best_frequency(ctx, word)? {
|
||||||
Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] })))
|
// Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] })))
|
||||||
} else {
|
// } else {
|
||||||
Ok(None)
|
Ok(None)
|
||||||
}
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Interned<QueryTerm> {
|
impl Interned<QueryTerm> {
|
||||||
@@ -416,11 +416,20 @@ fn split_best_frequency(
|
|||||||
let left = ctx.word_interner.insert(left.to_owned());
|
let left = ctx.word_interner.insert(left.to_owned());
|
||||||
let right = ctx.word_interner.insert(right.to_owned());
|
let right = ctx.word_interner.insert(right.to_owned());
|
||||||
|
|
||||||
if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(left, right, 1)? {
|
if let (Some(l_freq), Some(r_freq)) =
|
||||||
|
(ctx.get_db_word_docids_len(left)?, ctx.get_db_word_docids_len(right)?)
|
||||||
|
{
|
||||||
|
let frequency = l_freq.min(r_freq);
|
||||||
if best.map_or(true, |(old, _, _)| frequency > old) {
|
if best.map_or(true, |(old, _, _)| frequency > old) {
|
||||||
best = Some((frequency, left, right));
|
best = Some((frequency, left, right));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(left, right, 1)? {
|
||||||
|
// if best.map_or(true, |(old, _, _)| frequency > old) {
|
||||||
|
// best = Some((frequency, left, right));
|
||||||
|
// }
|
||||||
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(best.map(|(_, left, right)| (left, right)))
|
Ok(best.map(|(_, left, right)| (left, right)))
|
||||||
|
|||||||
@@ -82,41 +82,41 @@ pub fn located_query_terms_from_tokens(
|
|||||||
position = position.wrapping_add(7);
|
position = position.wrapping_add(7);
|
||||||
}
|
}
|
||||||
|
|
||||||
phrase = 'phrase: {
|
// phrase = 'phrase: {
|
||||||
let phrase = phrase.take();
|
// let phrase = phrase.take();
|
||||||
|
|
||||||
// If we have a hard separator inside a phrase, we immediately start a new phrase
|
// // If we have a hard separator inside a phrase, we immediately start a new phrase
|
||||||
let phrase = if separator_kind == SeparatorKind::Hard {
|
// let phrase = if separator_kind == SeparatorKind::Hard {
|
||||||
if let Some(phrase) = phrase {
|
// if let Some(phrase) = phrase {
|
||||||
if let Some(located_query_term) = phrase.build(ctx) {
|
// if let Some(located_query_term) = phrase.build(ctx) {
|
||||||
located_terms.push(located_query_term)
|
// located_terms.push(located_query_term)
|
||||||
}
|
// }
|
||||||
Some(PhraseBuilder::empty())
|
// Some(PhraseBuilder::empty())
|
||||||
} else {
|
// } else {
|
||||||
None
|
// None
|
||||||
}
|
// }
|
||||||
} else {
|
// } else {
|
||||||
phrase
|
// phrase
|
||||||
};
|
// };
|
||||||
|
|
||||||
// We close and start a new phrase depending on the number of double quotes
|
// // We close and start a new phrase depending on the number of double quotes
|
||||||
let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count();
|
// let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count();
|
||||||
if quote_count == 0 {
|
// if quote_count == 0 {
|
||||||
break 'phrase phrase;
|
// break 'phrase phrase;
|
||||||
}
|
// }
|
||||||
|
|
||||||
// Consume the closing quote and the phrase
|
// // Consume the closing quote and the phrase
|
||||||
if let Some(phrase) = phrase {
|
// if let Some(phrase) = phrase {
|
||||||
// Per the check above, quote_count > 0
|
// // Per the check above, quote_count > 0
|
||||||
quote_count -= 1;
|
// quote_count -= 1;
|
||||||
if let Some(located_query_term) = phrase.build(ctx) {
|
// if let Some(located_query_term) = phrase.build(ctx) {
|
||||||
located_terms.push(located_query_term)
|
// located_terms.push(located_query_term)
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
|
||||||
// Start new phrase if the token ends with an opening quote
|
// // Start new phrase if the token ends with an opening quote
|
||||||
(quote_count % 2 == 1).then_some(PhraseBuilder::empty())
|
// (quote_count % 2 == 1).then_some(PhraseBuilder::empty())
|
||||||
};
|
// };
|
||||||
}
|
}
|
||||||
_ => (),
|
_ => (),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -351,5 +351,5 @@ fn test_redacted() {
|
|||||||
.map(|scores| score_details::ScoreDetails::to_json_map(scores.iter()))
|
.map(|scores| score_details::ScoreDetails::to_json_map(scores.iter()))
|
||||||
.collect();
|
.collect();
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 4, 5, 22, 23, 13, 1, 3, 12, 21, 11, 20, 6, 7, 8, 9, 10, 14, 15]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 4, 5, 22, 23, 13, 1, 3, 12, 21, 11, 20, 6, 7, 8, 9, 10, 14, 15]");
|
||||||
// insta::assert_json_snapshot!(document_scores_json);
|
insta::assert_json_snapshot!(document_scores_json);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -152,15 +152,15 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
// spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
docid_word_positions_chunks.clone(),
|
// docid_word_positions_chunks.clone(),
|
||||||
indexer,
|
// indexer,
|
||||||
lmdb_writer_sx.clone(),
|
// lmdb_writer_sx.clone(),
|
||||||
extract_word_pair_proximity_docids,
|
// extract_word_pair_proximity_docids,
|
||||||
merge_cbo_roaring_bitmaps,
|
// merge_cbo_roaring_bitmaps,
|
||||||
TypedChunk::WordPairProximityDocids,
|
// TypedChunk::WordPairProximityDocids,
|
||||||
"word-pair-proximity-docids",
|
// "word-pair-proximity-docids",
|
||||||
);
|
// );
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
docid_word_positions_chunks.clone(),
|
docid_word_positions_chunks.clone(),
|
||||||
|
|||||||
Reference in New Issue
Block a user