From 161cb736ea501b1913c62917b622424c75a1f4a9 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 3 Jun 2025 10:37:29 +0200 Subject: [PATCH 1/2] Adapt tests to the Chinese word segmenter changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new Chinese segmenter is splitting words in smaller parts. The words `小化妆包` was previously seegmented as `小 / 化妆包` and is now segmented as `小 / 化妆 / 包`, which changes the tests results. --- crates/milli/src/update/index_documents/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 4acb78b9a..e0f85ca2d 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -1580,12 +1580,12 @@ mod tests { let rtxn = index.read_txn().unwrap(); // Only the first document should match. - let count = index.word_docids.get(&rtxn, "huàzhuāngbāo").unwrap().unwrap().len(); + let count = index.word_docids.get(&rtxn, "huàzhuāng").unwrap().unwrap().len(); assert_eq!(count, 1); // Only the second document should match. let count = index.word_docids.get(&rtxn, "bāo").unwrap().unwrap().len(); - assert_eq!(count, 1); + assert_eq!(count, 2); let mut search = crate::Search::new(&rtxn, &index); search.query("化妆包"); From cb7bb36080ba1216a724baaf0683107d5f82461e Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 3 Jun 2025 10:48:41 +0200 Subject: [PATCH 2/2] update charabia v0.9.6 --- Cargo.lock | 224 +++++++++++++++------------------------- crates/milli/Cargo.toml | 2 +- 2 files changed, 84 insertions(+), 142 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 84e13a78c..0e9a072d3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -203,7 +203,7 @@ dependencies = [ "serde_json", "serde_urlencoded", "smallvec", - "socket2 0.5.5", + "socket2 0.5.10", "time", "url", ] @@ -982,9 +982,9 @@ dependencies = [ [[package]] name = "charabia" -version = "0.9.5" +version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4da3b398d57d5526189869b32ac0b4f7fb436f490f47a2a19685cee634df72d2" +checksum = "3b01abfd2db0eb8c4e7a47ccab5d1f67993736f4e76923ed9ae281c49070645d" dependencies = [ "aho-corasick", "csv", @@ -2538,9 +2538,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.8.0" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" [[package]] name = "httpdate" @@ -2550,9 +2550,9 @@ checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421" [[package]] name = "hyper" -version = "1.4.1" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05" +checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" dependencies = [ "bytes", "futures-channel", @@ -2584,23 +2584,28 @@ dependencies = [ "tokio", "tokio-rustls", "tower-service", - "webpki-roots", + "webpki-roots 0.26.1", ] [[package]] name = "hyper-util" -version = "0.1.10" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df2dcfbe0677734ab2f3ffa7fa7bfd4706bfdc1ef393f2ee30184aed67e631b4" +checksum = "b1c293b6b3d21eca78250dc7dbebd6b9210ec5530e038cbfe0661b5c47ab06e8" dependencies = [ + "base64 0.22.1", "bytes", "futures-channel", + "futures-core", "futures-util", "http 1.2.0", "http-body", "hyper", + "ipnet", + "libc", + "percent-encoding", "pin-project-lite", - "socket2 0.5.5", + "socket2 0.5.10", "tokio", "tower-service", "tracing", @@ -2878,9 +2883,9 @@ dependencies = [ [[package]] name = "ipnet" -version = "2.8.0" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" [[package]] name = "irg-kvariants" @@ -2893,6 +2898,16 @@ dependencies = [ "serde", ] +[[package]] +name = "iri-string" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbc5ebe9c3a1a7a5127f920a418f7585e9e758e911d0466ed004f393b0e380b2" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "is-terminal" version = "0.4.13" @@ -3154,9 +3169,9 @@ dependencies = [ [[package]] name = "lindera" -version = "0.42.3" +version = "0.43.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fa3936dbcfc54b90a53da68ec8fe209656cfa691147f951944f48c61dcde317" +checksum = "f20720cb4206e87b6844b05c66b23301e7bb532718f200ff55bbbdfbce9b7f2b" dependencies = [ "anyhow", "bincode", @@ -3184,9 +3199,9 @@ dependencies = [ [[package]] name = "lindera-cc-cedict" -version = "0.42.3" +version = "0.43.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a4720c69e32b278614eefb8181e0ef78907fa115d947edaeaedb1150785b902" +checksum = "0f6ddd4aeaeaf1ce47ea5785bd6a273179d32df4af4b306d9b65a7a7f81a0e61" dependencies = [ "bincode", "byteorder", @@ -3197,9 +3212,9 @@ dependencies = [ [[package]] name = "lindera-dictionary" -version = "0.42.3" +version = "0.43.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b123ac54a74c9418616c96d0d7cf5eb8fbf372211c07032d1e174c94e40ff030" +checksum = "f9b5e417c4c6e001459e019b178f65f759be9c2cbf2d9bd803ec5d8ed0e62124" dependencies = [ "anyhow", "bincode", @@ -3225,9 +3240,9 @@ dependencies = [ [[package]] name = "lindera-ipadic" -version = "0.42.3" +version = "0.43.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71c3786e6cf65dd1e8537c3c35637f887289bf83687f6fbcac3a6679bfa33265" +checksum = "c2867975f1b92d1093ccbb52c5c1664a56dfbd27a2fece0166c765ad1f043f31" dependencies = [ "bincode", "byteorder", @@ -3238,9 +3253,9 @@ dependencies = [ [[package]] name = "lindera-ipadic-neologd" -version = "0.42.3" +version = "0.43.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42646cc30bf8ceabf3db1154358329e1031f2af25ca1721ddba8ee3666881a08" +checksum = "c54c4c2d3fb8b380d0ace5ae97111ca444bcfa7721966f552117d57f07d8b3b1" dependencies = [ "bincode", "byteorder", @@ -3251,9 +3266,9 @@ dependencies = [ [[package]] name = "lindera-ko-dic" -version = "0.42.3" +version = "0.43.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f94a00fc5931636c10d2e6af4cfa43fbf95f8a529caa45d10600f3cb2853c9" +checksum = "7f495e64f62deee60d9b71dbe3fd39b69b8688c9d591842f81f94e200eb4d81f" dependencies = [ "bincode", "byteorder", @@ -3264,9 +3279,9 @@ dependencies = [ [[package]] name = "lindera-unidic" -version = "0.42.3" +version = "0.43.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5933014ca145351d59bb50a6e509a53af1f89ceda687fe9efd6d534e6b59a27" +checksum = "e85ff97ce04c519fbca0f05504ea028761ccc456b1e84cf1e75fac57f9b3caf1" dependencies = [ "bincode", "byteorder", @@ -4549,7 +4564,7 @@ checksum = "9096629c45860fc7fb143e125eb826b5e721e10be3263160c7d60ca832cf8c46" dependencies = [ "libc", "once_cell", - "socket2 0.5.5", + "socket2 0.5.10", "tracing", "windows-sys 0.52.0", ] @@ -4754,9 +4769,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.15" +version = "0.12.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d19c46a6fdd48bc4dab94b6103fccc55d34c67cc0ad04653aad4ea2a07cd7bbb" +checksum = "a2f8e5513d63f2e5b386eb5106dc67eaf3f84e95258e210489136b8b92ad6119" dependencies = [ "base64 0.22.1", "bytes", @@ -4778,7 +4793,6 @@ dependencies = [ "pin-project-lite", "quinn", "rustls", - "rustls-pemfile", "rustls-pki-types", "serde", "serde_json", @@ -4788,14 +4802,14 @@ dependencies = [ "tokio-rustls", "tokio-util", "tower", + "tower-http", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", "wasm-streams", "web-sys", - "webpki-roots", - "windows-registry", + "webpki-roots 1.0.0", ] [[package]] @@ -5308,12 +5322,12 @@ dependencies = [ [[package]] name = "socket2" -version = "0.5.5" +version = "0.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9" +checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" dependencies = [ "libc", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] @@ -5735,9 +5749,9 @@ dependencies = [ [[package]] name = "tokio" -version = "1.45.0" +version = "1.45.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2513ca694ef9ede0fb23fe71a4ee4107cb102b9dc1930f6d0fd77aae068ae165" +checksum = "75ef51a33ef1da925cea3e4eb122833cb377c61439ca401b770f54902b806779" dependencies = [ "backtrace", "bytes", @@ -5746,7 +5760,7 @@ dependencies = [ "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2 0.5.5", + "socket2 0.5.10", "tokio-macros", "windows-sys 0.52.0", ] @@ -5846,6 +5860,24 @@ dependencies = [ "tower-service", ] +[[package]] +name = "tower-http" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cc2d9e086a412a451384326f521c8123a99a466b329941a9403696bff9b0da2" +dependencies = [ + "bitflags 2.9.0", + "bytes", + "futures-util", + "http 1.2.0", + "http-body", + "iri-string", + "pin-project-lite", + "tower", + "tower-layer", + "tower-service", +] + [[package]] name = "tower-layer" version = "0.3.3" @@ -6116,7 +6148,7 @@ dependencies = [ "serde_json", "socks", "url", - "webpki-roots", + "webpki-roots 0.26.1", ] [[package]] @@ -6411,6 +6443,15 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "webpki-roots" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2853738d1cc4f2da3a225c18ec6c3721abb31961096e9dbf5ab35fa88b19cfdb" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "whatlang" version = "0.16.4" @@ -6470,7 +6511,7 @@ checksum = "d2ed2439a290666cd67ecce2b0ffaad89c2a56b976b736e6ece670297897832d" dependencies = [ "windows-implement", "windows-interface", - "windows-result 0.1.2", + "windows-result", "windows-targets 0.52.6", ] @@ -6496,23 +6537,6 @@ dependencies = [ "syn 2.0.87", ] -[[package]] -name = "windows-link" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38" - -[[package]] -name = "windows-registry" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4286ad90ddb45071efd1a66dfa43eb02dd0dfbae1545ad6cc3c51cf34d7e8ba3" -dependencies = [ - "windows-result 0.3.2", - "windows-strings", - "windows-targets 0.53.0", -] - [[package]] name = "windows-result" version = "0.1.2" @@ -6522,24 +6546,6 @@ dependencies = [ "windows-targets 0.52.6", ] -[[package]] -name = "windows-result" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c64fd11a4fd95df68efcfee5f44a294fe71b8bc6a91993e2791938abcc712252" -dependencies = [ - "windows-link", -] - -[[package]] -name = "windows-strings" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87fa48cc5d406560701792be122a10132491cff9d0aeb23583cc2dcafc847319" -dependencies = [ - "windows-link", -] - [[package]] name = "windows-sys" version = "0.45.0" @@ -6615,29 +6621,13 @@ dependencies = [ "windows_aarch64_gnullvm 0.52.6", "windows_aarch64_msvc 0.52.6", "windows_i686_gnu 0.52.6", - "windows_i686_gnullvm 0.52.6", + "windows_i686_gnullvm", "windows_i686_msvc 0.52.6", "windows_x86_64_gnu 0.52.6", "windows_x86_64_gnullvm 0.52.6", "windows_x86_64_msvc 0.52.6", ] -[[package]] -name = "windows-targets" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1e4c7e8ceaaf9cb7d7507c974735728ab453b67ef8f18febdd7c11fe59dca8b" -dependencies = [ - "windows_aarch64_gnullvm 0.53.0", - "windows_aarch64_msvc 0.53.0", - "windows_i686_gnu 0.53.0", - "windows_i686_gnullvm 0.53.0", - "windows_i686_msvc 0.53.0", - "windows_x86_64_gnu 0.53.0", - "windows_x86_64_gnullvm 0.53.0", - "windows_x86_64_msvc 0.53.0", -] - [[package]] name = "windows_aarch64_gnullvm" version = "0.42.2" @@ -6656,12 +6646,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" - [[package]] name = "windows_aarch64_msvc" version = "0.42.2" @@ -6680,12 +6664,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" -[[package]] -name = "windows_aarch64_msvc" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" - [[package]] name = "windows_i686_gnu" version = "0.42.2" @@ -6704,24 +6682,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" -[[package]] -name = "windows_i686_gnu" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" - [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" -[[package]] -name = "windows_i686_gnullvm" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" - [[package]] name = "windows_i686_msvc" version = "0.42.2" @@ -6740,12 +6706,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" -[[package]] -name = "windows_i686_msvc" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" - [[package]] name = "windows_x86_64_gnu" version = "0.42.2" @@ -6764,12 +6724,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" -[[package]] -name = "windows_x86_64_gnu" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" - [[package]] name = "windows_x86_64_gnullvm" version = "0.42.2" @@ -6788,12 +6742,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" - [[package]] name = "windows_x86_64_msvc" version = "0.42.2" @@ -6812,12 +6760,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" -[[package]] -name = "windows_x86_64_msvc" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" - [[package]] name = "winnow" version = "0.5.40" diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index 3ce02b444..08e0c4728 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -18,7 +18,7 @@ bincode = "1.3.3" bstr = "1.11.3" bytemuck = { version = "1.21.0", features = ["extern_crate_alloc"] } byteorder = "1.5.0" -charabia = { version = "0.9.5", default-features = false } +charabia = { version = "0.9.6", default-features = false } concat-arrays = "0.1.2" convert_case = "0.6.0" crossbeam-channel = "0.5.15"