Merge #439

439: Optimize typo criterion r=Kerollmops a=MarinPostma This pr implements a couple of optimization for the typo criterion: - clamp max typo on concatenated query words to 1: By considering that a concatenated query word is a typo, we clamp the max number of typos allowed o it to 1. This is useful because we noticed that concatenated query words often introduced words with 2 typos in queries that otherwise didn't allow for 2 typo words. - Make typos on the first letter count for 2. This change is a big performance gain: by considering the typos on the first letter to count as 2 typos, we drastically restrict the search space for 1 typo, and if we reach 2 typos, the search space is reduced as well, as we only consider: (2 typos ∩ correct first letter) ∪ (wrong first letter ∩ 1 typo) instead of 2 typos anywhere in the word. ## benches ``` group main typo ----- ---- ---- smol-songs.csv: asc + default/Notstandskomitee 2.51 5.8±0.01ms ? ?/sec 1.00 2.3±0.01ms ? ?/sec smol-songs.csv: asc + default/charles 2.48 3.0±0.01ms ? ?/sec 1.00 1190.9±1.29µs ? ?/sec smol-songs.csv: asc + default/charles mingus 5.56 10.8±0.01ms ? ?/sec 1.00 1935.3±1.00µs ? ?/sec smol-songs.csv: asc + default/david 1.65 3.9±0.00ms ? ?/sec 1.00 2.4±0.01ms ? ?/sec smol-songs.csv: asc + default/david bowie 3.34 12.5±0.02ms ? ?/sec 1.00 3.7±0.00ms ? ?/sec smol-songs.csv: asc + default/john 1.00 1849.7±3.74µs ? ?/sec 1.01 1875.1±4.65µs ? ?/sec smol-songs.csv: asc + default/marcus miller 4.32 15.7±0.01ms ? ?/sec 1.00 3.6±0.01ms ? ?/sec smol-songs.csv: asc + default/michael jackson 3.31 12.5±0.01ms ? ?/sec 1.00 3.8±0.00ms ? ?/sec smol-songs.csv: asc + default/tamo 1.05 565.4±0.86µs ? ?/sec 1.00 539.3±1.22µs ? ?/sec smol-songs.csv: asc + default/thelonious monk 3.49 11.5±0.01ms ? ?/sec 1.00 3.3±0.00ms ? ?/sec smol-songs.csv: asc/Notstandskomitee 2.59 5.6±0.02ms ? ?/sec 1.00 2.2±0.01ms ? ?/sec smol-songs.csv: asc/charles 6.05 2.1±0.00ms ? ?/sec 1.00 347.8±0.60µs ? ?/sec smol-songs.csv: asc/charles mingus 14.46 9.4±0.01ms ? ?/sec 1.00 649.2±0.97µs ? ?/sec smol-songs.csv: asc/david 3.87 2.4±0.00ms ? ?/sec 1.00 618.2±0.69µs ? ?/sec smol-songs.csv: asc/david bowie 10.14 9.8±0.01ms ? ?/sec 1.00 970.8±1.55µs ? ?/sec smol-songs.csv: asc/john 1.00 546.5±1.10µs ? ?/sec 1.00 547.1±2.11µs ? ?/sec smol-songs.csv: asc/marcus miller 11.45 10.4±0.06ms ? ?/sec 1.00 907.9±1.37µs ? ?/sec smol-songs.csv: asc/michael jackson 10.56 9.7±0.01ms ? ?/sec 1.00 919.6±1.03µs ? ?/sec smol-songs.csv: asc/tamo 1.03 43.3±0.18µs ? ?/sec 1.00 42.2±0.23µs ? ?/sec smol-songs.csv: asc/thelonious monk 4.16 10.7±0.02ms ? ?/sec 1.00 2.6±0.00ms ? ?/sec smol-songs.csv: basic filter: <=/Notstandskomitee 1.00 95.7±0.20µs ? ?/sec 1.15 109.6±10.40µs ? ?/sec smol-songs.csv: basic filter: <=/charles 1.00 27.8±0.15µs ? ?/sec 1.01 27.9±0.18µs ? ?/sec smol-songs.csv: basic filter: <=/charles mingus 1.72 119.2±0.67µs ? ?/sec 1.00 69.1±0.13µs ? ?/sec smol-songs.csv: basic filter: <=/david 1.00 22.3±0.33µs ? ?/sec 1.05 23.4±0.19µs ? ?/sec smol-songs.csv: basic filter: <=/david bowie 1.59 86.9±0.79µs ? ?/sec 1.00 54.5±0.31µs ? ?/sec smol-songs.csv: basic filter: <=/john 1.00 17.9±0.06µs ? ?/sec 1.06 18.9±0.15µs ? ?/sec smol-songs.csv: basic filter: <=/marcus miller 1.65 102.7±1.63µs ? ?/sec 1.00 62.3±0.18µs ? ?/sec smol-songs.csv: basic filter: <=/michael jackson 1.76 128.2±1.85µs ? ?/sec 1.00 72.9±0.19µs ? ?/sec smol-songs.csv: basic filter: <=/tamo 1.00 17.9±0.13µs ? ?/sec 1.05 18.7±0.20µs ? ?/sec smol-songs.csv: basic filter: <=/thelonious monk 1.53 157.5±2.38µs ? ?/sec 1.00 102.8±0.88µs ? ?/sec smol-songs.csv: basic filter: TO/Notstandskomitee 1.00 100.9±4.36µs ? ?/sec 1.04 105.0±8.25µs ? ?/sec smol-songs.csv: basic filter: TO/charles 1.00 28.4±0.36µs ? ?/sec 1.03 29.4±0.33µs ? ?/sec smol-songs.csv: basic filter: TO/charles mingus 1.71 118.1±1.08µs ? ?/sec 1.00 68.9±0.26µs ? ?/sec smol-songs.csv: basic filter: TO/david 1.00 24.0±0.26µs ? ?/sec 1.03 24.6±0.43µs ? ?/sec smol-songs.csv: basic filter: TO/david bowie 1.72 95.2±0.30µs ? ?/sec 1.00 55.2±0.14µs ? ?/sec smol-songs.csv: basic filter: TO/john 1.00 18.8±0.09µs ? ?/sec 1.06 19.8±0.17µs ? ?/sec smol-songs.csv: basic filter: TO/marcus miller 1.61 102.4±1.65µs ? ?/sec 1.00 63.4±0.24µs ? ?/sec smol-songs.csv: basic filter: TO/michael jackson 1.77 132.1±1.41µs ? ?/sec 1.00 74.5±0.59µs ? ?/sec smol-songs.csv: basic filter: TO/tamo 1.00 18.2±0.14µs ? ?/sec 1.05 19.2±0.46µs ? ?/sec smol-songs.csv: basic filter: TO/thelonious monk 1.49 150.8±1.92µs ? ?/sec 1.00 101.3±0.44µs ? ?/sec smol-songs.csv: basic placeholder/ 1.00 27.3±0.07µs ? ?/sec 1.03 28.0±0.05µs ? ?/sec smol-songs.csv: basic with quote/"Notstandskomitee" 1.00 122.4±0.17µs ? ?/sec 1.03 125.6±0.16µs ? ?/sec smol-songs.csv: basic with quote/"charles" 1.00 88.8±0.30µs ? ?/sec 1.00 88.4±0.15µs ? ?/sec smol-songs.csv: basic with quote/"charles" "mingus" 1.00 685.2±0.74µs ? ?/sec 1.01 689.4±6.07µs ? ?/sec smol-songs.csv: basic with quote/"david" 1.00 161.6±0.42µs ? ?/sec 1.01 162.6±0.17µs ? ?/sec smol-songs.csv: basic with quote/"david" "bowie" 1.00 731.7±0.73µs ? ?/sec 1.02 743.1±0.77µs ? ?/sec smol-songs.csv: basic with quote/"john" 1.00 267.1±0.33µs ? ?/sec 1.01 270.9±0.33µs ? ?/sec smol-songs.csv: basic with quote/"marcus" "miller" 1.00 138.7±0.31µs ? ?/sec 1.02 140.9±0.13µs ? ?/sec smol-songs.csv: basic with quote/"michael" "jackson" 1.01 841.4±0.72µs ? ?/sec 1.00 833.8±0.92µs ? ?/sec smol-songs.csv: basic with quote/"tamo" 1.01 189.2±0.26µs ? ?/sec 1.00 188.2±0.71µs ? ?/sec smol-songs.csv: basic with quote/"thelonious" "monk" 1.00 1100.5±1.36µs ? ?/sec 1.01 1111.7±2.17µs ? ?/sec smol-songs.csv: basic without quote/Notstandskomitee 3.40 7.9±0.02ms ? ?/sec 1.00 2.3±0.02ms ? ?/sec smol-songs.csv: basic without quote/charles 2.57 494.4±0.89µs ? ?/sec 1.00 192.5±0.18µs ? ?/sec smol-songs.csv: basic without quote/charles mingus 1.29 2.8±0.02ms ? ?/sec 1.00 2.1±0.01ms ? ?/sec smol-songs.csv: basic without quote/david 1.95 623.8±0.90µs ? ?/sec 1.00 319.2±1.22µs ? ?/sec smol-songs.csv: basic without quote/david bowie 1.12 5.9±0.00ms ? ?/sec 1.00 5.2±0.00ms ? ?/sec smol-songs.csv: basic without quote/john 1.24 1340.9±2.25µs ? ?/sec 1.00 1084.7±7.76µs ? ?/sec smol-songs.csv: basic without quote/marcus miller 7.97 14.6±0.01ms ? ?/sec 1.00 1826.0±6.84µs ? ?/sec smol-songs.csv: basic without quote/michael jackson 1.19 3.9±0.00ms ? ?/sec 1.00 3.3±0.00ms ? ?/sec smol-songs.csv: basic without quote/tamo 1.65 737.7±3.58µs ? ?/sec 1.00 446.7±0.51µs ? ?/sec smol-songs.csv: basic without quote/thelonious monk 1.16 4.5±0.02ms ? ?/sec 1.00 3.9±0.04ms ? ?/sec smol-songs.csv: big filter/Notstandskomitee 3.27 7.6±0.02ms ? ?/sec 1.00 2.3±0.01ms ? ?/sec smol-songs.csv: big filter/charles 8.26 1957.5±1.37µs ? ?/sec 1.00 236.8±0.34µs ? ?/sec smol-songs.csv: big filter/charles mingus 18.49 11.2±0.06ms ? ?/sec 1.00 607.7±3.03µs ? ?/sec smol-songs.csv: big filter/david 3.78 2.4±0.00ms ? ?/sec 1.00 622.8±0.80µs ? ?/sec smol-songs.csv: big filter/david bowie 9.00 12.0±0.01ms ? ?/sec 1.00 1336.0±3.17µs ? ?/sec smol-songs.csv: big filter/john 1.00 554.2±0.95µs ? ?/sec 1.01 560.4±0.79µs ? ?/sec smol-songs.csv: big filter/marcus miller 18.09 12.0±0.01ms ? ?/sec 1.00 664.7±0.60µs ? ?/sec smol-songs.csv: big filter/michael jackson 8.43 12.0±0.01ms ? ?/sec 1.00 1421.6±1.37µs ? ?/sec smol-songs.csv: big filter/tamo 1.00 86.3±0.14µs ? ?/sec 1.01 87.3±0.21µs ? ?/sec smol-songs.csv: big filter/thelonious monk 5.55 14.3±0.02ms ? ?/sec 1.00 2.6±0.01ms ? ?/sec smol-songs.csv: desc + default/Notstandskomitee 2.52 5.8±0.01ms ? ?/sec 1.00 2.3±0.01ms ? ?/sec smol-songs.csv: desc + default/charles 3.04 2.7±0.01ms ? ?/sec 1.00 893.4±1.08µs ? ?/sec smol-songs.csv: desc + default/charles mingus 6.77 10.3±0.01ms ? ?/sec 1.00 1520.8±1.90µs ? ?/sec smol-songs.csv: desc + default/david 1.39 5.7±0.00ms ? ?/sec 1.00 4.1±0.00ms ? ?/sec smol-songs.csv: desc + default/david bowie 2.34 15.8±0.02ms ? ?/sec 1.00 6.7±0.01ms ? ?/sec smol-songs.csv: desc + default/john 1.00 2.5±0.00ms ? ?/sec 1.02 2.6±0.01ms ? ?/sec smol-songs.csv: desc + default/marcus miller 5.06 14.5±0.02ms ? ?/sec 1.00 2.9±0.01ms ? ?/sec smol-songs.csv: desc + default/michael jackson 2.64 14.1±0.05ms ? ?/sec 1.00 5.4±0.00ms ? ?/sec smol-songs.csv: desc + default/tamo 1.00 567.0±0.65µs ? ?/sec 1.00 565.7±0.97µs ? ?/sec smol-songs.csv: desc + default/thelonious monk 3.55 11.6±0.02ms ? ?/sec 1.00 3.3±0.00ms ? ?/sec smol-songs.csv: desc/Notstandskomitee 2.58 5.6±0.02ms ? ?/sec 1.00 2.2±0.02ms ? ?/sec smol-songs.csv: desc/charles 6.04 2.1±0.00ms ? ?/sec 1.00 348.1±0.57µs ? ?/sec smol-songs.csv: desc/charles mingus 14.51 9.4±0.01ms ? ?/sec 1.00 646.7±0.99µs ? ?/sec smol-songs.csv: desc/david 3.86 2.4±0.00ms ? ?/sec 1.00 620.7±2.46µs ? ?/sec smol-songs.csv: desc/david bowie 10.10 9.8±0.01ms ? ?/sec 1.00 973.9±3.31µs ? ?/sec smol-songs.csv: desc/john 1.00 545.5±0.78µs ? ?/sec 1.00 547.2±0.48µs ? ?/sec smol-songs.csv: desc/marcus miller 11.39 10.3±0.01ms ? ?/sec 1.00 903.7±0.95µs ? ?/sec smol-songs.csv: desc/michael jackson 10.51 9.7±0.01ms ? ?/sec 1.00 924.7±2.02µs ? ?/sec smol-songs.csv: desc/tamo 1.01 43.2±0.33µs ? ?/sec 1.00 42.6±0.35µs ? ?/sec smol-songs.csv: desc/thelonious monk 4.19 10.8±0.03ms ? ?/sec 1.00 2.6±0.00ms ? ?/sec smol-songs.csv: prefix search/a 1.00 1008.7±1.00µs ? ?/sec 1.00 1005.5±0.91µs ? ?/sec smol-songs.csv: prefix search/b 1.00 885.0±0.70µs ? ?/sec 1.01 890.6±1.11µs ? ?/sec smol-songs.csv: prefix search/i 1.00 1051.8±1.25µs ? ?/sec 1.00 1056.6±4.12µs ? ?/sec smol-songs.csv: prefix search/s 1.00 724.7±1.77µs ? ?/sec 1.00 721.6±0.59µs ? ?/sec smol-songs.csv: prefix search/x 1.01 212.4±0.21µs ? ?/sec 1.00 210.9±0.38µs ? ?/sec smol-songs.csv: proximity/7000 Danses Un Jour Dans Notre Vie 18.55 48.5±0.09ms ? ?/sec 1.00 2.6±0.03ms ? ?/sec smol-songs.csv: proximity/The Disneyland Sing-Along Chorus 8.41 56.7±0.45ms ? ?/sec 1.00 6.7±0.05ms ? ?/sec smol-songs.csv: proximity/Under Great Northern Lights 15.74 38.9±0.14ms ? ?/sec 1.00 2.5±0.00ms ? ?/sec smol-songs.csv: proximity/black saint sinner lady 11.82 40.1±0.13ms ? ?/sec 1.00 3.4±0.02ms ? ?/sec smol-songs.csv: proximity/les dangeureuses 1960 6.90 26.1±0.13ms ? ?/sec 1.00 3.8±0.04ms ? ?/sec smol-songs.csv: typo/Arethla Franklin 14.93 5.8±0.01ms ? ?/sec 1.00 390.1±1.89µs ? ?/sec smol-songs.csv: typo/Disnaylande 3.18 7.3±0.01ms ? ?/sec 1.00 2.3±0.00ms ? ?/sec smol-songs.csv: typo/dire straights 5.55 15.2±0.02ms ? ?/sec 1.00 2.7±0.00ms ? ?/sec smol-songs.csv: typo/fear of the duck 28.03 20.0±0.03ms ? ?/sec 1.00 713.3±1.54µs ? ?/sec smol-songs.csv: typo/indochie 19.25 1851.4±2.38µs ? ?/sec 1.00 96.2±0.13µs ? ?/sec smol-songs.csv: typo/indochien 14.66 1887.7±3.18µs ? ?/sec 1.00 128.8±0.18µs ? ?/sec smol-songs.csv: typo/klub des loopers 37.73 18.0±0.02ms ? ?/sec 1.00 476.7±0.73µs ? ?/sec smol-songs.csv: typo/michel depech 10.17 5.8±0.01ms ? ?/sec 1.00 565.8±1.16µs ? ?/sec smol-songs.csv: typo/mongus 15.33 1897.4±3.44µs ? ?/sec 1.00 123.8±0.13µs ? ?/sec smol-songs.csv: typo/stromal 14.63 1859.3±2.40µs ? ?/sec 1.00 127.1±0.29µs ? ?/sec smol-songs.csv: typo/the white striper 10.83 9.4±0.01ms ? ?/sec 1.00 866.0±0.98µs ? ?/sec smol-songs.csv: typo/thelonius monk 14.40 3.8±0.00ms ? ?/sec 1.00 261.5±1.30µs ? ?/sec smol-songs.csv: words/7000 Danses / Le Baiser / je me trompe de mots 5.54 70.8±0.09ms ? ?/sec 1.00 12.8±0.03ms ? ?/sec smol-songs.csv: words/Bring Your Daughter To The Slaughter but now this is not part of the title 3.48 119.8±0.14ms ? ?/sec 1.00 34.4±0.04ms ? ?/sec smol-songs.csv: words/The Disneyland Children's Sing-Alone song 8.98 71.9±0.12ms ? ?/sec 1.00 8.0±0.01ms ? ?/sec smol-songs.csv: words/les liaisons dangeureuses 1793 11.88 37.4±0.07ms ? ?/sec 1.00 3.1±0.01ms ? ?/sec smol-songs.csv: words/seven nation mummy 22.86 23.4±0.04ms ? ?/sec 1.00 1024.8±1.57µs ? ?/sec smol-songs.csv: words/the black saint and the sinner lady and the good doggo 2.76 124.4±0.15ms ? ?/sec 1.00 45.1±0.09ms ? ?/sec smol-songs.csv: words/whathavenotnsuchforth and a good amount of words to pop to match the first one 2.52 107.0±0.23ms ? ?/sec 1.00 42.4±0.66ms ? ?/sec group main-wiki typo-wiki ----- --------- --------- smol-wiki-articles.csv: basic placeholder/ 1.02 13.7±0.02µs ? ?/sec 1.00 13.4±0.03µs ? ?/sec smol-wiki-articles.csv: basic with quote/"film" 1.02 409.8±0.67µs ? ?/sec 1.00 402.6±0.48µs ? ?/sec smol-wiki-articles.csv: basic with quote/"france" 1.00 325.9±0.91µs ? ?/sec 1.00 326.4±0.49µs ? ?/sec smol-wiki-articles.csv: basic with quote/"japan" 1.00 218.4±0.26µs ? ?/sec 1.01 220.5±0.20µs ? ?/sec smol-wiki-articles.csv: basic with quote/"machine" 1.00 143.0±0.12µs ? ?/sec 1.04 148.8±0.21µs ? ?/sec smol-wiki-articles.csv: basic with quote/"miles" "davis" 1.00 11.7±0.06ms ? ?/sec 1.00 11.8±0.01ms ? ?/sec smol-wiki-articles.csv: basic with quote/"mingus" 1.00 4.4±0.03ms ? ?/sec 1.00 4.4±0.00ms ? ?/sec smol-wiki-articles.csv: basic with quote/"rock" "and" "roll" 1.00 43.5±0.08ms ? ?/sec 1.01 43.8±0.06ms ? ?/sec smol-wiki-articles.csv: basic with quote/"spain" 1.00 137.3±0.35µs ? ?/sec 1.05 144.4±0.23µs ? ?/sec smol-wiki-articles.csv: basic without quote/film 1.00 125.3±0.30µs ? ?/sec 1.06 133.1±0.37µs ? ?/sec smol-wiki-articles.csv: basic without quote/france 1.21 1782.6±1.65µs ? ?/sec 1.00 1477.0±1.39µs ? ?/sec smol-wiki-articles.csv: basic without quote/japan 1.28 1363.9±0.80µs ? ?/sec 1.00 1064.3±1.79µs ? ?/sec smol-wiki-articles.csv: basic without quote/machine 1.73 760.3±0.81µs ? ?/sec 1.00 439.6±0.75µs ? ?/sec smol-wiki-articles.csv: basic without quote/miles davis 1.03 17.0±0.03ms ? ?/sec 1.00 16.5±0.02ms ? ?/sec smol-wiki-articles.csv: basic without quote/mingus 1.07 5.3±0.01ms ? ?/sec 1.00 5.0±0.00ms ? ?/sec smol-wiki-articles.csv: basic without quote/rock and roll 1.01 63.9±0.18ms ? ?/sec 1.00 63.0±0.07ms ? ?/sec smol-wiki-articles.csv: basic without quote/spain 2.07 667.4±0.93µs ? ?/sec 1.00 322.8±0.29µs ? ?/sec smol-wiki-articles.csv: prefix search/c 1.00 343.1±0.47µs ? ?/sec 1.00 344.0±0.34µs ? ?/sec smol-wiki-articles.csv: prefix search/g 1.00 374.4±3.42µs ? ?/sec 1.00 374.1±0.44µs ? ?/sec smol-wiki-articles.csv: prefix search/j 1.00 359.9±0.31µs ? ?/sec 1.00 361.2±0.79µs ? ?/sec smol-wiki-articles.csv: prefix search/q 1.01 102.0±0.12µs ? ?/sec 1.00 101.4±0.32µs ? ?/sec smol-wiki-articles.csv: prefix search/t 1.00 536.7±1.39µs ? ?/sec 1.00 534.3±0.84µs ? ?/sec smol-wiki-articles.csv: prefix search/x 1.00 400.9±1.00µs ? ?/sec 1.00 399.5±0.45µs ? ?/sec smol-wiki-articles.csv: proximity/april paris 3.86 14.4±0.01ms ? ?/sec 1.00 3.7±0.01ms ? ?/sec smol-wiki-articles.csv: proximity/diesel engine 12.98 10.4±0.01ms ? ?/sec 1.00 803.5±1.13µs ? ?/sec smol-wiki-articles.csv: proximity/herald sings 1.00 12.7±0.06ms ? ?/sec 5.29 67.1±0.09ms ? ?/sec smol-wiki-articles.csv: proximity/tea two 6.48 1452.1±2.78µs ? ?/sec 1.00 224.1±0.38µs ? ?/sec smol-wiki-articles.csv: typo/Disnaylande 3.89 8.5±0.01ms ? ?/sec 1.00 2.2±0.01ms ? ?/sec smol-wiki-articles.csv: typo/aritmetric 3.78 10.3±0.01ms ? ?/sec 1.00 2.7±0.00ms ? ?/sec smol-wiki-articles.csv: typo/linax 8.91 1426.7±0.97µs ? ?/sec 1.00 160.1±0.18µs ? ?/sec smol-wiki-articles.csv: typo/migrosoft 7.48 1417.3±5.84µs ? ?/sec 1.00 189.5±0.88µs ? ?/sec smol-wiki-articles.csv: typo/nympalidea 3.96 7.2±0.01ms ? ?/sec 1.00 1810.1±2.03µs ? ?/sec smol-wiki-articles.csv: typo/phytogropher 3.71 7.2±0.01ms ? ?/sec 1.00 1934.3±6.51µs ? ?/sec smol-wiki-articles.csv: typo/sisan 6.44 1497.2±1.38µs ? ?/sec 1.00 232.7±0.94µs ? ?/sec smol-wiki-articles.csv: typo/the fronce 6.92 2.9±0.00ms ? ?/sec 1.00 418.0±1.76µs ? ?/sec smol-wiki-articles.csv: words/Abraham machin 16.63 10.8±0.01ms ? ?/sec 1.00 649.7±1.08µs ? ?/sec smol-wiki-articles.csv: words/Idaho Bellevue pizza 27.15 25.6±0.03ms ? ?/sec 1.00 944.2±5.07µs ? ?/sec smol-wiki-articles.csv: words/Kameya Tokujirō mingus monk 26.87 40.7±0.05ms ? ?/sec 1.00 1515.3±2.73µs ? ?/sec smol-wiki-articles.csv: words/Ulrich Hensel meilisearch milli 11.99 48.8±0.10ms ? ?/sec 1.00 4.1±0.02ms ? ?/sec smol-wiki-articles.csv: words/the black saint and the sinner lady and the good doggo 4.90 110.0±0.15ms ? ?/sec 1.00 22.4±0.03ms ? ?/sec ``` Co-authored-by: mpostma <postma.marin@protonmail.com> Co-authored-by: ad hoc <postma.marin@protonmail.com>
2025-10-31 07:56:28 +00:00 · 2022-03-15 16:43:36 +00:00
parent 8efac33b53 3f24555c3d
commit ad4c982c68
5 changed files with 263 additions and 22 deletions
--- a/milli/src/search/fst_utils.rs
+++ b/milli/src/search/fst_utils.rs
@@ -0,0 +1,187 @@
+/// This mod is necessary until https://github.com/BurntSushi/fst/pull/137 gets merged.
+/// All credits for this code go to BurntSushi.
+use fst::Automaton;
+
+pub struct StartsWith<A>(pub A);
+
+/// The `Automaton` state for `StartsWith<A>`.
+pub struct StartsWithState<A: Automaton>(pub StartsWithStateKind<A>);
+
+impl<A: Automaton> Clone for StartsWithState<A>
+where
+    A::State: Clone,
+{
+    fn clone(&self) -> Self {
+        Self(self.0.clone())
+    }
+}
+
+/// The inner state of a `StartsWithState<A>`.
+pub enum StartsWithStateKind<A: Automaton> {
+    /// Sink state that is reached when the automaton has matched the prefix.
+    Done,
+    /// State in which the automaton is while it hasn't matched the prefix.
+    Running(A::State),
+}
+
+impl<A: Automaton> Clone for StartsWithStateKind<A>
+where
+    A::State: Clone,
+{
+    fn clone(&self) -> Self {
+        match self {
+            StartsWithStateKind::Done => StartsWithStateKind::Done,
+            StartsWithStateKind::Running(inner) => StartsWithStateKind::Running(inner.clone()),
+        }
+    }
+}
+
+impl<A: Automaton> Automaton for StartsWith<A> {
+    type State = StartsWithState<A>;
+
+    fn start(&self) -> StartsWithState<A> {
+        StartsWithState({
+            let inner = self.0.start();
+            if self.0.is_match(&inner) {
+                StartsWithStateKind::Done
+            } else {
+                StartsWithStateKind::Running(inner)
+            }
+        })
+    }
+    fn is_match(&self, state: &StartsWithState<A>) -> bool {
+        match state.0 {
+            StartsWithStateKind::Done => true,
+            StartsWithStateKind::Running(_) => false,
+        }
+    }
+    fn can_match(&self, state: &StartsWithState<A>) -> bool {
+        match state.0 {
+            StartsWithStateKind::Done => true,
+            StartsWithStateKind::Running(ref inner) => self.0.can_match(inner),
+        }
+    }
+    fn will_always_match(&self, state: &StartsWithState<A>) -> bool {
+        match state.0 {
+            StartsWithStateKind::Done => true,
+            StartsWithStateKind::Running(_) => false,
+        }
+    }
+    fn accept(&self, state: &StartsWithState<A>, byte: u8) -> StartsWithState<A> {
+        StartsWithState(match state.0 {
+            StartsWithStateKind::Done => StartsWithStateKind::Done,
+            StartsWithStateKind::Running(ref inner) => {
+                let next_inner = self.0.accept(inner, byte);
+                if self.0.is_match(&next_inner) {
+                    StartsWithStateKind::Done
+                } else {
+                    StartsWithStateKind::Running(next_inner)
+                }
+            }
+        })
+    }
+}
+/// An automaton that matches when one of its component automata match.
+#[derive(Clone, Debug)]
+pub struct Union<A, B>(pub A, pub B);
+
+/// The `Automaton` state for `Union<A, B>`.
+pub struct UnionState<A: Automaton, B: Automaton>(pub A::State, pub B::State);
+
+impl<A: Automaton, B: Automaton> Clone for UnionState<A, B>
+where
+    A::State: Clone,
+    B::State: Clone,
+{
+    fn clone(&self) -> Self {
+        Self(self.0.clone(), self.1.clone())
+    }
+}
+
+impl<A: Automaton, B: Automaton> Automaton for Union<A, B> {
+    type State = UnionState<A, B>;
+    fn start(&self) -> UnionState<A, B> {
+        UnionState(self.0.start(), self.1.start())
+    }
+    fn is_match(&self, state: &UnionState<A, B>) -> bool {
+        self.0.is_match(&state.0) || self.1.is_match(&state.1)
+    }
+    fn can_match(&self, state: &UnionState<A, B>) -> bool {
+        self.0.can_match(&state.0) || self.1.can_match(&state.1)
+    }
+    fn will_always_match(&self, state: &UnionState<A, B>) -> bool {
+        self.0.will_always_match(&state.0) || self.1.will_always_match(&state.1)
+    }
+    fn accept(&self, state: &UnionState<A, B>, byte: u8) -> UnionState<A, B> {
+        UnionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte))
+    }
+}
+/// An automaton that matches when both of its component automata match.
+#[derive(Clone, Debug)]
+pub struct Intersection<A, B>(pub A, pub B);
+
+/// The `Automaton` state for `Intersection<A, B>`.
+pub struct IntersectionState<A: Automaton, B: Automaton>(pub A::State, pub B::State);
+
+impl<A: Automaton, B: Automaton> Clone for IntersectionState<A, B>
+where
+    A::State: Clone,
+    B::State: Clone,
+{
+    fn clone(&self) -> Self {
+        Self(self.0.clone(), self.1.clone())
+    }
+}
+
+impl<A: Automaton, B: Automaton> Automaton for Intersection<A, B> {
+    type State = IntersectionState<A, B>;
+    fn start(&self) -> IntersectionState<A, B> {
+        IntersectionState(self.0.start(), self.1.start())
+    }
+    fn is_match(&self, state: &IntersectionState<A, B>) -> bool {
+        self.0.is_match(&state.0) && self.1.is_match(&state.1)
+    }
+    fn can_match(&self, state: &IntersectionState<A, B>) -> bool {
+        self.0.can_match(&state.0) && self.1.can_match(&state.1)
+    }
+    fn will_always_match(&self, state: &IntersectionState<A, B>) -> bool {
+        self.0.will_always_match(&state.0) && self.1.will_always_match(&state.1)
+    }
+    fn accept(&self, state: &IntersectionState<A, B>, byte: u8) -> IntersectionState<A, B> {
+        IntersectionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte))
+    }
+}
+/// An automaton that matches exactly when the automaton it wraps does not.
+#[derive(Clone, Debug)]
+pub struct Complement<A>(pub A);
+
+/// The `Automaton` state for `Complement<A>`.
+pub struct ComplementState<A: Automaton>(pub A::State);
+
+impl<A: Automaton> Clone for ComplementState<A>
+where
+    A::State: Clone,
+{
+    fn clone(&self) -> Self {
+        Self(self.0.clone())
+    }
+}
+
+impl<A: Automaton> Automaton for Complement<A> {
+    type State = ComplementState<A>;
+    fn start(&self) -> ComplementState<A> {
+        ComplementState(self.0.start())
+    }
+    fn is_match(&self, state: &ComplementState<A>) -> bool {
+        !self.0.is_match(&state.0)
+    }
+    fn can_match(&self, state: &ComplementState<A>) -> bool {
+        !self.0.will_always_match(&state.0)
+    }
+    fn will_always_match(&self, state: &ComplementState<A>) -> bool {
+        !self.0.can_match(&state.0)
+    }
+    fn accept(&self, state: &ComplementState<A>, byte: u8) -> ComplementState<A> {
+        ComplementState(self.0.accept(&state.0, byte))
+    }
+}
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@@ -7,7 +7,8 @@ use std::str::Utf8Error;
 use std::time::Instant;

 use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct};
-use fst::{IntoStreamer, Streamer};
+use fst::automaton::Str;
+use fst::{Automaton, IntoStreamer, Streamer};
 use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
 use log::debug;
 use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
@@ -15,6 +16,7 @@ use once_cell::sync::Lazy;
 use roaring::bitmap::RoaringBitmap;

 pub use self::facet::{FacetDistribution, FacetNumberIter, Filter};
+use self::fst_utils::{Complement, Intersection, StartsWith, Union};
 pub use self::matching_words::MatchingWords;
 use self::query_tree::QueryTreeBuilder;
 use crate::error::UserError;
@@ -29,6 +31,7 @@ static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
 mod criteria;
 mod distinct;
 mod facet;
+mod fst_utils;
 mod matching_words;
 mod query_tree;

@@ -284,20 +287,66 @@ pub fn word_derivations<'c>(
        Entry::Occupied(entry) => Ok(entry.into_mut()),
        Entry::Vacant(entry) => {
            let mut derived_words = Vec::new();
-            let dfa = build_dfa(word, max_typo, is_prefix);
-            let mut stream = fst.search_with_state(&dfa).into_stream();
+            if max_typo == 0 {
+                if is_prefix {
+                    let prefix = Str::new(word).starts_with();
+                    let mut stream = fst.search(prefix).into_stream();

-            while let Some((word, state)) = stream.next() {
-                let word = std::str::from_utf8(word)?;
-                let distance = dfa.distance(state);
-                derived_words.push((word.to_string(), distance.to_u8()));
+                    while let Some(word) = stream.next() {
+                        let word = std::str::from_utf8(word)?;
+                        derived_words.push((word.to_string(), 0));
+                    }
+                } else if fst.contains(word) {
+                    derived_words.push((word.to_string(), 0));
+                }
+            } else {
+                if max_typo == 1 {
+                    let dfa = build_dfa(word, 1, is_prefix);
+                    let starts = StartsWith(Str::new(get_first(word)));
+                    let mut stream =
+                        fst.search_with_state(Intersection(starts, &dfa)).into_stream();
+
+                    while let Some((word, state)) = stream.next() {
+                        let word = std::str::from_utf8(word)?;
+                        let d = dfa.distance(state.1);
+                        derived_words.push((word.to_string(), d.to_u8()));
+                    }
+                } else {
+                    let starts = StartsWith(Str::new(get_first(word)));
+                    let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts));
+                    let second_dfa = build_dfa(word, 2, is_prefix);
+                    let second = Intersection(&second_dfa, &starts);
+                    let automaton = Union(first, &second);
+
+                    let mut stream = fst.search_with_state(automaton).into_stream();
+
+                    while let Some((found_word, state)) = stream.next() {
+                        let found_word = std::str::from_utf8(found_word)?;
+                        // in the case the typo is on the first letter, we know the number of typo
+                        // is two
+                        if get_first(found_word) != get_first(word) {
+                            derived_words.push((word.to_string(), 2));
+                        } else {
+                            // Else, we know that it is the second dfa that matched and compute the
+                            // correct distance
+                            let d = second_dfa.distance((state.1).0);
+                            derived_words.push((word.to_string(), d.to_u8()));
+                        }
+                    }
+                }
            }
-
            Ok(entry.insert(derived_words))
        }
    }
 }

+fn get_first(s: &str) -> &str {
+    match s.chars().next() {
+        Some(c) => &s[..c.len_utf8()],
+        None => panic!("unexpected empty query"),
+    }
+}
+
 pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA {
    let lev = match typos {
        0 => &LEVDIST0,
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@@ -260,12 +260,12 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<O

 /// Return the `QueryKind` of a word depending on `authorize_typos`
 /// and the provided word length.
-fn typos(word: String, authorize_typos: bool) -> QueryKind {
+fn typos(word: String, authorize_typos: bool, max_typos: u8) -> QueryKind {
    if authorize_typos {
        match word.chars().count() {
            0..=4 => QueryKind::exact(word),
-            5..=8 => QueryKind::tolerant(1, word),
-            _ => QueryKind::tolerant(2, word),
+            5..=8 => QueryKind::tolerant(1.min(max_typos), word),
+            _ => QueryKind::tolerant(2.min(max_typos), word),
        }
    } else {
        QueryKind::exact(word)
@@ -316,8 +316,10 @@ fn create_query_tree(
                if let Some(child) = split_best_frequency(ctx, &word)? {
                    children.push(child);
                }
-                children
-                    .push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) }));
+                children.push(Operation::Query(Query {
+                    prefix,
+                    kind: typos(word, authorize_typos, 2),
+                }));
                Ok(Operation::or(false, children))
            }
            // create a CONSECUTIVE operation wrapping all word in the phrase
@@ -363,8 +365,10 @@ fn create_query_tree(
                                .collect();
                            let mut operations = synonyms(ctx, &words)?.unwrap_or_default();
                            let concat = words.concat();
-                            let query =
-                                Query { prefix: is_prefix, kind: typos(concat, authorize_typos) };
+                            let query = Query {
+                                prefix: is_prefix,
+                                kind: typos(concat, authorize_typos, 1),
+                            };
                            operations.push(Operation::Query(query));
                            and_op_children.push(Operation::or(false, operations));
                        }
@@ -655,7 +659,7 @@ mod test {
                ]),
                Operation::Query(Query {
                    prefix: true,
-                    kind: QueryKind::tolerant(2, "heyfriends".to_string()),
+                    kind: QueryKind::tolerant(1, "heyfriends".to_string()),
                }),
            ],
        );
@@ -688,7 +692,7 @@ mod test {
                ]),
                Operation::Query(Query {
                    prefix: false,
-                    kind: QueryKind::tolerant(2, "heyfriends".to_string()),
+                    kind: QueryKind::tolerant(1, "heyfriends".to_string()),
                }),
            ],
        );
@@ -753,7 +757,7 @@ mod test {
                ]),
                Operation::Query(Query {
                    prefix: false,
-                    kind: QueryKind::tolerant(2, "helloworld".to_string()),
+                    kind: QueryKind::tolerant(1, "helloworld".to_string()),
                }),
            ],
        );
@@ -851,7 +855,7 @@ mod test {
                        ]),
                        Operation::Query(Query {
                            prefix: false,
-                            kind: QueryKind::tolerant(2, "newyorkcity".to_string()),
+                            kind: QueryKind::tolerant(1, "newyorkcity".to_string()),
                        }),
                    ],
                ),
@@ -925,7 +929,7 @@ mod test {
                ]),
                Operation::Query(Query {
                    prefix: false,
-                    kind: QueryKind::tolerant(2, "wordsplitfish".to_string()),
+                    kind: QueryKind::tolerant(1, "wordsplitfish".to_string()),
                }),
            ],
        );
@@ -1045,7 +1049,7 @@ mod test {
                        ]),
                        Operation::Query(Query {
                            prefix: false,
-                            kind: QueryKind::tolerant(2, "heymyfriend".to_string()),
+                            kind: QueryKind::tolerant(1, "heymyfriend".to_string()),
                        }),
                    ],
                ),
--- a/milli/tests/assets/test_set.ndjson
+++ b/milli/tests/assets/test_set.ndjson
@@ -8,7 +8,7 @@
 {"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":""}
 {"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"geo_rank":740667,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","_geo": { "lat": 43.973998070351065, "lng": 3.4661837318345032 },"":""}
 {"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":""}
-{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"ello creation system","description":"in few word ello was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":""}
+{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"hallo creation system","description":"in few word hallo was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":""}
 {"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":""}
 {"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":""}
 {"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":""}
--- a/milli/tests/search/query_criteria.rs
+++ b/milli/tests/search/query_criteria.rs
@@ -61,6 +61,7 @@ test_criterion!(
    vec![Attribute],
    vec![]
 );
+test_criterion!(typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Typo], vec![]);
 test_criterion!(
    attribute_disallow_typo,
    DISALLOW_OPTIONAL_WORDS,