mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 07:56:28 +00:00 
			
		
		
		
	Merge #439
439: Optimize typo criterion r=Kerollmops a=MarinPostma This pr implements a couple of optimization for the typo criterion: - clamp max typo on concatenated query words to 1: By considering that a concatenated query word is a typo, we clamp the max number of typos allowed o it to 1. This is useful because we noticed that concatenated query words often introduced words with 2 typos in queries that otherwise didn't allow for 2 typo words. - Make typos on the first letter count for 2. This change is a big performance gain: by considering the typos on the first letter to count as 2 typos, we drastically restrict the search space for 1 typo, and if we reach 2 typos, the search space is reduced as well, as we only consider: (2 typos ∩ correct first letter) ∪ (wrong first letter ∩ 1 typo) instead of 2 typos anywhere in the word. ## benches ``` group main typo ----- ---- ---- smol-songs.csv: asc + default/Notstandskomitee 2.51 5.8±0.01ms ? ?/sec 1.00 2.3±0.01ms ? ?/sec smol-songs.csv: asc + default/charles 2.48 3.0±0.01ms ? ?/sec 1.00 1190.9±1.29µs ? ?/sec smol-songs.csv: asc + default/charles mingus 5.56 10.8±0.01ms ? ?/sec 1.00 1935.3±1.00µs ? ?/sec smol-songs.csv: asc + default/david 1.65 3.9±0.00ms ? ?/sec 1.00 2.4±0.01ms ? ?/sec smol-songs.csv: asc + default/david bowie 3.34 12.5±0.02ms ? ?/sec 1.00 3.7±0.00ms ? ?/sec smol-songs.csv: asc + default/john 1.00 1849.7±3.74µs ? ?/sec 1.01 1875.1±4.65µs ? ?/sec smol-songs.csv: asc + default/marcus miller 4.32 15.7±0.01ms ? ?/sec 1.00 3.6±0.01ms ? ?/sec smol-songs.csv: asc + default/michael jackson 3.31 12.5±0.01ms ? ?/sec 1.00 3.8±0.00ms ? ?/sec smol-songs.csv: asc + default/tamo 1.05 565.4±0.86µs ? ?/sec 1.00 539.3±1.22µs ? ?/sec smol-songs.csv: asc + default/thelonious monk 3.49 11.5±0.01ms ? ?/sec 1.00 3.3±0.00ms ? ?/sec smol-songs.csv: asc/Notstandskomitee 2.59 5.6±0.02ms ? ?/sec 1.00 2.2±0.01ms ? ?/sec smol-songs.csv: asc/charles 6.05 2.1±0.00ms ? ?/sec 1.00 347.8±0.60µs ? ?/sec smol-songs.csv: asc/charles mingus 14.46 9.4±0.01ms ? ?/sec 1.00 649.2±0.97µs ? ?/sec smol-songs.csv: asc/david 3.87 2.4±0.00ms ? ?/sec 1.00 618.2±0.69µs ? ?/sec smol-songs.csv: asc/david bowie 10.14 9.8±0.01ms ? ?/sec 1.00 970.8±1.55µs ? ?/sec smol-songs.csv: asc/john 1.00 546.5±1.10µs ? ?/sec 1.00 547.1±2.11µs ? ?/sec smol-songs.csv: asc/marcus miller 11.45 10.4±0.06ms ? ?/sec 1.00 907.9±1.37µs ? ?/sec smol-songs.csv: asc/michael jackson 10.56 9.7±0.01ms ? ?/sec 1.00 919.6±1.03µs ? ?/sec smol-songs.csv: asc/tamo 1.03 43.3±0.18µs ? ?/sec 1.00 42.2±0.23µs ? ?/sec smol-songs.csv: asc/thelonious monk 4.16 10.7±0.02ms ? ?/sec 1.00 2.6±0.00ms ? ?/sec smol-songs.csv: basic filter: <=/Notstandskomitee 1.00 95.7±0.20µs ? ?/sec 1.15 109.6±10.40µs ? ?/sec smol-songs.csv: basic filter: <=/charles 1.00 27.8±0.15µs ? ?/sec 1.01 27.9±0.18µs ? ?/sec smol-songs.csv: basic filter: <=/charles mingus 1.72 119.2±0.67µs ? ?/sec 1.00 69.1±0.13µs ? ?/sec smol-songs.csv: basic filter: <=/david 1.00 22.3±0.33µs ? ?/sec 1.05 23.4±0.19µs ? ?/sec smol-songs.csv: basic filter: <=/david bowie 1.59 86.9±0.79µs ? ?/sec 1.00 54.5±0.31µs ? ?/sec smol-songs.csv: basic filter: <=/john 1.00 17.9±0.06µs ? ?/sec 1.06 18.9±0.15µs ? ?/sec smol-songs.csv: basic filter: <=/marcus miller 1.65 102.7±1.63µs ? ?/sec 1.00 62.3±0.18µs ? ?/sec smol-songs.csv: basic filter: <=/michael jackson 1.76 128.2±1.85µs ? ?/sec 1.00 72.9±0.19µs ? ?/sec smol-songs.csv: basic filter: <=/tamo 1.00 17.9±0.13µs ? ?/sec 1.05 18.7±0.20µs ? ?/sec smol-songs.csv: basic filter: <=/thelonious monk 1.53 157.5±2.38µs ? ?/sec 1.00 102.8±0.88µs ? ?/sec smol-songs.csv: basic filter: TO/Notstandskomitee 1.00 100.9±4.36µs ? ?/sec 1.04 105.0±8.25µs ? ?/sec smol-songs.csv: basic filter: TO/charles 1.00 28.4±0.36µs ? ?/sec 1.03 29.4±0.33µs ? ?/sec smol-songs.csv: basic filter: TO/charles mingus 1.71 118.1±1.08µs ? ?/sec 1.00 68.9±0.26µs ? ?/sec smol-songs.csv: basic filter: TO/david 1.00 24.0±0.26µs ? ?/sec 1.03 24.6±0.43µs ? ?/sec smol-songs.csv: basic filter: TO/david bowie 1.72 95.2±0.30µs ? ?/sec 1.00 55.2±0.14µs ? ?/sec smol-songs.csv: basic filter: TO/john 1.00 18.8±0.09µs ? ?/sec 1.06 19.8±0.17µs ? ?/sec smol-songs.csv: basic filter: TO/marcus miller 1.61 102.4±1.65µs ? ?/sec 1.00 63.4±0.24µs ? ?/sec smol-songs.csv: basic filter: TO/michael jackson 1.77 132.1±1.41µs ? ?/sec 1.00 74.5±0.59µs ? ?/sec smol-songs.csv: basic filter: TO/tamo 1.00 18.2±0.14µs ? ?/sec 1.05 19.2±0.46µs ? ?/sec smol-songs.csv: basic filter: TO/thelonious monk 1.49 150.8±1.92µs ? ?/sec 1.00 101.3±0.44µs ? ?/sec smol-songs.csv: basic placeholder/ 1.00 27.3±0.07µs ? ?/sec 1.03 28.0±0.05µs ? ?/sec smol-songs.csv: basic with quote/"Notstandskomitee" 1.00 122.4±0.17µs ? ?/sec 1.03 125.6±0.16µs ? ?/sec smol-songs.csv: basic with quote/"charles" 1.00 88.8±0.30µs ? ?/sec 1.00 88.4±0.15µs ? ?/sec smol-songs.csv: basic with quote/"charles" "mingus" 1.00 685.2±0.74µs ? ?/sec 1.01 689.4±6.07µs ? ?/sec smol-songs.csv: basic with quote/"david" 1.00 161.6±0.42µs ? ?/sec 1.01 162.6±0.17µs ? ?/sec smol-songs.csv: basic with quote/"david" "bowie" 1.00 731.7±0.73µs ? ?/sec 1.02 743.1±0.77µs ? ?/sec smol-songs.csv: basic with quote/"john" 1.00 267.1±0.33µs ? ?/sec 1.01 270.9±0.33µs ? ?/sec smol-songs.csv: basic with quote/"marcus" "miller" 1.00 138.7±0.31µs ? ?/sec 1.02 140.9±0.13µs ? ?/sec smol-songs.csv: basic with quote/"michael" "jackson" 1.01 841.4±0.72µs ? ?/sec 1.00 833.8±0.92µs ? ?/sec smol-songs.csv: basic with quote/"tamo" 1.01 189.2±0.26µs ? ?/sec 1.00 188.2±0.71µs ? ?/sec smol-songs.csv: basic with quote/"thelonious" "monk" 1.00 1100.5±1.36µs ? ?/sec 1.01 1111.7±2.17µs ? ?/sec smol-songs.csv: basic without quote/Notstandskomitee 3.40 7.9±0.02ms ? ?/sec 1.00 2.3±0.02ms ? ?/sec smol-songs.csv: basic without quote/charles 2.57 494.4±0.89µs ? ?/sec 1.00 192.5±0.18µs ? ?/sec smol-songs.csv: basic without quote/charles mingus 1.29 2.8±0.02ms ? ?/sec 1.00 2.1±0.01ms ? ?/sec smol-songs.csv: basic without quote/david 1.95 623.8±0.90µs ? ?/sec 1.00 319.2±1.22µs ? ?/sec smol-songs.csv: basic without quote/david bowie 1.12 5.9±0.00ms ? ?/sec 1.00 5.2±0.00ms ? ?/sec smol-songs.csv: basic without quote/john 1.24 1340.9±2.25µs ? ?/sec 1.00 1084.7±7.76µs ? ?/sec smol-songs.csv: basic without quote/marcus miller 7.97 14.6±0.01ms ? ?/sec 1.00 1826.0±6.84µs ? ?/sec smol-songs.csv: basic without quote/michael jackson 1.19 3.9±0.00ms ? ?/sec 1.00 3.3±0.00ms ? ?/sec smol-songs.csv: basic without quote/tamo 1.65 737.7±3.58µs ? ?/sec 1.00 446.7±0.51µs ? ?/sec smol-songs.csv: basic without quote/thelonious monk 1.16 4.5±0.02ms ? ?/sec 1.00 3.9±0.04ms ? ?/sec smol-songs.csv: big filter/Notstandskomitee 3.27 7.6±0.02ms ? ?/sec 1.00 2.3±0.01ms ? ?/sec smol-songs.csv: big filter/charles 8.26 1957.5±1.37µs ? ?/sec 1.00 236.8±0.34µs ? ?/sec smol-songs.csv: big filter/charles mingus 18.49 11.2±0.06ms ? ?/sec 1.00 607.7±3.03µs ? ?/sec smol-songs.csv: big filter/david 3.78 2.4±0.00ms ? ?/sec 1.00 622.8±0.80µs ? ?/sec smol-songs.csv: big filter/david bowie 9.00 12.0±0.01ms ? ?/sec 1.00 1336.0±3.17µs ? ?/sec smol-songs.csv: big filter/john 1.00 554.2±0.95µs ? ?/sec 1.01 560.4±0.79µs ? ?/sec smol-songs.csv: big filter/marcus miller 18.09 12.0±0.01ms ? ?/sec 1.00 664.7±0.60µs ? ?/sec smol-songs.csv: big filter/michael jackson 8.43 12.0±0.01ms ? ?/sec 1.00 1421.6±1.37µs ? ?/sec smol-songs.csv: big filter/tamo 1.00 86.3±0.14µs ? ?/sec 1.01 87.3±0.21µs ? ?/sec smol-songs.csv: big filter/thelonious monk 5.55 14.3±0.02ms ? ?/sec 1.00 2.6±0.01ms ? ?/sec smol-songs.csv: desc + default/Notstandskomitee 2.52 5.8±0.01ms ? ?/sec 1.00 2.3±0.01ms ? ?/sec smol-songs.csv: desc + default/charles 3.04 2.7±0.01ms ? ?/sec 1.00 893.4±1.08µs ? ?/sec smol-songs.csv: desc + default/charles mingus 6.77 10.3±0.01ms ? ?/sec 1.00 1520.8±1.90µs ? ?/sec smol-songs.csv: desc + default/david 1.39 5.7±0.00ms ? ?/sec 1.00 4.1±0.00ms ? ?/sec smol-songs.csv: desc + default/david bowie 2.34 15.8±0.02ms ? ?/sec 1.00 6.7±0.01ms ? ?/sec smol-songs.csv: desc + default/john 1.00 2.5±0.00ms ? ?/sec 1.02 2.6±0.01ms ? ?/sec smol-songs.csv: desc + default/marcus miller 5.06 14.5±0.02ms ? ?/sec 1.00 2.9±0.01ms ? ?/sec smol-songs.csv: desc + default/michael jackson 2.64 14.1±0.05ms ? ?/sec 1.00 5.4±0.00ms ? ?/sec smol-songs.csv: desc + default/tamo 1.00 567.0±0.65µs ? ?/sec 1.00 565.7±0.97µs ? ?/sec smol-songs.csv: desc + default/thelonious monk 3.55 11.6±0.02ms ? ?/sec 1.00 3.3±0.00ms ? ?/sec smol-songs.csv: desc/Notstandskomitee 2.58 5.6±0.02ms ? ?/sec 1.00 2.2±0.02ms ? ?/sec smol-songs.csv: desc/charles 6.04 2.1±0.00ms ? ?/sec 1.00 348.1±0.57µs ? ?/sec smol-songs.csv: desc/charles mingus 14.51 9.4±0.01ms ? ?/sec 1.00 646.7±0.99µs ? ?/sec smol-songs.csv: desc/david 3.86 2.4±0.00ms ? ?/sec 1.00 620.7±2.46µs ? ?/sec smol-songs.csv: desc/david bowie 10.10 9.8±0.01ms ? ?/sec 1.00 973.9±3.31µs ? ?/sec smol-songs.csv: desc/john 1.00 545.5±0.78µs ? ?/sec 1.00 547.2±0.48µs ? ?/sec smol-songs.csv: desc/marcus miller 11.39 10.3±0.01ms ? ?/sec 1.00 903.7±0.95µs ? ?/sec smol-songs.csv: desc/michael jackson 10.51 9.7±0.01ms ? ?/sec 1.00 924.7±2.02µs ? ?/sec smol-songs.csv: desc/tamo 1.01 43.2±0.33µs ? ?/sec 1.00 42.6±0.35µs ? ?/sec smol-songs.csv: desc/thelonious monk 4.19 10.8±0.03ms ? ?/sec 1.00 2.6±0.00ms ? ?/sec smol-songs.csv: prefix search/a 1.00 1008.7±1.00µs ? ?/sec 1.00 1005.5±0.91µs ? ?/sec smol-songs.csv: prefix search/b 1.00 885.0±0.70µs ? ?/sec 1.01 890.6±1.11µs ? ?/sec smol-songs.csv: prefix search/i 1.00 1051.8±1.25µs ? ?/sec 1.00 1056.6±4.12µs ? ?/sec smol-songs.csv: prefix search/s 1.00 724.7±1.77µs ? ?/sec 1.00 721.6±0.59µs ? ?/sec smol-songs.csv: prefix search/x 1.01 212.4±0.21µs ? ?/sec 1.00 210.9±0.38µs ? ?/sec smol-songs.csv: proximity/7000 Danses Un Jour Dans Notre Vie 18.55 48.5±0.09ms ? ?/sec 1.00 2.6±0.03ms ? ?/sec smol-songs.csv: proximity/The Disneyland Sing-Along Chorus 8.41 56.7±0.45ms ? ?/sec 1.00 6.7±0.05ms ? ?/sec smol-songs.csv: proximity/Under Great Northern Lights 15.74 38.9±0.14ms ? ?/sec 1.00 2.5±0.00ms ? ?/sec smol-songs.csv: proximity/black saint sinner lady 11.82 40.1±0.13ms ? ?/sec 1.00 3.4±0.02ms ? ?/sec smol-songs.csv: proximity/les dangeureuses 1960 6.90 26.1±0.13ms ? ?/sec 1.00 3.8±0.04ms ? ?/sec smol-songs.csv: typo/Arethla Franklin 14.93 5.8±0.01ms ? ?/sec 1.00 390.1±1.89µs ? ?/sec smol-songs.csv: typo/Disnaylande 3.18 7.3±0.01ms ? ?/sec 1.00 2.3±0.00ms ? ?/sec smol-songs.csv: typo/dire straights 5.55 15.2±0.02ms ? ?/sec 1.00 2.7±0.00ms ? ?/sec smol-songs.csv: typo/fear of the duck 28.03 20.0±0.03ms ? ?/sec 1.00 713.3±1.54µs ? ?/sec smol-songs.csv: typo/indochie 19.25 1851.4±2.38µs ? ?/sec 1.00 96.2±0.13µs ? ?/sec smol-songs.csv: typo/indochien 14.66 1887.7±3.18µs ? ?/sec 1.00 128.8±0.18µs ? ?/sec smol-songs.csv: typo/klub des loopers 37.73 18.0±0.02ms ? ?/sec 1.00 476.7±0.73µs ? ?/sec smol-songs.csv: typo/michel depech 10.17 5.8±0.01ms ? ?/sec 1.00 565.8±1.16µs ? ?/sec smol-songs.csv: typo/mongus 15.33 1897.4±3.44µs ? ?/sec 1.00 123.8±0.13µs ? ?/sec smol-songs.csv: typo/stromal 14.63 1859.3±2.40µs ? ?/sec 1.00 127.1±0.29µs ? ?/sec smol-songs.csv: typo/the white striper 10.83 9.4±0.01ms ? ?/sec 1.00 866.0±0.98µs ? ?/sec smol-songs.csv: typo/thelonius monk 14.40 3.8±0.00ms ? ?/sec 1.00 261.5±1.30µs ? ?/sec smol-songs.csv: words/7000 Danses / Le Baiser / je me trompe de mots 5.54 70.8±0.09ms ? ?/sec 1.00 12.8±0.03ms ? ?/sec smol-songs.csv: words/Bring Your Daughter To The Slaughter but now this is not part of the title 3.48 119.8±0.14ms ? ?/sec 1.00 34.4±0.04ms ? ?/sec smol-songs.csv: words/The Disneyland Children's Sing-Alone song 8.98 71.9±0.12ms ? ?/sec 1.00 8.0±0.01ms ? ?/sec smol-songs.csv: words/les liaisons dangeureuses 1793 11.88 37.4±0.07ms ? ?/sec 1.00 3.1±0.01ms ? ?/sec smol-songs.csv: words/seven nation mummy 22.86 23.4±0.04ms ? ?/sec 1.00 1024.8±1.57µs ? ?/sec smol-songs.csv: words/the black saint and the sinner lady and the good doggo 2.76 124.4±0.15ms ? ?/sec 1.00 45.1±0.09ms ? ?/sec smol-songs.csv: words/whathavenotnsuchforth and a good amount of words to pop to match the first one 2.52 107.0±0.23ms ? ?/sec 1.00 42.4±0.66ms ? ?/sec group main-wiki typo-wiki ----- --------- --------- smol-wiki-articles.csv: basic placeholder/ 1.02 13.7±0.02µs ? ?/sec 1.00 13.4±0.03µs ? ?/sec smol-wiki-articles.csv: basic with quote/"film" 1.02 409.8±0.67µs ? ?/sec 1.00 402.6±0.48µs ? ?/sec smol-wiki-articles.csv: basic with quote/"france" 1.00 325.9±0.91µs ? ?/sec 1.00 326.4±0.49µs ? ?/sec smol-wiki-articles.csv: basic with quote/"japan" 1.00 218.4±0.26µs ? ?/sec 1.01 220.5±0.20µs ? ?/sec smol-wiki-articles.csv: basic with quote/"machine" 1.00 143.0±0.12µs ? ?/sec 1.04 148.8±0.21µs ? ?/sec smol-wiki-articles.csv: basic with quote/"miles" "davis" 1.00 11.7±0.06ms ? ?/sec 1.00 11.8±0.01ms ? ?/sec smol-wiki-articles.csv: basic with quote/"mingus" 1.00 4.4±0.03ms ? ?/sec 1.00 4.4±0.00ms ? ?/sec smol-wiki-articles.csv: basic with quote/"rock" "and" "roll" 1.00 43.5±0.08ms ? ?/sec 1.01 43.8±0.06ms ? ?/sec smol-wiki-articles.csv: basic with quote/"spain" 1.00 137.3±0.35µs ? ?/sec 1.05 144.4±0.23µs ? ?/sec smol-wiki-articles.csv: basic without quote/film 1.00 125.3±0.30µs ? ?/sec 1.06 133.1±0.37µs ? ?/sec smol-wiki-articles.csv: basic without quote/france 1.21 1782.6±1.65µs ? ?/sec 1.00 1477.0±1.39µs ? ?/sec smol-wiki-articles.csv: basic without quote/japan 1.28 1363.9±0.80µs ? ?/sec 1.00 1064.3±1.79µs ? ?/sec smol-wiki-articles.csv: basic without quote/machine 1.73 760.3±0.81µs ? ?/sec 1.00 439.6±0.75µs ? ?/sec smol-wiki-articles.csv: basic without quote/miles davis 1.03 17.0±0.03ms ? ?/sec 1.00 16.5±0.02ms ? ?/sec smol-wiki-articles.csv: basic without quote/mingus 1.07 5.3±0.01ms ? ?/sec 1.00 5.0±0.00ms ? ?/sec smol-wiki-articles.csv: basic without quote/rock and roll 1.01 63.9±0.18ms ? ?/sec 1.00 63.0±0.07ms ? ?/sec smol-wiki-articles.csv: basic without quote/spain 2.07 667.4±0.93µs ? ?/sec 1.00 322.8±0.29µs ? ?/sec smol-wiki-articles.csv: prefix search/c 1.00 343.1±0.47µs ? ?/sec 1.00 344.0±0.34µs ? ?/sec smol-wiki-articles.csv: prefix search/g 1.00 374.4±3.42µs ? ?/sec 1.00 374.1±0.44µs ? ?/sec smol-wiki-articles.csv: prefix search/j 1.00 359.9±0.31µs ? ?/sec 1.00 361.2±0.79µs ? ?/sec smol-wiki-articles.csv: prefix search/q 1.01 102.0±0.12µs ? ?/sec 1.00 101.4±0.32µs ? ?/sec smol-wiki-articles.csv: prefix search/t 1.00 536.7±1.39µs ? ?/sec 1.00 534.3±0.84µs ? ?/sec smol-wiki-articles.csv: prefix search/x 1.00 400.9±1.00µs ? ?/sec 1.00 399.5±0.45µs ? ?/sec smol-wiki-articles.csv: proximity/april paris 3.86 14.4±0.01ms ? ?/sec 1.00 3.7±0.01ms ? ?/sec smol-wiki-articles.csv: proximity/diesel engine 12.98 10.4±0.01ms ? ?/sec 1.00 803.5±1.13µs ? ?/sec smol-wiki-articles.csv: proximity/herald sings 1.00 12.7±0.06ms ? ?/sec 5.29 67.1±0.09ms ? ?/sec smol-wiki-articles.csv: proximity/tea two 6.48 1452.1±2.78µs ? ?/sec 1.00 224.1±0.38µs ? ?/sec smol-wiki-articles.csv: typo/Disnaylande 3.89 8.5±0.01ms ? ?/sec 1.00 2.2±0.01ms ? ?/sec smol-wiki-articles.csv: typo/aritmetric 3.78 10.3±0.01ms ? ?/sec 1.00 2.7±0.00ms ? ?/sec smol-wiki-articles.csv: typo/linax 8.91 1426.7±0.97µs ? ?/sec 1.00 160.1±0.18µs ? ?/sec smol-wiki-articles.csv: typo/migrosoft 7.48 1417.3±5.84µs ? ?/sec 1.00 189.5±0.88µs ? ?/sec smol-wiki-articles.csv: typo/nympalidea 3.96 7.2±0.01ms ? ?/sec 1.00 1810.1±2.03µs ? ?/sec smol-wiki-articles.csv: typo/phytogropher 3.71 7.2±0.01ms ? ?/sec 1.00 1934.3±6.51µs ? ?/sec smol-wiki-articles.csv: typo/sisan 6.44 1497.2±1.38µs ? ?/sec 1.00 232.7±0.94µs ? ?/sec smol-wiki-articles.csv: typo/the fronce 6.92 2.9±0.00ms ? ?/sec 1.00 418.0±1.76µs ? ?/sec smol-wiki-articles.csv: words/Abraham machin 16.63 10.8±0.01ms ? ?/sec 1.00 649.7±1.08µs ? ?/sec smol-wiki-articles.csv: words/Idaho Bellevue pizza 27.15 25.6±0.03ms ? ?/sec 1.00 944.2±5.07µs ? ?/sec smol-wiki-articles.csv: words/Kameya Tokujirō mingus monk 26.87 40.7±0.05ms ? ?/sec 1.00 1515.3±2.73µs ? ?/sec smol-wiki-articles.csv: words/Ulrich Hensel meilisearch milli 11.99 48.8±0.10ms ? ?/sec 1.00 4.1±0.02ms ? ?/sec smol-wiki-articles.csv: words/the black saint and the sinner lady and the good doggo 4.90 110.0±0.15ms ? ?/sec 1.00 22.4±0.03ms ? ?/sec ``` Co-authored-by: mpostma <postma.marin@protonmail.com> Co-authored-by: ad hoc <postma.marin@protonmail.com>
This commit is contained in:
		
							
								
								
									
										187
									
								
								milli/src/search/fst_utils.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										187
									
								
								milli/src/search/fst_utils.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,187 @@ | ||||
| /// This mod is necessary until https://github.com/BurntSushi/fst/pull/137 gets merged. | ||||
| /// All credits for this code go to BurntSushi. | ||||
| use fst::Automaton; | ||||
|  | ||||
| pub struct StartsWith<A>(pub A); | ||||
|  | ||||
| /// The `Automaton` state for `StartsWith<A>`. | ||||
| pub struct StartsWithState<A: Automaton>(pub StartsWithStateKind<A>); | ||||
|  | ||||
| impl<A: Automaton> Clone for StartsWithState<A> | ||||
| where | ||||
|     A::State: Clone, | ||||
| { | ||||
|     fn clone(&self) -> Self { | ||||
|         Self(self.0.clone()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// The inner state of a `StartsWithState<A>`. | ||||
| pub enum StartsWithStateKind<A: Automaton> { | ||||
|     /// Sink state that is reached when the automaton has matched the prefix. | ||||
|     Done, | ||||
|     /// State in which the automaton is while it hasn't matched the prefix. | ||||
|     Running(A::State), | ||||
| } | ||||
|  | ||||
| impl<A: Automaton> Clone for StartsWithStateKind<A> | ||||
| where | ||||
|     A::State: Clone, | ||||
| { | ||||
|     fn clone(&self) -> Self { | ||||
|         match self { | ||||
|             StartsWithStateKind::Done => StartsWithStateKind::Done, | ||||
|             StartsWithStateKind::Running(inner) => StartsWithStateKind::Running(inner.clone()), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<A: Automaton> Automaton for StartsWith<A> { | ||||
|     type State = StartsWithState<A>; | ||||
|  | ||||
|     fn start(&self) -> StartsWithState<A> { | ||||
|         StartsWithState({ | ||||
|             let inner = self.0.start(); | ||||
|             if self.0.is_match(&inner) { | ||||
|                 StartsWithStateKind::Done | ||||
|             } else { | ||||
|                 StartsWithStateKind::Running(inner) | ||||
|             } | ||||
|         }) | ||||
|     } | ||||
|     fn is_match(&self, state: &StartsWithState<A>) -> bool { | ||||
|         match state.0 { | ||||
|             StartsWithStateKind::Done => true, | ||||
|             StartsWithStateKind::Running(_) => false, | ||||
|         } | ||||
|     } | ||||
|     fn can_match(&self, state: &StartsWithState<A>) -> bool { | ||||
|         match state.0 { | ||||
|             StartsWithStateKind::Done => true, | ||||
|             StartsWithStateKind::Running(ref inner) => self.0.can_match(inner), | ||||
|         } | ||||
|     } | ||||
|     fn will_always_match(&self, state: &StartsWithState<A>) -> bool { | ||||
|         match state.0 { | ||||
|             StartsWithStateKind::Done => true, | ||||
|             StartsWithStateKind::Running(_) => false, | ||||
|         } | ||||
|     } | ||||
|     fn accept(&self, state: &StartsWithState<A>, byte: u8) -> StartsWithState<A> { | ||||
|         StartsWithState(match state.0 { | ||||
|             StartsWithStateKind::Done => StartsWithStateKind::Done, | ||||
|             StartsWithStateKind::Running(ref inner) => { | ||||
|                 let next_inner = self.0.accept(inner, byte); | ||||
|                 if self.0.is_match(&next_inner) { | ||||
|                     StartsWithStateKind::Done | ||||
|                 } else { | ||||
|                     StartsWithStateKind::Running(next_inner) | ||||
|                 } | ||||
|             } | ||||
|         }) | ||||
|     } | ||||
| } | ||||
| /// An automaton that matches when one of its component automata match. | ||||
| #[derive(Clone, Debug)] | ||||
| pub struct Union<A, B>(pub A, pub B); | ||||
|  | ||||
| /// The `Automaton` state for `Union<A, B>`. | ||||
| pub struct UnionState<A: Automaton, B: Automaton>(pub A::State, pub B::State); | ||||
|  | ||||
| impl<A: Automaton, B: Automaton> Clone for UnionState<A, B> | ||||
| where | ||||
|     A::State: Clone, | ||||
|     B::State: Clone, | ||||
| { | ||||
|     fn clone(&self) -> Self { | ||||
|         Self(self.0.clone(), self.1.clone()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<A: Automaton, B: Automaton> Automaton for Union<A, B> { | ||||
|     type State = UnionState<A, B>; | ||||
|     fn start(&self) -> UnionState<A, B> { | ||||
|         UnionState(self.0.start(), self.1.start()) | ||||
|     } | ||||
|     fn is_match(&self, state: &UnionState<A, B>) -> bool { | ||||
|         self.0.is_match(&state.0) || self.1.is_match(&state.1) | ||||
|     } | ||||
|     fn can_match(&self, state: &UnionState<A, B>) -> bool { | ||||
|         self.0.can_match(&state.0) || self.1.can_match(&state.1) | ||||
|     } | ||||
|     fn will_always_match(&self, state: &UnionState<A, B>) -> bool { | ||||
|         self.0.will_always_match(&state.0) || self.1.will_always_match(&state.1) | ||||
|     } | ||||
|     fn accept(&self, state: &UnionState<A, B>, byte: u8) -> UnionState<A, B> { | ||||
|         UnionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte)) | ||||
|     } | ||||
| } | ||||
| /// An automaton that matches when both of its component automata match. | ||||
| #[derive(Clone, Debug)] | ||||
| pub struct Intersection<A, B>(pub A, pub B); | ||||
|  | ||||
| /// The `Automaton` state for `Intersection<A, B>`. | ||||
| pub struct IntersectionState<A: Automaton, B: Automaton>(pub A::State, pub B::State); | ||||
|  | ||||
| impl<A: Automaton, B: Automaton> Clone for IntersectionState<A, B> | ||||
| where | ||||
|     A::State: Clone, | ||||
|     B::State: Clone, | ||||
| { | ||||
|     fn clone(&self) -> Self { | ||||
|         Self(self.0.clone(), self.1.clone()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<A: Automaton, B: Automaton> Automaton for Intersection<A, B> { | ||||
|     type State = IntersectionState<A, B>; | ||||
|     fn start(&self) -> IntersectionState<A, B> { | ||||
|         IntersectionState(self.0.start(), self.1.start()) | ||||
|     } | ||||
|     fn is_match(&self, state: &IntersectionState<A, B>) -> bool { | ||||
|         self.0.is_match(&state.0) && self.1.is_match(&state.1) | ||||
|     } | ||||
|     fn can_match(&self, state: &IntersectionState<A, B>) -> bool { | ||||
|         self.0.can_match(&state.0) && self.1.can_match(&state.1) | ||||
|     } | ||||
|     fn will_always_match(&self, state: &IntersectionState<A, B>) -> bool { | ||||
|         self.0.will_always_match(&state.0) && self.1.will_always_match(&state.1) | ||||
|     } | ||||
|     fn accept(&self, state: &IntersectionState<A, B>, byte: u8) -> IntersectionState<A, B> { | ||||
|         IntersectionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte)) | ||||
|     } | ||||
| } | ||||
| /// An automaton that matches exactly when the automaton it wraps does not. | ||||
| #[derive(Clone, Debug)] | ||||
| pub struct Complement<A>(pub A); | ||||
|  | ||||
| /// The `Automaton` state for `Complement<A>`. | ||||
| pub struct ComplementState<A: Automaton>(pub A::State); | ||||
|  | ||||
| impl<A: Automaton> Clone for ComplementState<A> | ||||
| where | ||||
|     A::State: Clone, | ||||
| { | ||||
|     fn clone(&self) -> Self { | ||||
|         Self(self.0.clone()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<A: Automaton> Automaton for Complement<A> { | ||||
|     type State = ComplementState<A>; | ||||
|     fn start(&self) -> ComplementState<A> { | ||||
|         ComplementState(self.0.start()) | ||||
|     } | ||||
|     fn is_match(&self, state: &ComplementState<A>) -> bool { | ||||
|         !self.0.is_match(&state.0) | ||||
|     } | ||||
|     fn can_match(&self, state: &ComplementState<A>) -> bool { | ||||
|         !self.0.will_always_match(&state.0) | ||||
|     } | ||||
|     fn will_always_match(&self, state: &ComplementState<A>) -> bool { | ||||
|         !self.0.can_match(&state.0) | ||||
|     } | ||||
|     fn accept(&self, state: &ComplementState<A>, byte: u8) -> ComplementState<A> { | ||||
|         ComplementState(self.0.accept(&state.0, byte)) | ||||
|     } | ||||
| } | ||||
| @@ -7,7 +7,8 @@ use std::str::Utf8Error; | ||||
| use std::time::Instant; | ||||
|  | ||||
| use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; | ||||
| use fst::{IntoStreamer, Streamer}; | ||||
| use fst::automaton::Str; | ||||
| use fst::{Automaton, IntoStreamer, Streamer}; | ||||
| use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; | ||||
| use log::debug; | ||||
| use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; | ||||
| @@ -15,6 +16,7 @@ use once_cell::sync::Lazy; | ||||
| use roaring::bitmap::RoaringBitmap; | ||||
|  | ||||
| pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; | ||||
| use self::fst_utils::{Complement, Intersection, StartsWith, Union}; | ||||
| pub use self::matching_words::MatchingWords; | ||||
| use self::query_tree::QueryTreeBuilder; | ||||
| use crate::error::UserError; | ||||
| @@ -29,6 +31,7 @@ static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true)); | ||||
| mod criteria; | ||||
| mod distinct; | ||||
| mod facet; | ||||
| mod fst_utils; | ||||
| mod matching_words; | ||||
| mod query_tree; | ||||
|  | ||||
| @@ -284,20 +287,66 @@ pub fn word_derivations<'c>( | ||||
|         Entry::Occupied(entry) => Ok(entry.into_mut()), | ||||
|         Entry::Vacant(entry) => { | ||||
|             let mut derived_words = Vec::new(); | ||||
|             let dfa = build_dfa(word, max_typo, is_prefix); | ||||
|             let mut stream = fst.search_with_state(&dfa).into_stream(); | ||||
|             if max_typo == 0 { | ||||
|                 if is_prefix { | ||||
|                     let prefix = Str::new(word).starts_with(); | ||||
|                     let mut stream = fst.search(prefix).into_stream(); | ||||
|  | ||||
|             while let Some((word, state)) = stream.next() { | ||||
|                 let word = std::str::from_utf8(word)?; | ||||
|                 let distance = dfa.distance(state); | ||||
|                 derived_words.push((word.to_string(), distance.to_u8())); | ||||
|                     while let Some(word) = stream.next() { | ||||
|                         let word = std::str::from_utf8(word)?; | ||||
|                         derived_words.push((word.to_string(), 0)); | ||||
|                     } | ||||
|                 } else if fst.contains(word) { | ||||
|                     derived_words.push((word.to_string(), 0)); | ||||
|                 } | ||||
|             } else { | ||||
|                 if max_typo == 1 { | ||||
|                     let dfa = build_dfa(word, 1, is_prefix); | ||||
|                     let starts = StartsWith(Str::new(get_first(word))); | ||||
|                     let mut stream = | ||||
|                         fst.search_with_state(Intersection(starts, &dfa)).into_stream(); | ||||
|  | ||||
|                     while let Some((word, state)) = stream.next() { | ||||
|                         let word = std::str::from_utf8(word)?; | ||||
|                         let d = dfa.distance(state.1); | ||||
|                         derived_words.push((word.to_string(), d.to_u8())); | ||||
|                     } | ||||
|                 } else { | ||||
|                     let starts = StartsWith(Str::new(get_first(word))); | ||||
|                     let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts)); | ||||
|                     let second_dfa = build_dfa(word, 2, is_prefix); | ||||
|                     let second = Intersection(&second_dfa, &starts); | ||||
|                     let automaton = Union(first, &second); | ||||
|  | ||||
|                     let mut stream = fst.search_with_state(automaton).into_stream(); | ||||
|  | ||||
|                     while let Some((found_word, state)) = stream.next() { | ||||
|                         let found_word = std::str::from_utf8(found_word)?; | ||||
|                         // in the case the typo is on the first letter, we know the number of typo | ||||
|                         // is two | ||||
|                         if get_first(found_word) != get_first(word) { | ||||
|                             derived_words.push((word.to_string(), 2)); | ||||
|                         } else { | ||||
|                             // Else, we know that it is the second dfa that matched and compute the | ||||
|                             // correct distance | ||||
|                             let d = second_dfa.distance((state.1).0); | ||||
|                             derived_words.push((word.to_string(), d.to_u8())); | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             Ok(entry.insert(derived_words)) | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn get_first(s: &str) -> &str { | ||||
|     match s.chars().next() { | ||||
|         Some(c) => &s[..c.len_utf8()], | ||||
|         None => panic!("unexpected empty query"), | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA { | ||||
|     let lev = match typos { | ||||
|         0 => &LEVDIST0, | ||||
|   | ||||
| @@ -260,12 +260,12 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<O | ||||
|  | ||||
| /// Return the `QueryKind` of a word depending on `authorize_typos` | ||||
| /// and the provided word length. | ||||
| fn typos(word: String, authorize_typos: bool) -> QueryKind { | ||||
| fn typos(word: String, authorize_typos: bool, max_typos: u8) -> QueryKind { | ||||
|     if authorize_typos { | ||||
|         match word.chars().count() { | ||||
|             0..=4 => QueryKind::exact(word), | ||||
|             5..=8 => QueryKind::tolerant(1, word), | ||||
|             _ => QueryKind::tolerant(2, word), | ||||
|             5..=8 => QueryKind::tolerant(1.min(max_typos), word), | ||||
|             _ => QueryKind::tolerant(2.min(max_typos), word), | ||||
|         } | ||||
|     } else { | ||||
|         QueryKind::exact(word) | ||||
| @@ -316,8 +316,10 @@ fn create_query_tree( | ||||
|                 if let Some(child) = split_best_frequency(ctx, &word)? { | ||||
|                     children.push(child); | ||||
|                 } | ||||
|                 children | ||||
|                     .push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) })); | ||||
|                 children.push(Operation::Query(Query { | ||||
|                     prefix, | ||||
|                     kind: typos(word, authorize_typos, 2), | ||||
|                 })); | ||||
|                 Ok(Operation::or(false, children)) | ||||
|             } | ||||
|             // create a CONSECUTIVE operation wrapping all word in the phrase | ||||
| @@ -363,8 +365,10 @@ fn create_query_tree( | ||||
|                                 .collect(); | ||||
|                             let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); | ||||
|                             let concat = words.concat(); | ||||
|                             let query = | ||||
|                                 Query { prefix: is_prefix, kind: typos(concat, authorize_typos) }; | ||||
|                             let query = Query { | ||||
|                                 prefix: is_prefix, | ||||
|                                 kind: typos(concat, authorize_typos, 1), | ||||
|                             }; | ||||
|                             operations.push(Operation::Query(query)); | ||||
|                             and_op_children.push(Operation::or(false, operations)); | ||||
|                         } | ||||
| @@ -655,7 +659,7 @@ mod test { | ||||
|                 ]), | ||||
|                 Operation::Query(Query { | ||||
|                     prefix: true, | ||||
|                     kind: QueryKind::tolerant(2, "heyfriends".to_string()), | ||||
|                     kind: QueryKind::tolerant(1, "heyfriends".to_string()), | ||||
|                 }), | ||||
|             ], | ||||
|         ); | ||||
| @@ -688,7 +692,7 @@ mod test { | ||||
|                 ]), | ||||
|                 Operation::Query(Query { | ||||
|                     prefix: false, | ||||
|                     kind: QueryKind::tolerant(2, "heyfriends".to_string()), | ||||
|                     kind: QueryKind::tolerant(1, "heyfriends".to_string()), | ||||
|                 }), | ||||
|             ], | ||||
|         ); | ||||
| @@ -753,7 +757,7 @@ mod test { | ||||
|                 ]), | ||||
|                 Operation::Query(Query { | ||||
|                     prefix: false, | ||||
|                     kind: QueryKind::tolerant(2, "helloworld".to_string()), | ||||
|                     kind: QueryKind::tolerant(1, "helloworld".to_string()), | ||||
|                 }), | ||||
|             ], | ||||
|         ); | ||||
| @@ -851,7 +855,7 @@ mod test { | ||||
|                         ]), | ||||
|                         Operation::Query(Query { | ||||
|                             prefix: false, | ||||
|                             kind: QueryKind::tolerant(2, "newyorkcity".to_string()), | ||||
|                             kind: QueryKind::tolerant(1, "newyorkcity".to_string()), | ||||
|                         }), | ||||
|                     ], | ||||
|                 ), | ||||
| @@ -925,7 +929,7 @@ mod test { | ||||
|                 ]), | ||||
|                 Operation::Query(Query { | ||||
|                     prefix: false, | ||||
|                     kind: QueryKind::tolerant(2, "wordsplitfish".to_string()), | ||||
|                     kind: QueryKind::tolerant(1, "wordsplitfish".to_string()), | ||||
|                 }), | ||||
|             ], | ||||
|         ); | ||||
| @@ -1045,7 +1049,7 @@ mod test { | ||||
|                         ]), | ||||
|                         Operation::Query(Query { | ||||
|                             prefix: false, | ||||
|                             kind: QueryKind::tolerant(2, "heymyfriend".to_string()), | ||||
|                             kind: QueryKind::tolerant(1, "heymyfriend".to_string()), | ||||
|                         }), | ||||
|                     ], | ||||
|                 ), | ||||
|   | ||||
| @@ -8,7 +8,7 @@ | ||||
| {"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":""} | ||||
| {"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"geo_rank":740667,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","_geo": { "lat": 43.973998070351065, "lng": 3.4661837318345032 },"":""} | ||||
| {"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":""} | ||||
| {"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"ello creation system","description":"in few word ello was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":""} | ||||
| {"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"hallo creation system","description":"in few word hallo was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":""} | ||||
| {"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":""} | ||||
| {"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":""} | ||||
| {"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":""} | ||||
|   | ||||
| @@ -61,6 +61,7 @@ test_criterion!( | ||||
|     vec![Attribute], | ||||
|     vec![] | ||||
| ); | ||||
| test_criterion!(typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Typo], vec![]); | ||||
| test_criterion!( | ||||
|     attribute_disallow_typo, | ||||
|     DISALLOW_OPTIONAL_WORDS, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user