[package] name = "milli" edition = "2021" publish = false version.workspace = true authors.workspace = true description.workspace = true homepage.workspace = true readme.workspace = true # edition.workspace = true license.workspace = true [dependencies] big_s.workspace = true bimap = { workspace = true, features = ["serde"] } bincode.workspace = true bstr.workspace = true bytemuck = { workspace = true, features = ["extern_crate_alloc"] } byteorder.workspace = true charabia = { workspace = true, default-features = false } concat-arrays.workspace = true convert_case.workspace = true crossbeam-channel.workspace = true deserr.workspace = true either = { workspace = true, features = ["serde"] } flatten-serde-json.path = "../flatten-serde-json" fst.workspace = true fxhash.workspace = true geoutils.workspace = true grenad = { workspace = true, default-features = false, features = [ "rayon", "tempfile", ] } heed = { workspace = true, default-features = false, features = [ "serde-json", "serde-bincode", ] } indexmap = { workspace = true, features = ["serde"] } json-depth-checker.path = "../json-depth-checker" levenshtein_automata = { workspace = true, features = ["fst_automaton"] } memchr.workspace = true memmap2.workspace = true obkv.workspace = true once_cell.workspace = true ordered-float.workspace = true rayon.workspace = true roaring = { workspace = true, features = ["serde"] } rstar = { workspace = true, features = ["serde"] } serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true, features = ["preserve_order", "raw_value"] } slice-group-by.workspace = true smallstr = { workspace = true, features = ["serde"] } smallvec.workspace = true smartstring.workspace = true tempfile.workspace = true thiserror.workspace = true time = { workspace = true, features = [ "serde-well-known", "formatting", "parsing", "macros", ] } uuid = { workspace = true, features = ["v4"] } filter-parser.path = "../filter-parser" # documents words self-join itertools.workspace = true csv.workspace = true candle-core = { workspace = true } candle-transformers = { workspace = true } candle-nn = { workspace = true } tokenizers = { workspace = true, default-features = false, features = [ "onig", ] } hf-hub = { workspace = true, default-features = false, features = [ "online", ] } tiktoken-rs.workspace = true liquid.workspace = true rhai = { workspace = true, features = [ "serde", "no_module", "no_custom_syntax", "no_time", "sync", ] } arroy.workspace = true rand.workspace = true tracing.workspace = true ureq = { workspace = true, features = ["json"] } url.workspace = true hashbrown.workspace = true bumpalo.workspace = true bumparaw-collections.workspace = true thread_local.workspace = true rustc-hash.workspace = true enum-iterator.workspace = true bbqueue.workspace = true flume.workspace = true utoipa = { workspace = true, features = [ "non_strict_integers", "preserve_order", "uuid", "time", "openapi_extensions", ] } lru.workspace = true [dev-dependencies] mimalloc.workspace = true insta.workspace = true maplit.workspace = true md5.workspace = true meili-snap.path = "../meili-snap" rand = { workspace = true, features = ["small_rng"] } [features] all-tokenizations = ["charabia/default"] # Use POSIX semaphores instead of SysV semaphores in LMDB # For more information on this feature, see heed's Cargo.toml lmdb-posix-sem = ["heed/posix-sem"] # allow chinese specialized tokenization chinese = ["charabia/chinese"] chinese-pinyin = ["chinese", "charabia/chinese-normalization-pinyin"] # allow hebrew specialized tokenization hebrew = ["charabia/hebrew"] # allow japanese specialized tokenization japanese = ["charabia/japanese"] japanese-transliteration = ["charabia/japanese-transliteration"] # allow korean specialized tokenization korean = ["charabia/korean"] # allow thai specialized tokenization thai = ["charabia/thai"] # allow greek specialized tokenization greek = ["charabia/greek"] # allow khmer specialized tokenization khmer = ["charabia/khmer"] # allow vietnamese specialized tokenization vietnamese = ["charabia/vietnamese"] # allow german specialized tokenization german = ["charabia/german-segmentation"] # force swedish character recomposition swedish-recomposition = ["charabia/swedish-recomposition"] # allow turkish specialized tokenization turkish = ["charabia/turkish"] # allow CUDA support, see cuda = ["candle-core/cuda"]