Add german tokenization pipeline

This commit is contained in:
ManyTheFish
2024-09-19 13:30:07 +02:00
parent f77661ec44
commit 7d6768e4c4
3 changed files with 7 additions and 0 deletions

View File

@ -107,6 +107,7 @@ all-tokenizations = [
"charabia/khmer",
"charabia/vietnamese",
"charabia/swedish-recomposition",
"charabia/german-segmentation",
]
# Use POSIX semaphores instead of SysV semaphores in LMDB
@ -139,6 +140,9 @@ khmer = ["charabia/khmer"]
# allow vietnamese specialized tokenization
vietnamese = ["charabia/vietnamese"]
# allow german specialized tokenization
german = ["charabia/german-segmentation"]
# force swedish character recomposition
swedish-recomposition = ["charabia/swedish-recomposition"]