unicode: introduce UTF-8 character database
The decomposition and casefolding of UTF-8 characters are described in a prefix tree in utf8data.h, which is a generate from the Unicode Character Database (UCD), published by the Unicode Consortium, and should not be edited by hand. The structures in utf8data.h are meant to be used for lookup operations by the unicode subsystem, when decoding a utf-8 string. mkutf8data.c is the source for a program that generates utf8data.h. It was written by Olaf Weber from SGI and originally proposed to be merged into Linux in 2014. The original proposal performed the compatibility decomposition, NFKD, but the current version was modified by me to do canonical decomposition, NFD, as suggested by the community. The changes from the original submission are: * Rebase to mainline. * Fix out-of-tree-build. * Update makefile to build 11.0.0 ucd files. * drop references to xfs. * Convert NFKD to NFD. * Merge back robustness fixes from original patch. Requested by Dave Chinner. The original submission is archived at: <https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs> The utf8data.h file can be regenerated using the instructions in fs/unicode/README.utf8data. - Notes on the update from 8.0.0 to 11.0: The structure of the ucd files and special cases have not experienced any changes between versions 8.0.0 and 11.0.0. 8.0.0 saw the addition of Cherokee LC characters, which is an interesting case for case-folding. The update is accompanied by new tests on the test_ucd module to catch specific cases. No changes to mkutf8data script were required for the updates. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
This commit is contained in:
parent
45b1509e24
commit
371c600af8
8 changed files with 17106 additions and 0 deletions
|
@ -313,5 +313,6 @@ endif # NETWORK_FILESYSTEMS
|
||||||
|
|
||||||
source "fs/nls/Kconfig"
|
source "fs/nls/Kconfig"
|
||||||
source "fs/dlm/Kconfig"
|
source "fs/dlm/Kconfig"
|
||||||
|
source "fs/unicode/Kconfig"
|
||||||
|
|
||||||
endmenu
|
endmenu
|
||||||
|
|
|
@ -90,6 +90,7 @@ obj-$(CONFIG_EXPORTFS) += exportfs/
|
||||||
obj-$(CONFIG_NFSD) += nfsd/
|
obj-$(CONFIG_NFSD) += nfsd/
|
||||||
obj-$(CONFIG_LOCKD) += lockd/
|
obj-$(CONFIG_LOCKD) += lockd/
|
||||||
obj-$(CONFIG_NLS) += nls/
|
obj-$(CONFIG_NLS) += nls/
|
||||||
|
obj-$(CONFIG_UNICODE) += unicode/
|
||||||
obj-$(CONFIG_SYSV_FS) += sysv/
|
obj-$(CONFIG_SYSV_FS) += sysv/
|
||||||
obj-$(CONFIG_CIFS) += cifs/
|
obj-$(CONFIG_CIFS) += cifs/
|
||||||
obj-$(CONFIG_HPFS_FS) += hpfs/
|
obj-$(CONFIG_HPFS_FS) += hpfs/
|
||||||
|
|
8
fs/unicode/Kconfig
Normal file
8
fs/unicode/Kconfig
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
#
|
||||||
|
# UTF-8 normalization
|
||||||
|
#
|
||||||
|
config UNICODE
|
||||||
|
bool "UTF-8 normalization and casefolding support"
|
||||||
|
help
|
||||||
|
Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
|
||||||
|
support.
|
14
fs/unicode/Makefile
Normal file
14
fs/unicode/Makefile
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
# SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
# This rule is not invoked during the kernel compilation. It is used to
|
||||||
|
# regenerate the utf8data.h header file.
|
||||||
|
utf8data.h.new: *.txt $(objdir)/scripts/mkutf8data
|
||||||
|
$(objdir)/scripts/mkutf8data \
|
||||||
|
-a DerivedAge.txt \
|
||||||
|
-c DerivedCombiningClass.txt \
|
||||||
|
-p DerivedCoreProperties.txt \
|
||||||
|
-d UnicodeData.txt \
|
||||||
|
-f CaseFolding.txt \
|
||||||
|
-n NormalizationCorrections.txt \
|
||||||
|
-t NormalizationTest.txt \
|
||||||
|
-o $@
|
57
fs/unicode/README.utf8data
Normal file
57
fs/unicode/README.utf8data
Normal file
|
@ -0,0 +1,57 @@
|
||||||
|
The utf8data.h file in this directory is generated from the Unicode
|
||||||
|
Character Database for version 11.0.0 of the Unicode standard.
|
||||||
|
|
||||||
|
The full set of files can be found here:
|
||||||
|
|
||||||
|
http://www.unicode.org/Public/11.0.0/ucd/
|
||||||
|
|
||||||
|
Individual source links:
|
||||||
|
|
||||||
|
http://www.unicode.org/Public/11.0.0/ucd/CaseFolding.txt
|
||||||
|
http://www.unicode.org/Public/11.0.0/ucd/DerivedAge.txt
|
||||||
|
http://www.unicode.org/Public/11.0.0/ucd/extracted/DerivedCombiningClass.txt
|
||||||
|
http://www.unicode.org/Public/11.0.0/ucd/DerivedCoreProperties.txt
|
||||||
|
http://www.unicode.org/Public/11.0.0/ucd/NormalizationCorrections.txt
|
||||||
|
http://www.unicode.org/Public/11.0.0/ucd/NormalizationTest.txt
|
||||||
|
http://www.unicode.org/Public/11.0.0/ucd/UnicodeData.txt
|
||||||
|
|
||||||
|
md5sums (verify by running "md5sum -c README.utf8data"):
|
||||||
|
|
||||||
|
414436796cf097df55f798e1585448ee CaseFolding.txt
|
||||||
|
6032a595fbb782694456491d86eecfac DerivedAge.txt
|
||||||
|
3240997d671297ac754ab0d27577acf7 DerivedCombiningClass.txt
|
||||||
|
2a4fe257d9d8184518e036194d2248ec DerivedCoreProperties.txt
|
||||||
|
4e7d383fa0dd3cd9d49d64e5b7b7c9e0 NormalizationCorrections.txt
|
||||||
|
c9500c5b8b88e584469f056023ecc3f2 NormalizationTest.txt
|
||||||
|
acc291106c3758d2025f8d7bd5518bee UnicodeData.txt
|
||||||
|
|
||||||
|
sha1sums (verify by running "sha1sum -c README.utf8data"):
|
||||||
|
|
||||||
|
9184727adf7bd20e36312a68581d12ba3ffb9854 CaseFolding.txt
|
||||||
|
86c55b3eb89de61704da16af9c3f22854f61b57d DerivedAge.txt
|
||||||
|
b615703f62b1dbc5110e91acc3ff8b3789a067cf DerivedCombiningClass.txt
|
||||||
|
f8b07ef116d7dc21a94f26e70178ed2acf8713e9 DerivedCoreProperties.txt
|
||||||
|
a5fafb8998c0b8153a2a58430b8a35c811db0abc NormalizationCorrections.txt
|
||||||
|
070cdcb00cd4f0860e476750e404c59c2ebe9b25 NormalizationTest.txt
|
||||||
|
0e060fafb08d6722fbec56d9f9ebe8509f01d0ee UnicodeData.txt
|
||||||
|
|
||||||
|
To update to the newer version of the Unicode standard, the latest
|
||||||
|
released version of the UCD can be found here:
|
||||||
|
|
||||||
|
http://www.unicode.org/Public/UCD/latest/
|
||||||
|
|
||||||
|
To build the utf8data.h file, from a kernel tree that has been built,
|
||||||
|
cd to this directory (fs/unicode) and run this command:
|
||||||
|
|
||||||
|
make C=../.. objdir=../.. utf8data.h.new
|
||||||
|
|
||||||
|
After sanity checking the newly generated utf8data.h.new file (the
|
||||||
|
version generated from the 11.0.0 UCD should be 13,834 lines long, and
|
||||||
|
have a total size of 1104k) and/or comparing it with the older version
|
||||||
|
of utf8data.h, rename it to utf8data.h.
|
||||||
|
|
||||||
|
If you are a kernel developer updating to a newer version of the
|
||||||
|
Unicode Character Database, please update this README.utf8data file
|
||||||
|
with the version of the UCD that was used, the md5sum and sha1sums of
|
||||||
|
the *.txt files, before checking in the new versions of the utf8data.h
|
||||||
|
and README.utf8data files.
|
13834
fs/unicode/utf8data.h
Normal file
13834
fs/unicode/utf8data.h
Normal file
File diff suppressed because it is too large
Load diff
|
@ -20,6 +20,7 @@ hostprogs-$(CONFIG_ASN1) += asn1_compiler
|
||||||
hostprogs-$(CONFIG_MODULE_SIG) += sign-file
|
hostprogs-$(CONFIG_MODULE_SIG) += sign-file
|
||||||
hostprogs-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += extract-cert
|
hostprogs-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += extract-cert
|
||||||
hostprogs-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE) += insert-sys-cert
|
hostprogs-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE) += insert-sys-cert
|
||||||
|
hostprogs-$(CONFIG_UNICODE) += mkutf8data
|
||||||
|
|
||||||
HOSTCFLAGS_sortextable.o = -I$(srctree)/tools/include
|
HOSTCFLAGS_sortextable.o = -I$(srctree)/tools/include
|
||||||
HOSTCFLAGS_asn1_compiler.o = -I$(srctree)/include
|
HOSTCFLAGS_asn1_compiler.o = -I$(srctree)/include
|
||||||
|
|
3190
scripts/mkutf8data.c
Normal file
3190
scripts/mkutf8data.c
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue