Blame - fs/unicode/utf8-norm.c - SHIFTPHONES/mainline/linux

blob: 1d2d2e5b906aeeb92e44d2264ce7bf0f6bb4b490 [file] [log] [blame]

Thomas Gleixner	9f80685	2019-05-29 07:18:08 -0700	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	2	/*
				3	* Copyright (c) 2014 SGI.
				4	* All rights reserved.
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	5	*/
				6
				7	#include "utf8n.h"
				8
				9	struct utf8data {
				10	unsigned int maxage;
				11	unsigned int offset;
				12	};
				13
				14	#define __INCLUDED_FROM_UTF8NORM_C__
				15	#include "utf8data.h"
				16	#undef __INCLUDED_FROM_UTF8NORM_C__
				17
				18	int utf8version_is_supported(u8 maj, u8 min, u8 rev)
				19	{
				20	int i = ARRAY_SIZE(utf8agetab) - 1;
				21	unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
				22
				23	while (i >= 0 && utf8agetab[i] != 0) {
				24	if (sb_utf8version == utf8agetab[i])
				25	return 1;
				26	i--;
				27	}
				28	return 0;
				29	}
				30	EXPORT_SYMBOL(utf8version_is_supported);
				31
Gabriel Krisman Bertazi	9d53690	2019-04-25 13:51:22 -0400	[diff] [blame]	32	int utf8version_latest(void)
				33	{
				34	return utf8vers;
				35	}
				36	EXPORT_SYMBOL(utf8version_latest);
				37
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	38	/*
				39	* UTF-8 valid ranges.
				40	*
				41	* The UTF-8 encoding spreads the bits of a 32bit word over several
				42	* bytes. This table gives the ranges that can be held and how they'd
				43	* be represented.
				44	*
				45	* 0x00000000 0x0000007F: 0xxxxxxx
				46	* 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
				47	* 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
				48	* 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
				49	* 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				50	* 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				51	*
				52	* There is an additional requirement on UTF-8, in that only the
				53	* shortest representation of a 32bit value is to be used. A decoder
				54	* must not decode sequences that do not satisfy this requirement.
				55	* Thus the allowed ranges have a lower bound.
				56	*
				57	* 0x00000000 0x0000007F: 0xxxxxxx
				58	* 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
				59	* 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
				60	* 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
				61	* 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				62	* 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				63	*
				64	* Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
				65	* 17 planes of 65536 values. This limits the sequences actually seen
				66	* even more, to just the following.
				67	*
				68	* 0 - 0x7F: 0 - 0x7F
				69	* 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF
				70	* 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF
				71	* 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
				72	*
				73	* Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
				74	*
				75	* Note that the longest sequence seen with valid usage is 4 bytes,
				76	* the same a single UTF-32 character. This makes the UTF-8
				77	* representation of Unicode strictly smaller than UTF-32.
				78	*
				79	* The shortest sequence requirement was introduced by:
				80	* Corrigendum #1: UTF-8 Shortest Form
				81	* It can be found here:
				82	* http://www.unicode.org/versions/corrigendum1.html
				83	*
				84	*/
				85
				86	/*
				87	* Return the number of bytes used by the current UTF-8 sequence.
				88	* Assumes the input points to the first byte of a valid UTF-8
				89	* sequence.
				90	*/
				91	static inline int utf8clen(const char *s)
				92	{
				93	unsigned char c = *s;
				94
				95	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
				96	}
				97
				98	/*
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	99	* Decode a 3-byte UTF-8 sequence.
				100	*/
				101	static unsigned int
				102	utf8decode3(const char *str)
				103	{
				104	unsigned int uc;
				105
				106	uc = *str++ & 0x0F;
				107	uc <<= 6;
				108	uc \|= *str++ & 0x3F;
				109	uc <<= 6;
				110	uc \|= *str++ & 0x3F;
				111
				112	return uc;
				113	}
				114
				115	/*
				116	* Encode a 3-byte UTF-8 sequence.
				117	*/
				118	static int
				119	utf8encode3(char *str, unsigned int val)
				120	{
				121	str[2] = (val & 0x3F) \| 0x80;
				122	val >>= 6;
				123	str[1] = (val & 0x3F) \| 0x80;
				124	val >>= 6;
				125	str[0] = val \| 0xE0;
				126
				127	return 3;
				128	}
				129
				130	/*
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	131	* utf8trie_t
				132	*
				133	* A compact binary tree, used to decode UTF-8 characters.
				134	*
				135	* Internal nodes are one byte for the node itself, and up to three
				136	* bytes for an offset into the tree. The first byte contains the
				137	* following information:
				138	* NEXTBYTE - flag - advance to next byte if set
				139	* BITNUM - 3 bit field - the bit number to tested
				140	* OFFLEN - 2 bit field - number of bytes in the offset
				141	* if offlen == 0 (non-branching node)
				142	* RIGHTPATH - 1 bit field - set if the following node is for the
				143	* right-hand path (tested bit is set)
				144	* TRIENODE - 1 bit field - set if the following node is an internal
				145	* node, otherwise it is a leaf node
				146	* if offlen != 0 (branching node)
				147	* LEFTNODE - 1 bit field - set if the left-hand node is internal
				148	* RIGHTNODE - 1 bit field - set if the right-hand node is internal
				149	*
				150	* Due to the way utf8 works, there cannot be branching nodes with
				151	* NEXTBYTE set, and moreover those nodes always have a righthand
				152	* descendant.
				153	*/
				154	typedef const unsigned char utf8trie_t;
				155	#define BITNUM 0x07
				156	#define NEXTBYTE 0x08
				157	#define OFFLEN 0x30
				158	#define OFFLEN_SHIFT 4
				159	#define RIGHTPATH 0x40
				160	#define TRIENODE 0x80
				161	#define RIGHTNODE 0x40
				162	#define LEFTNODE 0x80
				163
				164	/*
				165	* utf8leaf_t
				166	*
				167	* The leaves of the trie are embedded in the trie, and so the same
				168	* underlying datatype: unsigned char.
				169	*
				170	* leaf[0]: The unicode version, stored as a generation number that is
				171	* an index into utf8agetab[]. With this we can filter code
				172	* points based on the unicode version in which they were
				173	* defined. The CCC of a non-defined code point is 0.
				174	* leaf[1]: Canonical Combining Class. During normalization, we need
				175	* to do a stable sort into ascending order of all characters
				176	* with a non-zero CCC that occur between two characters with
				177	* a CCC of 0, or at the begin or end of a string.
				178	* The unicode standard guarantees that all CCC values are
				179	* between 0 and 254 inclusive, which leaves 255 available as
				180	* a special value.
				181	* Code points with CCC 0 are known as stoppers.
				182	* leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
				183	* start of a NUL-terminated string that is the decomposition
				184	* of the character.
				185	* The CCC of a decomposable character is the same as the CCC
				186	* of the first character of its decomposition.
				187	* Some characters decompose as the empty string: these are
				188	* characters with the Default_Ignorable_Code_Point property.
				189	* These do affect normalization, as they all have CCC 0.
				190	*
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	191	* The decompositions in the trie have been fully expanded, with the
				192	* exception of Hangul syllables, which are decomposed algorithmically.
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	193	*
				194	* Casefolding, if applicable, is also done using decompositions.
				195	*
				196	* The trie is constructed in such a way that leaves exist for all
				197	* UTF-8 sequences that match the criteria from the "UTF-8 valid
				198	* ranges" comment above, and only for those sequences. Therefore a
				199	* lookup in the trie can be used to validate the UTF-8 input.
				200	*/
				201	typedef const unsigned char utf8leaf_t;
				202
				203	#define LEAF_GEN(LEAF) ((LEAF)[0])
				204	#define LEAF_CCC(LEAF) ((LEAF)[1])
				205	#define LEAF_STR(LEAF) ((const char *)((LEAF) + 2))
				206
				207	#define MINCCC (0)
				208	#define MAXCCC (254)
				209	#define STOPPER (0)
				210	#define DECOMPOSE (255)
				211
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	212	/* Marker for hangul syllable decomposition. */
				213	#define HANGUL ((char)(255))
				214	/* Size of the synthesized leaf used for Hangul syllable decomposition. */
				215	#define UTF8HANGULLEAF (12)
				216
				217	/*
				218	* Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
				219	*
				220	* AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
				221	* D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
				222	*
				223	* SBase = 0xAC00
				224	* LBase = 0x1100
				225	* VBase = 0x1161
				226	* TBase = 0x11A7
				227	* LCount = 19
				228	* VCount = 21
				229	* TCount = 28
				230	* NCount = 588 (VCount * TCount)
				231	* SCount = 11172 (LCount * NCount)
				232	*
				233	* Decomposition:
				234	* SIndex = s - SBase
				235	*
				236	* LV (Canonical/Full)
				237	* LIndex = SIndex / NCount
				238	* VIndex = (Sindex % NCount) / TCount
				239	* LPart = LBase + LIndex
				240	* VPart = VBase + VIndex
				241	*
				242	* LVT (Canonical)
				243	* LVIndex = (SIndex / TCount) * TCount
				244	* TIndex = (Sindex % TCount)
				245	* LVPart = SBase + LVIndex
				246	* TPart = TBase + TIndex
				247	*
				248	* LVT (Full)
				249	* LIndex = SIndex / NCount
				250	* VIndex = (Sindex % NCount) / TCount
				251	* TIndex = (Sindex % TCount)
				252	* LPart = LBase + LIndex
				253	* VPart = VBase + VIndex
				254	* if (TIndex == 0) {
				255	* d = <LPart, VPart>
				256	* } else {
				257	* TPart = TBase + TIndex
				258	* d = <LPart, TPart, VPart>
				259	* }
				260	*/
				261
				262	/* Constants */
				263	#define SB (0xAC00)
				264	#define LB (0x1100)
				265	#define VB (0x1161)
				266	#define TB (0x11A7)
				267	#define LC (19)
				268	#define VC (21)
				269	#define TC (28)
				270	#define NC (VC * TC)
				271	#define SC (LC * NC)
				272
				273	/* Algorithmic decomposition of hangul syllable. */
				274	static utf8leaf_t *
				275	utf8hangul(const char str, unsigned char hangul)
				276	{
				277	unsigned int si;
				278	unsigned int li;
				279	unsigned int vi;
				280	unsigned int ti;
				281	unsigned char *h;
				282
				283	/* Calculate the SI, LI, VI, and TI values. */
				284	si = utf8decode3(str) - SB;
				285	li = si / NC;
				286	vi = (si % NC) / TC;
				287	ti = si % TC;
				288
				289	/* Fill in base of leaf. */
				290	h = hangul;
				291	LEAF_GEN(h) = 2;
				292	LEAF_CCC(h) = DECOMPOSE;
				293	h += 2;
				294
				295	/* Add LPart, a 3-byte UTF-8 sequence. */
				296	h += utf8encode3((char *)h, li + LB);
				297
				298	/* Add VPart, a 3-byte UTF-8 sequence. */
				299	h += utf8encode3((char *)h, vi + VB);
				300
				301	/* Add TPart if required, also a 3-byte UTF-8 sequence. */
				302	if (ti)
				303	h += utf8encode3((char *)h, ti + TB);
				304
				305	/* Terminate string. */
				306	h[0] = '\0';
				307
				308	return hangul;
				309	}
				310
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	311	/*
				312	* Use trie to scan s, touching at most len bytes.
				313	* Returns the leaf if one exists, NULL otherwise.
				314	*
				315	* A non-NULL return guarantees that the UTF-8 sequence starting at s
				316	* is well-formed and corresponds to a known unicode code point. The
				317	* shorthand for this will be "is valid UTF-8 unicode".
				318	*/
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	319	static utf8leaf_t utf8nlookup(const struct utf8data data,
				320	unsigned char hangul, const char s, size_t len)
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	321	{
				322	utf8trie_t *trie = NULL;
				323	int offlen;
				324	int offset;
				325	int mask;
				326	int node;
				327
				328	if (!data)
				329	return NULL;
				330	if (len == 0)
				331	return NULL;
				332
				333	trie = utf8data + data->offset;
				334	node = 1;
				335	while (node) {
				336	offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
				337	if (*trie & NEXTBYTE) {
				338	if (--len == 0)
				339	return NULL;
				340	s++;
				341	}
				342	mask = 1 << (*trie & BITNUM);
				343	if (*s & mask) {
				344	/* Right leg */
				345	if (offlen) {
				346	/* Right node at offset of trie */
				347	node = (*trie & RIGHTNODE);
				348	offset = trie[offlen];
				349	while (--offlen) {
				350	offset <<= 8;
				351	offset \|= trie[offlen];
				352	}
				353	trie += offset;
				354	} else if (*trie & RIGHTPATH) {
				355	/* Right node after this node */
				356	node = (*trie & TRIENODE);
				357	trie++;
				358	} else {
				359	/* No right node. */
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	360	return NULL;
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	361	}
				362	} else {
				363	/* Left leg */
				364	if (offlen) {
				365	/* Left node after this node. */
				366	node = (*trie & LEFTNODE);
				367	trie += offlen + 1;
				368	} else if (*trie & RIGHTPATH) {
				369	/* No left node. */
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	370	return NULL;
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	371	} else {
				372	/* Left node after this node */
				373	node = (*trie & TRIENODE);
				374	trie++;
				375	}
				376	}
				377	}
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	378	/*
				379	* Hangul decomposition is done algorithmically. These are the
				380	* codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
				381	* always 3 bytes long, so s has been advanced twice, and the
				382	* start of the sequence is at s-2.
				383	*/
				384	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
				385	trie = utf8hangul(s - 2, hangul);
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	386	return trie;
				387	}
				388
				389	/*
				390	* Use trie to scan s.
				391	* Returns the leaf if one exists, NULL otherwise.
				392	*
				393	* Forwards to utf8nlookup().
				394	*/
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	395	static utf8leaf_t utf8lookup(const struct utf8data data,
				396	unsigned char hangul, const char s)
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	397	{
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	398	return utf8nlookup(data, hangul, s, (size_t)-1);
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	399	}
				400
				401	/*
				402	* Maximum age of any character in s.
				403	* Return -1 if s is not valid UTF-8 unicode.
				404	* Return 0 if only non-assigned code points are used.
				405	*/
				406	int utf8agemax(const struct utf8data data, const char s)
				407	{
				408	utf8leaf_t *leaf;
				409	int age = 0;
				410	int leaf_age;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	411	unsigned char hangul[UTF8HANGULLEAF];
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	412
				413	if (!data)
				414	return -1;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	415
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	416	while (*s) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	417	leaf = utf8lookup(data, hangul, s);
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	418	if (!leaf)
				419	return -1;
				420
				421	leaf_age = utf8agetab[LEAF_GEN(leaf)];
				422	if (leaf_age <= data->maxage && leaf_age > age)
				423	age = leaf_age;
				424	s += utf8clen(s);
				425	}
				426	return age;
				427	}
				428	EXPORT_SYMBOL(utf8agemax);
				429
				430	/*
				431	* Minimum age of any character in s.
				432	* Return -1 if s is not valid UTF-8 unicode.
				433	* Return 0 if non-assigned code points are used.
				434	*/
				435	int utf8agemin(const struct utf8data data, const char s)
				436	{
				437	utf8leaf_t *leaf;
				438	int age;
				439	int leaf_age;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	440	unsigned char hangul[UTF8HANGULLEAF];
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	441
				442	if (!data)
				443	return -1;
				444	age = data->maxage;
				445	while (*s) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	446	leaf = utf8lookup(data, hangul, s);
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	447	if (!leaf)
				448	return -1;
				449	leaf_age = utf8agetab[LEAF_GEN(leaf)];
				450	if (leaf_age <= data->maxage && leaf_age < age)
				451	age = leaf_age;
				452	s += utf8clen(s);
				453	}
				454	return age;
				455	}
				456	EXPORT_SYMBOL(utf8agemin);
				457
				458	/*
				459	* Maximum age of any character in s, touch at most len bytes.
				460	* Return -1 if s is not valid UTF-8 unicode.
				461	*/
				462	int utf8nagemax(const struct utf8data data, const char s, size_t len)
				463	{
				464	utf8leaf_t *leaf;
				465	int age = 0;
				466	int leaf_age;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	467	unsigned char hangul[UTF8HANGULLEAF];
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	468
				469	if (!data)
				470	return -1;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	471
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	472	while (len && *s) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	473	leaf = utf8nlookup(data, hangul, s, len);
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	474	if (!leaf)
				475	return -1;
				476	leaf_age = utf8agetab[LEAF_GEN(leaf)];
				477	if (leaf_age <= data->maxage && leaf_age > age)
				478	age = leaf_age;
				479	len -= utf8clen(s);
				480	s += utf8clen(s);
				481	}
				482	return age;
				483	}
				484	EXPORT_SYMBOL(utf8nagemax);
				485
				486	/*
				487	* Maximum age of any character in s, touch at most len bytes.
				488	* Return -1 if s is not valid UTF-8 unicode.
				489	*/
				490	int utf8nagemin(const struct utf8data data, const char s, size_t len)
				491	{
				492	utf8leaf_t *leaf;
				493	int leaf_age;
				494	int age;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	495	unsigned char hangul[UTF8HANGULLEAF];
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	496
				497	if (!data)
				498	return -1;
				499	age = data->maxage;
				500	while (len && *s) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	501	leaf = utf8nlookup(data, hangul, s, len);
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	502	if (!leaf)
				503	return -1;
				504	leaf_age = utf8agetab[LEAF_GEN(leaf)];
				505	if (leaf_age <= data->maxage && leaf_age < age)
				506	age = leaf_age;
				507	len -= utf8clen(s);
				508	s += utf8clen(s);
				509	}
				510	return age;
				511	}
				512	EXPORT_SYMBOL(utf8nagemin);
				513
				514	/*
				515	* Length of the normalization of s.
				516	* Return -1 if s is not valid UTF-8 unicode.
				517	*
				518	* A string of Default_Ignorable_Code_Point has length 0.
				519	*/
				520	ssize_t utf8len(const struct utf8data data, const char s)
				521	{
				522	utf8leaf_t *leaf;
				523	size_t ret = 0;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	524	unsigned char hangul[UTF8HANGULLEAF];
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	525
				526	if (!data)
				527	return -1;
				528	while (*s) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	529	leaf = utf8lookup(data, hangul, s);
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	530	if (!leaf)
				531	return -1;
				532	if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
				533	ret += utf8clen(s);
				534	else if (LEAF_CCC(leaf) == DECOMPOSE)
				535	ret += strlen(LEAF_STR(leaf));
				536	else
				537	ret += utf8clen(s);
				538	s += utf8clen(s);
				539	}
				540	return ret;
				541	}
				542	EXPORT_SYMBOL(utf8len);
				543
				544	/*
				545	* Length of the normalization of s, touch at most len bytes.
				546	* Return -1 if s is not valid UTF-8 unicode.
				547	*/
				548	ssize_t utf8nlen(const struct utf8data data, const char s, size_t len)
				549	{
				550	utf8leaf_t *leaf;
				551	size_t ret = 0;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	552	unsigned char hangul[UTF8HANGULLEAF];
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	553
				554	if (!data)
				555	return -1;
				556	while (len && *s) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	557	leaf = utf8nlookup(data, hangul, s, len);
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	558	if (!leaf)
				559	return -1;
				560	if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
				561	ret += utf8clen(s);
				562	else if (LEAF_CCC(leaf) == DECOMPOSE)
				563	ret += strlen(LEAF_STR(leaf));
				564	else
				565	ret += utf8clen(s);
				566	len -= utf8clen(s);
				567	s += utf8clen(s);
				568	}
				569	return ret;
				570	}
				571	EXPORT_SYMBOL(utf8nlen);
				572
				573	/*
				574	* Set up an utf8cursor for use by utf8byte().
				575	*
				576	* u8c : pointer to cursor.
				577	* data : const struct utf8data to use for normalization.
				578	* s : string.
				579	* len : length of s.
				580	*
				581	* Returns -1 on error, 0 on success.
				582	*/
				583	int utf8ncursor(struct utf8cursor u8c, const struct utf8data data,
				584	const char *s, size_t len)
				585	{
				586	if (!data)
				587	return -1;
				588	if (!s)
				589	return -1;
				590	u8c->data = data;
				591	u8c->s = s;
				592	u8c->p = NULL;
				593	u8c->ss = NULL;
				594	u8c->sp = NULL;
				595	u8c->len = len;
				596	u8c->slen = 0;
				597	u8c->ccc = STOPPER;
				598	u8c->nccc = STOPPER;
				599	/* Check we didn't clobber the maximum length. */
				600	if (u8c->len != len)
				601	return -1;
				602	/* The first byte of s may not be an utf8 continuation. */
				603	if (len > 0 && (*s & 0xC0) == 0x80)
				604	return -1;
				605	return 0;
				606	}
				607	EXPORT_SYMBOL(utf8ncursor);
				608
				609	/*
				610	* Set up an utf8cursor for use by utf8byte().
				611	*
				612	* u8c : pointer to cursor.
				613	* data : const struct utf8data to use for normalization.
				614	* s : NUL-terminated string.
				615	*
				616	* Returns -1 on error, 0 on success.
				617	*/
				618	int utf8cursor(struct utf8cursor u8c, const struct utf8data data,
				619	const char *s)
				620	{
				621	return utf8ncursor(u8c, data, s, (unsigned int)-1);
				622	}
				623	EXPORT_SYMBOL(utf8cursor);
				624
				625	/*
				626	* Get one byte from the normalized form of the string described by u8c.
				627	*
				628	* Returns the byte cast to an unsigned char on succes, and -1 on failure.
				629	*
				630	* The cursor keeps track of the location in the string in u8c->s.
				631	* When a character is decomposed, the current location is stored in
				632	* u8c->p, and u8c->s is set to the start of the decomposition. Note
				633	* that bytes from a decomposition do not count against u8c->len.
				634	*
				635	* Characters are emitted if they match the current CCC in u8c->ccc.
				636	* Hitting end-of-string while u8c->ccc == STOPPER means we're done,
				637	* and the function returns 0 in that case.
				638	*
				639	* Sorting by CCC is done by repeatedly scanning the string. The
				640	* values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
				641	* the start of the scan. The first pass finds the lowest CCC to be
				642	* emitted and stores it in u8c->nccc, the second pass emits the
				643	* characters with this CCC and finds the next lowest CCC. This limits
				644	* the number of passes to 1 + the number of different CCCs in the
				645	* sequence being scanned.
				646	*
				647	* Therefore:
				648	* u8c->p != NULL -> a decomposition is being scanned.
				649	* u8c->ss != NULL -> this is a repeating scan.
				650	* u8c->ccc == -1 -> this is the first scan of a repeating scan.
				651	*/
				652	int utf8byte(struct utf8cursor *u8c)
				653	{
				654	utf8leaf_t *leaf;
				655	int ccc;
				656
				657	for (;;) {
				658	/* Check for the end of a decomposed character. */
				659	if (u8c->p && *u8c->s == '\0') {
				660	u8c->s = u8c->p;
				661	u8c->p = NULL;
				662	}
				663
				664	/* Check for end-of-string. */
				665	if (!u8c->p && (u8c->len == 0 \|\| *u8c->s == '\0')) {
				666	/* There is no next byte. */
				667	if (u8c->ccc == STOPPER)
				668	return 0;
				669	/* End-of-string during a scan counts as a stopper. */
				670	ccc = STOPPER;
				671	goto ccc_mismatch;
				672	} else if ((*u8c->s & 0xC0) == 0x80) {
				673	/* This is a continuation of the current character. */
				674	if (!u8c->p)
				675	u8c->len--;
				676	return (unsigned char)*u8c->s++;
				677	}
				678
				679	/* Look up the data for the current character. */
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	680	if (u8c->p) {
				681	leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
				682	} else {
				683	leaf = utf8nlookup(u8c->data, u8c->hangul,
				684	u8c->s, u8c->len);
				685	}
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	686
				687	/* No leaf found implies that the input is a binary blob. */
				688	if (!leaf)
				689	return -1;
				690
				691	ccc = LEAF_CCC(leaf);
				692	/* Characters that are too new have CCC 0. */
				693	if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
				694	ccc = STOPPER;
				695	} else if (ccc == DECOMPOSE) {
				696	u8c->len -= utf8clen(u8c->s);
				697	u8c->p = u8c->s + utf8clen(u8c->s);
				698	u8c->s = LEAF_STR(leaf);
				699	/* Empty decomposition implies CCC 0. */
				700	if (*u8c->s == '\0') {
				701	if (u8c->ccc == STOPPER)
				702	continue;
				703	ccc = STOPPER;
				704	goto ccc_mismatch;
				705	}
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	706
				707	leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
Theodore Ts'o	15f0d8d	2019-05-12 04:56:51 -0400	[diff] [blame]	708	if (!leaf)
				709	return -1;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	710	ccc = LEAF_CCC(leaf);
Olaf Weber	44594c2	2019-04-25 13:45:46 -0400	[diff] [blame]	711	}
				712
				713	/*
				714	* If this is not a stopper, then see if it updates
				715	* the next canonical class to be emitted.
				716	*/
				717	if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
				718	u8c->nccc = ccc;
				719
				720	/*
				721	* Return the current byte if this is the current
				722	* combining class.
				723	*/
				724	if (ccc == u8c->ccc) {
				725	if (!u8c->p)
				726	u8c->len--;
				727	return (unsigned char)*u8c->s++;
				728	}
				729
				730	/* Current combining class mismatch. */
				731	ccc_mismatch:
				732	if (u8c->nccc == STOPPER) {
				733	/*
				734	* Scan forward for the first canonical class
				735	* to be emitted. Save the position from
				736	* which to restart.
				737	*/
				738	u8c->ccc = MINCCC - 1;
				739	u8c->nccc = ccc;
				740	u8c->sp = u8c->p;
				741	u8c->ss = u8c->s;
				742	u8c->slen = u8c->len;
				743	if (!u8c->p)
				744	u8c->len -= utf8clen(u8c->s);
				745	u8c->s += utf8clen(u8c->s);
				746	} else if (ccc != STOPPER) {
				747	/* Not a stopper, and not the ccc we're emitting. */
				748	if (!u8c->p)
				749	u8c->len -= utf8clen(u8c->s);
				750	u8c->s += utf8clen(u8c->s);
				751	} else if (u8c->nccc != MAXCCC + 1) {
				752	/* At a stopper, restart for next ccc. */
				753	u8c->ccc = u8c->nccc;
				754	u8c->nccc = MAXCCC + 1;
				755	u8c->s = u8c->ss;
				756	u8c->p = u8c->sp;
				757	u8c->len = u8c->slen;
				758	} else {
				759	/* All done, proceed from here. */
				760	u8c->ccc = STOPPER;
				761	u8c->nccc = STOPPER;
				762	u8c->sp = NULL;
				763	u8c->ss = NULL;
				764	u8c->slen = 0;
				765	}
				766	}
				767	}
				768	EXPORT_SYMBOL(utf8byte);
				769
				770	const struct utf8data *utf8nfdi(unsigned int maxage)
				771	{
				772	int i = ARRAY_SIZE(utf8nfdidata) - 1;
				773
				774	while (maxage < utf8nfdidata[i].maxage)
				775	i--;
				776	if (maxage > utf8nfdidata[i].maxage)
				777	return NULL;
				778	return &utf8nfdidata[i];
				779	}
				780	EXPORT_SYMBOL(utf8nfdi);
				781
				782	const struct utf8data *utf8nfdicf(unsigned int maxage)
				783	{
				784	int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
				785
				786	while (maxage < utf8nfdicfdata[i].maxage)
				787	i--;
				788	if (maxage > utf8nfdicfdata[i].maxage)
				789	return NULL;
				790	return &utf8nfdicfdata[i];
				791	}
				792	EXPORT_SYMBOL(utf8nfdicf);