Blame - fs/unicode/mkutf8data.c - SHIFTPHONES/mainline/linux

blob: bc1a7c8b5c8dfc10b723fd7fa696c8a287aada38 [file] [log] [blame]

Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1	/*
				2	* Copyright (c) 2014 SGI.
				3	* All rights reserved.
				4	*
				5	* This program is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU General Public License as
				7	* published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope that it would be useful,
				10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				12	* GNU General Public License for more details.
				13	*
				14	* You should have received a copy of the GNU General Public License
				15	* along with this program; if not, write the Free Software Foundation,
				16	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
				17	*/
				18
				19	/* Generator for a compact trie for unicode normalization */
				20
				21	#include <sys/types.h>
				22	#include <stddef.h>
				23	#include <stdlib.h>
				24	#include <stdio.h>
				25	#include <assert.h>
				26	#include <string.h>
				27	#include <unistd.h>
				28	#include <errno.h>
				29
				30	/* Default names of the in- and output files. */
				31
				32	#define AGE_NAME "DerivedAge.txt"
				33	#define CCC_NAME "DerivedCombiningClass.txt"
				34	#define PROP_NAME "DerivedCoreProperties.txt"
				35	#define DATA_NAME "UnicodeData.txt"
				36	#define FOLD_NAME "CaseFolding.txt"
				37	#define NORM_NAME "NormalizationCorrections.txt"
				38	#define TEST_NAME "NormalizationTest.txt"
				39	#define UTF8_NAME "utf8data.h"
				40
				41	const char *age_name = AGE_NAME;
				42	const char *ccc_name = CCC_NAME;
				43	const char *prop_name = PROP_NAME;
				44	const char *data_name = DATA_NAME;
				45	const char *fold_name = FOLD_NAME;
				46	const char *norm_name = NORM_NAME;
				47	const char *test_name = TEST_NAME;
				48	const char *utf8_name = UTF8_NAME;
				49
				50	int verbose = 0;
				51
				52	/* An arbitrary line size limit on input lines. */
				53
				54	#define LINESIZE 1024
				55	char line[LINESIZE];
				56	char buf0[LINESIZE];
				57	char buf1[LINESIZE];
				58	char buf2[LINESIZE];
				59	char buf3[LINESIZE];
				60
				61	const char *argv0;
				62
				63	#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
				64
				65	/* ------------------------------------------------------------------ */
				66
				67	/*
				68	* Unicode version numbers consist of three parts: major, minor, and a
				69	* revision. These numbers are packed into an unsigned int to obtain
				70	* a single version number.
				71	*
				72	* To save space in the generated trie, the unicode version is not
				73	* stored directly, instead we calculate a generation number from the
				74	* unicode versions seen in the DerivedAge file, and use that as an
				75	* index into a table of unicode versions.
				76	*/
				77	#define UNICODE_MAJ_SHIFT (16)
				78	#define UNICODE_MIN_SHIFT (8)
				79
				80	#define UNICODE_MAJ_MAX ((unsigned short)-1)
				81	#define UNICODE_MIN_MAX ((unsigned char)-1)
				82	#define UNICODE_REV_MAX ((unsigned char)-1)
				83
				84	#define UNICODE_AGE(MAJ,MIN,REV) \
				85	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) \| \
				86	((unsigned int)(MIN) << UNICODE_MIN_SHIFT) \| \
				87	((unsigned int)(REV)))
				88
				89	unsigned int *ages;
				90	int ages_count;
				91
				92	unsigned int unicode_maxage;
				93
				94	static int age_valid(unsigned int major, unsigned int minor,
				95	unsigned int revision)
				96	{
				97	if (major > UNICODE_MAJ_MAX)
				98	return 0;
				99	if (minor > UNICODE_MIN_MAX)
				100	return 0;
				101	if (revision > UNICODE_REV_MAX)
				102	return 0;
				103	return 1;
				104	}
				105
				106	/* ------------------------------------------------------------------ */
				107
				108	/*
				109	* utf8trie_t
				110	*
				111	* A compact binary tree, used to decode UTF-8 characters.
				112	*
				113	* Internal nodes are one byte for the node itself, and up to three
				114	* bytes for an offset into the tree. The first byte contains the
				115	* following information:
				116	* NEXTBYTE - flag - advance to next byte if set
				117	* BITNUM - 3 bit field - the bit number to tested
				118	* OFFLEN - 2 bit field - number of bytes in the offset
				119	* if offlen == 0 (non-branching node)
				120	* RIGHTPATH - 1 bit field - set if the following node is for the
				121	* right-hand path (tested bit is set)
				122	* TRIENODE - 1 bit field - set if the following node is an internal
				123	* node, otherwise it is a leaf node
				124	* if offlen != 0 (branching node)
				125	* LEFTNODE - 1 bit field - set if the left-hand node is internal
				126	* RIGHTNODE - 1 bit field - set if the right-hand node is internal
				127	*
				128	* Due to the way utf8 works, there cannot be branching nodes with
				129	* NEXTBYTE set, and moreover those nodes always have a righthand
				130	* descendant.
				131	*/
				132	typedef unsigned char utf8trie_t;
				133	#define BITNUM 0x07
				134	#define NEXTBYTE 0x08
				135	#define OFFLEN 0x30
				136	#define OFFLEN_SHIFT 4
				137	#define RIGHTPATH 0x40
				138	#define TRIENODE 0x80
				139	#define RIGHTNODE 0x40
				140	#define LEFTNODE 0x80
				141
				142	/*
				143	* utf8leaf_t
				144	*
				145	* The leaves of the trie are embedded in the trie, and so the same
				146	* underlying datatype, unsigned char.
				147	*
				148	* leaf[0]: The unicode version, stored as a generation number that is
				149	* an index into utf8agetab[]. With this we can filter code
				150	* points based on the unicode version in which they were
				151	* defined. The CCC of a non-defined code point is 0.
				152	* leaf[1]: Canonical Combining Class. During normalization, we need
				153	* to do a stable sort into ascending order of all characters
				154	* with a non-zero CCC that occur between two characters with
				155	* a CCC of 0, or at the begin or end of a string.
				156	* The unicode standard guarantees that all CCC values are
				157	* between 0 and 254 inclusive, which leaves 255 available as
				158	* a special value.
				159	* Code points with CCC 0 are known as stoppers.
				160	* leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
				161	* start of a NUL-terminated string that is the decomposition
				162	* of the character.
				163	* The CCC of a decomposable character is the same as the CCC
				164	* of the first character of its decomposition.
				165	* Some characters decompose as the empty string: these are
				166	* characters with the Default_Ignorable_Code_Point property.
				167	* These do affect normalization, as they all have CCC 0.
				168	*
				169	* The decompositions in the trie have been fully expanded.
				170	*
				171	* Casefolding, if applicable, is also done using decompositions.
				172	*/
				173	typedef unsigned char utf8leaf_t;
				174
				175	#define LEAF_GEN(LEAF) ((LEAF)[0])
				176	#define LEAF_CCC(LEAF) ((LEAF)[1])
				177	#define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
				178
				179	#define MAXGEN (255)
				180
				181	#define MINCCC (0)
				182	#define MAXCCC (254)
				183	#define STOPPER (0)
				184	#define DECOMPOSE (255)
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	185	#define HANGUL ((char)(255))
				186
				187	#define UTF8HANGULLEAF (12)
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	188
				189	struct tree;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	190	static utf8leaf_t utf8nlookup(struct tree , unsigned char *,
				191	const char *, size_t);
				192	static utf8leaf_t utf8lookup(struct tree , unsigned char , const char );
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	193
				194	unsigned char *utf8data;
				195	size_t utf8data_size;
				196
				197	utf8trie_t *nfdi;
				198	utf8trie_t *nfdicf;
				199
				200	/* ------------------------------------------------------------------ */
				201
				202	/*
				203	* UTF8 valid ranges.
				204	*
				205	* The UTF-8 encoding spreads the bits of a 32bit word over several
				206	* bytes. This table gives the ranges that can be held and how they'd
				207	* be represented.
				208	*
				209	* 0x00000000 0x0000007F: 0xxxxxxx
				210	* 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
				211	* 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
				212	* 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
				213	* 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				214	* 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				215	*
				216	* There is an additional requirement on UTF-8, in that only the
				217	* shortest representation of a 32bit value is to be used. A decoder
				218	* must not decode sequences that do not satisfy this requirement.
				219	* Thus the allowed ranges have a lower bound.
				220	*
				221	* 0x00000000 0x0000007F: 0xxxxxxx
				222	* 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
				223	* 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
				224	* 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
				225	* 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				226	* 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				227	*
				228	* Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
				229	* 17 planes of 65536 values. This limits the sequences actually seen
				230	* even more, to just the following.
				231	*
				232	* 0 - 0x7f: 0 0x7f
				233	* 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
				234	* 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
				235	* 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
				236	*
				237	* Even within those ranges not all values are allowed: the surrogates
				238	* 0xd800 - 0xdfff should never be seen.
				239	*
				240	* Note that the longest sequence seen with valid usage is 4 bytes,
				241	* the same a single UTF-32 character. This makes the UTF-8
				242	* representation of Unicode strictly smaller than UTF-32.
				243	*
				244	* The shortest sequence requirement was introduced by:
				245	* Corrigendum #1: UTF-8 Shortest Form
				246	* It can be found here:
				247	* http://www.unicode.org/versions/corrigendum1.html
				248	*
				249	*/
				250
				251	#define UTF8_2_BITS 0xC0
				252	#define UTF8_3_BITS 0xE0
				253	#define UTF8_4_BITS 0xF0
				254	#define UTF8_N_BITS 0x80
				255	#define UTF8_2_MASK 0xE0
				256	#define UTF8_3_MASK 0xF0
				257	#define UTF8_4_MASK 0xF8
				258	#define UTF8_N_MASK 0xC0
				259	#define UTF8_V_MASK 0x3F
				260	#define UTF8_V_SHIFT 6
				261
				262	static int utf8encode(char *str, unsigned int val)
				263	{
				264	int len;
				265
				266	if (val < 0x80) {
				267	str[0] = val;
				268	len = 1;
				269	} else if (val < 0x800) {
				270	str[1] = val & UTF8_V_MASK;
				271	str[1] \|= UTF8_N_BITS;
				272	val >>= UTF8_V_SHIFT;
				273	str[0] = val;
				274	str[0] \|= UTF8_2_BITS;
				275	len = 2;
				276	} else if (val < 0x10000) {
				277	str[2] = val & UTF8_V_MASK;
				278	str[2] \|= UTF8_N_BITS;
				279	val >>= UTF8_V_SHIFT;
				280	str[1] = val & UTF8_V_MASK;
				281	str[1] \|= UTF8_N_BITS;
				282	val >>= UTF8_V_SHIFT;
				283	str[0] = val;
				284	str[0] \|= UTF8_3_BITS;
				285	len = 3;
				286	} else if (val < 0x110000) {
				287	str[3] = val & UTF8_V_MASK;
				288	str[3] \|= UTF8_N_BITS;
				289	val >>= UTF8_V_SHIFT;
				290	str[2] = val & UTF8_V_MASK;
				291	str[2] \|= UTF8_N_BITS;
				292	val >>= UTF8_V_SHIFT;
				293	str[1] = val & UTF8_V_MASK;
				294	str[1] \|= UTF8_N_BITS;
				295	val >>= UTF8_V_SHIFT;
				296	str[0] = val;
				297	str[0] \|= UTF8_4_BITS;
				298	len = 4;
				299	} else {
				300	printf("%#x: illegal val\n", val);
				301	len = 0;
				302	}
				303	return len;
				304	}
				305
				306	static unsigned int utf8decode(const char *str)
				307	{
				308	const unsigned char s = (const unsigned char)str;
				309	unsigned int unichar = 0;
				310
				311	if (*s < 0x80) {
				312	unichar = *s;
				313	} else if (*s < UTF8_3_BITS) {
				314	unichar = *s++ & 0x1F;
				315	unichar <<= UTF8_V_SHIFT;
				316	unichar \|= *s & 0x3F;
				317	} else if (*s < UTF8_4_BITS) {
				318	unichar = *s++ & 0x0F;
				319	unichar <<= UTF8_V_SHIFT;
				320	unichar \|= *s++ & 0x3F;
				321	unichar <<= UTF8_V_SHIFT;
				322	unichar \|= *s & 0x3F;
				323	} else {
				324	unichar = *s++ & 0x0F;
				325	unichar <<= UTF8_V_SHIFT;
				326	unichar \|= *s++ & 0x3F;
				327	unichar <<= UTF8_V_SHIFT;
				328	unichar \|= *s++ & 0x3F;
				329	unichar <<= UTF8_V_SHIFT;
				330	unichar \|= *s & 0x3F;
				331	}
				332	return unichar;
				333	}
				334
				335	static int utf32valid(unsigned int unichar)
				336	{
				337	return unichar < 0x110000;
				338	}
				339
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	340	#define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
				341
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	342	#define NODE 1
				343	#define LEAF 0
				344
				345	struct tree {
				346	void *root;
				347	int childnode;
				348	const char *type;
				349	unsigned int maxage;
				350	struct tree *next;
				351	int (leaf_equal)(void , void *);
				352	void (leaf_print)(void , int);
				353	int (leaf_mark)(void );
				354	int (leaf_size)(void );
				355	int (leaf_index)(struct tree , void );
				356	unsigned char (leaf_emit)(void , unsigned char );
				357	int leafindex[0x110000];
				358	int index;
				359	};
				360
				361	struct node {
				362	int index;
				363	int offset;
				364	int mark;
				365	int size;
				366	struct node *parent;
				367	void *left;
				368	void *right;
				369	unsigned char bitnum;
				370	unsigned char nextbyte;
				371	unsigned char leftnode;
				372	unsigned char rightnode;
				373	unsigned int keybits;
				374	unsigned int keymask;
				375	};
				376
				377	/*
				378	* Example lookup function for a tree.
				379	*/
				380	static void lookup(struct tree tree, const char *key)
				381	{
				382	struct node *node;
				383	void *leaf = NULL;
				384
				385	node = tree->root;
				386	while (!leaf && node) {
				387	if (node->nextbyte)
				388	key++;
				389	if (*key & (1 << (node->bitnum & 7))) {
				390	/* Right leg */
				391	if (node->rightnode == NODE) {
				392	node = node->right;
				393	} else if (node->rightnode == LEAF) {
				394	leaf = node->right;
				395	} else {
				396	node = NULL;
				397	}
				398	} else {
				399	/* Left leg */
				400	if (node->leftnode == NODE) {
				401	node = node->left;
				402	} else if (node->leftnode == LEAF) {
				403	leaf = node->left;
				404	} else {
				405	node = NULL;
				406	}
				407	}
				408	}
				409
				410	return leaf;
				411	}
				412
				413	/*
				414	* A simple non-recursive tree walker: keep track of visits to the
				415	* left and right branches in the leftmask and rightmask.
				416	*/
				417	static void tree_walk(struct tree *tree)
				418	{
				419	struct node *node;
				420	unsigned int leftmask;
				421	unsigned int rightmask;
				422	unsigned int bitmask;
				423	int indent = 1;
				424	int nodes, singletons, leaves;
				425
				426	nodes = singletons = leaves = 0;
				427
				428	printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
				429	if (tree->childnode == LEAF) {
				430	assert(tree->root);
				431	tree->leaf_print(tree->root, indent);
				432	leaves = 1;
				433	} else {
				434	assert(tree->childnode == NODE);
				435	node = tree->root;
				436	leftmask = rightmask = 0;
				437	while (node) {
				438	printf("%*snode @ %p bitnum %d nextbyte %d"
				439	" left %p right %p mask %x bits %x\n",
				440	indent, "", node,
				441	node->bitnum, node->nextbyte,
				442	node->left, node->right,
				443	node->keymask, node->keybits);
				444	nodes += 1;
				445	if (!(node->left && node->right))
				446	singletons += 1;
				447
				448	while (node) {
				449	bitmask = 1 << node->bitnum;
				450	if ((leftmask & bitmask) == 0) {
				451	leftmask \|= bitmask;
				452	if (node->leftnode == LEAF) {
				453	assert(node->left);
				454	tree->leaf_print(node->left,
				455	indent+1);
				456	leaves += 1;
				457	} else if (node->left) {
				458	assert(node->leftnode == NODE);
				459	indent += 1;
				460	node = node->left;
				461	break;
				462	}
				463	}
				464	if ((rightmask & bitmask) == 0) {
				465	rightmask \|= bitmask;
				466	if (node->rightnode == LEAF) {
				467	assert(node->right);
				468	tree->leaf_print(node->right,
				469	indent+1);
				470	leaves += 1;
				471	} else if (node->right) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	472	assert(node->rightnode == NODE);
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	473	indent += 1;
				474	node = node->right;
				475	break;
				476	}
				477	}
				478	leftmask &= ~bitmask;
				479	rightmask &= ~bitmask;
				480	node = node->parent;
				481	indent -= 1;
				482	}
				483	}
				484	}
				485	printf("nodes %d leaves %d singletons %d\n",
				486	nodes, leaves, singletons);
				487	}
				488
				489	/*
				490	* Allocate an initialize a new internal node.
				491	*/
				492	static struct node alloc_node(struct node parent)
				493	{
				494	struct node *node;
				495	int bitnum;
				496
				497	node = malloc(sizeof(*node));
				498	node->left = node->right = NULL;
				499	node->parent = parent;
				500	node->leftnode = NODE;
				501	node->rightnode = NODE;
				502	node->keybits = 0;
				503	node->keymask = 0;
				504	node->mark = 0;
				505	node->index = 0;
				506	node->offset = -1;
				507	node->size = 4;
				508
				509	if (node->parent) {
				510	bitnum = parent->bitnum;
				511	if ((bitnum & 7) == 0) {
				512	node->bitnum = bitnum + 7 + 8;
				513	node->nextbyte = 1;
				514	} else {
				515	node->bitnum = bitnum - 1;
				516	node->nextbyte = 0;
				517	}
				518	} else {
				519	node->bitnum = 7;
				520	node->nextbyte = 0;
				521	}
				522
				523	return node;
				524	}
				525
				526	/*
				527	* Insert a new leaf into the tree, and collapse any subtrees that are
				528	* fully populated and end in identical leaves. A nextbyte tagged
				529	* internal node will not be removed to preserve the tree's integrity.
				530	* Note that due to the structure of utf8, no nextbyte tagged node
				531	* will be a candidate for removal.
				532	*/
				533	static int insert(struct tree tree, char key, int keylen, void *leaf)
				534	{
				535	struct node *node;
				536	struct node *parent;
				537	void **cursor;
				538	int keybits;
				539
				540	assert(keylen >= 1 && keylen <= 4);
				541
				542	node = NULL;
				543	cursor = &tree->root;
				544	keybits = 8 * keylen;
				545
				546	/* Insert, creating path along the way. */
				547	while (keybits) {
				548	if (!*cursor)
				549	*cursor = alloc_node(node);
				550	node = *cursor;
				551	if (node->nextbyte)
				552	key++;
				553	if (*key & (1 << (node->bitnum & 7)))
				554	cursor = &node->right;
				555	else
				556	cursor = &node->left;
				557	keybits--;
				558	}
				559	*cursor = leaf;
				560
				561	/* Merge subtrees if possible. */
				562	while (node) {
				563	if (*key & (1 << (node->bitnum & 7)))
				564	node->rightnode = LEAF;
				565	else
				566	node->leftnode = LEAF;
				567	if (node->nextbyte)
				568	break;
				569	if (node->leftnode == NODE \|\| node->rightnode == NODE)
				570	break;
				571	assert(node->left);
				572	assert(node->right);
				573	/* Compare */
				574	if (! tree->leaf_equal(node->left, node->right))
				575	break;
				576	/* Keep left, drop right leaf. */
				577	leaf = node->left;
				578	/* Check in parent */
				579	parent = node->parent;
				580	if (!parent) {
				581	/* root of tree! */
				582	tree->root = leaf;
				583	tree->childnode = LEAF;
				584	} else if (parent->left == node) {
				585	parent->left = leaf;
				586	parent->leftnode = LEAF;
				587	if (parent->right) {
				588	parent->keymask = 0;
				589	parent->keybits = 0;
				590	} else {
				591	parent->keymask \|= (1 << node->bitnum);
				592	}
				593	} else if (parent->right == node) {
				594	parent->right = leaf;
				595	parent->rightnode = LEAF;
				596	if (parent->left) {
				597	parent->keymask = 0;
				598	parent->keybits = 0;
				599	} else {
				600	parent->keymask \|= (1 << node->bitnum);
				601	parent->keybits \|= (1 << node->bitnum);
				602	}
				603	} else {
				604	/* internal tree error */
				605	assert(0);
				606	}
				607	free(node);
				608	node = parent;
				609	}
				610
				611	/* Propagate keymasks up along singleton chains. */
				612	while (node) {
				613	parent = node->parent;
				614	if (!parent)
				615	break;
				616	/* Nix the mask for parents with two children. */
				617	if (node->keymask == 0) {
				618	parent->keymask = 0;
				619	parent->keybits = 0;
				620	} else if (parent->left && parent->right) {
				621	parent->keymask = 0;
				622	parent->keybits = 0;
				623	} else {
				624	assert((parent->keymask & node->keymask) == 0);
				625	parent->keymask \|= node->keymask;
				626	parent->keymask \|= (1 << parent->bitnum);
				627	parent->keybits \|= node->keybits;
				628	if (parent->right)
				629	parent->keybits \|= (1 << parent->bitnum);
				630	}
				631	node = parent;
				632	}
				633
				634	return 0;
				635	}
				636
				637	/*
				638	* Prune internal nodes.
				639	*
				640	* Fully populated subtrees that end at the same leaf have already
				641	* been collapsed. There are still internal nodes that have for both
				642	* their left and right branches a sequence of singletons that make
				643	* identical choices and end in identical leaves. The keymask and
				644	* keybits collected in the nodes describe the choices made in these
				645	* singleton chains. When they are identical for the left and right
				646	* branch of a node, and the two leaves comare identical, the node in
				647	* question can be removed.
				648	*
				649	* Note that nodes with the nextbyte tag set will not be removed by
				650	* this to ensure tree integrity. Note as well that the structure of
				651	* utf8 ensures that these nodes would not have been candidates for
				652	* removal in any case.
				653	*/
				654	static void prune(struct tree *tree)
				655	{
				656	struct node *node;
				657	struct node *left;
				658	struct node *right;
				659	struct node *parent;
				660	void *leftleaf;
				661	void *rightleaf;
				662	unsigned int leftmask;
				663	unsigned int rightmask;
				664	unsigned int bitmask;
				665	int count;
				666
				667	if (verbose > 0)
				668	printf("Pruning %s_%x\n", tree->type, tree->maxage);
				669
				670	count = 0;
				671	if (tree->childnode == LEAF)
				672	return;
				673	if (!tree->root)
				674	return;
				675
				676	leftmask = rightmask = 0;
				677	node = tree->root;
				678	while (node) {
				679	if (node->nextbyte)
				680	goto advance;
				681	if (node->leftnode == LEAF)
				682	goto advance;
				683	if (node->rightnode == LEAF)
				684	goto advance;
				685	if (!node->left)
				686	goto advance;
				687	if (!node->right)
				688	goto advance;
				689	left = node->left;
				690	right = node->right;
				691	if (left->keymask == 0)
				692	goto advance;
				693	if (right->keymask == 0)
				694	goto advance;
				695	if (left->keymask != right->keymask)
				696	goto advance;
				697	if (left->keybits != right->keybits)
				698	goto advance;
				699	leftleaf = NULL;
				700	while (!leftleaf) {
				701	assert(left->left \|\| left->right);
				702	if (left->leftnode == LEAF)
				703	leftleaf = left->left;
				704	else if (left->rightnode == LEAF)
				705	leftleaf = left->right;
				706	else if (left->left)
				707	left = left->left;
				708	else if (left->right)
				709	left = left->right;
				710	else
				711	assert(0);
				712	}
				713	rightleaf = NULL;
				714	while (!rightleaf) {
				715	assert(right->left \|\| right->right);
				716	if (right->leftnode == LEAF)
				717	rightleaf = right->left;
				718	else if (right->rightnode == LEAF)
				719	rightleaf = right->right;
				720	else if (right->left)
				721	right = right->left;
				722	else if (right->right)
				723	right = right->right;
				724	else
				725	assert(0);
				726	}
				727	if (! tree->leaf_equal(leftleaf, rightleaf))
				728	goto advance;
				729	/*
				730	* This node has identical singleton-only subtrees.
				731	* Remove it.
				732	*/
				733	parent = node->parent;
				734	left = node->left;
				735	right = node->right;
				736	if (parent->left == node)
				737	parent->left = left;
				738	else if (parent->right == node)
				739	parent->right = left;
				740	else
				741	assert(0);
				742	left->parent = parent;
				743	left->keymask \|= (1 << node->bitnum);
				744	node->left = NULL;
				745	while (node) {
				746	bitmask = 1 << node->bitnum;
				747	leftmask &= ~bitmask;
				748	rightmask &= ~bitmask;
				749	if (node->leftnode == NODE && node->left) {
				750	left = node->left;
				751	free(node);
				752	count++;
				753	node = left;
				754	} else if (node->rightnode == NODE && node->right) {
				755	right = node->right;
				756	free(node);
				757	count++;
				758	node = right;
				759	} else {
				760	node = NULL;
				761	}
				762	}
				763	/* Propagate keymasks up along singleton chains. */
				764	node = parent;
				765	/* Force re-check */
				766	bitmask = 1 << node->bitnum;
				767	leftmask &= ~bitmask;
				768	rightmask &= ~bitmask;
				769	for (;;) {
				770	if (node->left && node->right)
				771	break;
				772	if (node->left) {
				773	left = node->left;
				774	node->keymask \|= left->keymask;
				775	node->keybits \|= left->keybits;
				776	}
				777	if (node->right) {
				778	right = node->right;
				779	node->keymask \|= right->keymask;
				780	node->keybits \|= right->keybits;
				781	}
				782	node->keymask \|= (1 << node->bitnum);
				783	node = node->parent;
				784	/* Force re-check */
				785	bitmask = 1 << node->bitnum;
				786	leftmask &= ~bitmask;
				787	rightmask &= ~bitmask;
				788	}
				789	advance:
				790	bitmask = 1 << node->bitnum;
				791	if ((leftmask & bitmask) == 0 &&
				792	node->leftnode == NODE &&
				793	node->left) {
				794	leftmask \|= bitmask;
				795	node = node->left;
				796	} else if ((rightmask & bitmask) == 0 &&
				797	node->rightnode == NODE &&
				798	node->right) {
				799	rightmask \|= bitmask;
				800	node = node->right;
				801	} else {
				802	leftmask &= ~bitmask;
				803	rightmask &= ~bitmask;
				804	node = node->parent;
				805	}
				806	}
				807	if (verbose > 0)
				808	printf("Pruned %d nodes\n", count);
				809	}
				810
				811	/*
				812	* Mark the nodes in the tree that lead to leaves that must be
				813	* emitted.
				814	*/
				815	static void mark_nodes(struct tree *tree)
				816	{
				817	struct node *node;
				818	struct node *n;
				819	unsigned int leftmask;
				820	unsigned int rightmask;
				821	unsigned int bitmask;
				822	int marked;
				823
				824	marked = 0;
				825	if (verbose > 0)
				826	printf("Marking %s_%x\n", tree->type, tree->maxage);
				827	if (tree->childnode == LEAF)
				828	goto done;
				829
				830	assert(tree->childnode == NODE);
				831	node = tree->root;
				832	leftmask = rightmask = 0;
				833	while (node) {
				834	bitmask = 1 << node->bitnum;
				835	if ((leftmask & bitmask) == 0) {
				836	leftmask \|= bitmask;
				837	if (node->leftnode == LEAF) {
				838	assert(node->left);
				839	if (tree->leaf_mark(node->left)) {
				840	n = node;
				841	while (n && !n->mark) {
				842	marked++;
				843	n->mark = 1;
				844	n = n->parent;
				845	}
				846	}
				847	} else if (node->left) {
				848	assert(node->leftnode == NODE);
				849	node = node->left;
				850	continue;
				851	}
				852	}
				853	if ((rightmask & bitmask) == 0) {
				854	rightmask \|= bitmask;
				855	if (node->rightnode == LEAF) {
				856	assert(node->right);
				857	if (tree->leaf_mark(node->right)) {
				858	n = node;
				859	while (n && !n->mark) {
				860	marked++;
				861	n->mark = 1;
				862	n = n->parent;
				863	}
				864	}
				865	} else if (node->right) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	866	assert(node->rightnode == NODE);
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	867	node = node->right;
				868	continue;
				869	}
				870	}
				871	leftmask &= ~bitmask;
				872	rightmask &= ~bitmask;
				873	node = node->parent;
				874	}
				875
				876	/* second pass: left siblings and singletons */
				877
				878	assert(tree->childnode == NODE);
				879	node = tree->root;
				880	leftmask = rightmask = 0;
				881	while (node) {
				882	bitmask = 1 << node->bitnum;
				883	if ((leftmask & bitmask) == 0) {
				884	leftmask \|= bitmask;
				885	if (node->leftnode == LEAF) {
				886	assert(node->left);
				887	if (tree->leaf_mark(node->left)) {
				888	n = node;
				889	while (n && !n->mark) {
				890	marked++;
				891	n->mark = 1;
				892	n = n->parent;
				893	}
				894	}
				895	} else if (node->left) {
				896	assert(node->leftnode == NODE);
				897	node = node->left;
				898	if (!node->mark && node->parent->mark) {
				899	marked++;
				900	node->mark = 1;
				901	}
				902	continue;
				903	}
				904	}
				905	if ((rightmask & bitmask) == 0) {
				906	rightmask \|= bitmask;
				907	if (node->rightnode == LEAF) {
				908	assert(node->right);
				909	if (tree->leaf_mark(node->right)) {
				910	n = node;
				911	while (n && !n->mark) {
				912	marked++;
				913	n->mark = 1;
				914	n = n->parent;
				915	}
				916	}
				917	} else if (node->right) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	918	assert(node->rightnode == NODE);
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	919	node = node->right;
				920	if (!node->mark && node->parent->mark &&
				921	!node->parent->left) {
				922	marked++;
				923	node->mark = 1;
				924	}
				925	continue;
				926	}
				927	}
				928	leftmask &= ~bitmask;
				929	rightmask &= ~bitmask;
				930	node = node->parent;
				931	}
				932	done:
				933	if (verbose > 0)
				934	printf("Marked %d nodes\n", marked);
				935	}
				936
				937	/*
				938	* Compute the index of each node and leaf, which is the offset in the
				939	* emitted trie. These values must be pre-computed because relative
				940	* offsets between nodes are used to navigate the tree.
				941	*/
				942	static int index_nodes(struct tree *tree, int index)
				943	{
				944	struct node *node;
				945	unsigned int leftmask;
				946	unsigned int rightmask;
				947	unsigned int bitmask;
				948	int count;
				949	int indent;
				950
				951	/* Align to a cache line (or half a cache line?). */
				952	while (index % 64)
				953	index++;
				954	tree->index = index;
				955	indent = 1;
				956	count = 0;
				957
				958	if (verbose > 0)
				959	printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
				960	if (tree->childnode == LEAF) {
				961	index += tree->leaf_size(tree->root);
				962	goto done;
				963	}
				964
				965	assert(tree->childnode == NODE);
				966	node = tree->root;
				967	leftmask = rightmask = 0;
				968	while (node) {
				969	if (!node->mark)
				970	goto skip;
				971	count++;
				972	if (node->index != index)
				973	node->index = index;
				974	index += node->size;
				975	skip:
				976	while (node) {
				977	bitmask = 1 << node->bitnum;
				978	if (node->mark && (leftmask & bitmask) == 0) {
				979	leftmask \|= bitmask;
				980	if (node->leftnode == LEAF) {
				981	assert(node->left);
				982	*tree->leaf_index(tree, node->left) =
				983	index;
				984	index += tree->leaf_size(node->left);
				985	count++;
				986	} else if (node->left) {
				987	assert(node->leftnode == NODE);
				988	indent += 1;
				989	node = node->left;
				990	break;
				991	}
				992	}
				993	if (node->mark && (rightmask & bitmask) == 0) {
				994	rightmask \|= bitmask;
				995	if (node->rightnode == LEAF) {
				996	assert(node->right);
				997	*tree->leaf_index(tree, node->right) = index;
				998	index += tree->leaf_size(node->right);
				999	count++;
				1000	} else if (node->right) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1001	assert(node->rightnode == NODE);
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1002	indent += 1;
				1003	node = node->right;
				1004	break;
				1005	}
				1006	}
				1007	leftmask &= ~bitmask;
				1008	rightmask &= ~bitmask;
				1009	node = node->parent;
				1010	indent -= 1;
				1011	}
				1012	}
				1013	done:
				1014	/* Round up to a multiple of 16 */
				1015	while (index % 16)
				1016	index++;
				1017	if (verbose > 0)
				1018	printf("Final index %d\n", index);
				1019	return index;
				1020	}
				1021
				1022	/*
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1023	* Mark the nodes in a subtree, helper for size_nodes().
				1024	*/
				1025	static int mark_subtree(struct node *node)
				1026	{
				1027	int changed;
				1028
				1029	if (!node \|\| node->mark)
				1030	return 0;
				1031	node->mark = 1;
				1032	node->index = node->parent->index;
				1033	changed = 1;
				1034	if (node->leftnode == NODE)
				1035	changed += mark_subtree(node->left);
				1036	if (node->rightnode == NODE)
				1037	changed += mark_subtree(node->right);
				1038	return changed;
				1039	}
				1040
				1041	/*
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1042	* Compute the size of nodes and leaves. We start by assuming that
				1043	* each node needs to store a three-byte offset. The indexes of the
				1044	* nodes are calculated based on that, and then this function is
				1045	* called to see if the sizes of some nodes can be reduced. This is
				1046	* repeated until no more changes are seen.
				1047	*/
				1048	static int size_nodes(struct tree *tree)
				1049	{
				1050	struct tree *next;
				1051	struct node *node;
				1052	struct node *right;
				1053	struct node *n;
				1054	unsigned int leftmask;
				1055	unsigned int rightmask;
				1056	unsigned int bitmask;
				1057	unsigned int pathbits;
				1058	unsigned int pathmask;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1059	unsigned int nbit;
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1060	int changed;
				1061	int offset;
				1062	int size;
				1063	int indent;
				1064
				1065	indent = 1;
				1066	changed = 0;
				1067	size = 0;
				1068
				1069	if (verbose > 0)
				1070	printf("Sizing %s_%x\n", tree->type, tree->maxage);
				1071	if (tree->childnode == LEAF)
				1072	goto done;
				1073
				1074	assert(tree->childnode == NODE);
				1075	pathbits = 0;
				1076	pathmask = 0;
				1077	node = tree->root;
				1078	leftmask = rightmask = 0;
				1079	while (node) {
				1080	if (!node->mark)
				1081	goto skip;
				1082	offset = 0;
				1083	if (!node->left \|\| !node->right) {
				1084	size = 1;
				1085	} else {
				1086	if (node->rightnode == NODE) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1087	/*
				1088	* If the right node is not marked,
				1089	* look for a corresponding node in
				1090	* the next tree. Such a node need
				1091	* not exist.
				1092	*/
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1093	right = node->right;
				1094	next = tree->next;
				1095	while (!right->mark) {
				1096	assert(next);
				1097	n = next->root;
				1098	while (n->bitnum != node->bitnum) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1099	nbit = 1 << n->bitnum;
				1100	if (!(pathmask & nbit))
				1101	break;
				1102	if (pathbits & nbit) {
				1103	if (n->rightnode == LEAF)
				1104	break;
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1105	n = n->right;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1106	} else {
				1107	if (n->leftnode == LEAF)
				1108	break;
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1109	n = n->left;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1110	}
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1111	}
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1112	if (n->bitnum != node->bitnum)
				1113	break;
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1114	n = n->right;
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1115	right = n;
				1116	next = next->next;
				1117	}
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1118	/* Make sure the right node is marked. */
				1119	if (!right->mark)
				1120	changed += mark_subtree(right);
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1121	offset = right->index - node->index;
				1122	} else {
				1123	offset = *tree->leaf_index(tree, node->right);
				1124	offset -= node->index;
				1125	}
				1126	assert(offset >= 0);
				1127	assert(offset <= 0xffffff);
				1128	if (offset <= 0xff) {
				1129	size = 2;
				1130	} else if (offset <= 0xffff) {
				1131	size = 3;
				1132	} else { /* offset <= 0xffffff */
				1133	size = 4;
				1134	}
				1135	}
				1136	if (node->size != size \|\| node->offset != offset) {
				1137	node->size = size;
				1138	node->offset = offset;
				1139	changed++;
				1140	}
				1141	skip:
				1142	while (node) {
				1143	bitmask = 1 << node->bitnum;
				1144	pathmask \|= bitmask;
				1145	if (node->mark && (leftmask & bitmask) == 0) {
				1146	leftmask \|= bitmask;
				1147	if (node->leftnode == LEAF) {
				1148	assert(node->left);
				1149	} else if (node->left) {
				1150	assert(node->leftnode == NODE);
				1151	indent += 1;
				1152	node = node->left;
				1153	break;
				1154	}
				1155	}
				1156	if (node->mark && (rightmask & bitmask) == 0) {
				1157	rightmask \|= bitmask;
				1158	pathbits \|= bitmask;
				1159	if (node->rightnode == LEAF) {
				1160	assert(node->right);
				1161	} else if (node->right) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1162	assert(node->rightnode == NODE);
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1163	indent += 1;
				1164	node = node->right;
				1165	break;
				1166	}
				1167	}
				1168	leftmask &= ~bitmask;
				1169	rightmask &= ~bitmask;
				1170	pathmask &= ~bitmask;
				1171	pathbits &= ~bitmask;
				1172	node = node->parent;
				1173	indent -= 1;
				1174	}
				1175	}
				1176	done:
				1177	if (verbose > 0)
				1178	printf("Found %d changes\n", changed);
				1179	return changed;
				1180	}
				1181
				1182	/*
				1183	* Emit a trie for the given tree into the data array.
				1184	*/
				1185	static void emit(struct tree tree, unsigned char data)
				1186	{
				1187	struct node *node;
				1188	unsigned int leftmask;
				1189	unsigned int rightmask;
				1190	unsigned int bitmask;
				1191	int offlen;
				1192	int offset;
				1193	int index;
				1194	int indent;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1195	int size;
				1196	int bytes;
				1197	int leaves;
				1198	int nodes[4];
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1199	unsigned char byte;
				1200
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1201	nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
				1202	leaves = 0;
				1203	bytes = 0;
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1204	index = tree->index;
				1205	data += index;
				1206	indent = 1;
				1207	if (verbose > 0)
				1208	printf("Emitting %s_%x\n", tree->type, tree->maxage);
				1209	if (tree->childnode == LEAF) {
				1210	assert(tree->root);
				1211	tree->leaf_emit(tree->root, data);
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1212	size = tree->leaf_size(tree->root);
				1213	index += size;
				1214	leaves++;
				1215	goto done;
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1216	}
				1217
				1218	assert(tree->childnode == NODE);
				1219	node = tree->root;
				1220	leftmask = rightmask = 0;
				1221	while (node) {
				1222	if (!node->mark)
				1223	goto skip;
				1224	assert(node->offset != -1);
				1225	assert(node->index == index);
				1226
				1227	byte = 0;
				1228	if (node->nextbyte)
				1229	byte \|= NEXTBYTE;
				1230	byte \|= (node->bitnum & BITNUM);
				1231	if (node->left && node->right) {
				1232	if (node->leftnode == NODE)
				1233	byte \|= LEFTNODE;
				1234	if (node->rightnode == NODE)
				1235	byte \|= RIGHTNODE;
				1236	if (node->offset <= 0xff)
				1237	offlen = 1;
				1238	else if (node->offset <= 0xffff)
				1239	offlen = 2;
				1240	else
				1241	offlen = 3;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1242	nodes[offlen]++;
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1243	offset = node->offset;
				1244	byte \|= offlen << OFFLEN_SHIFT;
				1245	*data++ = byte;
				1246	index++;
				1247	while (offlen--) {
				1248	*data++ = offset & 0xff;
				1249	index++;
				1250	offset >>= 8;
				1251	}
				1252	} else if (node->left) {
				1253	if (node->leftnode == NODE)
				1254	byte \|= TRIENODE;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1255	nodes[0]++;
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1256	*data++ = byte;
				1257	index++;
				1258	} else if (node->right) {
				1259	byte \|= RIGHTNODE;
				1260	if (node->rightnode == NODE)
				1261	byte \|= TRIENODE;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1262	nodes[0]++;
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1263	*data++ = byte;
				1264	index++;
				1265	} else {
				1266	assert(0);
				1267	}
				1268	skip:
				1269	while (node) {
				1270	bitmask = 1 << node->bitnum;
				1271	if (node->mark && (leftmask & bitmask) == 0) {
				1272	leftmask \|= bitmask;
				1273	if (node->leftnode == LEAF) {
				1274	assert(node->left);
				1275	data = tree->leaf_emit(node->left,
				1276	data);
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1277	size = tree->leaf_size(node->left);
				1278	index += size;
				1279	bytes += size;
				1280	leaves++;
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1281	} else if (node->left) {
				1282	assert(node->leftnode == NODE);
				1283	indent += 1;
				1284	node = node->left;
				1285	break;
				1286	}
				1287	}
				1288	if (node->mark && (rightmask & bitmask) == 0) {
				1289	rightmask \|= bitmask;
				1290	if (node->rightnode == LEAF) {
				1291	assert(node->right);
				1292	data = tree->leaf_emit(node->right,
				1293	data);
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1294	size = tree->leaf_size(node->right);
				1295	index += size;
				1296	bytes += size;
				1297	leaves++;
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1298	} else if (node->right) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1299	assert(node->rightnode == NODE);
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1300	indent += 1;
				1301	node = node->right;
				1302	break;
				1303	}
				1304	}
				1305	leftmask &= ~bitmask;
				1306	rightmask &= ~bitmask;
				1307	node = node->parent;
				1308	indent -= 1;
				1309	}
				1310	}
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1311	done:
				1312	if (verbose > 0) {
				1313	printf("Emitted %d (%d) leaves",
				1314	leaves, bytes);
				1315	printf(" %d (%d+%d+%d+%d) nodes",
				1316	nodes[0] + nodes[1] + nodes[2] + nodes[3],
				1317	nodes[0], nodes[1], nodes[2], nodes[3]);
				1318	printf(" %d total\n", index - tree->index);
				1319	}
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1320	}
				1321
				1322	/* ------------------------------------------------------------------ */
				1323
				1324	/*
				1325	* Unicode data.
				1326	*
				1327	* We need to keep track of the Canonical Combining Class, the Age,
				1328	* and decompositions for a code point.
				1329	*
				1330	* For the Age, we store the index into the ages table. Effectively
				1331	* this is a generation number that the table maps to a unicode
				1332	* version.
				1333	*
				1334	* The correction field is used to indicate that this entry is in the
				1335	* corrections array, which contains decompositions that were
				1336	* corrected in later revisions. The value of the correction field is
				1337	* the Unicode version in which the mapping was corrected.
				1338	*/
				1339	struct unicode_data {
				1340	unsigned int code;
				1341	int ccc;
				1342	int gen;
				1343	int correction;
				1344	unsigned int *utf32nfdi;
				1345	unsigned int *utf32nfdicf;
				1346	char *utf8nfdi;
				1347	char *utf8nfdicf;
				1348	};
				1349
				1350	struct unicode_data unicode_data[0x110000];
				1351	struct unicode_data *corrections;
				1352	int corrections_count;
				1353
				1354	struct tree *nfdi_tree;
				1355	struct tree *nfdicf_tree;
				1356
				1357	struct tree *trees;
				1358	int trees_count;
				1359
				1360	/*
				1361	* Check the corrections array to see if this entry was corrected at
				1362	* some point.
				1363	*/
				1364	static struct unicode_data corrections_lookup(struct unicode_data u)
				1365	{
				1366	int i;
				1367
				1368	for (i = 0; i != corrections_count; i++)
				1369	if (u->code == corrections[i].code)
				1370	return &corrections[i];
				1371	return u;
				1372	}
				1373
				1374	static int nfdi_equal(void l, void r)
				1375	{
				1376	struct unicode_data *left = l;
				1377	struct unicode_data *right = r;
				1378
				1379	if (left->gen != right->gen)
				1380	return 0;
				1381	if (left->ccc != right->ccc)
				1382	return 0;
				1383	if (left->utf8nfdi && right->utf8nfdi &&
				1384	strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
				1385	return 1;
				1386	if (left->utf8nfdi \|\| right->utf8nfdi)
				1387	return 0;
				1388	return 1;
				1389	}
				1390
				1391	static int nfdicf_equal(void l, void r)
				1392	{
				1393	struct unicode_data *left = l;
				1394	struct unicode_data *right = r;
				1395
				1396	if (left->gen != right->gen)
				1397	return 0;
				1398	if (left->ccc != right->ccc)
				1399	return 0;
				1400	if (left->utf8nfdicf && right->utf8nfdicf &&
				1401	strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
				1402	return 1;
				1403	if (left->utf8nfdicf && right->utf8nfdicf)
				1404	return 0;
				1405	if (left->utf8nfdicf \|\| right->utf8nfdicf)
				1406	return 0;
				1407	if (left->utf8nfdi && right->utf8nfdi &&
				1408	strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
				1409	return 1;
				1410	if (left->utf8nfdi \|\| right->utf8nfdi)
				1411	return 0;
				1412	return 1;
				1413	}
				1414
				1415	static void nfdi_print(void *l, int indent)
				1416	{
				1417	struct unicode_data *leaf = l;
				1418
				1419	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
				1420	leaf->code, leaf->ccc, leaf->gen);
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1421
				1422	if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
				1423	printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
				1424	else if (leaf->utf8nfdi)
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1425	printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1426
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1427	printf("\n");
				1428	}
				1429
				1430	static void nfdicf_print(void *l, int indent)
				1431	{
				1432	struct unicode_data *leaf = l;
				1433
				1434	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
				1435	leaf->code, leaf->ccc, leaf->gen);
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1436
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1437	if (leaf->utf8nfdicf)
				1438	printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1439	else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
				1440	printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1441	else if (leaf->utf8nfdi)
				1442	printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
				1443	printf("\n");
				1444	}
				1445
				1446	static int nfdi_mark(void *l)
				1447	{
				1448	return 1;
				1449	}
				1450
				1451	static int nfdicf_mark(void *l)
				1452	{
				1453	struct unicode_data *leaf = l;
				1454
				1455	if (leaf->utf8nfdicf)
				1456	return 1;
				1457	return 0;
				1458	}
				1459
				1460	static int correction_mark(void *l)
				1461	{
				1462	struct unicode_data *leaf = l;
				1463
				1464	return leaf->correction;
				1465	}
				1466
				1467	static int nfdi_size(void *l)
				1468	{
				1469	struct unicode_data *leaf = l;
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1470	int size = 2;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1471
				1472	if (HANGUL_SYLLABLE(leaf->code))
				1473	size += 1;
				1474	else if (leaf->utf8nfdi)
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1475	size += strlen(leaf->utf8nfdi) + 1;
				1476	return size;
				1477	}
				1478
				1479	static int nfdicf_size(void *l)
				1480	{
				1481	struct unicode_data *leaf = l;
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1482	int size = 2;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1483
				1484	if (HANGUL_SYLLABLE(leaf->code))
				1485	size += 1;
				1486	else if (leaf->utf8nfdicf)
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1487	size += strlen(leaf->utf8nfdicf) + 1;
				1488	else if (leaf->utf8nfdi)
				1489	size += strlen(leaf->utf8nfdi) + 1;
				1490	return size;
				1491	}
				1492
				1493	static int nfdi_index(struct tree tree, void *l)
				1494	{
				1495	struct unicode_data *leaf = l;
				1496
				1497	return &tree->leafindex[leaf->code];
				1498	}
				1499
				1500	static int nfdicf_index(struct tree tree, void *l)
				1501	{
				1502	struct unicode_data *leaf = l;
				1503
				1504	return &tree->leafindex[leaf->code];
				1505	}
				1506
				1507	static unsigned char nfdi_emit(void l, unsigned char *data)
				1508	{
				1509	struct unicode_data *leaf = l;
				1510	unsigned char *s;
				1511
				1512	*data++ = leaf->gen;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1513
				1514	if (HANGUL_SYLLABLE(leaf->code)) {
				1515	*data++ = DECOMPOSE;
				1516	*data++ = HANGUL;
				1517	} else if (leaf->utf8nfdi) {
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1518	*data++ = DECOMPOSE;
				1519	s = (unsigned char*)leaf->utf8nfdi;
				1520	while ((data++ = s++) != 0)
				1521	;
				1522	} else {
				1523	*data++ = leaf->ccc;
				1524	}
				1525	return data;
				1526	}
				1527
				1528	static unsigned char nfdicf_emit(void l, unsigned char *data)
				1529	{
				1530	struct unicode_data *leaf = l;
				1531	unsigned char *s;
				1532
				1533	*data++ = leaf->gen;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1534
				1535	if (HANGUL_SYLLABLE(leaf->code)) {
				1536	*data++ = DECOMPOSE;
				1537	*data++ = HANGUL;
				1538	} else if (leaf->utf8nfdicf) {
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1539	*data++ = DECOMPOSE;
				1540	s = (unsigned char*)leaf->utf8nfdicf;
				1541	while ((data++ = s++) != 0)
				1542	;
				1543	} else if (leaf->utf8nfdi) {
				1544	*data++ = DECOMPOSE;
				1545	s = (unsigned char*)leaf->utf8nfdi;
				1546	while ((data++ = s++) != 0)
				1547	;
				1548	} else {
				1549	*data++ = leaf->ccc;
				1550	}
				1551	return data;
				1552	}
				1553
				1554	static void utf8_create(struct unicode_data *data)
				1555	{
				1556	char utf[18*4+1];
				1557	char *u;
				1558	unsigned int *um;
				1559	int i;
				1560
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1561	if (data->utf8nfdi) {
				1562	assert(data->utf8nfdi[0] == HANGUL);
				1563	return;
				1564	}
				1565
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1566	u = utf;
				1567	um = data->utf32nfdi;
				1568	if (um) {
				1569	for (i = 0; um[i]; i++)
				1570	u += utf8encode(u, um[i]);
				1571	*u = '\0';
				1572	data->utf8nfdi = strdup(utf);
				1573	}
				1574	u = utf;
				1575	um = data->utf32nfdicf;
				1576	if (um) {
				1577	for (i = 0; um[i]; i++)
				1578	u += utf8encode(u, um[i]);
				1579	*u = '\0';
				1580	if (!data->utf8nfdi \|\| strcmp(data->utf8nfdi, utf))
				1581	data->utf8nfdicf = strdup(utf);
				1582	}
				1583	}
				1584
				1585	static void utf8_init(void)
				1586	{
				1587	unsigned int unichar;
				1588	int i;
				1589
				1590	for (unichar = 0; unichar != 0x110000; unichar++)
				1591	utf8_create(&unicode_data[unichar]);
				1592
				1593	for (i = 0; i != corrections_count; i++)
				1594	utf8_create(&corrections[i]);
				1595	}
				1596
				1597	static void trees_init(void)
				1598	{
				1599	struct unicode_data *data;
				1600	unsigned int maxage;
				1601	unsigned int nextage;
				1602	int count;
				1603	int i;
				1604	int j;
				1605
				1606	/* Count the number of different ages. */
				1607	count = 0;
				1608	nextage = (unsigned int)-1;
				1609	do {
				1610	maxage = nextage;
				1611	nextage = 0;
				1612	for (i = 0; i <= corrections_count; i++) {
				1613	data = &corrections[i];
				1614	if (nextage < data->correction &&
				1615	data->correction < maxage)
				1616	nextage = data->correction;
				1617	}
				1618	count++;
				1619	} while (nextage);
				1620
				1621	/* Two trees per age: nfdi and nfdicf */
				1622	trees_count = count * 2;
				1623	trees = calloc(trees_count, sizeof(struct tree));
				1624
				1625	/* Assign ages to the trees. */
				1626	count = trees_count;
				1627	nextage = (unsigned int)-1;
				1628	do {
				1629	maxage = nextage;
				1630	trees[--count].maxage = maxage;
				1631	trees[--count].maxage = maxage;
				1632	nextage = 0;
				1633	for (i = 0; i <= corrections_count; i++) {
				1634	data = &corrections[i];
				1635	if (nextage < data->correction &&
				1636	data->correction < maxage)
				1637	nextage = data->correction;
				1638	}
				1639	} while (nextage);
				1640
				1641	/* The ages assigned above are off by one. */
				1642	for (i = 0; i != trees_count; i++) {
				1643	j = 0;
				1644	while (ages[j] < trees[i].maxage)
				1645	j++;
				1646	trees[i].maxage = ages[j-1];
				1647	}
				1648
				1649	/* Set up the forwarding between trees. */
				1650	trees[trees_count-2].next = &trees[trees_count-1];
				1651	trees[trees_count-1].leaf_mark = nfdi_mark;
				1652	trees[trees_count-2].leaf_mark = nfdicf_mark;
				1653	for (i = 0; i != trees_count-2; i += 2) {
				1654	trees[i].next = &trees[trees_count-2];
				1655	trees[i].leaf_mark = correction_mark;
				1656	trees[i+1].next = &trees[trees_count-1];
				1657	trees[i+1].leaf_mark = correction_mark;
				1658	}
				1659
				1660	/* Assign the callouts. */
				1661	for (i = 0; i != trees_count; i += 2) {
				1662	trees[i].type = "nfdicf";
				1663	trees[i].leaf_equal = nfdicf_equal;
				1664	trees[i].leaf_print = nfdicf_print;
				1665	trees[i].leaf_size = nfdicf_size;
				1666	trees[i].leaf_index = nfdicf_index;
				1667	trees[i].leaf_emit = nfdicf_emit;
				1668
				1669	trees[i+1].type = "nfdi";
				1670	trees[i+1].leaf_equal = nfdi_equal;
				1671	trees[i+1].leaf_print = nfdi_print;
				1672	trees[i+1].leaf_size = nfdi_size;
				1673	trees[i+1].leaf_index = nfdi_index;
				1674	trees[i+1].leaf_emit = nfdi_emit;
				1675	}
				1676
				1677	/* Finish init. */
				1678	for (i = 0; i != trees_count; i++)
				1679	trees[i].childnode = NODE;
				1680	}
				1681
				1682	static void trees_populate(void)
				1683	{
				1684	struct unicode_data *data;
				1685	unsigned int unichar;
				1686	char keyval[4];
				1687	int keylen;
				1688	int i;
				1689
				1690	for (i = 0; i != trees_count; i++) {
				1691	if (verbose > 0) {
				1692	printf("Populating %s_%x\n",
				1693	trees[i].type, trees[i].maxage);
				1694	}
				1695	for (unichar = 0; unichar != 0x110000; unichar++) {
				1696	if (unicode_data[unichar].gen < 0)
				1697	continue;
				1698	keylen = utf8encode(keyval, unichar);
				1699	data = corrections_lookup(&unicode_data[unichar]);
				1700	if (data->correction <= trees[i].maxage)
				1701	data = &unicode_data[unichar];
				1702	insert(&trees[i], keyval, keylen, data);
				1703	}
				1704	}
				1705	}
				1706
				1707	static void trees_reduce(void)
				1708	{
				1709	int i;
				1710	int size;
				1711	int changed;
				1712
				1713	for (i = 0; i != trees_count; i++)
				1714	prune(&trees[i]);
				1715	for (i = 0; i != trees_count; i++)
				1716	mark_nodes(&trees[i]);
				1717	do {
				1718	size = 0;
				1719	for (i = 0; i != trees_count; i++)
				1720	size = index_nodes(&trees[i], size);
				1721	changed = 0;
				1722	for (i = 0; i != trees_count; i++)
				1723	changed += size_nodes(&trees[i]);
				1724	} while (changed);
				1725
				1726	utf8data = calloc(size, 1);
				1727	utf8data_size = size;
				1728	for (i = 0; i != trees_count; i++)
				1729	emit(&trees[i], utf8data);
				1730
				1731	if (verbose > 0) {
				1732	for (i = 0; i != trees_count; i++) {
				1733	printf("%s_%x idx %d\n",
				1734	trees[i].type, trees[i].maxage, trees[i].index);
				1735	}
				1736	}
				1737
				1738	nfdi = utf8data + trees[trees_count-1].index;
				1739	nfdicf = utf8data + trees[trees_count-2].index;
				1740
				1741	nfdi_tree = &trees[trees_count-1];
				1742	nfdicf_tree = &trees[trees_count-2];
				1743	}
				1744
				1745	static void verify(struct tree *tree)
				1746	{
				1747	struct unicode_data *data;
				1748	utf8leaf_t *leaf;
				1749	unsigned int unichar;
				1750	char key[4];
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1751	unsigned char hangul[UTF8HANGULLEAF];
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1752	int report;
				1753	int nocf;
				1754
				1755	if (verbose > 0)
				1756	printf("Verifying %s_%x\n", tree->type, tree->maxage);
				1757	nocf = strcmp(tree->type, "nfdicf");
				1758
				1759	for (unichar = 0; unichar != 0x110000; unichar++) {
				1760	report = 0;
				1761	data = corrections_lookup(&unicode_data[unichar]);
				1762	if (data->correction <= tree->maxage)
				1763	data = &unicode_data[unichar];
				1764	utf8encode(key,unichar);
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1765	leaf = utf8lookup(tree, hangul, key);
				1766
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1767	if (!leaf) {
				1768	if (data->gen != -1)
				1769	report++;
				1770	if (unichar < 0xd800 \|\| unichar > 0xdfff)
				1771	report++;
				1772	} else {
				1773	if (unichar >= 0xd800 && unichar <= 0xdfff)
				1774	report++;
				1775	if (data->gen == -1)
				1776	report++;
				1777	if (data->gen != LEAF_GEN(leaf))
				1778	report++;
				1779	if (LEAF_CCC(leaf) == DECOMPOSE) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	1780	if (HANGUL_SYLLABLE(data->code)) {
				1781	if (data->utf8nfdi[0] != HANGUL)
				1782	report++;
				1783	} else if (nocf) {
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	1784	if (!data->utf8nfdi) {
				1785	report++;
				1786	} else if (strcmp(data->utf8nfdi,
				1787	LEAF_STR(leaf))) {
				1788	report++;
				1789	}
				1790	} else {
				1791	if (!data->utf8nfdicf &&
				1792	!data->utf8nfdi) {
				1793	report++;
				1794	} else if (data->utf8nfdicf) {
				1795	if (strcmp(data->utf8nfdicf,
				1796	LEAF_STR(leaf)))
				1797	report++;
				1798	} else if (strcmp(data->utf8nfdi,
				1799	LEAF_STR(leaf))) {
				1800	report++;
				1801	}
				1802	}
				1803	} else if (data->ccc != LEAF_CCC(leaf)) {
				1804	report++;
				1805	}
				1806	}
				1807	if (report) {
				1808	printf("%X code %X gen %d ccc %d"
				1809	" nfdi -> \"%s\"",
				1810	unichar, data->code, data->gen,
				1811	data->ccc,
				1812	data->utf8nfdi);
				1813	if (leaf) {
				1814	printf(" gen %d ccc %d"
				1815	" nfdi -> \"%s\"",
				1816	LEAF_GEN(leaf),
				1817	LEAF_CCC(leaf),
				1818	LEAF_CCC(leaf) == DECOMPOSE ?
				1819	LEAF_STR(leaf) : "");
				1820	}
				1821	printf("\n");
				1822	}
				1823	}
				1824	}
				1825
				1826	static void trees_verify(void)
				1827	{
				1828	int i;
				1829
				1830	for (i = 0; i != trees_count; i++)
				1831	verify(&trees[i]);
				1832	}
				1833
				1834	/* ------------------------------------------------------------------ */
				1835
				1836	static void help(void)
				1837	{
				1838	printf("Usage: %s [options]\n", argv0);
				1839	printf("\n");
				1840	printf("This program creates an a data trie used for parsing and\n");
				1841	printf("normalization of UTF-8 strings. The trie is derived from\n");
				1842	printf("a set of input files from the Unicode character database\n");
				1843	printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
				1844	printf("\n");
				1845	printf("The generated tree supports two normalization forms:\n");
				1846	printf("\n");
				1847	printf("\tnfdi:\n");
				1848	printf("\t- Apply unicode normalization form NFD.\n");
				1849	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
				1850	printf("\n");
				1851	printf("\tnfdicf:\n");
				1852	printf("\t- Apply unicode normalization form NFD.\n");
				1853	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
				1854	printf("\t- Apply a full casefold (C + F).\n");
				1855	printf("\n");
				1856	printf("These forms were chosen as being most useful when dealing\n");
				1857	printf("with file names: NFD catches most cases where characters\n");
				1858	printf("should be considered equivalent. The ignorables are mostly\n");
				1859	printf("invisible, making names hard to type.\n");
				1860	printf("\n");
				1861	printf("The options to specify the files to be used are listed\n");
				1862	printf("below with their default values, which are the names used\n");
				1863	printf("by version 11.0.0 of the Unicode Character Database.\n");
				1864	printf("\n");
				1865	printf("The input files:\n");
				1866	printf("\t-a %s\n", AGE_NAME);
				1867	printf("\t-c %s\n", CCC_NAME);
				1868	printf("\t-p %s\n", PROP_NAME);
				1869	printf("\t-d %s\n", DATA_NAME);
				1870	printf("\t-f %s\n", FOLD_NAME);
				1871	printf("\t-n %s\n", NORM_NAME);
				1872	printf("\n");
				1873	printf("Additionally, the generated tables are tested using:\n");
				1874	printf("\t-t %s\n", TEST_NAME);
				1875	printf("\n");
				1876	printf("Finally, the output file:\n");
				1877	printf("\t-o %s\n", UTF8_NAME);
				1878	printf("\n");
				1879	}
				1880
				1881	static void usage(void)
				1882	{
				1883	help();
				1884	exit(1);
				1885	}
				1886
				1887	static void open_fail(const char *name, int error)
				1888	{
				1889	printf("Error %d opening %s: %s\n", error, name, strerror(error));
				1890	exit(1);
				1891	}
				1892
				1893	static void file_fail(const char *filename)
				1894	{
				1895	printf("Error parsing %s\n", filename);
				1896	exit(1);
				1897	}
				1898
				1899	static void line_fail(const char filename, const char line)
				1900	{
				1901	printf("Error parsing %s:%s\n", filename, line);
				1902	exit(1);
				1903	}
				1904
				1905	/* ------------------------------------------------------------------ */
				1906
				1907	static void print_utf32(unsigned int *utf32str)
				1908	{
				1909	int i;
				1910
				1911	for (i = 0; utf32str[i]; i++)
				1912	printf(" %X", utf32str[i]);
				1913	}
				1914
				1915	static void print_utf32nfdi(unsigned int unichar)
				1916	{
				1917	printf(" %X ->", unichar);
				1918	print_utf32(unicode_data[unichar].utf32nfdi);
				1919	printf("\n");
				1920	}
				1921
				1922	static void print_utf32nfdicf(unsigned int unichar)
				1923	{
				1924	printf(" %X ->", unichar);
				1925	print_utf32(unicode_data[unichar].utf32nfdicf);
				1926	printf("\n");
				1927	}
				1928
				1929	/* ------------------------------------------------------------------ */
				1930
				1931	static void age_init(void)
				1932	{
				1933	FILE *file;
				1934	unsigned int first;
				1935	unsigned int last;
				1936	unsigned int unichar;
				1937	unsigned int major;
				1938	unsigned int minor;
				1939	unsigned int revision;
				1940	int gen;
				1941	int count;
				1942	int ret;
				1943
				1944	if (verbose > 0)
				1945	printf("Parsing %s\n", age_name);
				1946
				1947	file = fopen(age_name, "r");
				1948	if (!file)
				1949	open_fail(age_name, errno);
				1950	count = 0;
				1951
				1952	gen = 0;
				1953	while (fgets(line, LINESIZE, file)) {
				1954	ret = sscanf(line, "# Age=V%d_%d_%d",
				1955	&major, &minor, &revision);
				1956	if (ret == 3) {
				1957	ages_count++;
				1958	if (verbose > 1)
				1959	printf(" Age V%d_%d_%d\n",
				1960	major, minor, revision);
				1961	if (!age_valid(major, minor, revision))
				1962	line_fail(age_name, line);
				1963	continue;
				1964	}
				1965	ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
				1966	if (ret == 2) {
				1967	ages_count++;
				1968	if (verbose > 1)
				1969	printf(" Age V%d_%d\n", major, minor);
				1970	if (!age_valid(major, minor, 0))
				1971	line_fail(age_name, line);
				1972	continue;
				1973	}
				1974	}
				1975
				1976	/* We must have found something above. */
				1977	if (verbose > 1)
				1978	printf("%d age entries\n", ages_count);
				1979	if (ages_count == 0 \|\| ages_count > MAXGEN)
				1980	file_fail(age_name);
				1981
				1982	/* There is a 0 entry. */
				1983	ages_count++;
				1984	ages = calloc(ages_count + 1, sizeof(*ages));
				1985	/* And a guard entry. */
				1986	ages[ages_count] = (unsigned int)-1;
				1987
				1988	rewind(file);
				1989	count = 0;
				1990	gen = 0;
				1991	while (fgets(line, LINESIZE, file)) {
				1992	ret = sscanf(line, "# Age=V%d_%d_%d",
				1993	&major, &minor, &revision);
				1994	if (ret == 3) {
				1995	ages[++gen] =
				1996	UNICODE_AGE(major, minor, revision);
				1997	if (verbose > 1)
				1998	printf(" Age V%d_%d_%d = gen %d\n",
				1999	major, minor, revision, gen);
				2000	if (!age_valid(major, minor, revision))
				2001	line_fail(age_name, line);
				2002	continue;
				2003	}
				2004	ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
				2005	if (ret == 2) {
				2006	ages[++gen] = UNICODE_AGE(major, minor, 0);
				2007	if (verbose > 1)
				2008	printf(" Age V%d_%d = %d\n",
				2009	major, minor, gen);
				2010	if (!age_valid(major, minor, 0))
				2011	line_fail(age_name, line);
				2012	continue;
				2013	}
				2014	ret = sscanf(line, "%X..%X ; %d.%d #",
				2015	&first, &last, &major, &minor);
				2016	if (ret == 4) {
				2017	for (unichar = first; unichar <= last; unichar++)
				2018	unicode_data[unichar].gen = gen;
				2019	count += 1 + last - first;
				2020	if (verbose > 1)
				2021	printf(" %X..%X gen %d\n", first, last, gen);
				2022	if (!utf32valid(first) \|\| !utf32valid(last))
				2023	line_fail(age_name, line);
				2024	continue;
				2025	}
				2026	ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
				2027	if (ret == 3) {
				2028	unicode_data[unichar].gen = gen;
				2029	count++;
				2030	if (verbose > 1)
				2031	printf(" %X gen %d\n", unichar, gen);
				2032	if (!utf32valid(unichar))
				2033	line_fail(age_name, line);
				2034	continue;
				2035	}
				2036	}
				2037	unicode_maxage = ages[gen];
				2038	fclose(file);
				2039
				2040	/* Nix surrogate block */
				2041	if (verbose > 1)
				2042	printf(" Removing surrogate block D800..DFFF\n");
				2043	for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
				2044	unicode_data[unichar].gen = -1;
				2045
				2046	if (verbose > 0)
				2047	printf("Found %d entries\n", count);
				2048	if (count == 0)
				2049	file_fail(age_name);
				2050	}
				2051
				2052	static void ccc_init(void)
				2053	{
				2054	FILE *file;
				2055	unsigned int first;
				2056	unsigned int last;
				2057	unsigned int unichar;
				2058	unsigned int value;
				2059	int count;
				2060	int ret;
				2061
				2062	if (verbose > 0)
				2063	printf("Parsing %s\n", ccc_name);
				2064
				2065	file = fopen(ccc_name, "r");
				2066	if (!file)
				2067	open_fail(ccc_name, errno);
				2068
				2069	count = 0;
				2070	while (fgets(line, LINESIZE, file)) {
				2071	ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
				2072	if (ret == 3) {
				2073	for (unichar = first; unichar <= last; unichar++) {
				2074	unicode_data[unichar].ccc = value;
				2075	count++;
				2076	}
				2077	if (verbose > 1)
				2078	printf(" %X..%X ccc %d\n", first, last, value);
				2079	if (!utf32valid(first) \|\| !utf32valid(last))
				2080	line_fail(ccc_name, line);
				2081	continue;
				2082	}
				2083	ret = sscanf(line, "%X ; %d #", &unichar, &value);
				2084	if (ret == 2) {
				2085	unicode_data[unichar].ccc = value;
				2086	count++;
				2087	if (verbose > 1)
				2088	printf(" %X ccc %d\n", unichar, value);
				2089	if (!utf32valid(unichar))
				2090	line_fail(ccc_name, line);
				2091	continue;
				2092	}
				2093	}
				2094	fclose(file);
				2095
				2096	if (verbose > 0)
				2097	printf("Found %d entries\n", count);
				2098	if (count == 0)
				2099	file_fail(ccc_name);
				2100	}
				2101
				2102	static int ignore_compatibility_form(char *type)
				2103	{
				2104	int i;
				2105	char *ignored_types[] = {"font", "noBreak", "initial", "medial",
				2106	"final", "isolated", "circle", "super",
				2107	"sub", "vertical", "wide", "narrow",
				2108	"small", "square", "fraction", "compat"};
				2109
				2110	for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
				2111	if (strcmp(type, ignored_types[i]) == 0)
				2112	return 1;
				2113	return 0;
				2114	}
				2115
				2116	static void nfdi_init(void)
				2117	{
				2118	FILE *file;
				2119	unsigned int unichar;
				2120	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
				2121	char *s;
				2122	char *type;
				2123	unsigned int *um;
				2124	int count;
				2125	int i;
				2126	int ret;
				2127
				2128	if (verbose > 0)
				2129	printf("Parsing %s\n", data_name);
				2130	file = fopen(data_name, "r");
				2131	if (!file)
				2132	open_fail(data_name, errno);
				2133
				2134	count = 0;
				2135	while (fgets(line, LINESIZE, file)) {
				2136	ret = sscanf(line, "%X;%[^;];%[^;];%[^;];%[^;];%[^;];",
				2137	&unichar, buf0);
				2138	if (ret != 2)
				2139	continue;
				2140	if (!utf32valid(unichar))
				2141	line_fail(data_name, line);
				2142
				2143	s = buf0;
				2144	/* skip over <tag> */
				2145	if (*s == '<') {
				2146	type = ++s;
				2147	while (*++s != '>');
				2148	*s++ = '\0';
				2149	if(ignore_compatibility_form(type))
				2150	continue;
				2151	}
				2152	/* decode the decomposition into UTF-32 */
				2153	i = 0;
				2154	while (*s) {
				2155	mapping[i] = strtoul(s, &s, 16);
				2156	if (!utf32valid(mapping[i]))
				2157	line_fail(data_name, line);
				2158	i++;
				2159	}
				2160	mapping[i++] = 0;
				2161
				2162	um = malloc(i * sizeof(unsigned int));
				2163	memcpy(um, mapping, i * sizeof(unsigned int));
				2164	unicode_data[unichar].utf32nfdi = um;
				2165
				2166	if (verbose > 1)
				2167	print_utf32nfdi(unichar);
				2168	count++;
				2169	}
				2170	fclose(file);
				2171	if (verbose > 0)
				2172	printf("Found %d entries\n", count);
				2173	if (count == 0)
				2174	file_fail(data_name);
				2175	}
				2176
				2177	static void nfdicf_init(void)
				2178	{
				2179	FILE *file;
				2180	unsigned int unichar;
				2181	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
				2182	char status;
				2183	char *s;
				2184	unsigned int *um;
				2185	int i;
				2186	int count;
				2187	int ret;
				2188
				2189	if (verbose > 0)
				2190	printf("Parsing %s\n", fold_name);
				2191	file = fopen(fold_name, "r");
				2192	if (!file)
				2193	open_fail(fold_name, errno);
				2194
				2195	count = 0;
				2196	while (fgets(line, LINESIZE, file)) {
				2197	ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
				2198	if (ret != 3)
				2199	continue;
				2200	if (!utf32valid(unichar))
				2201	line_fail(fold_name, line);
				2202	/* Use the C+F casefold. */
				2203	if (status != 'C' && status != 'F')
				2204	continue;
				2205	s = buf0;
				2206	if (*s == '<')
				2207	while (*s++ != ' ')
				2208	;
				2209	i = 0;
				2210	while (*s) {
				2211	mapping[i] = strtoul(s, &s, 16);
				2212	if (!utf32valid(mapping[i]))
				2213	line_fail(fold_name, line);
				2214	i++;
				2215	}
				2216	mapping[i++] = 0;
				2217
				2218	um = malloc(i * sizeof(unsigned int));
				2219	memcpy(um, mapping, i * sizeof(unsigned int));
				2220	unicode_data[unichar].utf32nfdicf = um;
				2221
				2222	if (verbose > 1)
				2223	print_utf32nfdicf(unichar);
				2224	count++;
				2225	}
				2226	fclose(file);
				2227	if (verbose > 0)
				2228	printf("Found %d entries\n", count);
				2229	if (count == 0)
				2230	file_fail(fold_name);
				2231	}
				2232
				2233	static void ignore_init(void)
				2234	{
				2235	FILE *file;
				2236	unsigned int unichar;
				2237	unsigned int first;
				2238	unsigned int last;
				2239	unsigned int *um;
				2240	int count;
				2241	int ret;
				2242
				2243	if (verbose > 0)
				2244	printf("Parsing %s\n", prop_name);
				2245	file = fopen(prop_name, "r");
				2246	if (!file)
				2247	open_fail(prop_name, errno);
				2248	assert(file);
				2249	count = 0;
				2250	while (fgets(line, LINESIZE, file)) {
				2251	ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
				2252	if (ret == 3) {
				2253	if (strcmp(buf0, "Default_Ignorable_Code_Point"))
				2254	continue;
				2255	if (!utf32valid(first) \|\| !utf32valid(last))
				2256	line_fail(prop_name, line);
				2257	for (unichar = first; unichar <= last; unichar++) {
				2258	free(unicode_data[unichar].utf32nfdi);
				2259	um = malloc(sizeof(unsigned int));
				2260	*um = 0;
				2261	unicode_data[unichar].utf32nfdi = um;
				2262	free(unicode_data[unichar].utf32nfdicf);
				2263	um = malloc(sizeof(unsigned int));
				2264	*um = 0;
				2265	unicode_data[unichar].utf32nfdicf = um;
				2266	count++;
				2267	}
				2268	if (verbose > 1)
				2269	printf(" %X..%X Default_Ignorable_Code_Point\n",
				2270	first, last);
				2271	continue;
				2272	}
				2273	ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
				2274	if (ret == 2) {
				2275	if (strcmp(buf0, "Default_Ignorable_Code_Point"))
				2276	continue;
				2277	if (!utf32valid(unichar))
				2278	line_fail(prop_name, line);
				2279	free(unicode_data[unichar].utf32nfdi);
				2280	um = malloc(sizeof(unsigned int));
				2281	*um = 0;
				2282	unicode_data[unichar].utf32nfdi = um;
				2283	free(unicode_data[unichar].utf32nfdicf);
				2284	um = malloc(sizeof(unsigned int));
				2285	*um = 0;
				2286	unicode_data[unichar].utf32nfdicf = um;
				2287	if (verbose > 1)
				2288	printf(" %X Default_Ignorable_Code_Point\n",
				2289	unichar);
				2290	count++;
				2291	continue;
				2292	}
				2293	}
				2294	fclose(file);
				2295
				2296	if (verbose > 0)
				2297	printf("Found %d entries\n", count);
				2298	if (count == 0)
				2299	file_fail(prop_name);
				2300	}
				2301
				2302	static void corrections_init(void)
				2303	{
				2304	FILE *file;
				2305	unsigned int unichar;
				2306	unsigned int major;
				2307	unsigned int minor;
				2308	unsigned int revision;
				2309	unsigned int age;
				2310	unsigned int *um;
				2311	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
				2312	char *s;
				2313	int i;
				2314	int count;
				2315	int ret;
				2316
				2317	if (verbose > 0)
				2318	printf("Parsing %s\n", norm_name);
				2319	file = fopen(norm_name, "r");
				2320	if (!file)
				2321	open_fail(norm_name, errno);
				2322
				2323	count = 0;
				2324	while (fgets(line, LINESIZE, file)) {
				2325	ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
				2326	&unichar, buf0, buf1,
				2327	&major, &minor, &revision);
				2328	if (ret != 6)
				2329	continue;
				2330	if (!utf32valid(unichar) \|\| !age_valid(major, minor, revision))
				2331	line_fail(norm_name, line);
				2332	count++;
				2333	}
				2334	corrections = calloc(count, sizeof(struct unicode_data));
				2335	corrections_count = count;
				2336	rewind(file);
				2337
				2338	count = 0;
				2339	while (fgets(line, LINESIZE, file)) {
				2340	ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
				2341	&unichar, buf0, buf1,
				2342	&major, &minor, &revision);
				2343	if (ret != 6)
				2344	continue;
				2345	if (!utf32valid(unichar) \|\| !age_valid(major, minor, revision))
				2346	line_fail(norm_name, line);
				2347	corrections[count] = unicode_data[unichar];
				2348	assert(corrections[count].code == unichar);
				2349	age = UNICODE_AGE(major, minor, revision);
				2350	corrections[count].correction = age;
				2351
				2352	i = 0;
				2353	s = buf0;
				2354	while (*s) {
				2355	mapping[i] = strtoul(s, &s, 16);
				2356	if (!utf32valid(mapping[i]))
				2357	line_fail(norm_name, line);
				2358	i++;
				2359	}
				2360	mapping[i++] = 0;
				2361
				2362	um = malloc(i * sizeof(unsigned int));
				2363	memcpy(um, mapping, i * sizeof(unsigned int));
				2364	corrections[count].utf32nfdi = um;
				2365
				2366	if (verbose > 1)
				2367	printf(" %X -> %s -> %s V%d_%d_%d\n",
				2368	unichar, buf0, buf1, major, minor, revision);
				2369	count++;
				2370	}
				2371	fclose(file);
				2372
				2373	if (verbose > 0)
				2374	printf("Found %d entries\n", count);
				2375	if (count == 0)
				2376	file_fail(norm_name);
				2377	}
				2378
				2379	/* ------------------------------------------------------------------ */
				2380
				2381	/*
				2382	* Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
				2383	*
				2384	* AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
				2385	* D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
				2386	*
				2387	* SBase = 0xAC00
				2388	* LBase = 0x1100
				2389	* VBase = 0x1161
				2390	* TBase = 0x11A7
				2391	* LCount = 19
				2392	* VCount = 21
				2393	* TCount = 28
				2394	* NCount = 588 (VCount * TCount)
				2395	* SCount = 11172 (LCount * NCount)
				2396	*
				2397	* Decomposition:
				2398	* SIndex = s - SBase
				2399	*
				2400	* LV (Canonical/Full)
				2401	* LIndex = SIndex / NCount
				2402	* VIndex = (Sindex % NCount) / TCount
				2403	* LPart = LBase + LIndex
				2404	* VPart = VBase + VIndex
				2405	*
				2406	* LVT (Canonical)
				2407	* LVIndex = (SIndex / TCount) * TCount
				2408	* TIndex = (Sindex % TCount)
				2409	* LVPart = SBase + LVIndex
				2410	* TPart = TBase + TIndex
				2411	*
				2412	* LVT (Full)
				2413	* LIndex = SIndex / NCount
				2414	* VIndex = (Sindex % NCount) / TCount
				2415	* TIndex = (Sindex % TCount)
				2416	* LPart = LBase + LIndex
				2417	* VPart = VBase + VIndex
				2418	* if (TIndex == 0) {
				2419	* d = <LPart, VPart>
				2420	* } else {
				2421	* TPart = TBase + TIndex
				2422	* d = <LPart, VPart, TPart>
				2423	* }
				2424	*
				2425	*/
				2426
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2427	static void hangul_decompose(void)
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2428	{
				2429	unsigned int sb = 0xAC00;
				2430	unsigned int lb = 0x1100;
				2431	unsigned int vb = 0x1161;
				2432	unsigned int tb = 0x11a7;
				2433	/* unsigned int lc = 19; */
				2434	unsigned int vc = 21;
				2435	unsigned int tc = 28;
				2436	unsigned int nc = (vc * tc);
				2437	/* unsigned int sc = (lc * nc); */
				2438	unsigned int unichar;
				2439	unsigned int mapping[4];
				2440	unsigned int *um;
				2441	int count;
				2442	int i;
				2443
				2444	if (verbose > 0)
				2445	printf("Decomposing hangul\n");
				2446	/* Hangul */
				2447	count = 0;
				2448	for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
				2449	unsigned int si = unichar - sb;
				2450	unsigned int li = si / nc;
				2451	unsigned int vi = (si % nc) / tc;
				2452	unsigned int ti = si % tc;
				2453
				2454	i = 0;
				2455	mapping[i++] = lb + li;
				2456	mapping[i++] = vb + vi;
				2457	if (ti)
				2458	mapping[i++] = tb + ti;
				2459	mapping[i++] = 0;
				2460
				2461	assert(!unicode_data[unichar].utf32nfdi);
				2462	um = malloc(i * sizeof(unsigned int));
				2463	memcpy(um, mapping, i * sizeof(unsigned int));
				2464	unicode_data[unichar].utf32nfdi = um;
				2465
				2466	assert(!unicode_data[unichar].utf32nfdicf);
				2467	um = malloc(i * sizeof(unsigned int));
				2468	memcpy(um, mapping, i * sizeof(unsigned int));
				2469	unicode_data[unichar].utf32nfdicf = um;
				2470
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2471	/*
				2472	* Add a cookie as a reminder that the hangul syllable
				2473	* decompositions must not be stored in the generated
				2474	* trie.
				2475	*/
				2476	unicode_data[unichar].utf8nfdi = malloc(2);
				2477	unicode_data[unichar].utf8nfdi[0] = HANGUL;
				2478	unicode_data[unichar].utf8nfdi[1] = '\0';
				2479
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2480	if (verbose > 1)
				2481	print_utf32nfdi(unichar);
				2482
				2483	count++;
				2484	}
				2485	if (verbose > 0)
				2486	printf("Created %d entries\n", count);
				2487	}
				2488
				2489	static void nfdi_decompose(void)
				2490	{
				2491	unsigned int unichar;
				2492	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
				2493	unsigned int *um;
				2494	unsigned int *dc;
				2495	int count;
				2496	int i;
				2497	int j;
				2498	int ret;
				2499
				2500	if (verbose > 0)
				2501	printf("Decomposing nfdi\n");
				2502
				2503	count = 0;
				2504	for (unichar = 0; unichar != 0x110000; unichar++) {
				2505	if (!unicode_data[unichar].utf32nfdi)
				2506	continue;
				2507	for (;;) {
				2508	ret = 1;
				2509	i = 0;
				2510	um = unicode_data[unichar].utf32nfdi;
				2511	while (*um) {
				2512	dc = unicode_data[*um].utf32nfdi;
				2513	if (dc) {
				2514	for (j = 0; dc[j]; j++)
				2515	mapping[i++] = dc[j];
				2516	ret = 0;
				2517	} else {
				2518	mapping[i++] = *um;
				2519	}
				2520	um++;
				2521	}
				2522	mapping[i++] = 0;
				2523	if (ret)
				2524	break;
				2525	free(unicode_data[unichar].utf32nfdi);
				2526	um = malloc(i * sizeof(unsigned int));
				2527	memcpy(um, mapping, i * sizeof(unsigned int));
				2528	unicode_data[unichar].utf32nfdi = um;
				2529	}
				2530	/* Add this decomposition to nfdicf if there is no entry. */
				2531	if (!unicode_data[unichar].utf32nfdicf) {
				2532	um = malloc(i * sizeof(unsigned int));
				2533	memcpy(um, mapping, i * sizeof(unsigned int));
				2534	unicode_data[unichar].utf32nfdicf = um;
				2535	}
				2536	if (verbose > 1)
				2537	print_utf32nfdi(unichar);
				2538	count++;
				2539	}
				2540	if (verbose > 0)
				2541	printf("Processed %d entries\n", count);
				2542	}
				2543
				2544	static void nfdicf_decompose(void)
				2545	{
				2546	unsigned int unichar;
				2547	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
				2548	unsigned int *um;
				2549	unsigned int *dc;
				2550	int count;
				2551	int i;
				2552	int j;
				2553	int ret;
				2554
				2555	if (verbose > 0)
				2556	printf("Decomposing nfdicf\n");
				2557	count = 0;
				2558	for (unichar = 0; unichar != 0x110000; unichar++) {
				2559	if (!unicode_data[unichar].utf32nfdicf)
				2560	continue;
				2561	for (;;) {
				2562	ret = 1;
				2563	i = 0;
				2564	um = unicode_data[unichar].utf32nfdicf;
				2565	while (*um) {
				2566	dc = unicode_data[*um].utf32nfdicf;
				2567	if (dc) {
				2568	for (j = 0; dc[j]; j++)
				2569	mapping[i++] = dc[j];
				2570	ret = 0;
				2571	} else {
				2572	mapping[i++] = *um;
				2573	}
				2574	um++;
				2575	}
				2576	mapping[i++] = 0;
				2577	if (ret)
				2578	break;
				2579	free(unicode_data[unichar].utf32nfdicf);
				2580	um = malloc(i * sizeof(unsigned int));
				2581	memcpy(um, mapping, i * sizeof(unsigned int));
				2582	unicode_data[unichar].utf32nfdicf = um;
				2583	}
				2584	if (verbose > 1)
				2585	print_utf32nfdicf(unichar);
				2586	count++;
				2587	}
				2588	if (verbose > 0)
				2589	printf("Processed %d entries\n", count);
				2590	}
				2591
				2592	/* ------------------------------------------------------------------ */
				2593
				2594	int utf8agemax(struct tree , const char );
				2595	int utf8nagemax(struct tree , const char , size_t);
				2596	int utf8agemin(struct tree , const char );
				2597	int utf8nagemin(struct tree , const char , size_t);
				2598	ssize_t utf8len(struct tree , const char );
				2599	ssize_t utf8nlen(struct tree , const char , size_t);
				2600	struct utf8cursor;
				2601	int utf8cursor(struct utf8cursor , struct tree , const char *);
				2602	int utf8ncursor(struct utf8cursor , struct tree , const char *, size_t);
				2603	int utf8byte(struct utf8cursor *);
				2604
				2605	/*
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2606	* Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
				2607	*
				2608	* AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
				2609	* D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
				2610	*
				2611	* SBase = 0xAC00
				2612	* LBase = 0x1100
				2613	* VBase = 0x1161
				2614	* TBase = 0x11A7
				2615	* LCount = 19
				2616	* VCount = 21
				2617	* TCount = 28
				2618	* NCount = 588 (VCount * TCount)
				2619	* SCount = 11172 (LCount * NCount)
				2620	*
				2621	* Decomposition:
				2622	* SIndex = s - SBase
				2623	*
				2624	* LV (Canonical/Full)
				2625	* LIndex = SIndex / NCount
				2626	* VIndex = (Sindex % NCount) / TCount
				2627	* LPart = LBase + LIndex
				2628	* VPart = VBase + VIndex
				2629	*
				2630	* LVT (Canonical)
				2631	* LVIndex = (SIndex / TCount) * TCount
				2632	* TIndex = (Sindex % TCount)
				2633	* LVPart = SBase + LVIndex
				2634	* TPart = TBase + TIndex
				2635	*
				2636	* LVT (Full)
				2637	* LIndex = SIndex / NCount
				2638	* VIndex = (Sindex % NCount) / TCount
				2639	* TIndex = (Sindex % TCount)
				2640	* LPart = LBase + LIndex
				2641	* VPart = VBase + VIndex
				2642	* if (TIndex == 0) {
				2643	* d = <LPart, VPart>
				2644	* } else {
				2645	* TPart = TBase + TIndex
				2646	* d = <LPart, VPart, TPart>
				2647	* }
				2648	*/
				2649
				2650	/* Constants */
				2651	#define SB (0xAC00)
				2652	#define LB (0x1100)
				2653	#define VB (0x1161)
				2654	#define TB (0x11A7)
				2655	#define LC (19)
				2656	#define VC (21)
				2657	#define TC (28)
				2658	#define NC (VC * TC)
				2659	#define SC (LC * NC)
				2660
				2661	/* Algorithmic decomposition of hangul syllable. */
				2662	static utf8leaf_t utf8hangul(const char str, unsigned char *hangul)
				2663	{
				2664	unsigned int si;
				2665	unsigned int li;
				2666	unsigned int vi;
				2667	unsigned int ti;
				2668	unsigned char *h;
				2669
				2670	/* Calculate the SI, LI, VI, and TI values. */
				2671	si = utf8decode(str) - SB;
				2672	li = si / NC;
				2673	vi = (si % NC) / TC;
				2674	ti = si % TC;
				2675
				2676	/* Fill in base of leaf. */
				2677	h = hangul;
				2678	LEAF_GEN(h) = 2;
				2679	LEAF_CCC(h) = DECOMPOSE;
				2680	h += 2;
				2681
				2682	/* Add LPart, a 3-byte UTF-8 sequence. */
				2683	h += utf8encode((char *)h, li + LB);
				2684
				2685	/* Add VPart, a 3-byte UTF-8 sequence. */
				2686	h += utf8encode((char *)h, vi + VB);
				2687
				2688	/* Add TPart if required, also a 3-byte UTF-8 sequence. */
				2689	if (ti)
				2690	h += utf8encode((char *)h, ti + TB);
				2691
				2692	/* Terminate string. */
				2693	h[0] = '\0';
				2694
				2695	return hangul;
				2696	}
				2697
				2698	/*
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2699	* Use trie to scan s, touching at most len bytes.
				2700	* Returns the leaf if one exists, NULL otherwise.
				2701	*
				2702	* A non-NULL return guarantees that the UTF-8 sequence starting at s
				2703	* is well-formed and corresponds to a known unicode code point. The
				2704	* shorthand for this will be "is valid UTF-8 unicode".
				2705	*/
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2706	static utf8leaf_t utf8nlookup(struct tree tree, unsigned char *hangul,
				2707	const char *s, size_t len)
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2708	{
				2709	utf8trie_t *trie;
				2710	int offlen;
				2711	int offset;
				2712	int mask;
				2713	int node;
				2714
				2715	if (!tree)
				2716	return NULL;
				2717	if (len == 0)
				2718	return NULL;
				2719	node = 1;
				2720	trie = utf8data + tree->index;
				2721	while (node) {
				2722	offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
				2723	if (*trie & NEXTBYTE) {
				2724	if (--len == 0)
				2725	return NULL;
				2726	s++;
				2727	}
				2728	mask = 1 << (*trie & BITNUM);
				2729	if (*s & mask) {
				2730	/* Right leg */
				2731	if (offlen) {
				2732	/* Right node at offset of trie */
				2733	node = (*trie & RIGHTNODE);
				2734	offset = trie[offlen];
				2735	while (--offlen) {
				2736	offset <<= 8;
				2737	offset \|= trie[offlen];
				2738	}
				2739	trie += offset;
				2740	} else if (*trie & RIGHTPATH) {
				2741	/* Right node after this node */
				2742	node = (*trie & TRIENODE);
				2743	trie++;
				2744	} else {
				2745	/* No right node. */
				2746	return NULL;
				2747	}
				2748	} else {
				2749	/* Left leg */
				2750	if (offlen) {
				2751	/* Left node after this node. */
				2752	node = (*trie & LEFTNODE);
				2753	trie += offlen + 1;
				2754	} else if (*trie & RIGHTPATH) {
				2755	/* No left node. */
				2756	return NULL;
				2757	} else {
				2758	/* Left node after this node */
				2759	node = (*trie & TRIENODE);
				2760	trie++;
				2761	}
				2762	}
				2763	}
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2764	/*
				2765	* Hangul decomposition is done algorithmically. These are the
				2766	* codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
				2767	* always 3 bytes long, so s has been advanced twice, and the
				2768	* start of the sequence is at s-2.
				2769	*/
				2770	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
				2771	trie = utf8hangul(s - 2, hangul);
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2772	return trie;
				2773	}
				2774
				2775	/*
				2776	* Use trie to scan s.
				2777	* Returns the leaf if one exists, NULL otherwise.
				2778	*
				2779	* Forwards to trie_nlookup().
				2780	*/
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2781	static utf8leaf_t utf8lookup(struct tree tree, unsigned char *hangul,
				2782	const char *s)
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2783	{
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2784	return utf8nlookup(tree, hangul, s, (size_t)-1);
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2785	}
				2786
				2787	/*
				2788	* Return the number of bytes used by the current UTF-8 sequence.
				2789	* Assumes the input points to the first byte of a valid UTF-8
				2790	* sequence.
				2791	*/
				2792	static inline int utf8clen(const char *s)
				2793	{
				2794	unsigned char c = *s;
				2795	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
				2796	}
				2797
				2798	/*
				2799	* Maximum age of any character in s.
				2800	* Return -1 if s is not valid UTF-8 unicode.
				2801	* Return 0 if only non-assigned code points are used.
				2802	*/
				2803	int utf8agemax(struct tree tree, const char s)
				2804	{
				2805	utf8leaf_t *leaf;
				2806	int age = 0;
				2807	int leaf_age;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2808	unsigned char hangul[UTF8HANGULLEAF];
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2809
				2810	if (!tree)
				2811	return -1;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2812
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2813	while (*s) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2814	leaf = utf8lookup(tree, hangul, s);
				2815	if (!leaf)
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2816	return -1;
				2817	leaf_age = ages[LEAF_GEN(leaf)];
				2818	if (leaf_age <= tree->maxage && leaf_age > age)
				2819	age = leaf_age;
				2820	s += utf8clen(s);
				2821	}
				2822	return age;
				2823	}
				2824
				2825	/*
				2826	* Minimum age of any character in s.
				2827	* Return -1 if s is not valid UTF-8 unicode.
				2828	* Return 0 if non-assigned code points are used.
				2829	*/
				2830	int utf8agemin(struct tree tree, const char s)
				2831	{
				2832	utf8leaf_t *leaf;
				2833	int age;
				2834	int leaf_age;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2835	unsigned char hangul[UTF8HANGULLEAF];
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2836
				2837	if (!tree)
				2838	return -1;
				2839	age = tree->maxage;
				2840	while (*s) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2841	leaf = utf8lookup(tree, hangul, s);
				2842	if (!leaf)
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2843	return -1;
				2844	leaf_age = ages[LEAF_GEN(leaf)];
				2845	if (leaf_age <= tree->maxage && leaf_age < age)
				2846	age = leaf_age;
				2847	s += utf8clen(s);
				2848	}
				2849	return age;
				2850	}
				2851
				2852	/*
				2853	* Maximum age of any character in s, touch at most len bytes.
				2854	* Return -1 if s is not valid UTF-8 unicode.
				2855	*/
				2856	int utf8nagemax(struct tree tree, const char s, size_t len)
				2857	{
				2858	utf8leaf_t *leaf;
				2859	int age = 0;
				2860	int leaf_age;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2861	unsigned char hangul[UTF8HANGULLEAF];
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2862
				2863	if (!tree)
				2864	return -1;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2865
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2866	while (len && *s) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2867	leaf = utf8nlookup(tree, hangul, s, len);
				2868	if (!leaf)
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2869	return -1;
				2870	leaf_age = ages[LEAF_GEN(leaf)];
				2871	if (leaf_age <= tree->maxage && leaf_age > age)
				2872	age = leaf_age;
				2873	len -= utf8clen(s);
				2874	s += utf8clen(s);
				2875	}
				2876	return age;
				2877	}
				2878
				2879	/*
				2880	* Maximum age of any character in s, touch at most len bytes.
				2881	* Return -1 if s is not valid UTF-8 unicode.
				2882	*/
				2883	int utf8nagemin(struct tree tree, const char s, size_t len)
				2884	{
				2885	utf8leaf_t *leaf;
				2886	int leaf_age;
				2887	int age;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2888	unsigned char hangul[UTF8HANGULLEAF];
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2889
				2890	if (!tree)
				2891	return -1;
				2892	age = tree->maxage;
				2893	while (len && *s) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2894	leaf = utf8nlookup(tree, hangul, s, len);
				2895	if (!leaf)
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2896	return -1;
				2897	leaf_age = ages[LEAF_GEN(leaf)];
				2898	if (leaf_age <= tree->maxage && leaf_age < age)
				2899	age = leaf_age;
				2900	len -= utf8clen(s);
				2901	s += utf8clen(s);
				2902	}
				2903	return age;
				2904	}
				2905
				2906	/*
				2907	* Length of the normalization of s.
				2908	* Return -1 if s is not valid UTF-8 unicode.
				2909	*
				2910	* A string of Default_Ignorable_Code_Point has length 0.
				2911	*/
				2912	ssize_t utf8len(struct tree tree, const char s)
				2913	{
				2914	utf8leaf_t *leaf;
				2915	size_t ret = 0;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2916	unsigned char hangul[UTF8HANGULLEAF];
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2917
				2918	if (!tree)
				2919	return -1;
				2920	while (*s) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2921	leaf = utf8lookup(tree, hangul, s);
				2922	if (!leaf)
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2923	return -1;
				2924	if (ages[LEAF_GEN(leaf)] > tree->maxage)
				2925	ret += utf8clen(s);
				2926	else if (LEAF_CCC(leaf) == DECOMPOSE)
				2927	ret += strlen(LEAF_STR(leaf));
				2928	else
				2929	ret += utf8clen(s);
				2930	s += utf8clen(s);
				2931	}
				2932	return ret;
				2933	}
				2934
				2935	/*
				2936	* Length of the normalization of s, touch at most len bytes.
				2937	* Return -1 if s is not valid UTF-8 unicode.
				2938	*/
				2939	ssize_t utf8nlen(struct tree tree, const char s, size_t len)
				2940	{
				2941	utf8leaf_t *leaf;
				2942	size_t ret = 0;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2943	unsigned char hangul[UTF8HANGULLEAF];
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2944
				2945	if (!tree)
				2946	return -1;
				2947	while (len && *s) {
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2948	leaf = utf8nlookup(tree, hangul, s, len);
				2949	if (!leaf)
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2950	return -1;
				2951	if (ages[LEAF_GEN(leaf)] > tree->maxage)
				2952	ret += utf8clen(s);
				2953	else if (LEAF_CCC(leaf) == DECOMPOSE)
				2954	ret += strlen(LEAF_STR(leaf));
				2955	else
				2956	ret += utf8clen(s);
				2957	len -= utf8clen(s);
				2958	s += utf8clen(s);
				2959	}
				2960	return ret;
				2961	}
				2962
				2963	/*
				2964	* Cursor structure used by the normalizer.
				2965	*/
				2966	struct utf8cursor {
				2967	struct tree *tree;
				2968	const char *s;
				2969	const char *p;
				2970	const char *ss;
				2971	const char *sp;
				2972	unsigned int len;
				2973	unsigned int slen;
				2974	short int ccc;
				2975	short int nccc;
				2976	unsigned int unichar;
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	2977	unsigned char hangul[UTF8HANGULLEAF];
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	2978	};
				2979
				2980	/*
				2981	* Set up an utf8cursor for use by utf8byte().
				2982	*
				2983	* s : string.
				2984	* len : length of s.
				2985	* u8c : pointer to cursor.
				2986	* trie : utf8trie_t to use for normalization.
				2987	*
				2988	* Returns -1 on error, 0 on success.
				2989	*/
				2990	int utf8ncursor(struct utf8cursor u8c, struct tree tree, const char *s,
				2991	size_t len)
				2992	{
				2993	if (!tree)
				2994	return -1;
				2995	if (!s)
				2996	return -1;
				2997	u8c->tree = tree;
				2998	u8c->s = s;
				2999	u8c->p = NULL;
				3000	u8c->ss = NULL;
				3001	u8c->sp = NULL;
				3002	u8c->len = len;
				3003	u8c->slen = 0;
				3004	u8c->ccc = STOPPER;
				3005	u8c->nccc = STOPPER;
				3006	u8c->unichar = 0;
				3007	/* Check we didn't clobber the maximum length. */
				3008	if (u8c->len != len)
				3009	return -1;
				3010	/* The first byte of s may not be an utf8 continuation. */
				3011	if (len > 0 && (*s & 0xC0) == 0x80)
				3012	return -1;
				3013	return 0;
				3014	}
				3015
				3016	/*
				3017	* Set up an utf8cursor for use by utf8byte().
				3018	*
				3019	* s : NUL-terminated string.
				3020	* u8c : pointer to cursor.
				3021	* trie : utf8trie_t to use for normalization.
				3022	*
				3023	* Returns -1 on error, 0 on success.
				3024	*/
				3025	int utf8cursor(struct utf8cursor u8c, struct tree tree, const char *s)
				3026	{
				3027	return utf8ncursor(u8c, tree, s, (unsigned int)-1);
				3028	}
				3029
				3030	/*
				3031	* Get one byte from the normalized form of the string described by u8c.
				3032	*
				3033	* Returns the byte cast to an unsigned char on succes, and -1 on failure.
				3034	*
				3035	* The cursor keeps track of the location in the string in u8c->s.
				3036	* When a character is decomposed, the current location is stored in
				3037	* u8c->p, and u8c->s is set to the start of the decomposition. Note
				3038	* that bytes from a decomposition do not count against u8c->len.
				3039	*
				3040	* Characters are emitted if they match the current CCC in u8c->ccc.
				3041	* Hitting end-of-string while u8c->ccc == STOPPER means we're done,
				3042	* and the function returns 0 in that case.
				3043	*
				3044	* Sorting by CCC is done by repeatedly scanning the string. The
				3045	* values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
				3046	* the start of the scan. The first pass finds the lowest CCC to be
				3047	* emitted and stores it in u8c->nccc, the second pass emits the
				3048	* characters with this CCC and finds the next lowest CCC. This limits
				3049	* the number of passes to 1 + the number of different CCCs in the
				3050	* sequence being scanned.
				3051	*
				3052	* Therefore:
				3053	* u8c->p != NULL -> a decomposition is being scanned.
				3054	* u8c->ss != NULL -> this is a repeating scan.
				3055	* u8c->ccc == -1 -> this is the first scan of a repeating scan.
				3056	*/
				3057	int utf8byte(struct utf8cursor *u8c)
				3058	{
				3059	utf8leaf_t *leaf;
				3060	int ccc;
				3061
				3062	for (;;) {
				3063	/* Check for the end of a decomposed character. */
				3064	if (u8c->p && *u8c->s == '\0') {
				3065	u8c->s = u8c->p;
				3066	u8c->p = NULL;
				3067	}
				3068
				3069	/* Check for end-of-string. */
				3070	if (!u8c->p && (u8c->len == 0 \|\| *u8c->s == '\0')) {
				3071	/* There is no next byte. */
				3072	if (u8c->ccc == STOPPER)
				3073	return 0;
				3074	/* End-of-string during a scan counts as a stopper. */
				3075	ccc = STOPPER;
				3076	goto ccc_mismatch;
				3077	} else if ((*u8c->s & 0xC0) == 0x80) {
				3078	/* This is a continuation of the current character. */
				3079	if (!u8c->p)
				3080	u8c->len--;
				3081	return (unsigned char)*u8c->s++;
				3082	}
				3083
				3084	/* Look up the data for the current character. */
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	3085	if (u8c->p) {
				3086	leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
				3087	} else {
				3088	leaf = utf8nlookup(u8c->tree, u8c->hangul,
				3089	u8c->s, u8c->len);
				3090	}
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	3091
				3092	/* No leaf found implies that the input is a binary blob. */
				3093	if (!leaf)
				3094	return -1;
				3095
				3096	/* Characters that are too new have CCC 0. */
				3097	if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
				3098	ccc = STOPPER;
				3099	} else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
				3100	u8c->len -= utf8clen(u8c->s);
				3101	u8c->p = u8c->s + utf8clen(u8c->s);
				3102	u8c->s = LEAF_STR(leaf);
				3103	/* Empty decomposition implies CCC 0. */
				3104	if (*u8c->s == '\0') {
				3105	if (u8c->ccc == STOPPER)
				3106	continue;
				3107	ccc = STOPPER;
				3108	goto ccc_mismatch;
				3109	}
Olaf Weber	a8384c6	2019-04-25 13:49:18 -0400	[diff] [blame]	3110	leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	3111	ccc = LEAF_CCC(leaf);
				3112	}
				3113	u8c->unichar = utf8decode(u8c->s);
				3114
				3115	/*
				3116	* If this is not a stopper, then see if it updates
				3117	* the next canonical class to be emitted.
				3118	*/
				3119	if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
				3120	u8c->nccc = ccc;
				3121
				3122	/*
				3123	* Return the current byte if this is the current
				3124	* combining class.
				3125	*/
				3126	if (ccc == u8c->ccc) {
				3127	if (!u8c->p)
				3128	u8c->len--;
				3129	return (unsigned char)*u8c->s++;
				3130	}
				3131
				3132	/* Current combining class mismatch. */
				3133	ccc_mismatch:
				3134	if (u8c->nccc == STOPPER) {
				3135	/*
				3136	* Scan forward for the first canonical class
				3137	* to be emitted. Save the position from
				3138	* which to restart.
				3139	*/
				3140	assert(u8c->ccc == STOPPER);
				3141	u8c->ccc = MINCCC - 1;
				3142	u8c->nccc = ccc;
				3143	u8c->sp = u8c->p;
				3144	u8c->ss = u8c->s;
				3145	u8c->slen = u8c->len;
				3146	if (!u8c->p)
				3147	u8c->len -= utf8clen(u8c->s);
				3148	u8c->s += utf8clen(u8c->s);
				3149	} else if (ccc != STOPPER) {
				3150	/* Not a stopper, and not the ccc we're emitting. */
				3151	if (!u8c->p)
				3152	u8c->len -= utf8clen(u8c->s);
				3153	u8c->s += utf8clen(u8c->s);
				3154	} else if (u8c->nccc != MAXCCC + 1) {
				3155	/* At a stopper, restart for next ccc. */
				3156	u8c->ccc = u8c->nccc;
				3157	u8c->nccc = MAXCCC + 1;
				3158	u8c->s = u8c->ss;
				3159	u8c->p = u8c->sp;
				3160	u8c->len = u8c->slen;
				3161	} else {
				3162	/* All done, proceed from here. */
				3163	u8c->ccc = STOPPER;
				3164	u8c->nccc = STOPPER;
				3165	u8c->sp = NULL;
				3166	u8c->ss = NULL;
				3167	u8c->slen = 0;
				3168	}
				3169	}
				3170	}
				3171
				3172	/* ------------------------------------------------------------------ */
				3173
				3174	static int normalize_line(struct tree *tree)
				3175	{
				3176	char *s;
				3177	char *t;
				3178	int c;
				3179	struct utf8cursor u8c;
				3180
				3181	/* First test: null-terminated string. */
				3182	s = buf2;
				3183	t = buf3;
				3184	if (utf8cursor(&u8c, tree, s))
				3185	return -1;
				3186	while ((c = utf8byte(&u8c)) > 0)
				3187	if (c != (unsigned char)*t++)
				3188	return -1;
				3189	if (c < 0)
				3190	return -1;
				3191	if (*t != 0)
				3192	return -1;
				3193
				3194	/* Second test: length-limited string. */
				3195	s = buf2;
				3196	/* Replace NUL with a value that will cause an error if seen. */
				3197	s[strlen(s) + 1] = -1;
				3198	t = buf3;
				3199	if (utf8cursor(&u8c, tree, s))
				3200	return -1;
				3201	while ((c = utf8byte(&u8c)) > 0)
				3202	if (c != (unsigned char)*t++)
				3203	return -1;
				3204	if (c < 0)
				3205	return -1;
				3206	if (*t != 0)
				3207	return -1;
				3208
				3209	return 0;
				3210	}
				3211
				3212	static void normalization_test(void)
				3213	{
				3214	FILE *file;
				3215	unsigned int unichar;
				3216	struct unicode_data *data;
				3217	char *s;
				3218	char *t;
				3219	int ret;
				3220	int ignorables;
				3221	int tests = 0;
				3222	int failures = 0;
				3223
				3224	if (verbose > 0)
				3225	printf("Parsing %s\n", test_name);
				3226	/* Step one, read data from file. */
				3227	file = fopen(test_name, "r");
				3228	if (!file)
				3229	open_fail(test_name, errno);
				3230
				3231	while (fgets(line, LINESIZE, file)) {
				3232	ret = sscanf(line, "%[^;];%[^;];%[^;];%[^;];%*[^;];",
				3233	buf0, buf1);
				3234	if (ret != 2 \|\| *line == '#')
				3235	continue;
				3236	s = buf0;
				3237	t = buf2;
				3238	while (*s) {
				3239	unichar = strtoul(s, &s, 16);
				3240	t += utf8encode(t, unichar);
				3241	}
				3242	*t = '\0';
				3243
				3244	ignorables = 0;
				3245	s = buf1;
				3246	t = buf3;
				3247	while (*s) {
				3248	unichar = strtoul(s, &s, 16);
				3249	data = &unicode_data[unichar];
				3250	if (data->utf8nfdi && !*data->utf8nfdi)
				3251	ignorables = 1;
				3252	else
				3253	t += utf8encode(t, unichar);
				3254	}
				3255	*t = '\0';
				3256
				3257	tests++;
				3258	if (normalize_line(nfdi_tree) < 0) {
				3259	printf("Line %s -> %s", buf0, buf1);
				3260	if (ignorables)
				3261	printf(" (ignorables removed)");
				3262	printf(" failure\n");
				3263	failures++;
				3264	}
				3265	}
				3266	fclose(file);
				3267	if (verbose > 0)
				3268	printf("Ran %d tests with %d failures\n", tests, failures);
				3269	if (failures)
				3270	file_fail(test_name);
				3271	}
				3272
				3273	/* ------------------------------------------------------------------ */
				3274
				3275	static void write_file(void)
				3276	{
				3277	FILE *file;
				3278	int i;
				3279	int j;
				3280	int t;
				3281	int gen;
				3282
				3283	if (verbose > 0)
				3284	printf("Writing %s\n", utf8_name);
				3285	file = fopen(utf8_name, "w");
				3286	if (!file)
				3287	open_fail(utf8_name, errno);
				3288
				3289	fprintf(file, "/* This file is generated code, do not edit. */\n");
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	3290	fprintf(file, "\n");
Christoph Hellwig	2b3d047	2021-09-15 09:00:05 +0200	[diff] [blame]	3291	fprintf(file, "#include <linux/module.h>\n");
				3292	fprintf(file, "#include <linux/kernel.h>\n");
				3293	fprintf(file, "#include \"utf8n.h\"\n");
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	3294	fprintf(file, "\n");
				3295	fprintf(file, "static const unsigned int utf8agetab[] = {\n");
				3296	for (i = 0; i != ages_count; i++)
				3297	fprintf(file, "\t%#x%s\n", ages[i],
				3298	ages[i] == unicode_maxage ? "" : ",");
				3299	fprintf(file, "};\n");
				3300	fprintf(file, "\n");
				3301	fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
				3302	t = 0;
				3303	for (gen = 0; gen < ages_count; gen++) {
				3304	fprintf(file, "\t{ %#x, %d }%s\n",
				3305	ages[gen], trees[t].index,
				3306	ages[gen] == unicode_maxage ? "" : ",");
				3307	if (trees[t].maxage == ages[gen])
				3308	t += 2;
				3309	}
				3310	fprintf(file, "};\n");
				3311	fprintf(file, "\n");
				3312	fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
				3313	t = 1;
				3314	for (gen = 0; gen < ages_count; gen++) {
				3315	fprintf(file, "\t{ %#x, %d }%s\n",
				3316	ages[gen], trees[t].index,
				3317	ages[gen] == unicode_maxage ? "" : ",");
				3318	if (trees[t].maxage == ages[gen])
				3319	t += 2;
				3320	}
				3321	fprintf(file, "};\n");
				3322	fprintf(file, "\n");
				3323	fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
				3324	utf8data_size);
				3325	t = 0;
				3326	for (i = 0; i != utf8data_size; i += 16) {
				3327	if (i == trees[t].index) {
				3328	fprintf(file, "\t/* %s_%x */\n",
				3329	trees[t].type, trees[t].maxage);
				3330	if (t < trees_count-1)
				3331	t++;
				3332	}
				3333	fprintf(file, "\t");
				3334	for (j = i; j != i + 16; j++)
				3335	fprintf(file, "0x%.2x%s", utf8data[j],
				3336	(j < utf8data_size -1 ? "," : ""));
				3337	fprintf(file, "\n");
				3338	}
				3339	fprintf(file, "};\n");
Christoph Hellwig	2b3d047	2021-09-15 09:00:05 +0200	[diff] [blame]	3340	fprintf(file, "\n");
				3341	fprintf(file, "struct utf8data_table utf8_data_table = {\n");
				3342	fprintf(file, "\t.utf8agetab = utf8agetab,\n");
				3343	fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n");
				3344	fprintf(file, "\n");
				3345	fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n");
				3346	fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n");
				3347	fprintf(file, "\n");
				3348	fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n");
				3349	fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n");
				3350	fprintf(file, "\n");
				3351	fprintf(file, "\t.utf8data = utf8data,\n");
				3352	fprintf(file, "};\n");
				3353	fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);");
				3354	fprintf(file, "\n");
				3355	fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n");
Gabriel Krisman Bertazi	955405d	2019-04-25 13:38:44 -0400	[diff] [blame]	3356	fclose(file);
				3357	}
				3358
				3359	/* ------------------------------------------------------------------ */
				3360
				3361	int main(int argc, char *argv[])
				3362	{
				3363	unsigned int unichar;
				3364	int opt;
				3365
				3366	argv0 = argv[0];
				3367
				3368	while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
				3369	switch (opt) {
				3370	case 'a':
				3371	age_name = optarg;
				3372	break;
				3373	case 'c':
				3374	ccc_name = optarg;
				3375	break;
				3376	case 'd':
				3377	data_name = optarg;
				3378	break;
				3379	case 'f':
				3380	fold_name = optarg;
				3381	break;
				3382	case 'n':
				3383	norm_name = optarg;
				3384	break;
				3385	case 'o':
				3386	utf8_name = optarg;
				3387	break;
				3388	case 'p':
				3389	prop_name = optarg;
				3390	break;
				3391	case 't':
				3392	test_name = optarg;
				3393	break;
				3394	case 'v':
				3395	verbose++;
				3396	break;
				3397	case 'h':
				3398	help();
				3399	exit(0);
				3400	default:
				3401	usage();
				3402	}
				3403	}
				3404
				3405	if (verbose > 1)
				3406	help();
				3407	for (unichar = 0; unichar != 0x110000; unichar++)
				3408	unicode_data[unichar].code = unichar;
				3409	age_init();
				3410	ccc_init();
				3411	nfdi_init();
				3412	nfdicf_init();
				3413	ignore_init();
				3414	corrections_init();
				3415	hangul_decompose();
				3416	nfdi_decompose();
				3417	nfdicf_decompose();
				3418	utf8_init();
				3419	trees_init();
				3420	trees_populate();
				3421	trees_reduce();
				3422	trees_verify();
				3423	/* Prevent "unused function" warning. */
				3424	(void)lookup(nfdi_tree, " ");
				3425	if (verbose > 2)
				3426	tree_walk(nfdi_tree);
				3427	if (verbose > 2)
				3428	tree_walk(nfdicf_tree);
				3429	normalization_test();
				3430	write_file();
				3431
				3432	return 0;
				3433	}