Blame - fs/unicode/utf8-selftest.c - SHIFTPHONES/mainline/linux

blob: 6fe8af7edccbb02b4747cde0ec1444745ee254ed [file] [log] [blame]

Thomas Gleixner	9c92ab6	2019-05-29 07:17:56 -0700	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
Gabriel Krisman Bertazi	f0d6cc0	2019-04-25 13:56:01 -0400	[diff] [blame]	2	/*
				3	* Kernel module for testing utf-8 support.
				4	*
				5	* Copyright 2017 Collabora Ltd.
Gabriel Krisman Bertazi	f0d6cc0	2019-04-25 13:56:01 -0400	[diff] [blame]	6	*/
				7
				8	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
				9
				10	#include <linux/module.h>
				11	#include <linux/printk.h>
				12	#include <linux/unicode.h>
				13	#include <linux/dcache.h>
				14
				15	#include "utf8n.h"
				16
				17	unsigned int failed_tests;
				18	unsigned int total_tests;
				19
				20	/* Tests will be based on this version. */
Gabriel Krisman Bertazi	1215d23	2019-04-25 13:59:17 -0400	[diff] [blame]	21	#define latest_maj 12
				22	#define latest_min 1
Gabriel Krisman Bertazi	f0d6cc0	2019-04-25 13:56:01 -0400	[diff] [blame]	23	#define latest_rev 0
				24
				25	#define _test(cond, func, line, fmt, ...) do { \
				26	total_tests++; \
				27	if (!cond) { \
				28	failed_tests++; \
				29	pr_err("test %s:%d Failed: %s%s", \
				30	func, line, #cond, (fmt?":":".")); \
				31	if (fmt) \
				32	pr_err(fmt, ##__VA_ARGS__); \
				33	} \
				34	} while (0)
				35	#define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__)
				36	#define test(cond) _test(cond, __func__, __LINE__, "")
				37
Krzysztof Wilczynski	334b427	2019-08-30 15:13:49 +0200	[diff] [blame^]	38	static const struct {
Gabriel Krisman Bertazi	f0d6cc0	2019-04-25 13:56:01 -0400	[diff] [blame]	39	/* UTF-8 strings in this vector _must_ be NULL-terminated. */
				40	unsigned char str[10];
				41	unsigned char dec[10];
				42	} nfdi_test_data[] = {
				43	/* Trivial sequence */
				44	{
				45	/* "ABba" decomposes to itself */
				46	.str = "aBba",
				47	.dec = "aBba",
				48	},
				49	/* Simple equivalent sequences */
				50	{
				51	/* 'VULGAR FRACTION ONE QUARTER' cannot decompose to
				52	'NUMBER 1' + 'FRACTION SLASH' + 'NUMBER 4' on
				53	canonical decomposition */
				54	.str = {0xc2, 0xbc, 0x00},
				55	.dec = {0xc2, 0xbc, 0x00},
				56	},
				57	{
				58	/* 'LATIN SMALL LETTER A WITH DIAERESIS' decomposes to
				59	'LETTER A' + 'COMBINING DIAERESIS' */
				60	.str = {0xc3, 0xa4, 0x00},
				61	.dec = {0x61, 0xcc, 0x88, 0x00},
				62	},
				63	{
				64	/* 'LATIN SMALL LETTER LJ' can't decompose to
				65	'LETTER L' + 'LETTER J' on canonical decomposition */
				66	.str = {0xC7, 0x89, 0x00},
				67	.dec = {0xC7, 0x89, 0x00},
				68	},
				69	{
				70	/* GREEK ANO TELEIA decomposes to MIDDLE DOT */
				71	.str = {0xCE, 0x87, 0x00},
				72	.dec = {0xC2, 0xB7, 0x00}
				73	},
				74	/* Canonical ordering */
				75	{
				76	/* A + 'COMBINING ACUTE ACCENT' + 'COMBINING OGONEK' decomposes
				77	to A + 'COMBINING OGONEK' + 'COMBINING ACUTE ACCENT' */
				78	.str = {0x41, 0xcc, 0x81, 0xcc, 0xa8, 0x0},
				79	.dec = {0x41, 0xcc, 0xa8, 0xcc, 0x81, 0x0},
				80	},
				81	{
				82	/* 'LATIN SMALL LETTER A WITH DIAERESIS' + 'COMBINING OGONEK'
				83	decomposes to
				84	'LETTER A' + 'COMBINING OGONEK' + 'COMBINING DIAERESIS' */
				85	.str = {0xc3, 0xa4, 0xCC, 0xA8, 0x00},
				86
				87	.dec = {0x61, 0xCC, 0xA8, 0xcc, 0x88, 0x00},
				88	},
				89
				90	};
				91
Krzysztof Wilczynski	334b427	2019-08-30 15:13:49 +0200	[diff] [blame^]	92	static const struct {
Gabriel Krisman Bertazi	f0d6cc0	2019-04-25 13:56:01 -0400	[diff] [blame]	93	/* UTF-8 strings in this vector _must_ be NULL-terminated. */
				94	unsigned char str[30];
				95	unsigned char ncf[30];
				96	} nfdicf_test_data[] = {
				97	/* Trivial sequences */
				98	{
				99	/* "ABba" folds to lowercase */
				100	.str = {0x41, 0x42, 0x62, 0x61, 0x00},
				101	.ncf = {0x61, 0x62, 0x62, 0x61, 0x00},
				102	},
				103	{
				104	/* All ASCII folds to lower-case */
				105	.str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0.1",
				106	.ncf = "abcdefghijklmnopqrstuvwxyz0.1",
				107	},
				108	{
				109	/* LATIN SMALL LETTER SHARP S folds to
				110	LATIN SMALL LETTER S + LATIN SMALL LETTER S */
				111	.str = {0xc3, 0x9f, 0x00},
				112	.ncf = {0x73, 0x73, 0x00},
				113	},
				114	{
				115	/* LATIN CAPITAL LETTER A WITH RING ABOVE folds to
				116	LATIN SMALL LETTER A + COMBINING RING ABOVE */
				117	.str = {0xC3, 0x85, 0x00},
				118	.ncf = {0x61, 0xcc, 0x8a, 0x00},
				119	},
				120	/* Introduced by UTF-8.0.0. */
				121	/* Cherokee letters are interesting test-cases because they fold
				122	to upper-case. Before 8.0.0, Cherokee lowercase were
				123	undefined, thus, the folding from LC is not stable between
				124	7.0.0 -> 8.0.0, but it is from UC. */
				125	{
				126	/* CHEROKEE SMALL LETTER A folds to CHEROKEE LETTER A */
				127	.str = {0xea, 0xad, 0xb0, 0x00},
				128	.ncf = {0xe1, 0x8e, 0xa0, 0x00},
				129	},
				130	{
				131	/* CHEROKEE SMALL LETTER YE folds to CHEROKEE LETTER YE */
				132	.str = {0xe1, 0x8f, 0xb8, 0x00},
				133	.ncf = {0xe1, 0x8f, 0xb0, 0x00},
				134	},
				135	{
				136	/* OLD HUNGARIAN CAPITAL LETTER AMB folds to
				137	OLD HUNGARIAN SMALL LETTER AMB */
				138	.str = {0xf0, 0x90, 0xb2, 0x83, 0x00},
				139	.ncf = {0xf0, 0x90, 0xb3, 0x83, 0x00},
				140	},
				141	/* Introduced by UTF-9.0.0. */
				142	{
				143	/* OSAGE CAPITAL LETTER CHA folds to
				144	OSAGE SMALL LETTER CHA */
				145	.str = {0xf0, 0x90, 0x92, 0xb5, 0x00},
				146	.ncf = {0xf0, 0x90, 0x93, 0x9d, 0x00},
				147	},
				148	{
				149	/* LATIN CAPITAL LETTER SMALL CAPITAL I folds to
				150	LATIN LETTER SMALL CAPITAL I */
				151	.str = {0xea, 0x9e, 0xae, 0x00},
				152	.ncf = {0xc9, 0xaa, 0x00},
				153	},
				154	/* Introduced by UTF-11.0.0. */
				155	{
				156	/* GEORGIAN SMALL LETTER AN folds to GEORGIAN MTAVRULI
				157	CAPITAL LETTER AN */
				158	.str = {0xe1, 0xb2, 0x90, 0x00},
				159	.ncf = {0xe1, 0x83, 0x90, 0x00},
				160	}
				161	};
				162
				163	static void check_utf8_nfdi(void)
				164	{
				165	int i;
				166	struct utf8cursor u8c;
				167	const struct utf8data *data;
				168
				169	data = utf8nfdi(UNICODE_AGE(latest_maj, latest_min, latest_rev));
				170	if (!data) {
				171	pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n",
				172	__func__, latest_maj, latest_min, latest_rev);
				173	return;
				174	}
				175
				176	for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
				177	int len = strlen(nfdi_test_data[i].str);
				178	int nlen = strlen(nfdi_test_data[i].dec);
				179	int j = 0;
				180	unsigned char c;
				181
				182	test((utf8len(data, nfdi_test_data[i].str) == nlen));
				183	test((utf8nlen(data, nfdi_test_data[i].str, len) == nlen));
				184
				185	if (utf8cursor(&u8c, data, nfdi_test_data[i].str) < 0)
				186	pr_err("can't create cursor\n");
				187
				188	while ((c = utf8byte(&u8c)) > 0) {
				189	test_f((c == nfdi_test_data[i].dec[j]),
				190	"Unexpected byte 0x%x should be 0x%x\n",
				191	c, nfdi_test_data[i].dec[j]);
				192	j++;
				193	}
				194
				195	test((j == nlen));
				196	}
				197	}
				198
				199	static void check_utf8_nfdicf(void)
				200	{
				201	int i;
				202	struct utf8cursor u8c;
				203	const struct utf8data *data;
				204
				205	data = utf8nfdicf(UNICODE_AGE(latest_maj, latest_min, latest_rev));
				206	if (!data) {
				207	pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n",
				208	__func__, latest_maj, latest_min, latest_rev);
				209	return;
				210	}
				211
				212	for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
				213	int len = strlen(nfdicf_test_data[i].str);
				214	int nlen = strlen(nfdicf_test_data[i].ncf);
				215	int j = 0;
				216	unsigned char c;
				217
				218	test((utf8len(data, nfdicf_test_data[i].str) == nlen));
				219	test((utf8nlen(data, nfdicf_test_data[i].str, len) == nlen));
				220
				221	if (utf8cursor(&u8c, data, nfdicf_test_data[i].str) < 0)
				222	pr_err("can't create cursor\n");
				223
				224	while ((c = utf8byte(&u8c)) > 0) {
				225	test_f((c == nfdicf_test_data[i].ncf[j]),
				226	"Unexpected byte 0x%x should be 0x%x\n",
				227	c, nfdicf_test_data[i].ncf[j]);
				228	j++;
				229	}
				230
				231	test((j == nlen));
				232	}
				233	}
				234
				235	static void check_utf8_comparisons(void)
				236	{
				237	int i;
Gabriel Krisman Bertazi	1215d23	2019-04-25 13:59:17 -0400	[diff] [blame]	238	struct unicode_map *table = utf8_load("12.1.0");
Gabriel Krisman Bertazi	f0d6cc0	2019-04-25 13:56:01 -0400	[diff] [blame]	239
				240	if (IS_ERR(table)) {
				241	pr_err("%s: Unable to load utf8 %d.%d.%d. Skipping.\n",
				242	__func__, latest_maj, latest_min, latest_rev);
				243	return;
				244	}
				245
				246	for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
				247	const struct qstr s1 = {.name = nfdi_test_data[i].str,
				248	.len = sizeof(nfdi_test_data[i].str)};
				249	const struct qstr s2 = {.name = nfdi_test_data[i].dec,
				250	.len = sizeof(nfdi_test_data[i].dec)};
				251
				252	test_f(!utf8_strncmp(table, &s1, &s2),
				253	"%s %s comparison mismatch\n", s1.name, s2.name);
				254	}
				255
				256	for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
				257	const struct qstr s1 = {.name = nfdicf_test_data[i].str,
				258	.len = sizeof(nfdicf_test_data[i].str)};
				259	const struct qstr s2 = {.name = nfdicf_test_data[i].ncf,
				260	.len = sizeof(nfdicf_test_data[i].ncf)};
				261
				262	test_f(!utf8_strncasecmp(table, &s1, &s2),
				263	"%s %s comparison mismatch\n", s1.name, s2.name);
				264	}
				265
				266	utf8_unload(table);
				267	}
				268
				269	static void check_supported_versions(void)
				270	{
				271	/* Unicode 7.0.0 should be supported. */
				272	test(utf8version_is_supported(7, 0, 0));
				273
				274	/* Unicode 9.0.0 should be supported. */
				275	test(utf8version_is_supported(9, 0, 0));
				276
				277	/* Unicode 1x.0.0 (the latest version) should be supported. */
				278	test(utf8version_is_supported(latest_maj, latest_min, latest_rev));
				279
				280	/* Next versions don't exist. */
Gabriel Krisman Bertazi	1215d23	2019-04-25 13:59:17 -0400	[diff] [blame]	281	test(!utf8version_is_supported(13, 0, 0));
Gabriel Krisman Bertazi	f0d6cc0	2019-04-25 13:56:01 -0400	[diff] [blame]	282	test(!utf8version_is_supported(0, 0, 0));
				283	test(!utf8version_is_supported(-1, -1, -1));
				284	}
				285
				286	static int __init init_test_ucd(void)
				287	{
				288	failed_tests = 0;
				289	total_tests = 0;
				290
				291	check_supported_versions();
				292	check_utf8_nfdi();
				293	check_utf8_nfdicf();
				294	check_utf8_comparisons();
				295
				296	if (!failed_tests)
				297	pr_info("All %u tests passed\n", total_tests);
				298	else
				299	pr_err("%u out of %u tests failed\n", failed_tests,
				300	total_tests);
				301	return 0;
				302	}
				303
				304	static void __exit exit_test_ucd(void)
				305	{
				306	}
				307
				308	module_init(init_test_ucd);
				309	module_exit(exit_test_ucd);
				310
				311	MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>");
				312	MODULE_LICENSE("GPL");