blob: a120638014c19f352051db0825760ddae8998f58 [file] [log] [blame]
Olaf Weber44594c22019-04-25 13:45:46 -04001/*
2 * Copyright (c) 2014 SGI.
3 * All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 */
15
16#ifndef UTF8NORM_H
17#define UTF8NORM_H
18
19#include <linux/types.h>
20#include <linux/export.h>
21#include <linux/string.h>
22#include <linux/module.h>
23
24/* Encoding a unicode version number as a single unsigned int. */
25#define UNICODE_MAJ_SHIFT (16)
26#define UNICODE_MIN_SHIFT (8)
27
28#define UNICODE_AGE(MAJ, MIN, REV) \
29 (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
30 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
31 ((unsigned int)(REV)))
32
33/* Highest unicode version supported by the data tables. */
34extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
Gabriel Krisman Bertazi9d536902019-04-25 13:51:22 -040035extern int utf8version_latest(void);
Olaf Weber44594c22019-04-25 13:45:46 -040036
37/*
38 * Look for the correct const struct utf8data for a unicode version.
39 * Returns NULL if the version requested is too new.
40 *
41 * Two normalization forms are supported: nfdi and nfdicf.
42 *
43 * nfdi:
44 * - Apply unicode normalization form NFD.
45 * - Remove any Default_Ignorable_Code_Point.
46 *
47 * nfdicf:
48 * - Apply unicode normalization form NFD.
49 * - Remove any Default_Ignorable_Code_Point.
50 * - Apply a full casefold (C + F).
51 */
52extern const struct utf8data *utf8nfdi(unsigned int maxage);
53extern const struct utf8data *utf8nfdicf(unsigned int maxage);
54
55/*
56 * Determine the maximum age of any unicode character in the string.
57 * Returns 0 if only unassigned code points are present.
58 * Returns -1 if the input is not valid UTF-8.
59 */
60extern int utf8agemax(const struct utf8data *data, const char *s);
61extern int utf8nagemax(const struct utf8data *data, const char *s, size_t len);
62
63/*
64 * Determine the minimum age of any unicode character in the string.
65 * Returns 0 if any unassigned code points are present.
66 * Returns -1 if the input is not valid UTF-8.
67 */
68extern int utf8agemin(const struct utf8data *data, const char *s);
69extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len);
70
71/*
72 * Determine the length of the normalized from of the string,
73 * excluding any terminating NULL byte.
74 * Returns 0 if only ignorable code points are present.
75 * Returns -1 if the input is not valid UTF-8.
76 */
77extern ssize_t utf8len(const struct utf8data *data, const char *s);
78extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len);
79
Olaf Webera8384c62019-04-25 13:49:18 -040080/* Needed in struct utf8cursor below. */
81#define UTF8HANGULLEAF (12)
82
Olaf Weber44594c22019-04-25 13:45:46 -040083/*
84 * Cursor structure used by the normalizer.
85 */
86struct utf8cursor {
87 const struct utf8data *data;
88 const char *s;
89 const char *p;
90 const char *ss;
91 const char *sp;
92 unsigned int len;
93 unsigned int slen;
94 short int ccc;
95 short int nccc;
Olaf Webera8384c62019-04-25 13:49:18 -040096 unsigned char hangul[UTF8HANGULLEAF];
Olaf Weber44594c22019-04-25 13:45:46 -040097};
98
99/*
100 * Initialize a utf8cursor to normalize a string.
101 * Returns 0 on success.
102 * Returns -1 on failure.
103 */
104extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
105 const char *s);
106extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
107 const char *s, size_t len);
108
109/*
110 * Get the next byte in the normalization.
111 * Returns a value > 0 && < 256 on success.
112 * Returns 0 when the end of the normalization is reached.
113 * Returns -1 if the string being normalized is not valid UTF-8.
114 */
115extern int utf8byte(struct utf8cursor *u8c);
116
117#endif /* UTF8NORM_H */