blob: b63a9091dc39a633ad05ab6f59bde8eb0b00c0c1 [file] [log] [blame]
Olaf Weber44594c22019-04-25 13:45:46 -04001/*
2 * Copyright (c) 2014 SGI.
3 * All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 */
15
16#ifndef UTF8NORM_H
17#define UTF8NORM_H
18
19#include <linux/types.h>
20#include <linux/export.h>
21#include <linux/string.h>
22#include <linux/module.h>
23
24/* Encoding a unicode version number as a single unsigned int. */
25#define UNICODE_MAJ_SHIFT (16)
26#define UNICODE_MIN_SHIFT (8)
27
28#define UNICODE_AGE(MAJ, MIN, REV) \
29 (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
30 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
31 ((unsigned int)(REV)))
32
33/* Highest unicode version supported by the data tables. */
34extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
35
36/*
37 * Look for the correct const struct utf8data for a unicode version.
38 * Returns NULL if the version requested is too new.
39 *
40 * Two normalization forms are supported: nfdi and nfdicf.
41 *
42 * nfdi:
43 * - Apply unicode normalization form NFD.
44 * - Remove any Default_Ignorable_Code_Point.
45 *
46 * nfdicf:
47 * - Apply unicode normalization form NFD.
48 * - Remove any Default_Ignorable_Code_Point.
49 * - Apply a full casefold (C + F).
50 */
51extern const struct utf8data *utf8nfdi(unsigned int maxage);
52extern const struct utf8data *utf8nfdicf(unsigned int maxage);
53
54/*
55 * Determine the maximum age of any unicode character in the string.
56 * Returns 0 if only unassigned code points are present.
57 * Returns -1 if the input is not valid UTF-8.
58 */
59extern int utf8agemax(const struct utf8data *data, const char *s);
60extern int utf8nagemax(const struct utf8data *data, const char *s, size_t len);
61
62/*
63 * Determine the minimum age of any unicode character in the string.
64 * Returns 0 if any unassigned code points are present.
65 * Returns -1 if the input is not valid UTF-8.
66 */
67extern int utf8agemin(const struct utf8data *data, const char *s);
68extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len);
69
70/*
71 * Determine the length of the normalized from of the string,
72 * excluding any terminating NULL byte.
73 * Returns 0 if only ignorable code points are present.
74 * Returns -1 if the input is not valid UTF-8.
75 */
76extern ssize_t utf8len(const struct utf8data *data, const char *s);
77extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len);
78
Olaf Webera8384c62019-04-25 13:49:18 -040079/* Needed in struct utf8cursor below. */
80#define UTF8HANGULLEAF (12)
81
Olaf Weber44594c22019-04-25 13:45:46 -040082/*
83 * Cursor structure used by the normalizer.
84 */
85struct utf8cursor {
86 const struct utf8data *data;
87 const char *s;
88 const char *p;
89 const char *ss;
90 const char *sp;
91 unsigned int len;
92 unsigned int slen;
93 short int ccc;
94 short int nccc;
Olaf Webera8384c62019-04-25 13:49:18 -040095 unsigned char hangul[UTF8HANGULLEAF];
Olaf Weber44594c22019-04-25 13:45:46 -040096};
97
98/*
99 * Initialize a utf8cursor to normalize a string.
100 * Returns 0 on success.
101 * Returns -1 on failure.
102 */
103extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
104 const char *s);
105extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
106 const char *s, size_t len);
107
108/*
109 * Get the next byte in the normalization.
110 * Returns a value > 0 && < 256 on success.
111 * Returns 0 when the end of the normalization is reached.
112 * Returns -1 if the string being normalized is not valid UTF-8.
113 */
114extern int utf8byte(struct utf8cursor *u8c);
115
116#endif /* UTF8NORM_H */