Thomas Gleixner | 9f80685 | 2019-05-29 07:18:08 -0700 | [diff] [blame] | 1 | // SPDX-License-Identifier: GPL-2.0-only |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 2 | /* |
| 3 | * Copyright (c) 2014 SGI. |
| 4 | * All rights reserved. |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 5 | */ |
| 6 | |
| 7 | #include "utf8n.h" |
| 8 | |
| 9 | struct utf8data { |
| 10 | unsigned int maxage; |
| 11 | unsigned int offset; |
| 12 | }; |
| 13 | |
| 14 | #define __INCLUDED_FROM_UTF8NORM_C__ |
| 15 | #include "utf8data.h" |
| 16 | #undef __INCLUDED_FROM_UTF8NORM_C__ |
| 17 | |
| 18 | int utf8version_is_supported(u8 maj, u8 min, u8 rev) |
| 19 | { |
| 20 | int i = ARRAY_SIZE(utf8agetab) - 1; |
| 21 | unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev); |
| 22 | |
| 23 | while (i >= 0 && utf8agetab[i] != 0) { |
| 24 | if (sb_utf8version == utf8agetab[i]) |
| 25 | return 1; |
| 26 | i--; |
| 27 | } |
| 28 | return 0; |
| 29 | } |
| 30 | EXPORT_SYMBOL(utf8version_is_supported); |
| 31 | |
Gabriel Krisman Bertazi | 9d53690 | 2019-04-25 13:51:22 -0400 | [diff] [blame] | 32 | int utf8version_latest(void) |
| 33 | { |
| 34 | return utf8vers; |
| 35 | } |
| 36 | EXPORT_SYMBOL(utf8version_latest); |
| 37 | |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 38 | /* |
| 39 | * UTF-8 valid ranges. |
| 40 | * |
| 41 | * The UTF-8 encoding spreads the bits of a 32bit word over several |
| 42 | * bytes. This table gives the ranges that can be held and how they'd |
| 43 | * be represented. |
| 44 | * |
| 45 | * 0x00000000 0x0000007F: 0xxxxxxx |
| 46 | * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx |
| 47 | * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx |
| 48 | * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| 49 | * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
| 50 | * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
| 51 | * |
| 52 | * There is an additional requirement on UTF-8, in that only the |
| 53 | * shortest representation of a 32bit value is to be used. A decoder |
| 54 | * must not decode sequences that do not satisfy this requirement. |
| 55 | * Thus the allowed ranges have a lower bound. |
| 56 | * |
| 57 | * 0x00000000 0x0000007F: 0xxxxxxx |
| 58 | * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx |
| 59 | * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx |
| 60 | * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| 61 | * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
| 62 | * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
| 63 | * |
| 64 | * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, |
| 65 | * 17 planes of 65536 values. This limits the sequences actually seen |
| 66 | * even more, to just the following. |
| 67 | * |
| 68 | * 0 - 0x7F: 0 - 0x7F |
| 69 | * 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF |
| 70 | * 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF |
| 71 | * 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF |
| 72 | * |
| 73 | * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed. |
| 74 | * |
| 75 | * Note that the longest sequence seen with valid usage is 4 bytes, |
| 76 | * the same a single UTF-32 character. This makes the UTF-8 |
| 77 | * representation of Unicode strictly smaller than UTF-32. |
| 78 | * |
| 79 | * The shortest sequence requirement was introduced by: |
| 80 | * Corrigendum #1: UTF-8 Shortest Form |
| 81 | * It can be found here: |
| 82 | * http://www.unicode.org/versions/corrigendum1.html |
| 83 | * |
| 84 | */ |
| 85 | |
| 86 | /* |
| 87 | * Return the number of bytes used by the current UTF-8 sequence. |
| 88 | * Assumes the input points to the first byte of a valid UTF-8 |
| 89 | * sequence. |
| 90 | */ |
| 91 | static inline int utf8clen(const char *s) |
| 92 | { |
| 93 | unsigned char c = *s; |
| 94 | |
| 95 | return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); |
| 96 | } |
| 97 | |
| 98 | /* |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 99 | * Decode a 3-byte UTF-8 sequence. |
| 100 | */ |
| 101 | static unsigned int |
| 102 | utf8decode3(const char *str) |
| 103 | { |
| 104 | unsigned int uc; |
| 105 | |
| 106 | uc = *str++ & 0x0F; |
| 107 | uc <<= 6; |
| 108 | uc |= *str++ & 0x3F; |
| 109 | uc <<= 6; |
| 110 | uc |= *str++ & 0x3F; |
| 111 | |
| 112 | return uc; |
| 113 | } |
| 114 | |
| 115 | /* |
| 116 | * Encode a 3-byte UTF-8 sequence. |
| 117 | */ |
| 118 | static int |
| 119 | utf8encode3(char *str, unsigned int val) |
| 120 | { |
| 121 | str[2] = (val & 0x3F) | 0x80; |
| 122 | val >>= 6; |
| 123 | str[1] = (val & 0x3F) | 0x80; |
| 124 | val >>= 6; |
| 125 | str[0] = val | 0xE0; |
| 126 | |
| 127 | return 3; |
| 128 | } |
| 129 | |
| 130 | /* |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 131 | * utf8trie_t |
| 132 | * |
| 133 | * A compact binary tree, used to decode UTF-8 characters. |
| 134 | * |
| 135 | * Internal nodes are one byte for the node itself, and up to three |
| 136 | * bytes for an offset into the tree. The first byte contains the |
| 137 | * following information: |
| 138 | * NEXTBYTE - flag - advance to next byte if set |
| 139 | * BITNUM - 3 bit field - the bit number to tested |
| 140 | * OFFLEN - 2 bit field - number of bytes in the offset |
| 141 | * if offlen == 0 (non-branching node) |
| 142 | * RIGHTPATH - 1 bit field - set if the following node is for the |
| 143 | * right-hand path (tested bit is set) |
| 144 | * TRIENODE - 1 bit field - set if the following node is an internal |
| 145 | * node, otherwise it is a leaf node |
| 146 | * if offlen != 0 (branching node) |
| 147 | * LEFTNODE - 1 bit field - set if the left-hand node is internal |
| 148 | * RIGHTNODE - 1 bit field - set if the right-hand node is internal |
| 149 | * |
| 150 | * Due to the way utf8 works, there cannot be branching nodes with |
| 151 | * NEXTBYTE set, and moreover those nodes always have a righthand |
| 152 | * descendant. |
| 153 | */ |
| 154 | typedef const unsigned char utf8trie_t; |
| 155 | #define BITNUM 0x07 |
| 156 | #define NEXTBYTE 0x08 |
| 157 | #define OFFLEN 0x30 |
| 158 | #define OFFLEN_SHIFT 4 |
| 159 | #define RIGHTPATH 0x40 |
| 160 | #define TRIENODE 0x80 |
| 161 | #define RIGHTNODE 0x40 |
| 162 | #define LEFTNODE 0x80 |
| 163 | |
| 164 | /* |
| 165 | * utf8leaf_t |
| 166 | * |
| 167 | * The leaves of the trie are embedded in the trie, and so the same |
| 168 | * underlying datatype: unsigned char. |
| 169 | * |
| 170 | * leaf[0]: The unicode version, stored as a generation number that is |
| 171 | * an index into utf8agetab[]. With this we can filter code |
| 172 | * points based on the unicode version in which they were |
| 173 | * defined. The CCC of a non-defined code point is 0. |
| 174 | * leaf[1]: Canonical Combining Class. During normalization, we need |
| 175 | * to do a stable sort into ascending order of all characters |
| 176 | * with a non-zero CCC that occur between two characters with |
| 177 | * a CCC of 0, or at the begin or end of a string. |
| 178 | * The unicode standard guarantees that all CCC values are |
| 179 | * between 0 and 254 inclusive, which leaves 255 available as |
| 180 | * a special value. |
| 181 | * Code points with CCC 0 are known as stoppers. |
| 182 | * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the |
| 183 | * start of a NUL-terminated string that is the decomposition |
| 184 | * of the character. |
| 185 | * The CCC of a decomposable character is the same as the CCC |
| 186 | * of the first character of its decomposition. |
| 187 | * Some characters decompose as the empty string: these are |
| 188 | * characters with the Default_Ignorable_Code_Point property. |
| 189 | * These do affect normalization, as they all have CCC 0. |
| 190 | * |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 191 | * The decompositions in the trie have been fully expanded, with the |
| 192 | * exception of Hangul syllables, which are decomposed algorithmically. |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 193 | * |
| 194 | * Casefolding, if applicable, is also done using decompositions. |
| 195 | * |
| 196 | * The trie is constructed in such a way that leaves exist for all |
| 197 | * UTF-8 sequences that match the criteria from the "UTF-8 valid |
| 198 | * ranges" comment above, and only for those sequences. Therefore a |
| 199 | * lookup in the trie can be used to validate the UTF-8 input. |
| 200 | */ |
| 201 | typedef const unsigned char utf8leaf_t; |
| 202 | |
| 203 | #define LEAF_GEN(LEAF) ((LEAF)[0]) |
| 204 | #define LEAF_CCC(LEAF) ((LEAF)[1]) |
| 205 | #define LEAF_STR(LEAF) ((const char *)((LEAF) + 2)) |
| 206 | |
| 207 | #define MINCCC (0) |
| 208 | #define MAXCCC (254) |
| 209 | #define STOPPER (0) |
| 210 | #define DECOMPOSE (255) |
| 211 | |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 212 | /* Marker for hangul syllable decomposition. */ |
| 213 | #define HANGUL ((char)(255)) |
| 214 | /* Size of the synthesized leaf used for Hangul syllable decomposition. */ |
| 215 | #define UTF8HANGULLEAF (12) |
| 216 | |
| 217 | /* |
| 218 | * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) |
| 219 | * |
| 220 | * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; |
| 221 | * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; |
| 222 | * |
| 223 | * SBase = 0xAC00 |
| 224 | * LBase = 0x1100 |
| 225 | * VBase = 0x1161 |
| 226 | * TBase = 0x11A7 |
| 227 | * LCount = 19 |
| 228 | * VCount = 21 |
| 229 | * TCount = 28 |
| 230 | * NCount = 588 (VCount * TCount) |
| 231 | * SCount = 11172 (LCount * NCount) |
| 232 | * |
| 233 | * Decomposition: |
| 234 | * SIndex = s - SBase |
| 235 | * |
| 236 | * LV (Canonical/Full) |
| 237 | * LIndex = SIndex / NCount |
| 238 | * VIndex = (Sindex % NCount) / TCount |
| 239 | * LPart = LBase + LIndex |
| 240 | * VPart = VBase + VIndex |
| 241 | * |
| 242 | * LVT (Canonical) |
| 243 | * LVIndex = (SIndex / TCount) * TCount |
| 244 | * TIndex = (Sindex % TCount) |
| 245 | * LVPart = SBase + LVIndex |
| 246 | * TPart = TBase + TIndex |
| 247 | * |
| 248 | * LVT (Full) |
| 249 | * LIndex = SIndex / NCount |
| 250 | * VIndex = (Sindex % NCount) / TCount |
| 251 | * TIndex = (Sindex % TCount) |
| 252 | * LPart = LBase + LIndex |
| 253 | * VPart = VBase + VIndex |
| 254 | * if (TIndex == 0) { |
| 255 | * d = <LPart, VPart> |
| 256 | * } else { |
| 257 | * TPart = TBase + TIndex |
| 258 | * d = <LPart, TPart, VPart> |
| 259 | * } |
| 260 | */ |
| 261 | |
| 262 | /* Constants */ |
| 263 | #define SB (0xAC00) |
| 264 | #define LB (0x1100) |
| 265 | #define VB (0x1161) |
| 266 | #define TB (0x11A7) |
| 267 | #define LC (19) |
| 268 | #define VC (21) |
| 269 | #define TC (28) |
| 270 | #define NC (VC * TC) |
| 271 | #define SC (LC * NC) |
| 272 | |
| 273 | /* Algorithmic decomposition of hangul syllable. */ |
| 274 | static utf8leaf_t * |
| 275 | utf8hangul(const char *str, unsigned char *hangul) |
| 276 | { |
| 277 | unsigned int si; |
| 278 | unsigned int li; |
| 279 | unsigned int vi; |
| 280 | unsigned int ti; |
| 281 | unsigned char *h; |
| 282 | |
| 283 | /* Calculate the SI, LI, VI, and TI values. */ |
| 284 | si = utf8decode3(str) - SB; |
| 285 | li = si / NC; |
| 286 | vi = (si % NC) / TC; |
| 287 | ti = si % TC; |
| 288 | |
| 289 | /* Fill in base of leaf. */ |
| 290 | h = hangul; |
| 291 | LEAF_GEN(h) = 2; |
| 292 | LEAF_CCC(h) = DECOMPOSE; |
| 293 | h += 2; |
| 294 | |
| 295 | /* Add LPart, a 3-byte UTF-8 sequence. */ |
| 296 | h += utf8encode3((char *)h, li + LB); |
| 297 | |
| 298 | /* Add VPart, a 3-byte UTF-8 sequence. */ |
| 299 | h += utf8encode3((char *)h, vi + VB); |
| 300 | |
| 301 | /* Add TPart if required, also a 3-byte UTF-8 sequence. */ |
| 302 | if (ti) |
| 303 | h += utf8encode3((char *)h, ti + TB); |
| 304 | |
| 305 | /* Terminate string. */ |
| 306 | h[0] = '\0'; |
| 307 | |
| 308 | return hangul; |
| 309 | } |
| 310 | |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 311 | /* |
| 312 | * Use trie to scan s, touching at most len bytes. |
| 313 | * Returns the leaf if one exists, NULL otherwise. |
| 314 | * |
| 315 | * A non-NULL return guarantees that the UTF-8 sequence starting at s |
| 316 | * is well-formed and corresponds to a known unicode code point. The |
| 317 | * shorthand for this will be "is valid UTF-8 unicode". |
| 318 | */ |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 319 | static utf8leaf_t *utf8nlookup(const struct utf8data *data, |
| 320 | unsigned char *hangul, const char *s, size_t len) |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 321 | { |
| 322 | utf8trie_t *trie = NULL; |
| 323 | int offlen; |
| 324 | int offset; |
| 325 | int mask; |
| 326 | int node; |
| 327 | |
| 328 | if (!data) |
| 329 | return NULL; |
| 330 | if (len == 0) |
| 331 | return NULL; |
| 332 | |
| 333 | trie = utf8data + data->offset; |
| 334 | node = 1; |
| 335 | while (node) { |
| 336 | offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; |
| 337 | if (*trie & NEXTBYTE) { |
| 338 | if (--len == 0) |
| 339 | return NULL; |
| 340 | s++; |
| 341 | } |
| 342 | mask = 1 << (*trie & BITNUM); |
| 343 | if (*s & mask) { |
| 344 | /* Right leg */ |
| 345 | if (offlen) { |
| 346 | /* Right node at offset of trie */ |
| 347 | node = (*trie & RIGHTNODE); |
| 348 | offset = trie[offlen]; |
| 349 | while (--offlen) { |
| 350 | offset <<= 8; |
| 351 | offset |= trie[offlen]; |
| 352 | } |
| 353 | trie += offset; |
| 354 | } else if (*trie & RIGHTPATH) { |
| 355 | /* Right node after this node */ |
| 356 | node = (*trie & TRIENODE); |
| 357 | trie++; |
| 358 | } else { |
| 359 | /* No right node. */ |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 360 | return NULL; |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 361 | } |
| 362 | } else { |
| 363 | /* Left leg */ |
| 364 | if (offlen) { |
| 365 | /* Left node after this node. */ |
| 366 | node = (*trie & LEFTNODE); |
| 367 | trie += offlen + 1; |
| 368 | } else if (*trie & RIGHTPATH) { |
| 369 | /* No left node. */ |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 370 | return NULL; |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 371 | } else { |
| 372 | /* Left node after this node */ |
| 373 | node = (*trie & TRIENODE); |
| 374 | trie++; |
| 375 | } |
| 376 | } |
| 377 | } |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 378 | /* |
| 379 | * Hangul decomposition is done algorithmically. These are the |
| 380 | * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is |
| 381 | * always 3 bytes long, so s has been advanced twice, and the |
| 382 | * start of the sequence is at s-2. |
| 383 | */ |
| 384 | if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL) |
| 385 | trie = utf8hangul(s - 2, hangul); |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 386 | return trie; |
| 387 | } |
| 388 | |
| 389 | /* |
| 390 | * Use trie to scan s. |
| 391 | * Returns the leaf if one exists, NULL otherwise. |
| 392 | * |
| 393 | * Forwards to utf8nlookup(). |
| 394 | */ |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 395 | static utf8leaf_t *utf8lookup(const struct utf8data *data, |
| 396 | unsigned char *hangul, const char *s) |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 397 | { |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 398 | return utf8nlookup(data, hangul, s, (size_t)-1); |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 399 | } |
| 400 | |
| 401 | /* |
| 402 | * Maximum age of any character in s. |
| 403 | * Return -1 if s is not valid UTF-8 unicode. |
| 404 | * Return 0 if only non-assigned code points are used. |
| 405 | */ |
| 406 | int utf8agemax(const struct utf8data *data, const char *s) |
| 407 | { |
| 408 | utf8leaf_t *leaf; |
| 409 | int age = 0; |
| 410 | int leaf_age; |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 411 | unsigned char hangul[UTF8HANGULLEAF]; |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 412 | |
| 413 | if (!data) |
| 414 | return -1; |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 415 | |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 416 | while (*s) { |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 417 | leaf = utf8lookup(data, hangul, s); |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 418 | if (!leaf) |
| 419 | return -1; |
| 420 | |
| 421 | leaf_age = utf8agetab[LEAF_GEN(leaf)]; |
| 422 | if (leaf_age <= data->maxage && leaf_age > age) |
| 423 | age = leaf_age; |
| 424 | s += utf8clen(s); |
| 425 | } |
| 426 | return age; |
| 427 | } |
| 428 | EXPORT_SYMBOL(utf8agemax); |
| 429 | |
| 430 | /* |
| 431 | * Minimum age of any character in s. |
| 432 | * Return -1 if s is not valid UTF-8 unicode. |
| 433 | * Return 0 if non-assigned code points are used. |
| 434 | */ |
| 435 | int utf8agemin(const struct utf8data *data, const char *s) |
| 436 | { |
| 437 | utf8leaf_t *leaf; |
| 438 | int age; |
| 439 | int leaf_age; |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 440 | unsigned char hangul[UTF8HANGULLEAF]; |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 441 | |
| 442 | if (!data) |
| 443 | return -1; |
| 444 | age = data->maxage; |
| 445 | while (*s) { |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 446 | leaf = utf8lookup(data, hangul, s); |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 447 | if (!leaf) |
| 448 | return -1; |
| 449 | leaf_age = utf8agetab[LEAF_GEN(leaf)]; |
| 450 | if (leaf_age <= data->maxage && leaf_age < age) |
| 451 | age = leaf_age; |
| 452 | s += utf8clen(s); |
| 453 | } |
| 454 | return age; |
| 455 | } |
| 456 | EXPORT_SYMBOL(utf8agemin); |
| 457 | |
| 458 | /* |
| 459 | * Maximum age of any character in s, touch at most len bytes. |
| 460 | * Return -1 if s is not valid UTF-8 unicode. |
| 461 | */ |
| 462 | int utf8nagemax(const struct utf8data *data, const char *s, size_t len) |
| 463 | { |
| 464 | utf8leaf_t *leaf; |
| 465 | int age = 0; |
| 466 | int leaf_age; |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 467 | unsigned char hangul[UTF8HANGULLEAF]; |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 468 | |
| 469 | if (!data) |
| 470 | return -1; |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 471 | |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 472 | while (len && *s) { |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 473 | leaf = utf8nlookup(data, hangul, s, len); |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 474 | if (!leaf) |
| 475 | return -1; |
| 476 | leaf_age = utf8agetab[LEAF_GEN(leaf)]; |
| 477 | if (leaf_age <= data->maxage && leaf_age > age) |
| 478 | age = leaf_age; |
| 479 | len -= utf8clen(s); |
| 480 | s += utf8clen(s); |
| 481 | } |
| 482 | return age; |
| 483 | } |
| 484 | EXPORT_SYMBOL(utf8nagemax); |
| 485 | |
| 486 | /* |
| 487 | * Maximum age of any character in s, touch at most len bytes. |
| 488 | * Return -1 if s is not valid UTF-8 unicode. |
| 489 | */ |
| 490 | int utf8nagemin(const struct utf8data *data, const char *s, size_t len) |
| 491 | { |
| 492 | utf8leaf_t *leaf; |
| 493 | int leaf_age; |
| 494 | int age; |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 495 | unsigned char hangul[UTF8HANGULLEAF]; |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 496 | |
| 497 | if (!data) |
| 498 | return -1; |
| 499 | age = data->maxage; |
| 500 | while (len && *s) { |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 501 | leaf = utf8nlookup(data, hangul, s, len); |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 502 | if (!leaf) |
| 503 | return -1; |
| 504 | leaf_age = utf8agetab[LEAF_GEN(leaf)]; |
| 505 | if (leaf_age <= data->maxage && leaf_age < age) |
| 506 | age = leaf_age; |
| 507 | len -= utf8clen(s); |
| 508 | s += utf8clen(s); |
| 509 | } |
| 510 | return age; |
| 511 | } |
| 512 | EXPORT_SYMBOL(utf8nagemin); |
| 513 | |
| 514 | /* |
| 515 | * Length of the normalization of s. |
| 516 | * Return -1 if s is not valid UTF-8 unicode. |
| 517 | * |
| 518 | * A string of Default_Ignorable_Code_Point has length 0. |
| 519 | */ |
| 520 | ssize_t utf8len(const struct utf8data *data, const char *s) |
| 521 | { |
| 522 | utf8leaf_t *leaf; |
| 523 | size_t ret = 0; |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 524 | unsigned char hangul[UTF8HANGULLEAF]; |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 525 | |
| 526 | if (!data) |
| 527 | return -1; |
| 528 | while (*s) { |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 529 | leaf = utf8lookup(data, hangul, s); |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 530 | if (!leaf) |
| 531 | return -1; |
| 532 | if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) |
| 533 | ret += utf8clen(s); |
| 534 | else if (LEAF_CCC(leaf) == DECOMPOSE) |
| 535 | ret += strlen(LEAF_STR(leaf)); |
| 536 | else |
| 537 | ret += utf8clen(s); |
| 538 | s += utf8clen(s); |
| 539 | } |
| 540 | return ret; |
| 541 | } |
| 542 | EXPORT_SYMBOL(utf8len); |
| 543 | |
| 544 | /* |
| 545 | * Length of the normalization of s, touch at most len bytes. |
| 546 | * Return -1 if s is not valid UTF-8 unicode. |
| 547 | */ |
| 548 | ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len) |
| 549 | { |
| 550 | utf8leaf_t *leaf; |
| 551 | size_t ret = 0; |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 552 | unsigned char hangul[UTF8HANGULLEAF]; |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 553 | |
| 554 | if (!data) |
| 555 | return -1; |
| 556 | while (len && *s) { |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 557 | leaf = utf8nlookup(data, hangul, s, len); |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 558 | if (!leaf) |
| 559 | return -1; |
| 560 | if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) |
| 561 | ret += utf8clen(s); |
| 562 | else if (LEAF_CCC(leaf) == DECOMPOSE) |
| 563 | ret += strlen(LEAF_STR(leaf)); |
| 564 | else |
| 565 | ret += utf8clen(s); |
| 566 | len -= utf8clen(s); |
| 567 | s += utf8clen(s); |
| 568 | } |
| 569 | return ret; |
| 570 | } |
| 571 | EXPORT_SYMBOL(utf8nlen); |
| 572 | |
| 573 | /* |
| 574 | * Set up an utf8cursor for use by utf8byte(). |
| 575 | * |
| 576 | * u8c : pointer to cursor. |
| 577 | * data : const struct utf8data to use for normalization. |
| 578 | * s : string. |
| 579 | * len : length of s. |
| 580 | * |
| 581 | * Returns -1 on error, 0 on success. |
| 582 | */ |
| 583 | int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data, |
| 584 | const char *s, size_t len) |
| 585 | { |
| 586 | if (!data) |
| 587 | return -1; |
| 588 | if (!s) |
| 589 | return -1; |
| 590 | u8c->data = data; |
| 591 | u8c->s = s; |
| 592 | u8c->p = NULL; |
| 593 | u8c->ss = NULL; |
| 594 | u8c->sp = NULL; |
| 595 | u8c->len = len; |
| 596 | u8c->slen = 0; |
| 597 | u8c->ccc = STOPPER; |
| 598 | u8c->nccc = STOPPER; |
| 599 | /* Check we didn't clobber the maximum length. */ |
| 600 | if (u8c->len != len) |
| 601 | return -1; |
| 602 | /* The first byte of s may not be an utf8 continuation. */ |
| 603 | if (len > 0 && (*s & 0xC0) == 0x80) |
| 604 | return -1; |
| 605 | return 0; |
| 606 | } |
| 607 | EXPORT_SYMBOL(utf8ncursor); |
| 608 | |
| 609 | /* |
| 610 | * Set up an utf8cursor for use by utf8byte(). |
| 611 | * |
| 612 | * u8c : pointer to cursor. |
| 613 | * data : const struct utf8data to use for normalization. |
| 614 | * s : NUL-terminated string. |
| 615 | * |
| 616 | * Returns -1 on error, 0 on success. |
| 617 | */ |
| 618 | int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data, |
| 619 | const char *s) |
| 620 | { |
| 621 | return utf8ncursor(u8c, data, s, (unsigned int)-1); |
| 622 | } |
| 623 | EXPORT_SYMBOL(utf8cursor); |
| 624 | |
| 625 | /* |
| 626 | * Get one byte from the normalized form of the string described by u8c. |
| 627 | * |
| 628 | * Returns the byte cast to an unsigned char on succes, and -1 on failure. |
| 629 | * |
| 630 | * The cursor keeps track of the location in the string in u8c->s. |
| 631 | * When a character is decomposed, the current location is stored in |
| 632 | * u8c->p, and u8c->s is set to the start of the decomposition. Note |
| 633 | * that bytes from a decomposition do not count against u8c->len. |
| 634 | * |
| 635 | * Characters are emitted if they match the current CCC in u8c->ccc. |
| 636 | * Hitting end-of-string while u8c->ccc == STOPPER means we're done, |
| 637 | * and the function returns 0 in that case. |
| 638 | * |
| 639 | * Sorting by CCC is done by repeatedly scanning the string. The |
| 640 | * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at |
| 641 | * the start of the scan. The first pass finds the lowest CCC to be |
| 642 | * emitted and stores it in u8c->nccc, the second pass emits the |
| 643 | * characters with this CCC and finds the next lowest CCC. This limits |
| 644 | * the number of passes to 1 + the number of different CCCs in the |
| 645 | * sequence being scanned. |
| 646 | * |
| 647 | * Therefore: |
| 648 | * u8c->p != NULL -> a decomposition is being scanned. |
| 649 | * u8c->ss != NULL -> this is a repeating scan. |
| 650 | * u8c->ccc == -1 -> this is the first scan of a repeating scan. |
| 651 | */ |
| 652 | int utf8byte(struct utf8cursor *u8c) |
| 653 | { |
| 654 | utf8leaf_t *leaf; |
| 655 | int ccc; |
| 656 | |
| 657 | for (;;) { |
| 658 | /* Check for the end of a decomposed character. */ |
| 659 | if (u8c->p && *u8c->s == '\0') { |
| 660 | u8c->s = u8c->p; |
| 661 | u8c->p = NULL; |
| 662 | } |
| 663 | |
| 664 | /* Check for end-of-string. */ |
| 665 | if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { |
| 666 | /* There is no next byte. */ |
| 667 | if (u8c->ccc == STOPPER) |
| 668 | return 0; |
| 669 | /* End-of-string during a scan counts as a stopper. */ |
| 670 | ccc = STOPPER; |
| 671 | goto ccc_mismatch; |
| 672 | } else if ((*u8c->s & 0xC0) == 0x80) { |
| 673 | /* This is a continuation of the current character. */ |
| 674 | if (!u8c->p) |
| 675 | u8c->len--; |
| 676 | return (unsigned char)*u8c->s++; |
| 677 | } |
| 678 | |
| 679 | /* Look up the data for the current character. */ |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 680 | if (u8c->p) { |
| 681 | leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s); |
| 682 | } else { |
| 683 | leaf = utf8nlookup(u8c->data, u8c->hangul, |
| 684 | u8c->s, u8c->len); |
| 685 | } |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 686 | |
| 687 | /* No leaf found implies that the input is a binary blob. */ |
| 688 | if (!leaf) |
| 689 | return -1; |
| 690 | |
| 691 | ccc = LEAF_CCC(leaf); |
| 692 | /* Characters that are too new have CCC 0. */ |
| 693 | if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) { |
| 694 | ccc = STOPPER; |
| 695 | } else if (ccc == DECOMPOSE) { |
| 696 | u8c->len -= utf8clen(u8c->s); |
| 697 | u8c->p = u8c->s + utf8clen(u8c->s); |
| 698 | u8c->s = LEAF_STR(leaf); |
| 699 | /* Empty decomposition implies CCC 0. */ |
| 700 | if (*u8c->s == '\0') { |
| 701 | if (u8c->ccc == STOPPER) |
| 702 | continue; |
| 703 | ccc = STOPPER; |
| 704 | goto ccc_mismatch; |
| 705 | } |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 706 | |
| 707 | leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s); |
Theodore Ts'o | 15f0d8d | 2019-05-12 04:56:51 -0400 | [diff] [blame] | 708 | if (!leaf) |
| 709 | return -1; |
Olaf Weber | a8384c6 | 2019-04-25 13:49:18 -0400 | [diff] [blame] | 710 | ccc = LEAF_CCC(leaf); |
Olaf Weber | 44594c2 | 2019-04-25 13:45:46 -0400 | [diff] [blame] | 711 | } |
| 712 | |
| 713 | /* |
| 714 | * If this is not a stopper, then see if it updates |
| 715 | * the next canonical class to be emitted. |
| 716 | */ |
| 717 | if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) |
| 718 | u8c->nccc = ccc; |
| 719 | |
| 720 | /* |
| 721 | * Return the current byte if this is the current |
| 722 | * combining class. |
| 723 | */ |
| 724 | if (ccc == u8c->ccc) { |
| 725 | if (!u8c->p) |
| 726 | u8c->len--; |
| 727 | return (unsigned char)*u8c->s++; |
| 728 | } |
| 729 | |
| 730 | /* Current combining class mismatch. */ |
| 731 | ccc_mismatch: |
| 732 | if (u8c->nccc == STOPPER) { |
| 733 | /* |
| 734 | * Scan forward for the first canonical class |
| 735 | * to be emitted. Save the position from |
| 736 | * which to restart. |
| 737 | */ |
| 738 | u8c->ccc = MINCCC - 1; |
| 739 | u8c->nccc = ccc; |
| 740 | u8c->sp = u8c->p; |
| 741 | u8c->ss = u8c->s; |
| 742 | u8c->slen = u8c->len; |
| 743 | if (!u8c->p) |
| 744 | u8c->len -= utf8clen(u8c->s); |
| 745 | u8c->s += utf8clen(u8c->s); |
| 746 | } else if (ccc != STOPPER) { |
| 747 | /* Not a stopper, and not the ccc we're emitting. */ |
| 748 | if (!u8c->p) |
| 749 | u8c->len -= utf8clen(u8c->s); |
| 750 | u8c->s += utf8clen(u8c->s); |
| 751 | } else if (u8c->nccc != MAXCCC + 1) { |
| 752 | /* At a stopper, restart for next ccc. */ |
| 753 | u8c->ccc = u8c->nccc; |
| 754 | u8c->nccc = MAXCCC + 1; |
| 755 | u8c->s = u8c->ss; |
| 756 | u8c->p = u8c->sp; |
| 757 | u8c->len = u8c->slen; |
| 758 | } else { |
| 759 | /* All done, proceed from here. */ |
| 760 | u8c->ccc = STOPPER; |
| 761 | u8c->nccc = STOPPER; |
| 762 | u8c->sp = NULL; |
| 763 | u8c->ss = NULL; |
| 764 | u8c->slen = 0; |
| 765 | } |
| 766 | } |
| 767 | } |
| 768 | EXPORT_SYMBOL(utf8byte); |
| 769 | |
| 770 | const struct utf8data *utf8nfdi(unsigned int maxage) |
| 771 | { |
| 772 | int i = ARRAY_SIZE(utf8nfdidata) - 1; |
| 773 | |
| 774 | while (maxage < utf8nfdidata[i].maxage) |
| 775 | i--; |
| 776 | if (maxage > utf8nfdidata[i].maxage) |
| 777 | return NULL; |
| 778 | return &utf8nfdidata[i]; |
| 779 | } |
| 780 | EXPORT_SYMBOL(utf8nfdi); |
| 781 | |
| 782 | const struct utf8data *utf8nfdicf(unsigned int maxage) |
| 783 | { |
| 784 | int i = ARRAY_SIZE(utf8nfdicfdata) - 1; |
| 785 | |
| 786 | while (maxage < utf8nfdicfdata[i].maxage) |
| 787 | i--; |
| 788 | if (maxage > utf8nfdicfdata[i].maxage) |
| 789 | return NULL; |
| 790 | return &utf8nfdicfdata[i]; |
| 791 | } |
| 792 | EXPORT_SYMBOL(utf8nfdicf); |