Thomas Graf | 6408f79 | 2005-06-23 20:59:16 -0700 | [diff] [blame^] | 1 | /* |
| 2 | * lib/ts_fsm.c A naive finite state machine text search approach |
| 3 | * |
| 4 | * This program is free software; you can redistribute it and/or |
| 5 | * modify it under the terms of the GNU General Public License |
| 6 | * as published by the Free Software Foundation; either version |
| 7 | * 2 of the License, or (at your option) any later version. |
| 8 | * |
| 9 | * Authors: Thomas Graf <tgraf@suug.ch> |
| 10 | * |
| 11 | * ========================================================================== |
| 12 | * |
| 13 | * A finite state machine consists of n states (struct ts_fsm_token) |
| 14 | * representing the pattern as a finite automation. The data is read |
| 15 | * sequentially on a octet basis. Every state token specifies the number |
| 16 | * of recurrences and the type of value accepted which can be either a |
| 17 | * specific character or ctype based set of characters. The available |
| 18 | * type of recurrences include 1, (0|1), [0 n], and [1 n]. |
| 19 | * |
| 20 | * The algorithm differs between strict/non-strict mode specyfing |
| 21 | * whether the pattern has to start at the first octect. Strict mode |
| 22 | * is enabled by default and can be disabled by inserting |
| 23 | * TS_FSM_HEAD_IGNORE as the first token in the chain. |
| 24 | * |
| 25 | * The runtime performance of the algorithm should be around O(n), |
| 26 | * however while in strict mode the average runtime can be better. |
| 27 | */ |
| 28 | |
| 29 | #include <linux/config.h> |
| 30 | #include <linux/module.h> |
| 31 | #include <linux/types.h> |
| 32 | #include <linux/string.h> |
| 33 | #include <linux/ctype.h> |
| 34 | #include <linux/textsearch.h> |
| 35 | #include <linux/textsearch_fsm.h> |
| 36 | |
| 37 | struct ts_fsm |
| 38 | { |
| 39 | unsigned int ntokens; |
| 40 | struct ts_fsm_token tokens[0]; |
| 41 | }; |
| 42 | |
| 43 | /* other values derived from ctype.h */ |
| 44 | #define _A 0x100 /* ascii */ |
| 45 | #define _W 0x200 /* wildcard */ |
| 46 | |
| 47 | /* Map to _ctype flags and some magic numbers */ |
| 48 | static u16 token_map[TS_FSM_TYPE_MAX+1] = { |
| 49 | [TS_FSM_SPECIFIC] = 0, |
| 50 | [TS_FSM_WILDCARD] = _W, |
| 51 | [TS_FSM_CNTRL] = _C, |
| 52 | [TS_FSM_LOWER] = _L, |
| 53 | [TS_FSM_UPPER] = _U, |
| 54 | [TS_FSM_PUNCT] = _P, |
| 55 | [TS_FSM_SPACE] = _S, |
| 56 | [TS_FSM_DIGIT] = _D, |
| 57 | [TS_FSM_XDIGIT] = _D | _X, |
| 58 | [TS_FSM_ALPHA] = _U | _L, |
| 59 | [TS_FSM_ALNUM] = _U | _L | _D, |
| 60 | [TS_FSM_PRINT] = _P | _U | _L | _D | _SP, |
| 61 | [TS_FSM_GRAPH] = _P | _U | _L | _D, |
| 62 | [TS_FSM_ASCII] = _A, |
| 63 | }; |
| 64 | |
| 65 | static u16 token_lookup_tbl[256] = { |
| 66 | _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 0- 3 */ |
| 67 | _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 4- 7 */ |
| 68 | _W|_A|_C, _W|_A|_C|_S, _W|_A|_C|_S, _W|_A|_C|_S, /* 8- 11 */ |
| 69 | _W|_A|_C|_S, _W|_A|_C|_S, _W|_A|_C, _W|_A|_C, /* 12- 15 */ |
| 70 | _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 16- 19 */ |
| 71 | _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 20- 23 */ |
| 72 | _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 24- 27 */ |
| 73 | _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 28- 31 */ |
| 74 | _W|_A|_S|_SP, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 32- 35 */ |
| 75 | _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 36- 39 */ |
| 76 | _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 40- 43 */ |
| 77 | _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 44- 47 */ |
| 78 | _W|_A|_D, _W|_A|_D, _W|_A|_D, _W|_A|_D, /* 48- 51 */ |
| 79 | _W|_A|_D, _W|_A|_D, _W|_A|_D, _W|_A|_D, /* 52- 55 */ |
| 80 | _W|_A|_D, _W|_A|_D, _W|_A|_P, _W|_A|_P, /* 56- 59 */ |
| 81 | _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 60- 63 */ |
| 82 | _W|_A|_P, _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U|_X, /* 64- 67 */ |
| 83 | _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U, /* 68- 71 */ |
| 84 | _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 72- 75 */ |
| 85 | _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 76- 79 */ |
| 86 | _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 80- 83 */ |
| 87 | _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 84- 87 */ |
| 88 | _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_P, /* 88- 91 */ |
| 89 | _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 92- 95 */ |
| 90 | _W|_A|_P, _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L|_X, /* 96- 99 */ |
| 91 | _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L, /* 100-103 */ |
| 92 | _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 104-107 */ |
| 93 | _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 108-111 */ |
| 94 | _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 112-115 */ |
| 95 | _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 116-119 */ |
| 96 | _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_P, /* 120-123 */ |
| 97 | _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_C, /* 124-127 */ |
| 98 | _W, _W, _W, _W, /* 128-131 */ |
| 99 | _W, _W, _W, _W, /* 132-135 */ |
| 100 | _W, _W, _W, _W, /* 136-139 */ |
| 101 | _W, _W, _W, _W, /* 140-143 */ |
| 102 | _W, _W, _W, _W, /* 144-147 */ |
| 103 | _W, _W, _W, _W, /* 148-151 */ |
| 104 | _W, _W, _W, _W, /* 152-155 */ |
| 105 | _W, _W, _W, _W, /* 156-159 */ |
| 106 | _W|_S|_SP, _W|_P, _W|_P, _W|_P, /* 160-163 */ |
| 107 | _W|_P, _W|_P, _W|_P, _W|_P, /* 164-167 */ |
| 108 | _W|_P, _W|_P, _W|_P, _W|_P, /* 168-171 */ |
| 109 | _W|_P, _W|_P, _W|_P, _W|_P, /* 172-175 */ |
| 110 | _W|_P, _W|_P, _W|_P, _W|_P, /* 176-179 */ |
| 111 | _W|_P, _W|_P, _W|_P, _W|_P, /* 180-183 */ |
| 112 | _W|_P, _W|_P, _W|_P, _W|_P, /* 184-187 */ |
| 113 | _W|_P, _W|_P, _W|_P, _W|_P, /* 188-191 */ |
| 114 | _W|_U, _W|_U, _W|_U, _W|_U, /* 192-195 */ |
| 115 | _W|_U, _W|_U, _W|_U, _W|_U, /* 196-199 */ |
| 116 | _W|_U, _W|_U, _W|_U, _W|_U, /* 200-203 */ |
| 117 | _W|_U, _W|_U, _W|_U, _W|_U, /* 204-207 */ |
| 118 | _W|_U, _W|_U, _W|_U, _W|_U, /* 208-211 */ |
| 119 | _W|_U, _W|_U, _W|_U, _W|_P, /* 212-215 */ |
| 120 | _W|_U, _W|_U, _W|_U, _W|_U, /* 216-219 */ |
| 121 | _W|_U, _W|_U, _W|_U, _W|_L, /* 220-223 */ |
| 122 | _W|_L, _W|_L, _W|_L, _W|_L, /* 224-227 */ |
| 123 | _W|_L, _W|_L, _W|_L, _W|_L, /* 228-231 */ |
| 124 | _W|_L, _W|_L, _W|_L, _W|_L, /* 232-235 */ |
| 125 | _W|_L, _W|_L, _W|_L, _W|_L, /* 236-239 */ |
| 126 | _W|_L, _W|_L, _W|_L, _W|_L, /* 240-243 */ |
| 127 | _W|_L, _W|_L, _W|_L, _W|_P, /* 244-247 */ |
| 128 | _W|_L, _W|_L, _W|_L, _W|_L, /* 248-251 */ |
| 129 | _W|_L, _W|_L, _W|_L, _W|_L}; /* 252-255 */ |
| 130 | |
| 131 | static inline int match_token(struct ts_fsm_token *t, u8 d) |
| 132 | { |
| 133 | if (t->type) |
| 134 | return (token_lookup_tbl[d] & t->type) != 0; |
| 135 | else |
| 136 | return t->value == d; |
| 137 | } |
| 138 | |
| 139 | static unsigned int fsm_find(struct ts_config *conf, struct ts_state *state) |
| 140 | { |
| 141 | struct ts_fsm *fsm = ts_config_priv(conf); |
| 142 | struct ts_fsm_token *cur = NULL, *next; |
| 143 | unsigned int match_start, block_idx = 0, tok_idx; |
| 144 | unsigned block_len = 0, strict, consumed = state->offset; |
| 145 | const u8 *data; |
| 146 | |
| 147 | #define GET_NEXT_BLOCK() \ |
| 148 | ({ consumed += block_idx; \ |
| 149 | block_idx = 0; \ |
| 150 | block_len = conf->get_next_block(consumed, &data, conf, state); }) |
| 151 | |
| 152 | #define TOKEN_MISMATCH() \ |
| 153 | do { \ |
| 154 | if (strict) \ |
| 155 | goto no_match; \ |
| 156 | block_idx++; \ |
| 157 | goto startover; \ |
| 158 | } while(0) |
| 159 | |
| 160 | #define end_of_data() unlikely(block_idx >= block_len && !GET_NEXT_BLOCK()) |
| 161 | |
| 162 | if (end_of_data()) |
| 163 | goto no_match; |
| 164 | |
| 165 | strict = fsm->tokens[0].recur != TS_FSM_HEAD_IGNORE; |
| 166 | |
| 167 | startover: |
| 168 | match_start = consumed + block_idx; |
| 169 | |
| 170 | for (tok_idx = 0; tok_idx < fsm->ntokens; tok_idx++) { |
| 171 | cur = &fsm->tokens[tok_idx]; |
| 172 | |
| 173 | if (likely(tok_idx < (fsm->ntokens - 1))) |
| 174 | next = &fsm->tokens[tok_idx + 1]; |
| 175 | else |
| 176 | next = NULL; |
| 177 | |
| 178 | switch (cur->recur) { |
| 179 | case TS_FSM_SINGLE: |
| 180 | if (end_of_data()) |
| 181 | goto no_match; |
| 182 | |
| 183 | if (!match_token(cur, data[block_idx])) |
| 184 | TOKEN_MISMATCH(); |
| 185 | break; |
| 186 | |
| 187 | case TS_FSM_PERHAPS: |
| 188 | if (end_of_data() || |
| 189 | !match_token(cur, data[block_idx])) |
| 190 | continue; |
| 191 | break; |
| 192 | |
| 193 | case TS_FSM_MULTI: |
| 194 | if (end_of_data()) |
| 195 | goto no_match; |
| 196 | |
| 197 | if (!match_token(cur, data[block_idx])) |
| 198 | TOKEN_MISMATCH(); |
| 199 | |
| 200 | block_idx++; |
| 201 | /* fall through */ |
| 202 | |
| 203 | case TS_FSM_ANY: |
| 204 | if (next == NULL) |
| 205 | goto found_match; |
| 206 | |
| 207 | if (end_of_data()) |
| 208 | continue; |
| 209 | |
| 210 | while (!match_token(next, data[block_idx])) { |
| 211 | if (!match_token(cur, data[block_idx])) |
| 212 | TOKEN_MISMATCH(); |
| 213 | block_idx++; |
| 214 | if (end_of_data()) |
| 215 | goto no_match; |
| 216 | } |
| 217 | continue; |
| 218 | |
| 219 | /* |
| 220 | * Optimization: Prefer small local loop over jumping |
| 221 | * back and forth until garbage at head is munched. |
| 222 | */ |
| 223 | case TS_FSM_HEAD_IGNORE: |
| 224 | if (end_of_data()) |
| 225 | continue; |
| 226 | |
| 227 | while (!match_token(next, data[block_idx])) { |
| 228 | /* |
| 229 | * Special case, don't start over upon |
| 230 | * a mismatch, give the user the |
| 231 | * chance to specify the type of data |
| 232 | * allowed to be ignored. |
| 233 | */ |
| 234 | if (!match_token(cur, data[block_idx])) |
| 235 | goto no_match; |
| 236 | |
| 237 | block_idx++; |
| 238 | if (end_of_data()) |
| 239 | goto no_match; |
| 240 | } |
| 241 | |
| 242 | match_start = consumed + block_idx; |
| 243 | continue; |
| 244 | } |
| 245 | |
| 246 | block_idx++; |
| 247 | } |
| 248 | |
| 249 | if (end_of_data()) |
| 250 | goto found_match; |
| 251 | |
| 252 | no_match: |
| 253 | return UINT_MAX; |
| 254 | |
| 255 | found_match: |
| 256 | state->offset = consumed + block_idx; |
| 257 | return match_start; |
| 258 | } |
| 259 | |
| 260 | static struct ts_config *fsm_init(const void *pattern, unsigned int len, |
| 261 | int gfp_mask) |
| 262 | { |
| 263 | int i, err = -EINVAL; |
| 264 | struct ts_config *conf; |
| 265 | struct ts_fsm *fsm; |
| 266 | struct ts_fsm_token *tokens = (struct ts_fsm_token *) pattern; |
| 267 | unsigned int ntokens = len / sizeof(*tokens); |
| 268 | size_t priv_size = sizeof(*fsm) + len; |
| 269 | |
| 270 | if (len % sizeof(struct ts_fsm_token) || ntokens < 1) |
| 271 | goto errout; |
| 272 | |
| 273 | for (i = 0; i < ntokens; i++) { |
| 274 | struct ts_fsm_token *t = &tokens[i]; |
| 275 | |
| 276 | if (t->type > TS_FSM_TYPE_MAX || t->recur > TS_FSM_RECUR_MAX) |
| 277 | goto errout; |
| 278 | |
| 279 | if (t->recur == TS_FSM_HEAD_IGNORE && |
| 280 | (i != 0 || i == (ntokens - 1))) |
| 281 | goto errout; |
| 282 | } |
| 283 | |
| 284 | conf = alloc_ts_config(priv_size, gfp_mask); |
| 285 | if (IS_ERR(conf)) |
| 286 | return conf; |
| 287 | |
| 288 | fsm = ts_config_priv(conf); |
| 289 | fsm->ntokens = ntokens; |
| 290 | memcpy(fsm->tokens, pattern, len); |
| 291 | |
| 292 | for (i = 0; i < fsm->ntokens; i++) { |
| 293 | struct ts_fsm_token *t = &fsm->tokens[i]; |
| 294 | t->type = token_map[t->type]; |
| 295 | } |
| 296 | |
| 297 | return conf; |
| 298 | |
| 299 | errout: |
| 300 | return ERR_PTR(err); |
| 301 | } |
| 302 | |
| 303 | static void *fsm_get_pattern(struct ts_config *conf) |
| 304 | { |
| 305 | struct ts_fsm *fsm = ts_config_priv(conf); |
| 306 | return fsm->tokens; |
| 307 | } |
| 308 | |
| 309 | static unsigned int fsm_get_pattern_len(struct ts_config *conf) |
| 310 | { |
| 311 | struct ts_fsm *fsm = ts_config_priv(conf); |
| 312 | return fsm->ntokens * sizeof(struct ts_fsm_token); |
| 313 | } |
| 314 | |
| 315 | static struct ts_ops fsm_ops = { |
| 316 | .name = "fsm", |
| 317 | .find = fsm_find, |
| 318 | .init = fsm_init, |
| 319 | .get_pattern = fsm_get_pattern, |
| 320 | .get_pattern_len = fsm_get_pattern_len, |
| 321 | .owner = THIS_MODULE, |
| 322 | .list = LIST_HEAD_INIT(fsm_ops.list) |
| 323 | }; |
| 324 | |
| 325 | static int __init init_fsm(void) |
| 326 | { |
| 327 | return textsearch_register(&fsm_ops); |
| 328 | } |
| 329 | |
| 330 | static void __exit exit_fsm(void) |
| 331 | { |
| 332 | textsearch_unregister(&fsm_ops); |
| 333 | } |
| 334 | |
| 335 | MODULE_LICENSE("GPL"); |
| 336 | |
| 337 | module_init(init_fsm); |
| 338 | module_exit(exit_fsm); |