Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | /* Generate assembler source containing symbol information |
| 2 | * |
| 3 | * Copyright 2002 by Kai Germaschewski |
| 4 | * |
| 5 | * This software may be used and distributed according to the terms |
| 6 | * of the GNU General Public License, incorporated herein by reference. |
| 7 | * |
| 8 | * Usage: nm -n vmlinux | scripts/kallsyms [--all-symbols] > symbols.S |
| 9 | * |
| 10 | * ChangeLog: |
| 11 | * |
| 12 | * (25/Aug/2004) Paulo Marques <pmarques@grupopie.com> |
| 13 | * Changed the compression method from stem compression to "table lookup" |
| 14 | * compression |
| 15 | * |
| 16 | * Table compression uses all the unused char codes on the symbols and |
| 17 | * maps these to the most used substrings (tokens). For instance, it might |
| 18 | * map char code 0xF7 to represent "write_" and then in every symbol where |
| 19 | * "write_" appears it can be replaced by 0xF7, saving 5 bytes. |
| 20 | * The used codes themselves are also placed in the table so that the |
| 21 | * decompresion can work without "special cases". |
| 22 | * Applied to kernel symbols, this usually produces a compression ratio |
| 23 | * of about 50%. |
| 24 | * |
| 25 | */ |
| 26 | |
| 27 | #include <stdio.h> |
| 28 | #include <stdlib.h> |
| 29 | #include <string.h> |
| 30 | #include <ctype.h> |
| 31 | |
| 32 | /* maximum token length used. It doesn't pay to increase it a lot, because |
| 33 | * very long substrings probably don't repeat themselves too often. */ |
| 34 | #define MAX_TOK_SIZE 11 |
| 35 | #define KSYM_NAME_LEN 127 |
| 36 | |
| 37 | /* we use only a subset of the complete symbol table to gather the token count, |
| 38 | * to speed up compression, at the expense of a little compression ratio */ |
| 39 | #define WORKING_SET 1024 |
| 40 | |
| 41 | /* first find the best token only on the list of tokens that would profit more |
| 42 | * than GOOD_BAD_THRESHOLD. Only if this list is empty go to the "bad" list. |
| 43 | * Increasing this value will put less tokens on the "good" list, so the search |
| 44 | * is faster. However, if the good list runs out of tokens, we must painfully |
| 45 | * search the bad list. */ |
| 46 | #define GOOD_BAD_THRESHOLD 10 |
| 47 | |
| 48 | /* token hash parameters */ |
| 49 | #define HASH_BITS 18 |
| 50 | #define HASH_TABLE_SIZE (1 << HASH_BITS) |
| 51 | #define HASH_MASK (HASH_TABLE_SIZE - 1) |
| 52 | #define HASH_BASE_OFFSET 2166136261U |
| 53 | #define HASH_FOLD(a) ((a)&(HASH_MASK)) |
| 54 | |
| 55 | /* flags to mark symbols */ |
| 56 | #define SYM_FLAG_VALID 1 |
| 57 | #define SYM_FLAG_SAMPLED 2 |
| 58 | |
| 59 | struct sym_entry { |
| 60 | unsigned long long addr; |
| 61 | char type; |
| 62 | unsigned char flags; |
| 63 | unsigned char len; |
| 64 | unsigned char *sym; |
| 65 | }; |
| 66 | |
| 67 | |
| 68 | static struct sym_entry *table; |
| 69 | static int size, cnt; |
David Woodhouse | 075d6eb | 2005-05-05 16:15:09 -0700 | [diff] [blame] | 70 | static unsigned long long _stext, _etext, _sinittext, _einittext, _sextratext, _eextratext; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 71 | static int all_symbols = 0; |
Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 72 | static char symbol_prefix_char = '\0'; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 73 | |
| 74 | struct token { |
| 75 | unsigned char data[MAX_TOK_SIZE]; |
| 76 | unsigned char len; |
| 77 | /* profit: the number of bytes that could be saved by inserting this |
| 78 | * token into the table */ |
| 79 | int profit; |
| 80 | struct token *next; /* next token on the hash list */ |
| 81 | struct token *right; /* next token on the good/bad list */ |
| 82 | struct token *left; /* previous token on the good/bad list */ |
| 83 | struct token *smaller; /* token that is less one letter than this one */ |
| 84 | }; |
| 85 | |
| 86 | struct token bad_head, good_head; |
| 87 | struct token *hash_table[HASH_TABLE_SIZE]; |
| 88 | |
| 89 | /* the table that holds the result of the compression */ |
| 90 | unsigned char best_table[256][MAX_TOK_SIZE+1]; |
| 91 | unsigned char best_table_len[256]; |
| 92 | |
| 93 | |
| 94 | static void |
| 95 | usage(void) |
| 96 | { |
Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 97 | fprintf(stderr, "Usage: kallsyms [--all-symbols] [--symbol-prefix=<prefix char>] < in.map > out.S\n"); |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 98 | exit(1); |
| 99 | } |
| 100 | |
| 101 | /* |
| 102 | * This ignores the intensely annoying "mapping symbols" found |
| 103 | * in ARM ELF files: $a, $t and $d. |
| 104 | */ |
| 105 | static inline int |
| 106 | is_arm_mapping_symbol(const char *str) |
| 107 | { |
| 108 | return str[0] == '$' && strchr("atd", str[1]) |
| 109 | && (str[2] == '\0' || str[2] == '.'); |
| 110 | } |
| 111 | |
| 112 | static int |
| 113 | read_symbol(FILE *in, struct sym_entry *s) |
| 114 | { |
| 115 | char str[500]; |
Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 116 | char *sym; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 117 | int rc; |
| 118 | |
| 119 | rc = fscanf(in, "%llx %c %499s\n", &s->addr, &s->type, str); |
| 120 | if (rc != 3) { |
| 121 | if (rc != EOF) { |
| 122 | /* skip line */ |
| 123 | fgets(str, 500, in); |
| 124 | } |
| 125 | return -1; |
| 126 | } |
| 127 | |
Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 128 | sym = str; |
| 129 | /* skip prefix char */ |
| 130 | if (symbol_prefix_char && str[0] == symbol_prefix_char) |
| 131 | sym++; |
| 132 | |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 133 | /* Ignore most absolute/undefined (?) symbols. */ |
Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 134 | if (strcmp(sym, "_stext") == 0) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 135 | _stext = s->addr; |
Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 136 | else if (strcmp(sym, "_etext") == 0) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 137 | _etext = s->addr; |
Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 138 | else if (strcmp(sym, "_sinittext") == 0) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 139 | _sinittext = s->addr; |
Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 140 | else if (strcmp(sym, "_einittext") == 0) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 141 | _einittext = s->addr; |
David Woodhouse | 075d6eb | 2005-05-05 16:15:09 -0700 | [diff] [blame] | 142 | else if (strcmp(sym, "_sextratext") == 0) |
| 143 | _sextratext = s->addr; |
| 144 | else if (strcmp(sym, "_eextratext") == 0) |
| 145 | _eextratext = s->addr; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 146 | else if (toupper(s->type) == 'A') |
| 147 | { |
| 148 | /* Keep these useful absolute symbols */ |
Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 149 | if (strcmp(sym, "__kernel_syscall_via_break") && |
| 150 | strcmp(sym, "__kernel_syscall_via_epc") && |
| 151 | strcmp(sym, "__kernel_sigtramp") && |
| 152 | strcmp(sym, "__gp")) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 153 | return -1; |
| 154 | |
| 155 | } |
| 156 | else if (toupper(s->type) == 'U' || |
Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 157 | is_arm_mapping_symbol(sym)) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 158 | return -1; |
| 159 | |
| 160 | /* include the type field in the symbol name, so that it gets |
| 161 | * compressed together */ |
| 162 | s->len = strlen(str) + 1; |
| 163 | s->sym = (char *) malloc(s->len + 1); |
| 164 | strcpy(s->sym + 1, str); |
| 165 | s->sym[0] = s->type; |
| 166 | |
| 167 | return 0; |
| 168 | } |
| 169 | |
| 170 | static int |
| 171 | symbol_valid(struct sym_entry *s) |
| 172 | { |
| 173 | /* Symbols which vary between passes. Passes 1 and 2 must have |
| 174 | * identical symbol lists. The kallsyms_* symbols below are only added |
| 175 | * after pass 1, they would be included in pass 2 when --all-symbols is |
| 176 | * specified so exclude them to get a stable symbol list. |
| 177 | */ |
| 178 | static char *special_symbols[] = { |
| 179 | "kallsyms_addresses", |
| 180 | "kallsyms_num_syms", |
| 181 | "kallsyms_names", |
| 182 | "kallsyms_markers", |
| 183 | "kallsyms_token_table", |
| 184 | "kallsyms_token_index", |
| 185 | |
| 186 | /* Exclude linker generated symbols which vary between passes */ |
| 187 | "_SDA_BASE_", /* ppc */ |
| 188 | "_SDA2_BASE_", /* ppc */ |
| 189 | NULL }; |
| 190 | int i; |
Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 191 | int offset = 1; |
| 192 | |
| 193 | /* skip prefix char */ |
| 194 | if (symbol_prefix_char && *(s->sym + 1) == symbol_prefix_char) |
| 195 | offset++; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 196 | |
| 197 | /* if --all-symbols is not specified, then symbols outside the text |
| 198 | * and inittext sections are discarded */ |
| 199 | if (!all_symbols) { |
| 200 | if ((s->addr < _stext || s->addr > _etext) |
David Woodhouse | 075d6eb | 2005-05-05 16:15:09 -0700 | [diff] [blame] | 201 | && (s->addr < _sinittext || s->addr > _einittext) |
| 202 | && (s->addr < _sextratext || s->addr > _eextratext)) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 203 | return 0; |
| 204 | /* Corner case. Discard any symbols with the same value as |
David Woodhouse | 075d6eb | 2005-05-05 16:15:09 -0700 | [diff] [blame] | 205 | * _etext _einittext or _eextratext; they can move between pass |
| 206 | * 1 and 2 when the kallsyms data are added. If these symbols |
| 207 | * move then they may get dropped in pass 2, which breaks the |
| 208 | * kallsyms rules. |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 209 | */ |
J.A. Magallon | 61d9cdf | 2005-07-15 22:14:43 +0000 | [diff] [blame] | 210 | if ((s->addr == _etext && strcmp((char*)s->sym + offset, "_etext")) || |
| 211 | (s->addr == _einittext && strcmp((char*)s->sym + offset, "_einittext")) || |
| 212 | (s->addr == _eextratext && strcmp((char*)s->sym + offset, "_eextratext"))) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 213 | return 0; |
| 214 | } |
| 215 | |
| 216 | /* Exclude symbols which vary between passes. */ |
Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 217 | if (strstr(s->sym + offset, "_compiled.")) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 218 | return 0; |
| 219 | |
| 220 | for (i = 0; special_symbols[i]; i++) |
Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 221 | if( strcmp(s->sym + offset, special_symbols[i]) == 0 ) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 222 | return 0; |
| 223 | |
| 224 | return 1; |
| 225 | } |
| 226 | |
| 227 | static void |
| 228 | read_map(FILE *in) |
| 229 | { |
| 230 | while (!feof(in)) { |
| 231 | if (cnt >= size) { |
| 232 | size += 10000; |
| 233 | table = realloc(table, sizeof(*table) * size); |
| 234 | if (!table) { |
| 235 | fprintf(stderr, "out of memory\n"); |
| 236 | exit (1); |
| 237 | } |
| 238 | } |
| 239 | if (read_symbol(in, &table[cnt]) == 0) |
| 240 | cnt++; |
| 241 | } |
| 242 | } |
| 243 | |
| 244 | static void output_label(char *label) |
| 245 | { |
Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 246 | if (symbol_prefix_char) |
| 247 | printf(".globl %c%s\n", symbol_prefix_char, label); |
| 248 | else |
| 249 | printf(".globl %s\n", label); |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 250 | printf("\tALGN\n"); |
Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 251 | if (symbol_prefix_char) |
| 252 | printf("%c%s:\n", symbol_prefix_char, label); |
| 253 | else |
| 254 | printf("%s:\n", label); |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 255 | } |
| 256 | |
| 257 | /* uncompress a compressed symbol. When this function is called, the best table |
| 258 | * might still be compressed itself, so the function needs to be recursive */ |
| 259 | static int expand_symbol(unsigned char *data, int len, char *result) |
| 260 | { |
| 261 | int c, rlen, total=0; |
| 262 | |
| 263 | while (len) { |
| 264 | c = *data; |
| 265 | /* if the table holds a single char that is the same as the one |
| 266 | * we are looking for, then end the search */ |
| 267 | if (best_table[c][0]==c && best_table_len[c]==1) { |
| 268 | *result++ = c; |
| 269 | total++; |
| 270 | } else { |
| 271 | /* if not, recurse and expand */ |
| 272 | rlen = expand_symbol(best_table[c], best_table_len[c], result); |
| 273 | total += rlen; |
| 274 | result += rlen; |
| 275 | } |
| 276 | data++; |
| 277 | len--; |
| 278 | } |
| 279 | *result=0; |
| 280 | |
| 281 | return total; |
| 282 | } |
| 283 | |
| 284 | static void |
| 285 | write_src(void) |
| 286 | { |
| 287 | int i, k, off, valid; |
| 288 | unsigned int best_idx[256]; |
| 289 | unsigned int *markers; |
| 290 | char buf[KSYM_NAME_LEN+1]; |
| 291 | |
| 292 | printf("#include <asm/types.h>\n"); |
| 293 | printf("#if BITS_PER_LONG == 64\n"); |
| 294 | printf("#define PTR .quad\n"); |
| 295 | printf("#define ALGN .align 8\n"); |
| 296 | printf("#else\n"); |
| 297 | printf("#define PTR .long\n"); |
| 298 | printf("#define ALGN .align 4\n"); |
| 299 | printf("#endif\n"); |
| 300 | |
| 301 | printf(".data\n"); |
| 302 | |
| 303 | output_label("kallsyms_addresses"); |
| 304 | valid = 0; |
| 305 | for (i = 0; i < cnt; i++) { |
| 306 | if (table[i].flags & SYM_FLAG_VALID) { |
| 307 | printf("\tPTR\t%#llx\n", table[i].addr); |
| 308 | valid++; |
| 309 | } |
| 310 | } |
| 311 | printf("\n"); |
| 312 | |
| 313 | output_label("kallsyms_num_syms"); |
| 314 | printf("\tPTR\t%d\n", valid); |
| 315 | printf("\n"); |
| 316 | |
| 317 | /* table of offset markers, that give the offset in the compressed stream |
| 318 | * every 256 symbols */ |
| 319 | markers = (unsigned int *) malloc(sizeof(unsigned int)*((valid + 255) / 256)); |
| 320 | |
| 321 | output_label("kallsyms_names"); |
| 322 | valid = 0; |
| 323 | off = 0; |
| 324 | for (i = 0; i < cnt; i++) { |
| 325 | |
| 326 | if (!table[i].flags & SYM_FLAG_VALID) |
| 327 | continue; |
| 328 | |
| 329 | if ((valid & 0xFF) == 0) |
| 330 | markers[valid >> 8] = off; |
| 331 | |
| 332 | printf("\t.byte 0x%02x", table[i].len); |
| 333 | for (k = 0; k < table[i].len; k++) |
| 334 | printf(", 0x%02x", table[i].sym[k]); |
| 335 | printf("\n"); |
| 336 | |
| 337 | off += table[i].len + 1; |
| 338 | valid++; |
| 339 | } |
| 340 | printf("\n"); |
| 341 | |
| 342 | output_label("kallsyms_markers"); |
| 343 | for (i = 0; i < ((valid + 255) >> 8); i++) |
| 344 | printf("\tPTR\t%d\n", markers[i]); |
| 345 | printf("\n"); |
| 346 | |
| 347 | free(markers); |
| 348 | |
| 349 | output_label("kallsyms_token_table"); |
| 350 | off = 0; |
| 351 | for (i = 0; i < 256; i++) { |
| 352 | best_idx[i] = off; |
| 353 | expand_symbol(best_table[i],best_table_len[i],buf); |
| 354 | printf("\t.asciz\t\"%s\"\n", buf); |
| 355 | off += strlen(buf) + 1; |
| 356 | } |
| 357 | printf("\n"); |
| 358 | |
| 359 | output_label("kallsyms_token_index"); |
| 360 | for (i = 0; i < 256; i++) |
| 361 | printf("\t.short\t%d\n", best_idx[i]); |
| 362 | printf("\n"); |
| 363 | } |
| 364 | |
| 365 | |
| 366 | /* table lookup compression functions */ |
| 367 | |
| 368 | static inline unsigned int rehash_token(unsigned int hash, unsigned char data) |
| 369 | { |
| 370 | return ((hash * 16777619) ^ data); |
| 371 | } |
| 372 | |
| 373 | static unsigned int hash_token(unsigned char *data, int len) |
| 374 | { |
| 375 | unsigned int hash=HASH_BASE_OFFSET; |
| 376 | int i; |
| 377 | |
| 378 | for (i = 0; i < len; i++) |
| 379 | hash = rehash_token(hash, data[i]); |
| 380 | |
| 381 | return HASH_FOLD(hash); |
| 382 | } |
| 383 | |
| 384 | /* find a token given its data and hash value */ |
| 385 | static struct token *find_token_hash(unsigned char *data, int len, unsigned int hash) |
| 386 | { |
| 387 | struct token *ptr; |
| 388 | |
| 389 | ptr = hash_table[hash]; |
| 390 | |
| 391 | while (ptr) { |
| 392 | if ((ptr->len == len) && (memcmp(ptr->data, data, len) == 0)) |
| 393 | return ptr; |
| 394 | ptr=ptr->next; |
| 395 | } |
| 396 | |
| 397 | return NULL; |
| 398 | } |
| 399 | |
| 400 | static inline void insert_token_in_group(struct token *head, struct token *ptr) |
| 401 | { |
| 402 | ptr->right = head->right; |
| 403 | ptr->right->left = ptr; |
| 404 | head->right = ptr; |
| 405 | ptr->left = head; |
| 406 | } |
| 407 | |
| 408 | static inline void remove_token_from_group(struct token *ptr) |
| 409 | { |
| 410 | ptr->left->right = ptr->right; |
| 411 | ptr->right->left = ptr->left; |
| 412 | } |
| 413 | |
| 414 | |
| 415 | /* build the counts for all the tokens that start with "data", and have lenghts |
| 416 | * from 2 to "len" */ |
| 417 | static void learn_token(unsigned char *data, int len) |
| 418 | { |
| 419 | struct token *ptr,*last_ptr; |
| 420 | int i, newprofit; |
| 421 | unsigned int hash = HASH_BASE_OFFSET; |
| 422 | unsigned int hashes[MAX_TOK_SIZE + 1]; |
| 423 | |
| 424 | if (len > MAX_TOK_SIZE) |
| 425 | len = MAX_TOK_SIZE; |
| 426 | |
| 427 | /* calculate and store the hash values for all the sub-tokens */ |
| 428 | hash = rehash_token(hash, data[0]); |
| 429 | for (i = 2; i <= len; i++) { |
| 430 | hash = rehash_token(hash, data[i-1]); |
| 431 | hashes[i] = HASH_FOLD(hash); |
| 432 | } |
| 433 | |
| 434 | last_ptr = NULL; |
| 435 | ptr = NULL; |
| 436 | |
| 437 | for (i = len; i >= 2; i--) { |
| 438 | hash = hashes[i]; |
| 439 | |
| 440 | if (!ptr) ptr = find_token_hash(data, i, hash); |
| 441 | |
| 442 | if (!ptr) { |
| 443 | /* create a new token entry */ |
| 444 | ptr = (struct token *) malloc(sizeof(*ptr)); |
| 445 | |
| 446 | memcpy(ptr->data, data, i); |
| 447 | ptr->len = i; |
| 448 | |
| 449 | /* when we create an entry, it's profit is 0 because |
| 450 | * we also take into account the size of the token on |
| 451 | * the compressed table. We then subtract GOOD_BAD_THRESHOLD |
| 452 | * so that the test to see if this token belongs to |
| 453 | * the good or bad list, is a comparison to zero */ |
| 454 | ptr->profit = -GOOD_BAD_THRESHOLD; |
| 455 | |
| 456 | ptr->next = hash_table[hash]; |
| 457 | hash_table[hash] = ptr; |
| 458 | |
| 459 | insert_token_in_group(&bad_head, ptr); |
| 460 | |
| 461 | ptr->smaller = NULL; |
| 462 | } else { |
| 463 | newprofit = ptr->profit + (ptr->len - 1); |
| 464 | /* check to see if this token needs to be moved to a |
| 465 | * different list */ |
| 466 | if((ptr->profit < 0) && (newprofit >= 0)) { |
| 467 | remove_token_from_group(ptr); |
| 468 | insert_token_in_group(&good_head,ptr); |
| 469 | } |
| 470 | ptr->profit = newprofit; |
| 471 | } |
| 472 | |
| 473 | if (last_ptr) last_ptr->smaller = ptr; |
| 474 | last_ptr = ptr; |
| 475 | |
| 476 | ptr = ptr->smaller; |
| 477 | } |
| 478 | } |
| 479 | |
| 480 | /* decrease the counts for all the tokens that start with "data", and have lenghts |
| 481 | * from 2 to "len". This function is much simpler than learn_token because we have |
| 482 | * more guarantees (tho tokens exist, the ->smaller pointer is set, etc.) |
| 483 | * The two separate functions exist only because of compression performance */ |
| 484 | static void forget_token(unsigned char *data, int len) |
| 485 | { |
| 486 | struct token *ptr; |
| 487 | int i, newprofit; |
| 488 | unsigned int hash=0; |
| 489 | |
| 490 | if (len > MAX_TOK_SIZE) len = MAX_TOK_SIZE; |
| 491 | |
| 492 | hash = hash_token(data, len); |
| 493 | ptr = find_token_hash(data, len, hash); |
| 494 | |
| 495 | for (i = len; i >= 2; i--) { |
| 496 | |
| 497 | newprofit = ptr->profit - (ptr->len - 1); |
| 498 | if ((ptr->profit >= 0) && (newprofit < 0)) { |
| 499 | remove_token_from_group(ptr); |
| 500 | insert_token_in_group(&bad_head, ptr); |
| 501 | } |
| 502 | ptr->profit=newprofit; |
| 503 | |
| 504 | ptr=ptr->smaller; |
| 505 | } |
| 506 | } |
| 507 | |
| 508 | /* count all the possible tokens in a symbol */ |
| 509 | static void learn_symbol(unsigned char *symbol, int len) |
| 510 | { |
| 511 | int i; |
| 512 | |
| 513 | for (i = 0; i < len - 1; i++) |
| 514 | learn_token(symbol + i, len - i); |
| 515 | } |
| 516 | |
| 517 | /* decrease the count for all the possible tokens in a symbol */ |
| 518 | static void forget_symbol(unsigned char *symbol, int len) |
| 519 | { |
| 520 | int i; |
| 521 | |
| 522 | for (i = 0; i < len - 1; i++) |
| 523 | forget_token(symbol + i, len - i); |
| 524 | } |
| 525 | |
| 526 | /* set all the symbol flags and do the initial token count */ |
| 527 | static void build_initial_tok_table(void) |
| 528 | { |
| 529 | int i, use_it, valid; |
| 530 | |
| 531 | valid = 0; |
| 532 | for (i = 0; i < cnt; i++) { |
| 533 | table[i].flags = 0; |
| 534 | if ( symbol_valid(&table[i]) ) { |
| 535 | table[i].flags |= SYM_FLAG_VALID; |
| 536 | valid++; |
| 537 | } |
| 538 | } |
| 539 | |
| 540 | use_it = 0; |
| 541 | for (i = 0; i < cnt; i++) { |
| 542 | |
| 543 | /* subsample the available symbols. This method is almost like |
| 544 | * a Bresenham's algorithm to get uniformly distributed samples |
| 545 | * across the symbol table */ |
| 546 | if (table[i].flags & SYM_FLAG_VALID) { |
| 547 | |
| 548 | use_it += WORKING_SET; |
| 549 | |
| 550 | if (use_it >= valid) { |
| 551 | table[i].flags |= SYM_FLAG_SAMPLED; |
| 552 | use_it -= valid; |
| 553 | } |
| 554 | } |
| 555 | if (table[i].flags & SYM_FLAG_SAMPLED) |
| 556 | learn_symbol(table[i].sym, table[i].len); |
| 557 | } |
| 558 | } |
| 559 | |
| 560 | /* replace a given token in all the valid symbols. Use the sampled symbols |
| 561 | * to update the counts */ |
| 562 | static void compress_symbols(unsigned char *str, int tlen, int idx) |
| 563 | { |
| 564 | int i, len, learn, size; |
| 565 | unsigned char *p; |
| 566 | |
| 567 | for (i = 0; i < cnt; i++) { |
| 568 | |
| 569 | if (!(table[i].flags & SYM_FLAG_VALID)) continue; |
| 570 | |
| 571 | len = table[i].len; |
| 572 | learn = 0; |
| 573 | p = table[i].sym; |
| 574 | |
| 575 | do { |
| 576 | /* find the token on the symbol */ |
| 577 | p = (unsigned char *) strstr((char *) p, (char *) str); |
| 578 | if (!p) break; |
| 579 | |
| 580 | if (!learn) { |
| 581 | /* if this symbol was used to count, decrease it */ |
| 582 | if (table[i].flags & SYM_FLAG_SAMPLED) |
| 583 | forget_symbol(table[i].sym, len); |
| 584 | learn = 1; |
| 585 | } |
| 586 | |
| 587 | *p = idx; |
| 588 | size = (len - (p - table[i].sym)) - tlen + 1; |
| 589 | memmove(p + 1, p + tlen, size); |
| 590 | p++; |
| 591 | len -= tlen - 1; |
| 592 | |
| 593 | } while (size >= tlen); |
| 594 | |
| 595 | if(learn) { |
| 596 | table[i].len = len; |
| 597 | /* if this symbol was used to count, learn it again */ |
| 598 | if(table[i].flags & SYM_FLAG_SAMPLED) |
| 599 | learn_symbol(table[i].sym, len); |
| 600 | } |
| 601 | } |
| 602 | } |
| 603 | |
| 604 | /* search the token with the maximum profit */ |
| 605 | static struct token *find_best_token(void) |
| 606 | { |
| 607 | struct token *ptr,*best,*head; |
| 608 | int bestprofit; |
| 609 | |
| 610 | bestprofit=-10000; |
| 611 | |
| 612 | /* failsafe: if the "good" list is empty search from the "bad" list */ |
| 613 | if(good_head.right == &good_head) head = &bad_head; |
| 614 | else head = &good_head; |
| 615 | |
| 616 | ptr = head->right; |
| 617 | best = NULL; |
| 618 | while (ptr != head) { |
| 619 | if (ptr->profit > bestprofit) { |
| 620 | bestprofit = ptr->profit; |
| 621 | best = ptr; |
| 622 | } |
| 623 | ptr = ptr->right; |
| 624 | } |
| 625 | |
| 626 | return best; |
| 627 | } |
| 628 | |
| 629 | /* this is the core of the algorithm: calculate the "best" table */ |
| 630 | static void optimize_result(void) |
| 631 | { |
| 632 | struct token *best; |
| 633 | int i; |
| 634 | |
| 635 | /* using the '\0' symbol last allows compress_symbols to use standard |
| 636 | * fast string functions */ |
| 637 | for (i = 255; i >= 0; i--) { |
| 638 | |
| 639 | /* if this table slot is empty (it is not used by an actual |
| 640 | * original char code */ |
| 641 | if (!best_table_len[i]) { |
| 642 | |
| 643 | /* find the token with the breates profit value */ |
| 644 | best = find_best_token(); |
| 645 | |
| 646 | /* place it in the "best" table */ |
| 647 | best_table_len[i] = best->len; |
| 648 | memcpy(best_table[i], best->data, best_table_len[i]); |
| 649 | /* zero terminate the token so that we can use strstr |
| 650 | in compress_symbols */ |
| 651 | best_table[i][best_table_len[i]]='\0'; |
| 652 | |
| 653 | /* replace this token in all the valid symbols */ |
| 654 | compress_symbols(best_table[i], best_table_len[i], i); |
| 655 | } |
| 656 | } |
| 657 | } |
| 658 | |
| 659 | /* start by placing the symbols that are actually used on the table */ |
| 660 | static void insert_real_symbols_in_table(void) |
| 661 | { |
| 662 | int i, j, c; |
| 663 | |
| 664 | memset(best_table, 0, sizeof(best_table)); |
| 665 | memset(best_table_len, 0, sizeof(best_table_len)); |
| 666 | |
| 667 | for (i = 0; i < cnt; i++) { |
| 668 | if (table[i].flags & SYM_FLAG_VALID) { |
| 669 | for (j = 0; j < table[i].len; j++) { |
| 670 | c = table[i].sym[j]; |
| 671 | best_table[c][0]=c; |
| 672 | best_table_len[c]=1; |
| 673 | } |
| 674 | } |
| 675 | } |
| 676 | } |
| 677 | |
| 678 | static void optimize_token_table(void) |
| 679 | { |
| 680 | memset(hash_table, 0, sizeof(hash_table)); |
| 681 | |
| 682 | good_head.left = &good_head; |
| 683 | good_head.right = &good_head; |
| 684 | |
| 685 | bad_head.left = &bad_head; |
| 686 | bad_head.right = &bad_head; |
| 687 | |
| 688 | build_initial_tok_table(); |
| 689 | |
| 690 | insert_real_symbols_in_table(); |
| 691 | |
Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 692 | /* When valid symbol is not registered, exit to error */ |
| 693 | if (good_head.left == good_head.right && |
| 694 | bad_head.left == bad_head.right) { |
| 695 | fprintf(stderr, "No valid symbol.\n"); |
| 696 | exit(1); |
| 697 | } |
| 698 | |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 699 | optimize_result(); |
| 700 | } |
| 701 | |
| 702 | |
| 703 | int |
| 704 | main(int argc, char **argv) |
| 705 | { |
Yoshinori Sato | 41f11a4 | 2005-05-01 08:59:06 -0700 | [diff] [blame] | 706 | if (argc >= 2) { |
| 707 | int i; |
| 708 | for (i = 1; i < argc; i++) { |
| 709 | if(strcmp(argv[i], "--all-symbols") == 0) |
| 710 | all_symbols = 1; |
| 711 | else if (strncmp(argv[i], "--symbol-prefix=", 16) == 0) { |
| 712 | char *p = &argv[i][16]; |
| 713 | /* skip quote */ |
| 714 | if ((*p == '"' && *(p+2) == '"') || (*p == '\'' && *(p+2) == '\'')) |
| 715 | p++; |
| 716 | symbol_prefix_char = *p; |
| 717 | } else |
| 718 | usage(); |
| 719 | } |
| 720 | } else if (argc != 1) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 721 | usage(); |
| 722 | |
| 723 | read_map(stdin); |
| 724 | optimize_token_table(); |
| 725 | write_src(); |
| 726 | |
| 727 | return 0; |
| 728 | } |