blob: 9d2c769e5f835547347bb653f2a3d5e7a514c57c [file] [log] [blame]
Ingo Molnar07800602009-04-20 15:00:56 +02001/*
2 * kerneltop.c: show top kernel functions - performance counters showcase
3
4 Build with:
5
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
7
8 Sample output:
9
10------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12------------------------------------------------------------------------------
13
14 weight RIP kernel function
15 ______ ________________ _______________
16
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
29 */
30
31/*
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
33
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
38
39 Sample output:
40
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43 Performance counter stats for 'ls':
44
45 163516953 instructions
46 2295 cache-misses
47 2855182 branch-misses
48 */
49
50 /*
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52 *
53 * Improvements and fixes by:
54 *
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
60 *
61 * Released under the GPL v2. (and only v2, not any later version)
62 */
63
64#include "util.h"
65
66#define _GNU_SOURCE
67#include <sys/types.h>
68#include <sys/stat.h>
69#include <sys/time.h>
70#include <unistd.h>
71#include <stdint.h>
72#include <stdlib.h>
73#include <string.h>
74#include <limits.h>
75#include <getopt.h>
76#include <assert.h>
77#include <fcntl.h>
78#include <stdio.h>
79#include <errno.h>
80#include <ctype.h>
81#include <time.h>
82#include <sched.h>
83#include <pthread.h>
84
85#include <sys/syscall.h>
86#include <sys/ioctl.h>
87#include <sys/poll.h>
88#include <sys/prctl.h>
89#include <sys/wait.h>
90#include <sys/uio.h>
91#include <sys/mman.h>
92
93#include <linux/unistd.h>
94#include <linux/types.h>
95
96#include "../../include/linux/perf_counter.h"
97
98
99/*
100 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
101 * counters in the current task.
102 */
103#define PR_TASK_PERF_COUNTERS_DISABLE 31
104#define PR_TASK_PERF_COUNTERS_ENABLE 32
105
106#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
107
108#define rdclock() \
109({ \
110 struct timespec ts; \
111 \
112 clock_gettime(CLOCK_MONOTONIC, &ts); \
113 ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
114})
115
116/*
117 * Pick up some kernel type conventions:
118 */
119#define __user
120#define asmlinkage
121
122#ifdef __x86_64__
123#define __NR_perf_counter_open 295
124#define rmb() asm volatile("lfence" ::: "memory")
125#define cpu_relax() asm volatile("rep; nop" ::: "memory");
126#endif
127
128#ifdef __i386__
129#define __NR_perf_counter_open 333
130#define rmb() asm volatile("lfence" ::: "memory")
131#define cpu_relax() asm volatile("rep; nop" ::: "memory");
132#endif
133
134#ifdef __powerpc__
135#define __NR_perf_counter_open 319
136#define rmb() asm volatile ("sync" ::: "memory")
137#define cpu_relax() asm volatile ("" ::: "memory");
138#endif
139
140#define unlikely(x) __builtin_expect(!!(x), 0)
141#define min(x, y) ({ \
142 typeof(x) _min1 = (x); \
143 typeof(y) _min2 = (y); \
144 (void) (&_min1 == &_min2); \
145 _min1 < _min2 ? _min1 : _min2; })
146
147asmlinkage int sys_perf_counter_open(
148 struct perf_counter_hw_event *hw_event_uptr __user,
149 pid_t pid,
150 int cpu,
151 int group_fd,
152 unsigned long flags)
153{
154 return syscall(
155 __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
156}
157
158#define MAX_COUNTERS 64
159#define MAX_NR_CPUS 256
160
161#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
162
163static int run_perfstat = 0;
164static int system_wide = 0;
165
166static int nr_counters = 0;
167static __u64 event_id[MAX_COUNTERS] = {
168 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
169 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
170 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
171 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
172
173 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
174 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
175 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
176 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
177};
178static int default_interval = 100000;
179static int event_count[MAX_COUNTERS];
180static int fd[MAX_NR_CPUS][MAX_COUNTERS];
181
182static __u64 count_filter = 100;
183
184static int tid = -1;
185static int profile_cpu = -1;
186static int nr_cpus = 0;
187static int nmi = 1;
188static unsigned int realtime_prio = 0;
189static int group = 0;
190static unsigned int page_size;
191static unsigned int mmap_pages = 16;
192static int use_mmap = 0;
193static int use_munmap = 0;
194
195static char *vmlinux;
196
197static char *sym_filter;
198static unsigned long filter_start;
199static unsigned long filter_end;
200
201static int delay_secs = 2;
202static int zero;
203static int dump_symtab;
204
205static int scale;
206
207struct source_line {
208 uint64_t EIP;
209 unsigned long count;
210 char *line;
211 struct source_line *next;
212};
213
214static struct source_line *lines;
215static struct source_line **lines_tail;
216
217const unsigned int default_count[] = {
218 1000000,
219 1000000,
220 10000,
221 10000,
222 1000000,
223 10000,
224};
225
226static char *hw_event_names[] = {
227 "CPU cycles",
228 "instructions",
229 "cache references",
230 "cache misses",
231 "branches",
232 "branch misses",
233 "bus cycles",
234};
235
236static char *sw_event_names[] = {
237 "cpu clock ticks",
238 "task clock ticks",
239 "pagefaults",
240 "context switches",
241 "CPU migrations",
242 "minor faults",
243 "major faults",
244};
245
246struct event_symbol {
247 __u64 event;
248 char *symbol;
249};
250
251static struct event_symbol event_symbols[] = {
252 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
253 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
254 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
255 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
256 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
257 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
258 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
259 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
260 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
261
262 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
263 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
264 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
265 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
266 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
267 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
268 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
269 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
270 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
271 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
272};
273
274#define __PERF_COUNTER_FIELD(config, name) \
275 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
276
277#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
278#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
279#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
280#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
281
282static void display_events_help(void)
283{
284 unsigned int i;
285 __u64 e;
286
287 printf(
288 " -e EVENT --event=EVENT # symbolic-name abbreviations");
289
290 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
291 int type, id;
292
293 e = event_symbols[i].event;
294 type = PERF_COUNTER_TYPE(e);
295 id = PERF_COUNTER_ID(e);
296
297 printf("\n %d:%d: %-20s",
298 type, id, event_symbols[i].symbol);
299 }
300
301 printf("\n"
302 " rNNN: raw PMU events (eventsel+umask)\n\n");
303}
304
305static void display_perfstat_help(void)
306{
307 printf(
308 "Usage: perfstat [<events...>] <cmd...>\n\n"
309 "PerfStat Options (up to %d event types can be specified):\n\n",
310 MAX_COUNTERS);
311
312 display_events_help();
313
314 printf(
315 " -l # scale counter values\n"
316 " -a # system-wide collection\n");
317 exit(0);
318}
319
320static void display_help(void)
321{
322 if (run_perfstat)
323 return display_perfstat_help();
324
325 printf(
326 "Usage: kerneltop [<options>]\n"
327 " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
328 "KernelTop Options (up to %d event types can be specified at once):\n\n",
329 MAX_COUNTERS);
330
331 display_events_help();
332
333 printf(
334 " -S --stat # perfstat COMMAND\n"
335 " -a # system-wide collection (for perfstat)\n\n"
336 " -c CNT --count=CNT # event period to sample\n\n"
337 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n"
338 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n"
339 " -l # show scale factor for RR events\n"
340 " -d delay --delay=<seconds> # sampling/display delay [default: 2]\n"
341 " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n"
342 " -r prio --realtime=<prio> # event acquisition runs with SCHED_FIFO policy\n"
343 " -s symbol --symbol=<symbol> # function to be showed annotated one-shot\n"
344 " -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n"
345 " -z --zero # zero counts after display\n"
346 " -D --dump_symtab # dump symbol table to stderr on startup\n"
347 " -m pages --mmap_pages=<pages> # number of mmap data pages\n"
348 " -M --mmap_info # print mmap info stream\n"
349 " -U --munmap_info # print munmap info stream\n"
350 );
351
352 exit(0);
353}
354
355static char *event_name(int ctr)
356{
357 __u64 config = event_id[ctr];
358 int type = PERF_COUNTER_TYPE(config);
359 int id = PERF_COUNTER_ID(config);
360 static char buf[32];
361
362 if (PERF_COUNTER_RAW(config)) {
363 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
364 return buf;
365 }
366
367 switch (type) {
368 case PERF_TYPE_HARDWARE:
369 if (id < PERF_HW_EVENTS_MAX)
370 return hw_event_names[id];
371 return "unknown-hardware";
372
373 case PERF_TYPE_SOFTWARE:
374 if (id < PERF_SW_EVENTS_MAX)
375 return sw_event_names[id];
376 return "unknown-software";
377
378 default:
379 break;
380 }
381
382 return "unknown";
383}
384
385/*
386 * Each event can have multiple symbolic names.
387 * Symbolic names are (almost) exactly matched.
388 */
389static __u64 match_event_symbols(char *str)
390{
391 __u64 config, id;
392 int type;
393 unsigned int i;
394
395 if (sscanf(str, "r%llx", &config) == 1)
396 return config | PERF_COUNTER_RAW_MASK;
397
398 if (sscanf(str, "%d:%llu", &type, &id) == 2)
399 return EID(type, id);
400
401 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
402 if (!strncmp(str, event_symbols[i].symbol,
403 strlen(event_symbols[i].symbol)))
404 return event_symbols[i].event;
405 }
406
407 return ~0ULL;
408}
409
410static int parse_events(char *str)
411{
412 __u64 config;
413
414again:
415 if (nr_counters == MAX_COUNTERS)
416 return -1;
417
418 config = match_event_symbols(str);
419 if (config == ~0ULL)
420 return -1;
421
422 event_id[nr_counters] = config;
423 nr_counters++;
424
425 str = strstr(str, ",");
426 if (str) {
427 str++;
428 goto again;
429 }
430
431 return 0;
432}
433
434
435/*
436 * perfstat
437 */
438
439char fault_here[1000000];
440
441static void create_perfstat_counter(int counter)
442{
443 struct perf_counter_hw_event hw_event;
444
445 memset(&hw_event, 0, sizeof(hw_event));
446 hw_event.config = event_id[counter];
447 hw_event.record_type = 0;
448 hw_event.nmi = 0;
449 if (scale)
450 hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
451 PERF_FORMAT_TOTAL_TIME_RUNNING;
452
453 if (system_wide) {
454 int cpu;
455 for (cpu = 0; cpu < nr_cpus; cpu ++) {
456 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
457 if (fd[cpu][counter] < 0) {
458 printf("perfstat error: syscall returned with %d (%s)\n",
459 fd[cpu][counter], strerror(errno));
460 exit(-1);
461 }
462 }
463 } else {
464 hw_event.inherit = 1;
465 hw_event.disabled = 1;
466
467 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
468 if (fd[0][counter] < 0) {
469 printf("perfstat error: syscall returned with %d (%s)\n",
470 fd[0][counter], strerror(errno));
471 exit(-1);
472 }
473 }
474}
475
476int do_perfstat(int argc, char *argv[])
477{
478 unsigned long long t0, t1;
479 int counter;
480 ssize_t res;
481 int status;
482 int pid;
483
484 if (!system_wide)
485 nr_cpus = 1;
486
487 for (counter = 0; counter < nr_counters; counter++)
488 create_perfstat_counter(counter);
489
490 argc -= optind;
491 argv += optind;
492
493 if (!argc)
494 display_help();
495
496 /*
497 * Enable counters and exec the command:
498 */
499 t0 = rdclock();
500 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
501
502 if ((pid = fork()) < 0)
503 perror("failed to fork");
504 if (!pid) {
505 if (execvp(argv[0], argv)) {
506 perror(argv[0]);
507 exit(-1);
508 }
509 }
510 while (wait(&status) >= 0)
511 ;
512 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
513 t1 = rdclock();
514
515 fflush(stdout);
516
517 fprintf(stderr, "\n");
518 fprintf(stderr, " Performance counter stats for \'%s\':\n",
519 argv[0]);
520 fprintf(stderr, "\n");
521
522 for (counter = 0; counter < nr_counters; counter++) {
523 int cpu, nv;
524 __u64 count[3], single_count[3];
525 int scaled;
526
527 count[0] = count[1] = count[2] = 0;
528 nv = scale ? 3 : 1;
529 for (cpu = 0; cpu < nr_cpus; cpu ++) {
530 res = read(fd[cpu][counter],
531 single_count, nv * sizeof(__u64));
532 assert(res == nv * sizeof(__u64));
533
534 count[0] += single_count[0];
535 if (scale) {
536 count[1] += single_count[1];
537 count[2] += single_count[2];
538 }
539 }
540
541 scaled = 0;
542 if (scale) {
543 if (count[2] == 0) {
544 fprintf(stderr, " %14s %-20s\n",
545 "<not counted>", event_name(counter));
546 continue;
547 }
548 if (count[2] < count[1]) {
549 scaled = 1;
550 count[0] = (unsigned long long)
551 ((double)count[0] * count[1] / count[2] + 0.5);
552 }
553 }
554
555 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
556 event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
557
558 double msecs = (double)count[0] / 1000000;
559
560 fprintf(stderr, " %14.6f %-20s (msecs)",
561 msecs, event_name(counter));
562 } else {
563 fprintf(stderr, " %14Ld %-20s (events)",
564 count[0], event_name(counter));
565 }
566 if (scaled)
567 fprintf(stderr, " (scaled from %.2f%%)",
568 (double) count[2] / count[1] * 100);
569 fprintf(stderr, "\n");
570 }
571 fprintf(stderr, "\n");
572 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
573 (double)(t1-t0)/1e6);
574 fprintf(stderr, "\n");
575
576 return 0;
577}
578
579/*
580 * Symbols
581 */
582
583static uint64_t min_ip;
584static uint64_t max_ip = -1ll;
585
586struct sym_entry {
587 unsigned long long addr;
588 char *sym;
589 unsigned long count[MAX_COUNTERS];
590 int skip;
591 struct source_line *source;
592};
593
594#define MAX_SYMS 100000
595
596static int sym_table_count;
597
598struct sym_entry *sym_filter_entry;
599
600static struct sym_entry sym_table[MAX_SYMS];
601
602static void show_details(struct sym_entry *sym);
603
604/*
605 * Ordering weight: count-1 * count-2 * ... / count-n
606 */
607static double sym_weight(const struct sym_entry *sym)
608{
609 double weight;
610 int counter;
611
612 weight = sym->count[0];
613
614 for (counter = 1; counter < nr_counters-1; counter++)
615 weight *= sym->count[counter];
616
617 weight /= (sym->count[counter] + 1);
618
619 return weight;
620}
621
622static int compare(const void *__sym1, const void *__sym2)
623{
624 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
625
626 return sym_weight(sym1) < sym_weight(sym2);
627}
628
629static long events;
630static long userspace_events;
631static const char CONSOLE_CLEAR[] = "";
632
633static struct sym_entry tmp[MAX_SYMS];
634
635static void print_sym_table(void)
636{
637 int i, printed;
638 int counter;
639 float events_per_sec = events/delay_secs;
640 float kevents_per_sec = (events-userspace_events)/delay_secs;
641 float sum_kevents = 0.0;
642
643 events = userspace_events = 0;
644 memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
645 qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
646
647 for (i = 0; i < sym_table_count && tmp[i].count[0]; i++)
648 sum_kevents += tmp[i].count[0];
649
650 write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
651
652 printf(
653"------------------------------------------------------------------------------\n");
654 printf( " KernelTop:%8.0f irqs/sec kernel:%4.1f%% [%s, ",
655 events_per_sec,
656 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
657 nmi ? "NMI" : "IRQ");
658
659 if (nr_counters == 1)
660 printf("%d ", event_count[0]);
661
662 for (counter = 0; counter < nr_counters; counter++) {
663 if (counter)
664 printf("/");
665
666 printf("%s", event_name(counter));
667 }
668
669 printf( "], ");
670
671 if (tid != -1)
672 printf(" (tid: %d", tid);
673 else
674 printf(" (all");
675
676 if (profile_cpu != -1)
677 printf(", cpu: %d)\n", profile_cpu);
678 else {
679 if (tid != -1)
680 printf(")\n");
681 else
682 printf(", %d CPUs)\n", nr_cpus);
683 }
684
685 printf("------------------------------------------------------------------------------\n\n");
686
687 if (nr_counters == 1)
688 printf(" events pcnt");
689 else
690 printf(" weight events pcnt");
691
692 printf(" RIP kernel function\n"
693 " ______ ______ _____ ________________ _______________\n\n"
694 );
695
696 for (i = 0, printed = 0; i < sym_table_count; i++) {
697 float pcnt;
698 int count;
699
700 if (printed <= 18 && tmp[i].count[0] >= count_filter) {
701 pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents));
702
703 if (nr_counters == 1)
704 printf("%19.2f - %4.1f%% - %016llx : %s\n",
705 sym_weight(tmp + i),
706 pcnt, tmp[i].addr, tmp[i].sym);
707 else
708 printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
709 sym_weight(tmp + i),
710 tmp[i].count[0],
711 pcnt, tmp[i].addr, tmp[i].sym);
712 printed++;
713 }
714 /*
715 * Add decay to the counts:
716 */
717 for (count = 0; count < nr_counters; count++)
718 sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
719 }
720
721 if (sym_filter_entry)
722 show_details(sym_filter_entry);
723
724 {
725 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
726
727 if (poll(&stdin_poll, 1, 0) == 1) {
728 printf("key pressed - exiting.\n");
729 exit(0);
730 }
731 }
732}
733
734static void *display_thread(void *arg)
735{
736 printf("KernelTop refresh period: %d seconds\n", delay_secs);
737
738 while (!sleep(delay_secs))
739 print_sym_table();
740
741 return NULL;
742}
743
744static int read_symbol(FILE *in, struct sym_entry *s)
745{
746 static int filter_match = 0;
747 char *sym, stype;
748 char str[500];
749 int rc, pos;
750
751 rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
752 if (rc == EOF)
753 return -1;
754
755 assert(rc == 3);
756
757 /* skip until end of line: */
758 pos = strlen(str);
759 do {
760 rc = fgetc(in);
761 if (rc == '\n' || rc == EOF || pos >= 499)
762 break;
763 str[pos] = rc;
764 pos++;
765 } while (1);
766 str[pos] = 0;
767
768 sym = str;
769
770 /* Filter out known duplicates and non-text symbols. */
771 if (!strcmp(sym, "_text"))
772 return 1;
773 if (!min_ip && !strcmp(sym, "_stext"))
774 return 1;
775 if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
776 return 1;
777 if (stype != 'T' && stype != 't')
778 return 1;
779 if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
780 return 1;
781 if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
782 return 1;
783
784 s->sym = malloc(strlen(str));
785 assert(s->sym);
786
787 strcpy((char *)s->sym, str);
788 s->skip = 0;
789
790 /* Tag events to be skipped. */
791 if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
792 s->skip = 1;
793 else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
794 s->skip = 1;
795 else if (!strcmp("mwait_idle", s->sym))
796 s->skip = 1;
797
798 if (filter_match == 1) {
799 filter_end = s->addr;
800 filter_match = -1;
801 if (filter_end - filter_start > 10000) {
802 printf("hm, too large filter symbol <%s> - skipping.\n",
803 sym_filter);
804 printf("symbol filter start: %016lx\n", filter_start);
805 printf(" end: %016lx\n", filter_end);
806 filter_end = filter_start = 0;
807 sym_filter = NULL;
808 sleep(1);
809 }
810 }
811 if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
812 filter_match = 1;
813 filter_start = s->addr;
814 }
815
816 return 0;
817}
818
819int compare_addr(const void *__sym1, const void *__sym2)
820{
821 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
822
823 return sym1->addr > sym2->addr;
824}
825
826static void sort_symbol_table(void)
827{
828 int i, dups;
829
830 do {
831 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
832 for (i = 0, dups = 0; i < sym_table_count; i++) {
833 if (sym_table[i].addr == sym_table[i+1].addr) {
834 sym_table[i+1].addr = -1ll;
835 dups++;
836 }
837 }
838 sym_table_count -= dups;
839 } while(dups);
840}
841
842static void parse_symbols(void)
843{
844 struct sym_entry *last;
845
846 FILE *kallsyms = fopen("/proc/kallsyms", "r");
847
848 if (!kallsyms) {
849 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
850 exit(-1);
851 }
852
853 while (!feof(kallsyms)) {
854 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
855 sym_table_count++;
856 assert(sym_table_count <= MAX_SYMS);
857 }
858 }
859
860 sort_symbol_table();
861 min_ip = sym_table[0].addr;
862 max_ip = sym_table[sym_table_count-1].addr;
863 last = sym_table + sym_table_count++;
864
865 last->addr = -1ll;
866 last->sym = "<end>";
867
868 if (filter_end) {
869 int count;
870 for (count=0; count < sym_table_count; count ++) {
871 if (!strcmp(sym_table[count].sym, sym_filter)) {
872 sym_filter_entry = &sym_table[count];
873 break;
874 }
875 }
876 }
877 if (dump_symtab) {
878 int i;
879
880 for (i = 0; i < sym_table_count; i++)
881 fprintf(stderr, "%llx %s\n",
882 sym_table[i].addr, sym_table[i].sym);
883 }
884}
885
886/*
887 * Source lines
888 */
889
890static void parse_vmlinux(char *filename)
891{
892 FILE *file;
893 char command[PATH_MAX*2];
894 if (!filename)
895 return;
896
897 sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
898
899 file = popen(command, "r");
900 if (!file)
901 return;
902
903 lines_tail = &lines;
904 while (!feof(file)) {
905 struct source_line *src;
906 size_t dummy = 0;
907 char *c;
908
909 src = malloc(sizeof(struct source_line));
910 assert(src != NULL);
911 memset(src, 0, sizeof(struct source_line));
912
913 if (getline(&src->line, &dummy, file) < 0)
914 break;
915 if (!src->line)
916 break;
917
918 c = strchr(src->line, '\n');
919 if (c)
920 *c = 0;
921
922 src->next = NULL;
923 *lines_tail = src;
924 lines_tail = &src->next;
925
926 if (strlen(src->line)>8 && src->line[8] == ':')
927 src->EIP = strtoull(src->line, NULL, 16);
928 if (strlen(src->line)>8 && src->line[16] == ':')
929 src->EIP = strtoull(src->line, NULL, 16);
930 }
931 pclose(file);
932}
933
934static void record_precise_ip(uint64_t ip)
935{
936 struct source_line *line;
937
938 for (line = lines; line; line = line->next) {
939 if (line->EIP == ip)
940 line->count++;
941 if (line->EIP > ip)
942 break;
943 }
944}
945
946static void lookup_sym_in_vmlinux(struct sym_entry *sym)
947{
948 struct source_line *line;
949 char pattern[PATH_MAX];
950 sprintf(pattern, "<%s>:", sym->sym);
951
952 for (line = lines; line; line = line->next) {
953 if (strstr(line->line, pattern)) {
954 sym->source = line;
955 break;
956 }
957 }
958}
959
960static void show_lines(struct source_line *line_queue, int line_queue_count)
961{
962 int i;
963 struct source_line *line;
964
965 line = line_queue;
966 for (i = 0; i < line_queue_count; i++) {
967 printf("%8li\t%s\n", line->count, line->line);
968 line = line->next;
969 }
970}
971
972#define TRACE_COUNT 3
973
974static void show_details(struct sym_entry *sym)
975{
976 struct source_line *line;
977 struct source_line *line_queue = NULL;
978 int displayed = 0;
979 int line_queue_count = 0;
980
981 if (!sym->source)
982 lookup_sym_in_vmlinux(sym);
983 if (!sym->source)
984 return;
985
986 printf("Showing details for %s\n", sym->sym);
987
988 line = sym->source;
989 while (line) {
990 if (displayed && strstr(line->line, ">:"))
991 break;
992
993 if (!line_queue_count)
994 line_queue = line;
995 line_queue_count ++;
996
997 if (line->count >= count_filter) {
998 show_lines(line_queue, line_queue_count);
999 line_queue_count = 0;
1000 line_queue = NULL;
1001 } else if (line_queue_count > TRACE_COUNT) {
1002 line_queue = line_queue->next;
1003 line_queue_count --;
1004 }
1005
1006 line->count = 0;
1007 displayed++;
1008 if (displayed > 300)
1009 break;
1010 line = line->next;
1011 }
1012}
1013
1014/*
1015 * Binary search in the histogram table and record the hit:
1016 */
1017static void record_ip(uint64_t ip, int counter)
1018{
1019 int left_idx, middle_idx, right_idx, idx;
1020 unsigned long left, middle, right;
1021
1022 record_precise_ip(ip);
1023
1024 left_idx = 0;
1025 right_idx = sym_table_count-1;
1026 assert(ip <= max_ip && ip >= min_ip);
1027
1028 while (left_idx + 1 < right_idx) {
1029 middle_idx = (left_idx + right_idx) / 2;
1030
1031 left = sym_table[ left_idx].addr;
1032 middle = sym_table[middle_idx].addr;
1033 right = sym_table[ right_idx].addr;
1034
1035 if (!(left <= middle && middle <= right)) {
1036 printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
1037 printf("%d %d %d\n", left_idx, middle_idx, right_idx);
1038 }
1039 assert(left <= middle && middle <= right);
1040 if (!(left <= ip && ip <= right)) {
1041 printf(" left: %016lx\n", left);
1042 printf(" ip: %016lx\n", (unsigned long)ip);
1043 printf("right: %016lx\n", right);
1044 }
1045 assert(left <= ip && ip <= right);
1046 /*
1047 * [ left .... target .... middle .... right ]
1048 * => right := middle
1049 */
1050 if (ip < middle) {
1051 right_idx = middle_idx;
1052 continue;
1053 }
1054 /*
1055 * [ left .... middle ... target ... right ]
1056 * => left := middle
1057 */
1058 left_idx = middle_idx;
1059 }
1060
1061 idx = left_idx;
1062
1063 if (!sym_table[idx].skip)
1064 sym_table[idx].count[counter]++;
1065 else events--;
1066}
1067
1068static void process_event(uint64_t ip, int counter)
1069{
1070 events++;
1071
1072 if (ip < min_ip || ip > max_ip) {
1073 userspace_events++;
1074 return;
1075 }
1076
1077 record_ip(ip, counter);
1078}
1079
1080static void process_options(int argc, char *argv[])
1081{
1082 int error = 0, counter;
1083
1084 if (strstr(argv[0], "perfstat"))
1085 run_perfstat = 1;
1086
1087 for (;;) {
1088 int option_index = 0;
1089 /** Options for getopt */
1090 static struct option long_options[] = {
1091 {"count", required_argument, NULL, 'c'},
1092 {"cpu", required_argument, NULL, 'C'},
1093 {"delay", required_argument, NULL, 'd'},
1094 {"dump_symtab", no_argument, NULL, 'D'},
1095 {"event", required_argument, NULL, 'e'},
1096 {"filter", required_argument, NULL, 'f'},
1097 {"group", required_argument, NULL, 'g'},
1098 {"help", no_argument, NULL, 'h'},
1099 {"nmi", required_argument, NULL, 'n'},
1100 {"mmap_info", no_argument, NULL, 'M'},
1101 {"mmap_pages", required_argument, NULL, 'm'},
1102 {"munmap_info", no_argument, NULL, 'U'},
1103 {"pid", required_argument, NULL, 'p'},
1104 {"realtime", required_argument, NULL, 'r'},
1105 {"scale", no_argument, NULL, 'l'},
1106 {"symbol", required_argument, NULL, 's'},
1107 {"stat", no_argument, NULL, 'S'},
1108 {"vmlinux", required_argument, NULL, 'x'},
1109 {"zero", no_argument, NULL, 'z'},
1110 {NULL, 0, NULL, 0 }
1111 };
1112 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
1113 long_options, &option_index);
1114 if (c == -1)
1115 break;
1116
1117 switch (c) {
1118 case 'a': system_wide = 1; break;
1119 case 'c': default_interval = atoi(optarg); break;
1120 case 'C':
1121 /* CPU and PID are mutually exclusive */
1122 if (tid != -1) {
1123 printf("WARNING: CPU switch overriding PID\n");
1124 sleep(1);
1125 tid = -1;
1126 }
1127 profile_cpu = atoi(optarg); break;
1128 case 'd': delay_secs = atoi(optarg); break;
1129 case 'D': dump_symtab = 1; break;
1130
1131 case 'e': error = parse_events(optarg); break;
1132
1133 case 'f': count_filter = atoi(optarg); break;
1134 case 'g': group = atoi(optarg); break;
1135 case 'h': display_help(); break;
1136 case 'l': scale = 1; break;
1137 case 'n': nmi = atoi(optarg); break;
1138 case 'p':
1139 /* CPU and PID are mutually exclusive */
1140 if (profile_cpu != -1) {
1141 printf("WARNING: PID switch overriding CPU\n");
1142 sleep(1);
1143 profile_cpu = -1;
1144 }
1145 tid = atoi(optarg); break;
1146 case 'r': realtime_prio = atoi(optarg); break;
1147 case 's': sym_filter = strdup(optarg); break;
1148 case 'S': run_perfstat = 1; break;
1149 case 'x': vmlinux = strdup(optarg); break;
1150 case 'z': zero = 1; break;
1151 case 'm': mmap_pages = atoi(optarg); break;
1152 case 'M': use_mmap = 1; break;
1153 case 'U': use_munmap = 1; break;
1154 default: error = 1; break;
1155 }
1156 }
1157 if (error)
1158 display_help();
1159
1160 if (!nr_counters) {
1161 if (run_perfstat)
1162 nr_counters = 8;
1163 else {
1164 nr_counters = 1;
1165 event_id[0] = 0;
1166 }
1167 }
1168
1169 for (counter = 0; counter < nr_counters; counter++) {
1170 if (event_count[counter])
1171 continue;
1172
1173 event_count[counter] = default_interval;
1174 }
1175}
1176
1177struct mmap_data {
1178 int counter;
1179 void *base;
1180 unsigned int mask;
1181 unsigned int prev;
1182};
1183
1184static unsigned int mmap_read_head(struct mmap_data *md)
1185{
1186 struct perf_counter_mmap_page *pc = md->base;
1187 int head;
1188
1189 head = pc->data_head;
1190 rmb();
1191
1192 return head;
1193}
1194
1195struct timeval last_read, this_read;
1196
1197static void mmap_read(struct mmap_data *md)
1198{
1199 unsigned int head = mmap_read_head(md);
1200 unsigned int old = md->prev;
1201 unsigned char *data = md->base + page_size;
1202 int diff;
1203
1204 gettimeofday(&this_read, NULL);
1205
1206 /*
1207 * If we're further behind than half the buffer, there's a chance
1208 * the writer will bite our tail and screw up the events under us.
1209 *
1210 * If we somehow ended up ahead of the head, we got messed up.
1211 *
1212 * In either case, truncate and restart at head.
1213 */
1214 diff = head - old;
1215 if (diff > md->mask / 2 || diff < 0) {
1216 struct timeval iv;
1217 unsigned long msecs;
1218
1219 timersub(&this_read, &last_read, &iv);
1220 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
1221
1222 fprintf(stderr, "WARNING: failed to keep up with mmap data."
1223 " Last read %lu msecs ago.\n", msecs);
1224
1225 /*
1226 * head points to a known good entry, start there.
1227 */
1228 old = head;
1229 }
1230
1231 last_read = this_read;
1232
1233 for (; old != head;) {
1234 struct ip_event {
1235 struct perf_event_header header;
1236 __u64 ip;
1237 __u32 pid, tid;
1238 };
1239 struct mmap_event {
1240 struct perf_event_header header;
1241 __u32 pid, tid;
1242 __u64 start;
1243 __u64 len;
1244 __u64 pgoff;
1245 char filename[PATH_MAX];
1246 };
1247
1248 typedef union event_union {
1249 struct perf_event_header header;
1250 struct ip_event ip;
1251 struct mmap_event mmap;
1252 } event_t;
1253
1254 event_t *event = (event_t *)&data[old & md->mask];
1255
1256 event_t event_copy;
1257
1258 unsigned int size = event->header.size;
1259
1260 /*
1261 * Event straddles the mmap boundary -- header should always
1262 * be inside due to u64 alignment of output.
1263 */
1264 if ((old & md->mask) + size != ((old + size) & md->mask)) {
1265 unsigned int offset = old;
1266 unsigned int len = min(sizeof(*event), size), cpy;
1267 void *dst = &event_copy;
1268
1269 do {
1270 cpy = min(md->mask + 1 - (offset & md->mask), len);
1271 memcpy(dst, &data[offset & md->mask], cpy);
1272 offset += cpy;
1273 dst += cpy;
1274 len -= cpy;
1275 } while (len);
1276
1277 event = &event_copy;
1278 }
1279
1280 old += size;
1281
1282 if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
1283 if (event->header.type & PERF_RECORD_IP)
1284 process_event(event->ip.ip, md->counter);
1285 } else {
1286 switch (event->header.type) {
1287 case PERF_EVENT_MMAP:
1288 case PERF_EVENT_MUNMAP:
1289 printf("%s: %Lu %Lu %Lu %s\n",
1290 event->header.type == PERF_EVENT_MMAP
1291 ? "mmap" : "munmap",
1292 event->mmap.start,
1293 event->mmap.len,
1294 event->mmap.pgoff,
1295 event->mmap.filename);
1296 break;
1297 }
1298 }
1299 }
1300
1301 md->prev = old;
1302}
1303
1304int cmd_top(int argc, const char **argv, const char *prefix)
1305{
1306 struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
1307 struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
1308 struct perf_counter_hw_event hw_event;
1309 pthread_t thread;
1310 int i, counter, group_fd, nr_poll = 0;
1311 unsigned int cpu;
1312 int ret;
1313
1314 page_size = sysconf(_SC_PAGE_SIZE);
1315
1316 process_options(argc, argv);
1317
1318 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1319 assert(nr_cpus <= MAX_NR_CPUS);
1320 assert(nr_cpus >= 0);
1321
1322 if (run_perfstat)
1323 return do_perfstat(argc, argv);
1324
1325 if (tid != -1 || profile_cpu != -1)
1326 nr_cpus = 1;
1327
1328 parse_symbols();
1329 if (vmlinux && sym_filter_entry)
1330 parse_vmlinux(vmlinux);
1331
1332 for (i = 0; i < nr_cpus; i++) {
1333 group_fd = -1;
1334 for (counter = 0; counter < nr_counters; counter++) {
1335
1336 cpu = profile_cpu;
1337 if (tid == -1 && profile_cpu == -1)
1338 cpu = i;
1339
1340 memset(&hw_event, 0, sizeof(hw_event));
1341 hw_event.config = event_id[counter];
1342 hw_event.irq_period = event_count[counter];
1343 hw_event.record_type = PERF_RECORD_IP | PERF_RECORD_TID;
1344 hw_event.nmi = nmi;
1345 hw_event.mmap = use_mmap;
1346 hw_event.munmap = use_munmap;
1347
1348 fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1349 if (fd[i][counter] < 0) {
1350 int err = errno;
1351 printf("kerneltop error: syscall returned with %d (%s)\n",
1352 fd[i][counter], strerror(err));
1353 if (err == EPERM)
1354 printf("Are you root?\n");
1355 exit(-1);
1356 }
1357 assert(fd[i][counter] >= 0);
1358 fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1359
1360 /*
1361 * First counter acts as the group leader:
1362 */
1363 if (group && group_fd == -1)
1364 group_fd = fd[i][counter];
1365
1366 event_array[nr_poll].fd = fd[i][counter];
1367 event_array[nr_poll].events = POLLIN;
1368 nr_poll++;
1369
1370 mmap_array[i][counter].counter = counter;
1371 mmap_array[i][counter].prev = 0;
1372 mmap_array[i][counter].mask = mmap_pages*page_size - 1;
1373 mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
1374 PROT_READ, MAP_SHARED, fd[i][counter], 0);
1375 if (mmap_array[i][counter].base == MAP_FAILED) {
1376 printf("kerneltop error: failed to mmap with %d (%s)\n",
1377 errno, strerror(errno));
1378 exit(-1);
1379 }
1380 }
1381 }
1382
1383 if (pthread_create(&thread, NULL, display_thread, NULL)) {
1384 printf("Could not create display thread.\n");
1385 exit(-1);
1386 }
1387
1388 if (realtime_prio) {
1389 struct sched_param param;
1390
1391 param.sched_priority = realtime_prio;
1392 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1393 printf("Could not set realtime priority.\n");
1394 exit(-1);
1395 }
1396 }
1397
1398 while (1) {
1399 int hits = events;
1400
1401 for (i = 0; i < nr_cpus; i++) {
1402 for (counter = 0; counter < nr_counters; counter++)
1403 mmap_read(&mmap_array[i][counter]);
1404 }
1405
1406 if (hits == events)
1407 ret = poll(event_array, nr_poll, 100);
1408 }
1409
1410 return 0;
1411}