blob: 1fde12762ca4d00e704b51afcaaf8cf2d9106de1 [file] [log] [blame]
Ingo Molnarddcacfa2009-04-20 15:37:32 +02001/*
2 * kerneltop.c: show top kernel functions - performance counters showcase
3
4 Build with:
5
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
7
8 Sample output:
9
10------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12------------------------------------------------------------------------------
13
14 weight RIP kernel function
15 ______ ________________ _______________
16
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
29 */
30
31/*
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
33
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
38
39 Sample output:
40
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43 Performance counter stats for 'ls':
44
45 163516953 instructions
46 2295 cache-misses
47 2855182 branch-misses
48 */
49
50 /*
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52 *
53 * Improvements and fixes by:
54 *
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
60 *
61 * Released under the GPL v2. (and only v2, not any later version)
62 */
63
Ingo Molnar148be2c2009-04-27 08:02:14 +020064#include "util/util.h"
Ingo Molnarddcacfa2009-04-20 15:37:32 +020065
66#include <getopt.h>
67#include <assert.h>
68#include <fcntl.h>
69#include <stdio.h>
70#include <errno.h>
Ingo Molnarddcacfa2009-04-20 15:37:32 +020071#include <time.h>
72#include <sched.h>
73#include <pthread.h>
74
75#include <sys/syscall.h>
76#include <sys/ioctl.h>
77#include <sys/poll.h>
78#include <sys/prctl.h>
79#include <sys/wait.h>
80#include <sys/uio.h>
81#include <sys/mman.h>
82
83#include <linux/unistd.h>
84#include <linux/types.h>
85
86#include "../../include/linux/perf_counter.h"
87
88
89/*
90 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
91 * counters in the current task.
92 */
93#define PR_TASK_PERF_COUNTERS_DISABLE 31
94#define PR_TASK_PERF_COUNTERS_ENABLE 32
95
96#define rdclock() \
97({ \
98 struct timespec ts; \
99 \
100 clock_gettime(CLOCK_MONOTONIC, &ts); \
101 ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
102})
103
104/*
105 * Pick up some kernel type conventions:
106 */
107#define __user
108#define asmlinkage
109
110#ifdef __x86_64__
111#define __NR_perf_counter_open 295
112#define rmb() asm volatile("lfence" ::: "memory")
113#define cpu_relax() asm volatile("rep; nop" ::: "memory");
114#endif
115
116#ifdef __i386__
117#define __NR_perf_counter_open 333
118#define rmb() asm volatile("lfence" ::: "memory")
119#define cpu_relax() asm volatile("rep; nop" ::: "memory");
120#endif
121
122#ifdef __powerpc__
123#define __NR_perf_counter_open 319
124#define rmb() asm volatile ("sync" ::: "memory")
125#define cpu_relax() asm volatile ("" ::: "memory");
126#endif
127
128#define unlikely(x) __builtin_expect(!!(x), 0)
129#define min(x, y) ({ \
130 typeof(x) _min1 = (x); \
131 typeof(y) _min2 = (y); \
132 (void) (&_min1 == &_min2); \
133 _min1 < _min2 ? _min1 : _min2; })
134
135extern asmlinkage int sys_perf_counter_open(
136 struct perf_counter_hw_event *hw_event_uptr __user,
137 pid_t pid,
138 int cpu,
139 int group_fd,
140 unsigned long flags);
141
142#define MAX_COUNTERS 64
143#define MAX_NR_CPUS 256
144
145#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
146
147static int system_wide = 0;
148
149static int nr_counters = 0;
150static __u64 event_id[MAX_COUNTERS] = {
151 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
152 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
153 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
154 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
155
156 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
157 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
158 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
159 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
160};
161static int default_interval = 100000;
162static int event_count[MAX_COUNTERS];
163static int fd[MAX_NR_CPUS][MAX_COUNTERS];
164
165static int tid = -1;
166static int profile_cpu = -1;
167static int nr_cpus = 0;
168static int nmi = 1;
169static int group = 0;
170static unsigned int page_size;
171
172static int zero;
173
Ingo Molnar66cf7822009-04-30 13:53:33 +0200174static int scale = 1;
Ingo Molnarddcacfa2009-04-20 15:37:32 +0200175
176static const unsigned int default_count[] = {
177 1000000,
178 1000000,
179 10000,
180 10000,
181 1000000,
182 10000,
183};
184
185static char *hw_event_names[] = {
186 "CPU cycles",
187 "instructions",
188 "cache references",
189 "cache misses",
190 "branches",
191 "branch misses",
192 "bus cycles",
193};
194
195static char *sw_event_names[] = {
196 "cpu clock ticks",
197 "task clock ticks",
198 "pagefaults",
199 "context switches",
200 "CPU migrations",
201 "minor faults",
202 "major faults",
203};
204
205struct event_symbol {
206 __u64 event;
207 char *symbol;
208};
209
210static struct event_symbol event_symbols[] = {
211 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
212 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
213 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
214 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
215 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
216 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
217 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
218 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
219 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
220
221 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
222 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
223 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
224 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
225 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
226 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
227 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
228 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
229 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
230 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
231};
232
233#define __PERF_COUNTER_FIELD(config, name) \
234 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
235
236#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
237#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
238#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
239#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
240
241static void display_events_help(void)
242{
243 unsigned int i;
244 __u64 e;
245
246 printf(
247 " -e EVENT --event=EVENT # symbolic-name abbreviations");
248
249 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
250 int type, id;
251
252 e = event_symbols[i].event;
253 type = PERF_COUNTER_TYPE(e);
254 id = PERF_COUNTER_ID(e);
255
256 printf("\n %d:%d: %-20s",
257 type, id, event_symbols[i].symbol);
258 }
259
260 printf("\n"
261 " rNNN: raw PMU events (eventsel+umask)\n\n");
262}
263
264static void display_help(void)
265{
266 printf(
267 "Usage: perfstat [<events...>] <cmd...>\n\n"
268 "PerfStat Options (up to %d event types can be specified):\n\n",
269 MAX_COUNTERS);
270
271 display_events_help();
272
273 printf(
274 " -l # scale counter values\n"
275 " -a # system-wide collection\n");
276 exit(0);
277}
278
279static char *event_name(int ctr)
280{
281 __u64 config = event_id[ctr];
282 int type = PERF_COUNTER_TYPE(config);
283 int id = PERF_COUNTER_ID(config);
284 static char buf[32];
285
286 if (PERF_COUNTER_RAW(config)) {
287 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
288 return buf;
289 }
290
291 switch (type) {
292 case PERF_TYPE_HARDWARE:
293 if (id < PERF_HW_EVENTS_MAX)
294 return hw_event_names[id];
295 return "unknown-hardware";
296
297 case PERF_TYPE_SOFTWARE:
298 if (id < PERF_SW_EVENTS_MAX)
299 return sw_event_names[id];
300 return "unknown-software";
301
302 default:
303 break;
304 }
305
306 return "unknown";
307}
308
309/*
310 * Each event can have multiple symbolic names.
311 * Symbolic names are (almost) exactly matched.
312 */
313static __u64 match_event_symbols(char *str)
314{
315 __u64 config, id;
316 int type;
317 unsigned int i;
318
319 if (sscanf(str, "r%llx", &config) == 1)
320 return config | PERF_COUNTER_RAW_MASK;
321
322 if (sscanf(str, "%d:%llu", &type, &id) == 2)
323 return EID(type, id);
324
325 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
326 if (!strncmp(str, event_symbols[i].symbol,
327 strlen(event_symbols[i].symbol)))
328 return event_symbols[i].event;
329 }
330
331 return ~0ULL;
332}
333
334static int parse_events(char *str)
335{
336 __u64 config;
337
338again:
339 if (nr_counters == MAX_COUNTERS)
340 return -1;
341
342 config = match_event_symbols(str);
343 if (config == ~0ULL)
344 return -1;
345
346 event_id[nr_counters] = config;
347 nr_counters++;
348
349 str = strstr(str, ",");
350 if (str) {
351 str++;
352 goto again;
353 }
354
355 return 0;
356}
357
358
359/*
360 * perfstat
361 */
362
363char fault_here[1000000];
364
365static void create_perfstat_counter(int counter)
366{
367 struct perf_counter_hw_event hw_event;
368
369 memset(&hw_event, 0, sizeof(hw_event));
370 hw_event.config = event_id[counter];
371 hw_event.record_type = 0;
372 hw_event.nmi = 0;
373 if (scale)
374 hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
375 PERF_FORMAT_TOTAL_TIME_RUNNING;
376
377 if (system_wide) {
378 int cpu;
379 for (cpu = 0; cpu < nr_cpus; cpu ++) {
380 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
381 if (fd[cpu][counter] < 0) {
382 printf("perfstat error: syscall returned with %d (%s)\n",
383 fd[cpu][counter], strerror(errno));
384 exit(-1);
385 }
386 }
387 } else {
388 hw_event.inherit = 1;
389 hw_event.disabled = 1;
390
391 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
392 if (fd[0][counter] < 0) {
393 printf("perfstat error: syscall returned with %d (%s)\n",
394 fd[0][counter], strerror(errno));
395 exit(-1);
396 }
397 }
398}
399
400int do_perfstat(int argc, char *argv[])
401{
402 unsigned long long t0, t1;
403 int counter;
404 ssize_t res;
405 int status;
406 int pid;
407
408 if (!system_wide)
409 nr_cpus = 1;
410
411 for (counter = 0; counter < nr_counters; counter++)
412 create_perfstat_counter(counter);
413
414 argc -= optind;
415 argv += optind;
416
417 if (!argc)
418 display_help();
419
420 /*
421 * Enable counters and exec the command:
422 */
423 t0 = rdclock();
424 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
425
426 if ((pid = fork()) < 0)
427 perror("failed to fork");
428 if (!pid) {
429 if (execvp(argv[0], argv)) {
430 perror(argv[0]);
431 exit(-1);
432 }
433 }
434 while (wait(&status) >= 0)
435 ;
436 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
437 t1 = rdclock();
438
439 fflush(stdout);
440
441 fprintf(stderr, "\n");
442 fprintf(stderr, " Performance counter stats for \'%s\':\n",
443 argv[0]);
444 fprintf(stderr, "\n");
445
446 for (counter = 0; counter < nr_counters; counter++) {
447 int cpu, nv;
448 __u64 count[3], single_count[3];
449 int scaled;
450
451 count[0] = count[1] = count[2] = 0;
452 nv = scale ? 3 : 1;
453 for (cpu = 0; cpu < nr_cpus; cpu ++) {
454 res = read(fd[cpu][counter],
455 single_count, nv * sizeof(__u64));
456 assert(res == nv * sizeof(__u64));
457
458 count[0] += single_count[0];
459 if (scale) {
460 count[1] += single_count[1];
461 count[2] += single_count[2];
462 }
463 }
464
465 scaled = 0;
466 if (scale) {
467 if (count[2] == 0) {
468 fprintf(stderr, " %14s %-20s\n",
469 "<not counted>", event_name(counter));
470 continue;
471 }
472 if (count[2] < count[1]) {
473 scaled = 1;
474 count[0] = (unsigned long long)
475 ((double)count[0] * count[1] / count[2] + 0.5);
476 }
477 }
478
479 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
480 event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
481
482 double msecs = (double)count[0] / 1000000;
483
484 fprintf(stderr, " %14.6f %-20s (msecs)",
485 msecs, event_name(counter));
486 } else {
487 fprintf(stderr, " %14Ld %-20s (events)",
488 count[0], event_name(counter));
489 }
490 if (scaled)
491 fprintf(stderr, " (scaled from %.2f%%)",
492 (double) count[2] / count[1] * 100);
493 fprintf(stderr, "\n");
494 }
495 fprintf(stderr, "\n");
496 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
497 (double)(t1-t0)/1e6);
498 fprintf(stderr, "\n");
499
500 return 0;
501}
502
503static void process_options(int argc, char **argv)
504{
505 int error = 0, counter;
506
507 for (;;) {
508 int option_index = 0;
509 /** Options for getopt */
510 static struct option long_options[] = {
511 {"count", required_argument, NULL, 'c'},
512 {"cpu", required_argument, NULL, 'C'},
513 {"delay", required_argument, NULL, 'd'},
514 {"dump_symtab", no_argument, NULL, 'D'},
515 {"event", required_argument, NULL, 'e'},
516 {"filter", required_argument, NULL, 'f'},
517 {"group", required_argument, NULL, 'g'},
518 {"help", no_argument, NULL, 'h'},
519 {"nmi", required_argument, NULL, 'n'},
520 {"munmap_info", no_argument, NULL, 'U'},
521 {"pid", required_argument, NULL, 'p'},
522 {"realtime", required_argument, NULL, 'r'},
523 {"scale", no_argument, NULL, 'l'},
524 {"symbol", required_argument, NULL, 's'},
525 {"stat", no_argument, NULL, 'S'},
526 {"vmlinux", required_argument, NULL, 'x'},
527 {"zero", no_argument, NULL, 'z'},
528 {NULL, 0, NULL, 0 }
529 };
530 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
531 long_options, &option_index);
532 if (c == -1)
533 break;
534
535 switch (c) {
536 case 'a': system_wide = 1; break;
537 case 'c': default_interval = atoi(optarg); break;
538 case 'C':
539 /* CPU and PID are mutually exclusive */
540 if (tid != -1) {
541 printf("WARNING: CPU switch overriding PID\n");
542 sleep(1);
543 tid = -1;
544 }
545 profile_cpu = atoi(optarg); break;
546
547 case 'e': error = parse_events(optarg); break;
548
549 case 'g': group = atoi(optarg); break;
550 case 'h': display_help(); break;
551 case 'l': scale = 1; break;
552 case 'n': nmi = atoi(optarg); break;
553 case 'p':
554 /* CPU and PID are mutually exclusive */
555 if (profile_cpu != -1) {
556 printf("WARNING: PID switch overriding CPU\n");
557 sleep(1);
558 profile_cpu = -1;
559 }
560 tid = atoi(optarg); break;
561 case 'z': zero = 1; break;
562 default: error = 1; break;
563 }
564 }
565 if (error)
566 display_help();
567
568 if (!nr_counters) {
569 nr_counters = 8;
570 }
571
572 for (counter = 0; counter < nr_counters; counter++) {
573 if (event_count[counter])
574 continue;
575
576 event_count[counter] = default_interval;
577 }
578}
579
580int cmd_stat(int argc, char **argv, const char *prefix)
581{
582 page_size = sysconf(_SC_PAGE_SIZE);
583
584 process_options(argc, argv);
585
586 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
587 assert(nr_cpus <= MAX_NR_CPUS);
588 assert(nr_cpus >= 0);
589
590 return do_perfstat(argc, argv);
591}