blob: 974e2e64a43a49e8d569032f995afcd620cecc50 [file] [log] [blame]
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001/*
Ingo Molnar57c0c152009-09-21 12:20:38 +02002 * Performance events core code:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
Ingo Molnar57c0c152009-09-21 12:20:38 +02009 * For licensing details see kernel-base/COPYING
Ingo Molnarcdd6c482009-09-21 12:02:48 +020010 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
Peter Zijlstra2e80a822010-11-17 23:17:36 +010016#include <linux/idr.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020017#include <linux/file.h>
18#include <linux/poll.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090019#include <linux/slab.h>
Frederic Weisbecker76e1d902010-04-05 15:35:57 +020020#include <linux/hash.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020021#include <linux/sysfs.h>
22#include <linux/dcache.h>
23#include <linux/percpu.h>
24#include <linux/ptrace.h>
Peter Zijlstrac2774432010-12-08 15:29:02 +010025#include <linux/reboot.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020026#include <linux/vmstat.h>
Peter Zijlstraabe43402010-11-17 23:17:37 +010027#include <linux/device.h>
Peter Zijlstra906010b2009-09-21 16:08:49 +020028#include <linux/vmalloc.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020029#include <linux/hardirq.h>
30#include <linux/rculist.h>
31#include <linux/uaccess.h>
32#include <linux/syscalls.h>
33#include <linux/anon_inodes.h>
34#include <linux/kernel_stat.h>
35#include <linux/perf_event.h>
Li Zefan6fb29152009-10-15 11:21:42 +080036#include <linux/ftrace_event.h>
Jason Wessel3c502e72010-11-04 17:33:01 -050037#include <linux/hw_breakpoint.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020038
39#include <asm/irq_regs.h>
40
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +010041struct remote_function_call {
42 struct task_struct *p;
43 int (*func)(void *info);
44 void *info;
45 int ret;
46};
47
48static void remote_function(void *data)
49{
50 struct remote_function_call *tfc = data;
51 struct task_struct *p = tfc->p;
52
53 if (p) {
54 tfc->ret = -EAGAIN;
55 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
56 return;
57 }
58
59 tfc->ret = tfc->func(tfc->info);
60}
61
62/**
63 * task_function_call - call a function on the cpu on which a task runs
64 * @p: the task to evaluate
65 * @func: the function to be called
66 * @info: the function call argument
67 *
68 * Calls the function @func when the task is currently running. This might
69 * be on the current CPU, which just calls the function directly
70 *
71 * returns: @func return value, or
72 * -ESRCH - when the process isn't running
73 * -EAGAIN - when the process moved away
74 */
75static int
76task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
77{
78 struct remote_function_call data = {
79 .p = p,
80 .func = func,
81 .info = info,
82 .ret = -ESRCH, /* No such (running) process */
83 };
84
85 if (task_curr(p))
86 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
87
88 return data.ret;
89}
90
91/**
92 * cpu_function_call - call a function on the cpu
93 * @func: the function to be called
94 * @info: the function call argument
95 *
96 * Calls the function @func on the remote cpu.
97 *
98 * returns: @func return value or -ENXIO when the cpu is offline
99 */
100static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
101{
102 struct remote_function_call data = {
103 .p = NULL,
104 .func = func,
105 .info = info,
106 .ret = -ENXIO, /* No such CPU */
107 };
108
109 smp_call_function_single(cpu, remote_function, &data, 1);
110
111 return data.ret;
112}
113
Stephane Eraniane5d13672011-02-14 11:20:01 +0200114#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
115 PERF_FLAG_FD_OUTPUT |\
116 PERF_FLAG_PID_CGROUP)
117
Stephane Eranian0b3fcf12011-01-03 18:20:01 +0200118enum event_type_t {
119 EVENT_FLEXIBLE = 0x1,
120 EVENT_PINNED = 0x2,
121 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
122};
123
Stephane Eraniane5d13672011-02-14 11:20:01 +0200124/*
125 * perf_sched_events : >0 events exist
126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
127 */
128atomic_t perf_sched_events __read_mostly;
129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
130
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200131static atomic_t nr_mmap_events __read_mostly;
132static atomic_t nr_comm_events __read_mostly;
133static atomic_t nr_task_events __read_mostly;
134
Peter Zijlstra108b02c2010-09-06 14:32:03 +0200135static LIST_HEAD(pmus);
136static DEFINE_MUTEX(pmus_lock);
137static struct srcu_struct pmus_srcu;
138
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200139/*
140 * perf event paranoia level:
141 * -1 - not paranoid at all
142 * 0 - disallow raw tracepoint access for unpriv
143 * 1 - disallow cpu events for unpriv
144 * 2 - disallow kernel profiling for unpriv
145 */
146int sysctl_perf_event_paranoid __read_mostly = 1;
147
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200148int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
149
150/*
151 * max perf event sample rate
152 */
Peter Zijlstra163ec432011-02-16 11:22:34 +0100153#define DEFAULT_MAX_SAMPLE_RATE 100000
154int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
155static int max_samples_per_tick __read_mostly =
156 DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
157
158int perf_proc_update_handler(struct ctl_table *table, int write,
159 void __user *buffer, size_t *lenp,
160 loff_t *ppos)
161{
162 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
163
164 if (ret || !write)
165 return ret;
166
167 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
168
169 return 0;
170}
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200171
172static atomic64_t perf_event_id;
173
Stephane Eranian0b3fcf12011-01-03 18:20:01 +0200174static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
175 enum event_type_t event_type);
176
177static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
Stephane Eraniane5d13672011-02-14 11:20:01 +0200178 enum event_type_t event_type,
179 struct task_struct *task);
180
181static void update_context_time(struct perf_event_context *ctx);
182static u64 perf_event_time(struct perf_event *event);
Stephane Eranian0b3fcf12011-01-03 18:20:01 +0200183
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200184void __weak perf_event_print_debug(void) { }
185
Matt Fleming84c79912010-10-03 21:41:13 +0100186extern __weak const char *perf_pmu_name(void)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200187{
Matt Fleming84c79912010-10-03 21:41:13 +0100188 return "pmu";
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200189}
190
Stephane Eranian0b3fcf12011-01-03 18:20:01 +0200191static inline u64 perf_clock(void)
192{
193 return local_clock();
194}
195
Stephane Eraniane5d13672011-02-14 11:20:01 +0200196static inline struct perf_cpu_context *
197__get_cpu_context(struct perf_event_context *ctx)
198{
199 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
200}
201
202#ifdef CONFIG_CGROUP_PERF
203
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200204/*
205 * Must ensure cgroup is pinned (css_get) before calling
206 * this function. In other words, we cannot call this function
207 * if there is no cgroup event for the current CPU context.
208 */
Stephane Eraniane5d13672011-02-14 11:20:01 +0200209static inline struct perf_cgroup *
210perf_cgroup_from_task(struct task_struct *task)
211{
212 return container_of(task_subsys_state(task, perf_subsys_id),
213 struct perf_cgroup, css);
214}
215
216static inline bool
217perf_cgroup_match(struct perf_event *event)
218{
219 struct perf_event_context *ctx = event->ctx;
220 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
221
222 return !event->cgrp || event->cgrp == cpuctx->cgrp;
223}
224
225static inline void perf_get_cgroup(struct perf_event *event)
226{
227 css_get(&event->cgrp->css);
228}
229
230static inline void perf_put_cgroup(struct perf_event *event)
231{
232 css_put(&event->cgrp->css);
233}
234
235static inline void perf_detach_cgroup(struct perf_event *event)
236{
237 perf_put_cgroup(event);
238 event->cgrp = NULL;
239}
240
241static inline int is_cgroup_event(struct perf_event *event)
242{
243 return event->cgrp != NULL;
244}
245
246static inline u64 perf_cgroup_event_time(struct perf_event *event)
247{
248 struct perf_cgroup_info *t;
249
250 t = per_cpu_ptr(event->cgrp->info, event->cpu);
251 return t->time;
252}
253
254static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
255{
256 struct perf_cgroup_info *info;
257 u64 now;
258
259 now = perf_clock();
260
261 info = this_cpu_ptr(cgrp->info);
262
263 info->time += now - info->timestamp;
264 info->timestamp = now;
265}
266
267static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
268{
269 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
270 if (cgrp_out)
271 __update_cgrp_time(cgrp_out);
272}
273
274static inline void update_cgrp_time_from_event(struct perf_event *event)
275{
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200276 struct perf_cgroup *cgrp;
277
Stephane Eraniane5d13672011-02-14 11:20:01 +0200278 /*
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200279 * ensure we access cgroup data only when needed and
280 * when we know the cgroup is pinned (css_get)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200281 */
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200282 if (!is_cgroup_event(event))
Stephane Eraniane5d13672011-02-14 11:20:01 +0200283 return;
284
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200285 cgrp = perf_cgroup_from_task(current);
286 /*
287 * Do not update time when cgroup is not active
288 */
289 if (cgrp == event->cgrp)
290 __update_cgrp_time(event->cgrp);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200291}
292
293static inline void
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200294perf_cgroup_set_timestamp(struct task_struct *task,
295 struct perf_event_context *ctx)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200296{
297 struct perf_cgroup *cgrp;
298 struct perf_cgroup_info *info;
299
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200300 /*
301 * ctx->lock held by caller
302 * ensure we do not access cgroup data
303 * unless we have the cgroup pinned (css_get)
304 */
305 if (!task || !ctx->nr_cgroups)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200306 return;
307
308 cgrp = perf_cgroup_from_task(task);
309 info = this_cpu_ptr(cgrp->info);
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200310 info->timestamp = ctx->timestamp;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200311}
312
313#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
314#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
315
316/*
317 * reschedule events based on the cgroup constraint of task.
318 *
319 * mode SWOUT : schedule out everything
320 * mode SWIN : schedule in based on cgroup for next
321 */
322void perf_cgroup_switch(struct task_struct *task, int mode)
323{
324 struct perf_cpu_context *cpuctx;
325 struct pmu *pmu;
326 unsigned long flags;
327
328 /*
329 * disable interrupts to avoid geting nr_cgroup
330 * changes via __perf_event_disable(). Also
331 * avoids preemption.
332 */
333 local_irq_save(flags);
334
335 /*
336 * we reschedule only in the presence of cgroup
337 * constrained events.
338 */
339 rcu_read_lock();
340
341 list_for_each_entry_rcu(pmu, &pmus, entry) {
342
343 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
344
345 perf_pmu_disable(cpuctx->ctx.pmu);
346
347 /*
348 * perf_cgroup_events says at least one
349 * context on this CPU has cgroup events.
350 *
351 * ctx->nr_cgroups reports the number of cgroup
352 * events for a context.
353 */
354 if (cpuctx->ctx.nr_cgroups > 0) {
355
356 if (mode & PERF_CGROUP_SWOUT) {
357 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
358 /*
359 * must not be done before ctxswout due
360 * to event_filter_match() in event_sched_out()
361 */
362 cpuctx->cgrp = NULL;
363 }
364
365 if (mode & PERF_CGROUP_SWIN) {
366 /* set cgrp before ctxsw in to
367 * allow event_filter_match() to not
368 * have to pass task around
369 */
370 cpuctx->cgrp = perf_cgroup_from_task(task);
371 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
372 }
373 }
374
375 perf_pmu_enable(cpuctx->ctx.pmu);
376 }
377
378 rcu_read_unlock();
379
380 local_irq_restore(flags);
381}
382
383static inline void perf_cgroup_sched_out(struct task_struct *task)
384{
385 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
386}
387
388static inline void perf_cgroup_sched_in(struct task_struct *task)
389{
390 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
391}
392
393static inline int perf_cgroup_connect(int fd, struct perf_event *event,
394 struct perf_event_attr *attr,
395 struct perf_event *group_leader)
396{
397 struct perf_cgroup *cgrp;
398 struct cgroup_subsys_state *css;
399 struct file *file;
400 int ret = 0, fput_needed;
401
402 file = fget_light(fd, &fput_needed);
403 if (!file)
404 return -EBADF;
405
406 css = cgroup_css_from_dir(file, perf_subsys_id);
Li Zefan3db272c2011-03-03 14:25:37 +0800407 if (IS_ERR(css)) {
408 ret = PTR_ERR(css);
409 goto out;
410 }
Stephane Eraniane5d13672011-02-14 11:20:01 +0200411
412 cgrp = container_of(css, struct perf_cgroup, css);
413 event->cgrp = cgrp;
414
Li Zefanf75e18c2011-03-03 14:25:50 +0800415 /* must be done before we fput() the file */
416 perf_get_cgroup(event);
417
Stephane Eraniane5d13672011-02-14 11:20:01 +0200418 /*
419 * all events in a group must monitor
420 * the same cgroup because a task belongs
421 * to only one perf cgroup at a time
422 */
423 if (group_leader && group_leader->cgrp != cgrp) {
424 perf_detach_cgroup(event);
425 ret = -EINVAL;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200426 }
Li Zefan3db272c2011-03-03 14:25:37 +0800427out:
Stephane Eraniane5d13672011-02-14 11:20:01 +0200428 fput_light(file, fput_needed);
429 return ret;
430}
431
432static inline void
433perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
434{
435 struct perf_cgroup_info *t;
436 t = per_cpu_ptr(event->cgrp->info, event->cpu);
437 event->shadow_ctx_time = now - t->timestamp;
438}
439
440static inline void
441perf_cgroup_defer_enabled(struct perf_event *event)
442{
443 /*
444 * when the current task's perf cgroup does not match
445 * the event's, we need to remember to call the
446 * perf_mark_enable() function the first time a task with
447 * a matching perf cgroup is scheduled in.
448 */
449 if (is_cgroup_event(event) && !perf_cgroup_match(event))
450 event->cgrp_defer_enabled = 1;
451}
452
453static inline void
454perf_cgroup_mark_enabled(struct perf_event *event,
455 struct perf_event_context *ctx)
456{
457 struct perf_event *sub;
458 u64 tstamp = perf_event_time(event);
459
460 if (!event->cgrp_defer_enabled)
461 return;
462
463 event->cgrp_defer_enabled = 0;
464
465 event->tstamp_enabled = tstamp - event->total_time_enabled;
466 list_for_each_entry(sub, &event->sibling_list, group_entry) {
467 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
468 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
469 sub->cgrp_defer_enabled = 0;
470 }
471 }
472}
473#else /* !CONFIG_CGROUP_PERF */
474
475static inline bool
476perf_cgroup_match(struct perf_event *event)
477{
478 return true;
479}
480
481static inline void perf_detach_cgroup(struct perf_event *event)
482{}
483
484static inline int is_cgroup_event(struct perf_event *event)
485{
486 return 0;
487}
488
489static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
490{
491 return 0;
492}
493
494static inline void update_cgrp_time_from_event(struct perf_event *event)
495{
496}
497
498static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
499{
500}
501
502static inline void perf_cgroup_sched_out(struct task_struct *task)
503{
504}
505
506static inline void perf_cgroup_sched_in(struct task_struct *task)
507{
508}
509
510static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
511 struct perf_event_attr *attr,
512 struct perf_event *group_leader)
513{
514 return -EINVAL;
515}
516
517static inline void
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200518perf_cgroup_set_timestamp(struct task_struct *task,
519 struct perf_event_context *ctx)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200520{
521}
522
523void
524perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
525{
526}
527
528static inline void
529perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
530{
531}
532
533static inline u64 perf_cgroup_event_time(struct perf_event *event)
534{
535 return 0;
536}
537
538static inline void
539perf_cgroup_defer_enabled(struct perf_event *event)
540{
541}
542
543static inline void
544perf_cgroup_mark_enabled(struct perf_event *event,
545 struct perf_event_context *ctx)
546{
547}
548#endif
549
Peter Zijlstra33696fc2010-06-14 08:49:00 +0200550void perf_pmu_disable(struct pmu *pmu)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200551{
Peter Zijlstra33696fc2010-06-14 08:49:00 +0200552 int *count = this_cpu_ptr(pmu->pmu_disable_count);
553 if (!(*count)++)
554 pmu->pmu_disable(pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200555}
556
Peter Zijlstra33696fc2010-06-14 08:49:00 +0200557void perf_pmu_enable(struct pmu *pmu)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200558{
Peter Zijlstra33696fc2010-06-14 08:49:00 +0200559 int *count = this_cpu_ptr(pmu->pmu_disable_count);
560 if (!--(*count))
561 pmu->pmu_enable(pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200562}
563
Peter Zijlstrae9d2b062010-09-17 11:28:50 +0200564static DEFINE_PER_CPU(struct list_head, rotation_list);
565
566/*
567 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
568 * because they're strictly cpu affine and rotate_start is called with IRQs
569 * disabled, while rotate_context is called from IRQ context.
570 */
Peter Zijlstra108b02c2010-09-06 14:32:03 +0200571static void perf_pmu_rotate_start(struct pmu *pmu)
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +0200572{
Peter Zijlstra108b02c2010-09-06 14:32:03 +0200573 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstrae9d2b062010-09-17 11:28:50 +0200574 struct list_head *head = &__get_cpu_var(rotation_list);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +0200575
Peter Zijlstrae9d2b062010-09-17 11:28:50 +0200576 WARN_ON(!irqs_disabled());
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +0200577
Peter Zijlstrae9d2b062010-09-17 11:28:50 +0200578 if (list_empty(&cpuctx->rotation_list))
579 list_add(&cpuctx->rotation_list, head);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200580}
581
582static void get_ctx(struct perf_event_context *ctx)
583{
584 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
585}
586
587static void free_ctx(struct rcu_head *head)
588{
589 struct perf_event_context *ctx;
590
591 ctx = container_of(head, struct perf_event_context, rcu_head);
592 kfree(ctx);
593}
594
595static void put_ctx(struct perf_event_context *ctx)
596{
597 if (atomic_dec_and_test(&ctx->refcount)) {
598 if (ctx->parent_ctx)
599 put_ctx(ctx->parent_ctx);
600 if (ctx->task)
601 put_task_struct(ctx->task);
602 call_rcu(&ctx->rcu_head, free_ctx);
603 }
604}
605
606static void unclone_ctx(struct perf_event_context *ctx)
607{
608 if (ctx->parent_ctx) {
609 put_ctx(ctx->parent_ctx);
610 ctx->parent_ctx = NULL;
611 }
612}
613
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -0200614static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
615{
616 /*
617 * only top level events have the pid namespace they were created in
618 */
619 if (event->parent)
620 event = event->parent;
621
622 return task_tgid_nr_ns(p, event->ns);
623}
624
625static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
626{
627 /*
628 * only top level events have the pid namespace they were created in
629 */
630 if (event->parent)
631 event = event->parent;
632
633 return task_pid_nr_ns(p, event->ns);
634}
635
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200636/*
637 * If we inherit events we want to return the parent event id
638 * to userspace.
639 */
640static u64 primary_event_id(struct perf_event *event)
641{
642 u64 id = event->id;
643
644 if (event->parent)
645 id = event->parent->id;
646
647 return id;
648}
649
650/*
651 * Get the perf_event_context for a task and lock it.
652 * This has to cope with with the fact that until it is locked,
653 * the context could get moved to another task.
654 */
655static struct perf_event_context *
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +0200656perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200657{
658 struct perf_event_context *ctx;
659
660 rcu_read_lock();
Peter Zijlstra9ed60602010-06-11 17:36:35 +0200661retry:
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +0200662 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200663 if (ctx) {
664 /*
665 * If this context is a clone of another, it might
666 * get swapped for another underneath us by
667 * perf_event_task_sched_out, though the
668 * rcu_read_lock() protects us from any context
669 * getting freed. Lock the context and check if it
670 * got swapped before we could get the lock, and retry
671 * if so. If we locked the right context, then it
672 * can't get swapped on us any more.
673 */
Thomas Gleixnere625cce12009-11-17 18:02:06 +0100674 raw_spin_lock_irqsave(&ctx->lock, *flags);
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +0200675 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
Thomas Gleixnere625cce12009-11-17 18:02:06 +0100676 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200677 goto retry;
678 }
679
680 if (!atomic_inc_not_zero(&ctx->refcount)) {
Thomas Gleixnere625cce12009-11-17 18:02:06 +0100681 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200682 ctx = NULL;
683 }
684 }
685 rcu_read_unlock();
686 return ctx;
687}
688
689/*
690 * Get the context for a task and increment its pin_count so it
691 * can't get swapped to another task. This also increments its
692 * reference count so that the context can't get freed.
693 */
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +0200694static struct perf_event_context *
695perf_pin_task_context(struct task_struct *task, int ctxn)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200696{
697 struct perf_event_context *ctx;
698 unsigned long flags;
699
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +0200700 ctx = perf_lock_task_context(task, ctxn, &flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200701 if (ctx) {
702 ++ctx->pin_count;
Thomas Gleixnere625cce12009-11-17 18:02:06 +0100703 raw_spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200704 }
705 return ctx;
706}
707
708static void perf_unpin_context(struct perf_event_context *ctx)
709{
710 unsigned long flags;
711
Thomas Gleixnere625cce12009-11-17 18:02:06 +0100712 raw_spin_lock_irqsave(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200713 --ctx->pin_count;
Thomas Gleixnere625cce12009-11-17 18:02:06 +0100714 raw_spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200715}
716
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100717/*
718 * Update the record of the current time in a context.
719 */
720static void update_context_time(struct perf_event_context *ctx)
721{
722 u64 now = perf_clock();
723
724 ctx->time += now - ctx->timestamp;
725 ctx->timestamp = now;
726}
727
Stephane Eranian41587552011-01-03 18:20:01 +0200728static u64 perf_event_time(struct perf_event *event)
729{
730 struct perf_event_context *ctx = event->ctx;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200731
732 if (is_cgroup_event(event))
733 return perf_cgroup_event_time(event);
734
Stephane Eranian41587552011-01-03 18:20:01 +0200735 return ctx ? ctx->time : 0;
736}
737
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100738/*
739 * Update the total_time_enabled and total_time_running fields for a event.
740 */
741static void update_event_times(struct perf_event *event)
742{
743 struct perf_event_context *ctx = event->ctx;
744 u64 run_end;
745
746 if (event->state < PERF_EVENT_STATE_INACTIVE ||
747 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
748 return;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200749 /*
750 * in cgroup mode, time_enabled represents
751 * the time the event was enabled AND active
752 * tasks were in the monitored cgroup. This is
753 * independent of the activity of the context as
754 * there may be a mix of cgroup and non-cgroup events.
755 *
756 * That is why we treat cgroup events differently
757 * here.
758 */
759 if (is_cgroup_event(event))
Stephane Eranian41587552011-01-03 18:20:01 +0200760 run_end = perf_event_time(event);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200761 else if (ctx->is_active)
762 run_end = ctx->time;
Peter Zijlstraacd1d7c2009-11-23 15:00:36 +0100763 else
764 run_end = event->tstamp_stopped;
765
766 event->total_time_enabled = run_end - event->tstamp_enabled;
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100767
768 if (event->state == PERF_EVENT_STATE_INACTIVE)
769 run_end = event->tstamp_stopped;
770 else
Stephane Eranian41587552011-01-03 18:20:01 +0200771 run_end = perf_event_time(event);
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100772
773 event->total_time_running = run_end - event->tstamp_running;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200774
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100775}
776
Peter Zijlstra96c21a42010-05-11 16:19:10 +0200777/*
778 * Update total_time_enabled and total_time_running for all events in a group.
779 */
780static void update_group_times(struct perf_event *leader)
781{
782 struct perf_event *event;
783
784 update_event_times(leader);
785 list_for_each_entry(event, &leader->sibling_list, group_entry)
786 update_event_times(event);
787}
788
Frederic Weisbecker889ff012010-01-09 20:04:47 +0100789static struct list_head *
790ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
791{
792 if (event->attr.pinned)
793 return &ctx->pinned_groups;
794 else
795 return &ctx->flexible_groups;
796}
797
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200798/*
799 * Add a event from the lists for its context.
800 * Must be called with ctx->mutex and ctx->lock held.
801 */
802static void
803list_add_event(struct perf_event *event, struct perf_event_context *ctx)
804{
Peter Zijlstra8a495422010-05-27 15:47:49 +0200805 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
806 event->attach_state |= PERF_ATTACH_CONTEXT;
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200807
808 /*
Peter Zijlstra8a495422010-05-27 15:47:49 +0200809 * If we're a stand alone event or group leader, we go to the context
810 * list, group events are kept attached to the group so that
811 * perf_group_detach can, at all times, locate all siblings.
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200812 */
Peter Zijlstra8a495422010-05-27 15:47:49 +0200813 if (event->group_leader == event) {
Frederic Weisbecker889ff012010-01-09 20:04:47 +0100814 struct list_head *list;
815
Frederic Weisbeckerd6f962b2010-01-10 01:25:51 +0100816 if (is_software_event(event))
817 event->group_flags |= PERF_GROUP_SOFTWARE;
818
Frederic Weisbecker889ff012010-01-09 20:04:47 +0100819 list = ctx_group_list(event, ctx);
820 list_add_tail(&event->group_entry, list);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200821 }
822
Peter Zijlstra08309372011-03-03 11:31:20 +0100823 if (is_cgroup_event(event))
Stephane Eraniane5d13672011-02-14 11:20:01 +0200824 ctx->nr_cgroups++;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200825
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200826 list_add_rcu(&event->event_entry, &ctx->event_list);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +0200827 if (!ctx->nr_events)
Peter Zijlstra108b02c2010-09-06 14:32:03 +0200828 perf_pmu_rotate_start(ctx->pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200829 ctx->nr_events++;
830 if (event->attr.inherit_stat)
831 ctx->nr_stat++;
832}
833
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200834/*
835 * Called at perf_event creation and when events are attached/detached from a
836 * group.
837 */
838static void perf_event__read_size(struct perf_event *event)
839{
840 int entry = sizeof(u64); /* value */
841 int size = 0;
842 int nr = 1;
843
844 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
845 size += sizeof(u64);
846
847 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
848 size += sizeof(u64);
849
850 if (event->attr.read_format & PERF_FORMAT_ID)
851 entry += sizeof(u64);
852
853 if (event->attr.read_format & PERF_FORMAT_GROUP) {
854 nr += event->group_leader->nr_siblings;
855 size += sizeof(u64);
856 }
857
858 size += entry * nr;
859 event->read_size = size;
860}
861
862static void perf_event__header_size(struct perf_event *event)
863{
864 struct perf_sample_data *data;
865 u64 sample_type = event->attr.sample_type;
866 u16 size = 0;
867
868 perf_event__read_size(event);
869
870 if (sample_type & PERF_SAMPLE_IP)
871 size += sizeof(data->ip);
872
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -0200873 if (sample_type & PERF_SAMPLE_ADDR)
874 size += sizeof(data->addr);
875
876 if (sample_type & PERF_SAMPLE_PERIOD)
877 size += sizeof(data->period);
878
879 if (sample_type & PERF_SAMPLE_READ)
880 size += event->read_size;
881
882 event->header_size = size;
883}
884
885static void perf_event__id_header_size(struct perf_event *event)
886{
887 struct perf_sample_data *data;
888 u64 sample_type = event->attr.sample_type;
889 u16 size = 0;
890
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200891 if (sample_type & PERF_SAMPLE_TID)
892 size += sizeof(data->tid_entry);
893
894 if (sample_type & PERF_SAMPLE_TIME)
895 size += sizeof(data->time);
896
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200897 if (sample_type & PERF_SAMPLE_ID)
898 size += sizeof(data->id);
899
900 if (sample_type & PERF_SAMPLE_STREAM_ID)
901 size += sizeof(data->stream_id);
902
903 if (sample_type & PERF_SAMPLE_CPU)
904 size += sizeof(data->cpu_entry);
905
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -0200906 event->id_header_size = size;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200907}
908
Peter Zijlstra8a495422010-05-27 15:47:49 +0200909static void perf_group_attach(struct perf_event *event)
910{
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200911 struct perf_event *group_leader = event->group_leader, *pos;
Peter Zijlstra8a495422010-05-27 15:47:49 +0200912
Peter Zijlstra74c33372010-10-15 11:40:29 +0200913 /*
914 * We can have double attach due to group movement in perf_event_open.
915 */
916 if (event->attach_state & PERF_ATTACH_GROUP)
917 return;
918
Peter Zijlstra8a495422010-05-27 15:47:49 +0200919 event->attach_state |= PERF_ATTACH_GROUP;
920
921 if (group_leader == event)
922 return;
923
924 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
925 !is_software_event(event))
926 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
927
928 list_add_tail(&event->group_entry, &group_leader->sibling_list);
929 group_leader->nr_siblings++;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200930
931 perf_event__header_size(group_leader);
932
933 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
934 perf_event__header_size(pos);
Peter Zijlstra8a495422010-05-27 15:47:49 +0200935}
936
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200937/*
938 * Remove a event from the lists for its context.
939 * Must be called with ctx->mutex and ctx->lock held.
940 */
941static void
942list_del_event(struct perf_event *event, struct perf_event_context *ctx)
943{
Peter Zijlstra8a495422010-05-27 15:47:49 +0200944 /*
945 * We can have double detach due to exit/hot-unplug + close.
946 */
947 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200948 return;
Peter Zijlstra8a495422010-05-27 15:47:49 +0200949
950 event->attach_state &= ~PERF_ATTACH_CONTEXT;
951
Peter Zijlstra08309372011-03-03 11:31:20 +0100952 if (is_cgroup_event(event))
Stephane Eraniane5d13672011-02-14 11:20:01 +0200953 ctx->nr_cgroups--;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200954
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200955 ctx->nr_events--;
956 if (event->attr.inherit_stat)
957 ctx->nr_stat--;
958
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200959 list_del_rcu(&event->event_entry);
960
Peter Zijlstra8a495422010-05-27 15:47:49 +0200961 if (event->group_leader == event)
962 list_del_init(&event->group_entry);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200963
Peter Zijlstra96c21a42010-05-11 16:19:10 +0200964 update_group_times(event);
Stephane Eranianb2e74a22009-11-26 09:24:30 -0800965
966 /*
967 * If event was in error state, then keep it
968 * that way, otherwise bogus counts will be
969 * returned on read(). The only way to get out
970 * of error state is by explicit re-enabling
971 * of the event
972 */
973 if (event->state > PERF_EVENT_STATE_OFF)
974 event->state = PERF_EVENT_STATE_OFF;
Peter Zijlstra050735b2010-05-11 11:51:53 +0200975}
976
Peter Zijlstra8a495422010-05-27 15:47:49 +0200977static void perf_group_detach(struct perf_event *event)
Peter Zijlstra050735b2010-05-11 11:51:53 +0200978{
979 struct perf_event *sibling, *tmp;
Peter Zijlstra8a495422010-05-27 15:47:49 +0200980 struct list_head *list = NULL;
981
982 /*
983 * We can have double detach due to exit/hot-unplug + close.
984 */
985 if (!(event->attach_state & PERF_ATTACH_GROUP))
986 return;
987
988 event->attach_state &= ~PERF_ATTACH_GROUP;
989
990 /*
991 * If this is a sibling, remove it from its group.
992 */
993 if (event->group_leader != event) {
994 list_del_init(&event->group_entry);
995 event->group_leader->nr_siblings--;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200996 goto out;
Peter Zijlstra8a495422010-05-27 15:47:49 +0200997 }
998
999 if (!list_empty(&event->group_entry))
1000 list = &event->group_entry;
Peter Zijlstra2e2af502009-11-23 11:37:25 +01001001
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001002 /*
1003 * If this was a group event with sibling events then
1004 * upgrade the siblings to singleton events by adding them
Peter Zijlstra8a495422010-05-27 15:47:49 +02001005 * to whatever list we are on.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001006 */
1007 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
Peter Zijlstra8a495422010-05-27 15:47:49 +02001008 if (list)
1009 list_move_tail(&sibling->group_entry, list);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001010 sibling->group_leader = sibling;
Frederic Weisbeckerd6f962b2010-01-10 01:25:51 +01001011
1012 /* Inherit group flags from the previous leader */
1013 sibling->group_flags = event->group_flags;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001014 }
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02001015
1016out:
1017 perf_event__header_size(event->group_leader);
1018
1019 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1020 perf_event__header_size(tmp);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001021}
1022
Stephane Eranianfa66f072010-08-26 16:40:01 +02001023static inline int
1024event_filter_match(struct perf_event *event)
1025{
Stephane Eraniane5d13672011-02-14 11:20:01 +02001026 return (event->cpu == -1 || event->cpu == smp_processor_id())
1027 && perf_cgroup_match(event);
Stephane Eranianfa66f072010-08-26 16:40:01 +02001028}
1029
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001030static void
1031event_sched_out(struct perf_event *event,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001032 struct perf_cpu_context *cpuctx,
1033 struct perf_event_context *ctx)
1034{
Stephane Eranian41587552011-01-03 18:20:01 +02001035 u64 tstamp = perf_event_time(event);
Stephane Eranianfa66f072010-08-26 16:40:01 +02001036 u64 delta;
1037 /*
1038 * An event which could not be activated because of
1039 * filter mismatch still needs to have its timings
1040 * maintained, otherwise bogus information is return
1041 * via read() for time_enabled, time_running:
1042 */
1043 if (event->state == PERF_EVENT_STATE_INACTIVE
1044 && !event_filter_match(event)) {
Stephane Eraniane5d13672011-02-14 11:20:01 +02001045 delta = tstamp - event->tstamp_stopped;
Stephane Eranianfa66f072010-08-26 16:40:01 +02001046 event->tstamp_running += delta;
Stephane Eranian41587552011-01-03 18:20:01 +02001047 event->tstamp_stopped = tstamp;
Stephane Eranianfa66f072010-08-26 16:40:01 +02001048 }
1049
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001050 if (event->state != PERF_EVENT_STATE_ACTIVE)
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001051 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001052
1053 event->state = PERF_EVENT_STATE_INACTIVE;
1054 if (event->pending_disable) {
1055 event->pending_disable = 0;
1056 event->state = PERF_EVENT_STATE_OFF;
1057 }
Stephane Eranian41587552011-01-03 18:20:01 +02001058 event->tstamp_stopped = tstamp;
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02001059 event->pmu->del(event, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001060 event->oncpu = -1;
1061
1062 if (!is_software_event(event))
1063 cpuctx->active_oncpu--;
1064 ctx->nr_active--;
1065 if (event->attr.exclusive || !cpuctx->active_oncpu)
1066 cpuctx->exclusive = 0;
1067}
1068
1069static void
1070group_sched_out(struct perf_event *group_event,
1071 struct perf_cpu_context *cpuctx,
1072 struct perf_event_context *ctx)
1073{
1074 struct perf_event *event;
Stephane Eranianfa66f072010-08-26 16:40:01 +02001075 int state = group_event->state;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001076
1077 event_sched_out(group_event, cpuctx, ctx);
1078
1079 /*
1080 * Schedule out siblings (if any):
1081 */
1082 list_for_each_entry(event, &group_event->sibling_list, group_entry)
1083 event_sched_out(event, cpuctx, ctx);
1084
Stephane Eranianfa66f072010-08-26 16:40:01 +02001085 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001086 cpuctx->exclusive = 0;
1087}
1088
1089/*
1090 * Cross CPU call to remove a performance event
1091 *
1092 * We disable the event on the hardware level first. After that we
1093 * remove it from the context list.
1094 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001095static int __perf_remove_from_context(void *info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001096{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001097 struct perf_event *event = info;
1098 struct perf_event_context *ctx = event->ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001099 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001100
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001101 raw_spin_lock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001102 event_sched_out(event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001103 list_del_event(event, ctx);
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001104 raw_spin_unlock(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001105
1106 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001107}
1108
1109
1110/*
1111 * Remove the event from a task's (or a CPU's) list of events.
1112 *
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001113 * CPU events are removed with a smp call. For task events we only
1114 * call when the task is on a CPU.
1115 *
1116 * If event->ctx is a cloned context, callers must make sure that
1117 * every task struct that event->ctx->task could possibly point to
1118 * remains valid. This is OK when called from perf_release since
1119 * that only calls us on the top-level context, which can't be a clone.
1120 * When called from perf_event_exit_task, it's OK because the
1121 * context has been detached from its task.
1122 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001123static void perf_remove_from_context(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001124{
1125 struct perf_event_context *ctx = event->ctx;
1126 struct task_struct *task = ctx->task;
1127
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001128 lockdep_assert_held(&ctx->mutex);
1129
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001130 if (!task) {
1131 /*
1132 * Per cpu events are removed via an smp call and
André Goddard Rosaaf901ca2009-11-14 13:09:05 -02001133 * the removal is always successful.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001134 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001135 cpu_function_call(event->cpu, __perf_remove_from_context, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001136 return;
1137 }
1138
1139retry:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001140 if (!task_function_call(task, __perf_remove_from_context, event))
1141 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001142
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001143 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001144 /*
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001145 * If we failed to find a running task, but find the context active now
1146 * that we've acquired the ctx->lock, retry.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001147 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001148 if (ctx->is_active) {
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001149 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001150 goto retry;
1151 }
1152
1153 /*
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001154 * Since the task isn't running, its safe to remove the event, us
1155 * holding the ctx->lock ensures the task won't get scheduled in.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001156 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001157 list_del_event(event, ctx);
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001158 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001159}
1160
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001161/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001162 * Cross CPU call to disable a performance event
1163 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001164static int __perf_event_disable(void *info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001165{
1166 struct perf_event *event = info;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001167 struct perf_event_context *ctx = event->ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001168 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001169
1170 /*
1171 * If this is a per-task event, need to check whether this
1172 * event's task is the current task on this cpu.
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001173 *
1174 * Can trigger due to concurrent perf_event_context_sched_out()
1175 * flipping contexts around.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001176 */
1177 if (ctx->task && cpuctx->task_ctx != ctx)
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001178 return -EINVAL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001179
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001180 raw_spin_lock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001181
1182 /*
1183 * If the event is on, turn it off.
1184 * If it is in error state, leave it in error state.
1185 */
1186 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
1187 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02001188 update_cgrp_time_from_event(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001189 update_group_times(event);
1190 if (event == event->group_leader)
1191 group_sched_out(event, cpuctx, ctx);
1192 else
1193 event_sched_out(event, cpuctx, ctx);
1194 event->state = PERF_EVENT_STATE_OFF;
1195 }
1196
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001197 raw_spin_unlock(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001198
1199 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001200}
1201
1202/*
1203 * Disable a event.
1204 *
1205 * If event->ctx is a cloned context, callers must make sure that
1206 * every task struct that event->ctx->task could possibly point to
1207 * remains valid. This condition is satisifed when called through
1208 * perf_event_for_each_child or perf_event_for_each because they
1209 * hold the top-level event's child_mutex, so any descendant that
1210 * goes to exit will block in sync_child_event.
1211 * When called from perf_pending_event it's OK because event->ctx
1212 * is the current context on this CPU and preemption is disabled,
1213 * hence we can't get into perf_event_task_sched_out for this context.
1214 */
Frederic Weisbecker44234ad2009-12-09 09:25:48 +01001215void perf_event_disable(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001216{
1217 struct perf_event_context *ctx = event->ctx;
1218 struct task_struct *task = ctx->task;
1219
1220 if (!task) {
1221 /*
1222 * Disable the event on the cpu that it's on
1223 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001224 cpu_function_call(event->cpu, __perf_event_disable, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001225 return;
1226 }
1227
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001228retry:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001229 if (!task_function_call(task, __perf_event_disable, event))
1230 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001231
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001232 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001233 /*
1234 * If the event is still active, we need to retry the cross-call.
1235 */
1236 if (event->state == PERF_EVENT_STATE_ACTIVE) {
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001237 raw_spin_unlock_irq(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001238 /*
1239 * Reload the task pointer, it might have been changed by
1240 * a concurrent perf_event_context_sched_out().
1241 */
1242 task = ctx->task;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001243 goto retry;
1244 }
1245
1246 /*
1247 * Since we have the lock this context can't be scheduled
1248 * in, so we can change the state safely.
1249 */
1250 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1251 update_group_times(event);
1252 event->state = PERF_EVENT_STATE_OFF;
1253 }
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001254 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001255}
1256
Stephane Eraniane5d13672011-02-14 11:20:01 +02001257static void perf_set_shadow_time(struct perf_event *event,
1258 struct perf_event_context *ctx,
1259 u64 tstamp)
1260{
1261 /*
1262 * use the correct time source for the time snapshot
1263 *
1264 * We could get by without this by leveraging the
1265 * fact that to get to this function, the caller
1266 * has most likely already called update_context_time()
1267 * and update_cgrp_time_xx() and thus both timestamp
1268 * are identical (or very close). Given that tstamp is,
1269 * already adjusted for cgroup, we could say that:
1270 * tstamp - ctx->timestamp
1271 * is equivalent to
1272 * tstamp - cgrp->timestamp.
1273 *
1274 * Then, in perf_output_read(), the calculation would
1275 * work with no changes because:
1276 * - event is guaranteed scheduled in
1277 * - no scheduled out in between
1278 * - thus the timestamp would be the same
1279 *
1280 * But this is a bit hairy.
1281 *
1282 * So instead, we have an explicit cgroup call to remain
1283 * within the time time source all along. We believe it
1284 * is cleaner and simpler to understand.
1285 */
1286 if (is_cgroup_event(event))
1287 perf_cgroup_set_shadow_time(event, tstamp);
1288 else
1289 event->shadow_ctx_time = tstamp - ctx->timestamp;
1290}
1291
Peter Zijlstra4fe757d2011-02-15 22:26:07 +01001292#define MAX_INTERRUPTS (~0ULL)
1293
1294static void perf_log_throttle(struct perf_event *event, int enable);
1295
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001296static int
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001297event_sched_in(struct perf_event *event,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001298 struct perf_cpu_context *cpuctx,
Peter Zijlstra6e377382010-02-11 13:21:58 +01001299 struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001300{
Stephane Eranian41587552011-01-03 18:20:01 +02001301 u64 tstamp = perf_event_time(event);
1302
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001303 if (event->state <= PERF_EVENT_STATE_OFF)
1304 return 0;
1305
1306 event->state = PERF_EVENT_STATE_ACTIVE;
Peter Zijlstra6e377382010-02-11 13:21:58 +01001307 event->oncpu = smp_processor_id();
Peter Zijlstra4fe757d2011-02-15 22:26:07 +01001308
1309 /*
1310 * Unthrottle events, since we scheduled we might have missed several
1311 * ticks already, also for a heavily scheduling task there is little
1312 * guarantee it'll get a tick in a timely manner.
1313 */
1314 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1315 perf_log_throttle(event, 1);
1316 event->hw.interrupts = 0;
1317 }
1318
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001319 /*
1320 * The new state must be visible before we turn it on in the hardware:
1321 */
1322 smp_wmb();
1323
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02001324 if (event->pmu->add(event, PERF_EF_START)) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001325 event->state = PERF_EVENT_STATE_INACTIVE;
1326 event->oncpu = -1;
1327 return -EAGAIN;
1328 }
1329
Stephane Eranian41587552011-01-03 18:20:01 +02001330 event->tstamp_running += tstamp - event->tstamp_stopped;
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001331
Stephane Eraniane5d13672011-02-14 11:20:01 +02001332 perf_set_shadow_time(event, ctx, tstamp);
Stephane Eranianeed01522010-10-26 16:08:01 +02001333
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001334 if (!is_software_event(event))
1335 cpuctx->active_oncpu++;
1336 ctx->nr_active++;
1337
1338 if (event->attr.exclusive)
1339 cpuctx->exclusive = 1;
1340
1341 return 0;
1342}
1343
1344static int
1345group_sched_in(struct perf_event *group_event,
1346 struct perf_cpu_context *cpuctx,
Peter Zijlstra6e377382010-02-11 13:21:58 +01001347 struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001348{
Lin Ming6bde9b62010-04-23 13:56:00 +08001349 struct perf_event *event, *partial_group = NULL;
Peter Zijlstra51b0fe32010-06-11 13:35:57 +02001350 struct pmu *pmu = group_event->pmu;
Stephane Eraniand7842da2010-10-20 15:25:01 +02001351 u64 now = ctx->time;
1352 bool simulate = false;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001353
1354 if (group_event->state == PERF_EVENT_STATE_OFF)
1355 return 0;
1356
Peter Zijlstraad5133b2010-06-15 12:22:39 +02001357 pmu->start_txn(pmu);
Lin Ming6bde9b62010-04-23 13:56:00 +08001358
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001359 if (event_sched_in(group_event, cpuctx, ctx)) {
Peter Zijlstraad5133b2010-06-15 12:22:39 +02001360 pmu->cancel_txn(pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001361 return -EAGAIN;
Stephane Eranian90151c352010-05-25 16:23:10 +02001362 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001363
1364 /*
1365 * Schedule in siblings as one group (if any):
1366 */
1367 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001368 if (event_sched_in(event, cpuctx, ctx)) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001369 partial_group = event;
1370 goto group_error;
1371 }
1372 }
1373
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001374 if (!pmu->commit_txn(pmu))
Paul Mackerras6e851582010-05-08 20:58:00 +10001375 return 0;
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001376
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001377group_error:
1378 /*
1379 * Groups can be scheduled in as one unit only, so undo any
1380 * partial group before returning:
Stephane Eraniand7842da2010-10-20 15:25:01 +02001381 * The events up to the failed event are scheduled out normally,
1382 * tstamp_stopped will be updated.
1383 *
1384 * The failed events and the remaining siblings need to have
1385 * their timings updated as if they had gone thru event_sched_in()
1386 * and event_sched_out(). This is required to get consistent timings
1387 * across the group. This also takes care of the case where the group
1388 * could never be scheduled by ensuring tstamp_stopped is set to mark
1389 * the time the event was actually stopped, such that time delta
1390 * calculation in update_event_times() is correct.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001391 */
1392 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1393 if (event == partial_group)
Stephane Eraniand7842da2010-10-20 15:25:01 +02001394 simulate = true;
1395
1396 if (simulate) {
1397 event->tstamp_running += now - event->tstamp_stopped;
1398 event->tstamp_stopped = now;
1399 } else {
1400 event_sched_out(event, cpuctx, ctx);
1401 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001402 }
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001403 event_sched_out(group_event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001404
Peter Zijlstraad5133b2010-06-15 12:22:39 +02001405 pmu->cancel_txn(pmu);
Stephane Eranian90151c352010-05-25 16:23:10 +02001406
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001407 return -EAGAIN;
1408}
1409
1410/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001411 * Work out whether we can put this event group on the CPU now.
1412 */
1413static int group_can_go_on(struct perf_event *event,
1414 struct perf_cpu_context *cpuctx,
1415 int can_add_hw)
1416{
1417 /*
1418 * Groups consisting entirely of software events can always go on.
1419 */
Frederic Weisbeckerd6f962b2010-01-10 01:25:51 +01001420 if (event->group_flags & PERF_GROUP_SOFTWARE)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001421 return 1;
1422 /*
1423 * If an exclusive group is already on, no other hardware
1424 * events can go on.
1425 */
1426 if (cpuctx->exclusive)
1427 return 0;
1428 /*
1429 * If this group is exclusive and there are already
1430 * events on the CPU, it can't go on.
1431 */
1432 if (event->attr.exclusive && cpuctx->active_oncpu)
1433 return 0;
1434 /*
1435 * Otherwise, try to add it if all previous groups were able
1436 * to go on.
1437 */
1438 return can_add_hw;
1439}
1440
1441static void add_event_to_ctx(struct perf_event *event,
1442 struct perf_event_context *ctx)
1443{
Stephane Eranian41587552011-01-03 18:20:01 +02001444 u64 tstamp = perf_event_time(event);
1445
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001446 list_add_event(event, ctx);
Peter Zijlstra8a495422010-05-27 15:47:49 +02001447 perf_group_attach(event);
Stephane Eranian41587552011-01-03 18:20:01 +02001448 event->tstamp_enabled = tstamp;
1449 event->tstamp_running = tstamp;
1450 event->tstamp_stopped = tstamp;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001451}
1452
Stephane Eraniane5d13672011-02-14 11:20:01 +02001453static void perf_event_context_sched_in(struct perf_event_context *ctx,
1454 struct task_struct *tsk);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001455
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001456/*
1457 * Cross CPU call to install and enable a performance event
1458 *
1459 * Must be called with ctx->mutex held
1460 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001461static int __perf_install_in_context(void *info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001462{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001463 struct perf_event *event = info;
1464 struct perf_event_context *ctx = event->ctx;
1465 struct perf_event *leader = event->group_leader;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001466 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001467 int err;
1468
1469 /*
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001470 * In case we're installing a new context to an already running task,
1471 * could also happen before perf_event_task_sched_in() on architectures
1472 * which do context switches with IRQs enabled.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001473 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001474 if (ctx->task && !cpuctx->task_ctx)
Stephane Eraniane5d13672011-02-14 11:20:01 +02001475 perf_event_context_sched_in(ctx, ctx->task);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001476
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001477 raw_spin_lock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001478 ctx->is_active = 1;
1479 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02001480 /*
1481 * update cgrp time only if current cgrp
1482 * matches event->cgrp. Must be done before
1483 * calling add_event_to_ctx()
1484 */
1485 update_cgrp_time_from_event(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001486
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001487 add_event_to_ctx(event, ctx);
1488
Stephane Eranian5632ab12011-01-03 18:20:01 +02001489 if (!event_filter_match(event))
Peter Zijlstraf4c41762009-12-16 17:55:54 +01001490 goto unlock;
1491
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001492 /*
1493 * Don't put the event on if it is disabled or if
1494 * it is in a group and the group isn't on.
1495 */
1496 if (event->state != PERF_EVENT_STATE_INACTIVE ||
1497 (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
1498 goto unlock;
1499
1500 /*
1501 * An exclusive event can't go on if there are already active
1502 * hardware events, and no hardware event can go on if there
1503 * is already an exclusive event on.
1504 */
1505 if (!group_can_go_on(event, cpuctx, 1))
1506 err = -EEXIST;
1507 else
Peter Zijlstra6e377382010-02-11 13:21:58 +01001508 err = event_sched_in(event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001509
1510 if (err) {
1511 /*
1512 * This event couldn't go on. If it is in a group
1513 * then we have to pull the whole group off.
1514 * If the event group is pinned then put it in error state.
1515 */
1516 if (leader != event)
1517 group_sched_out(leader, cpuctx, ctx);
1518 if (leader->attr.pinned) {
1519 update_group_times(leader);
1520 leader->state = PERF_EVENT_STATE_ERROR;
1521 }
1522 }
1523
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001524unlock:
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001525 raw_spin_unlock(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001526
1527 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001528}
1529
1530/*
1531 * Attach a performance event to a context
1532 *
1533 * First we add the event to the list with the hardware enable bit
1534 * in event->hw_config cleared.
1535 *
1536 * If the event is attached to a task which is on a CPU we use a smp
1537 * call to enable it in the task context. The task might have been
1538 * scheduled away, but we check this in the smp call again.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001539 */
1540static void
1541perf_install_in_context(struct perf_event_context *ctx,
1542 struct perf_event *event,
1543 int cpu)
1544{
1545 struct task_struct *task = ctx->task;
1546
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001547 lockdep_assert_held(&ctx->mutex);
1548
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02001549 event->ctx = ctx;
1550
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001551 if (!task) {
1552 /*
1553 * Per cpu events are installed via an smp call and
André Goddard Rosaaf901ca2009-11-14 13:09:05 -02001554 * the install is always successful.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001555 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001556 cpu_function_call(cpu, __perf_install_in_context, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001557 return;
1558 }
1559
1560retry:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001561 if (!task_function_call(task, __perf_install_in_context, event))
1562 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001563
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001564 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001565 /*
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001566 * If we failed to find a running task, but find the context active now
1567 * that we've acquired the ctx->lock, retry.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001568 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001569 if (ctx->is_active) {
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001570 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001571 goto retry;
1572 }
1573
1574 /*
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001575 * Since the task isn't running, its safe to add the event, us holding
1576 * the ctx->lock ensures the task won't get scheduled in.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001577 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001578 add_event_to_ctx(event, ctx);
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001579 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001580}
1581
1582/*
1583 * Put a event into inactive state and update time fields.
1584 * Enabling the leader of a group effectively enables all
1585 * the group members that aren't explicitly disabled, so we
1586 * have to update their ->tstamp_enabled also.
1587 * Note: this works for group members as well as group leaders
1588 * since the non-leader members' sibling_lists will be empty.
1589 */
1590static void __perf_event_mark_enabled(struct perf_event *event,
1591 struct perf_event_context *ctx)
1592{
1593 struct perf_event *sub;
Stephane Eranian41587552011-01-03 18:20:01 +02001594 u64 tstamp = perf_event_time(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001595
1596 event->state = PERF_EVENT_STATE_INACTIVE;
Stephane Eranian41587552011-01-03 18:20:01 +02001597 event->tstamp_enabled = tstamp - event->total_time_enabled;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001598 list_for_each_entry(sub, &event->sibling_list, group_entry) {
Stephane Eranian41587552011-01-03 18:20:01 +02001599 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
1600 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001601 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001602}
1603
1604/*
1605 * Cross CPU call to enable a performance event
1606 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001607static int __perf_event_enable(void *info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001608{
1609 struct perf_event *event = info;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001610 struct perf_event_context *ctx = event->ctx;
1611 struct perf_event *leader = event->group_leader;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001612 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001613 int err;
1614
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001615 if (WARN_ON_ONCE(!ctx->is_active))
1616 return -EINVAL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001617
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001618 raw_spin_lock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001619 update_context_time(ctx);
1620
1621 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1622 goto unlock;
Stephane Eraniane5d13672011-02-14 11:20:01 +02001623
1624 /*
1625 * set current task's cgroup time reference point
1626 */
Stephane Eranian3f7cce32011-02-18 14:40:01 +02001627 perf_cgroup_set_timestamp(current, ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02001628
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001629 __perf_event_mark_enabled(event, ctx);
1630
Stephane Eraniane5d13672011-02-14 11:20:01 +02001631 if (!event_filter_match(event)) {
1632 if (is_cgroup_event(event))
1633 perf_cgroup_defer_enabled(event);
Peter Zijlstraf4c41762009-12-16 17:55:54 +01001634 goto unlock;
Stephane Eraniane5d13672011-02-14 11:20:01 +02001635 }
Peter Zijlstraf4c41762009-12-16 17:55:54 +01001636
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001637 /*
1638 * If the event is in a group and isn't the group leader,
1639 * then don't put it on unless the group is on.
1640 */
1641 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
1642 goto unlock;
1643
1644 if (!group_can_go_on(event, cpuctx, 1)) {
1645 err = -EEXIST;
1646 } else {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001647 if (event == leader)
Peter Zijlstra6e377382010-02-11 13:21:58 +01001648 err = group_sched_in(event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001649 else
Peter Zijlstra6e377382010-02-11 13:21:58 +01001650 err = event_sched_in(event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001651 }
1652
1653 if (err) {
1654 /*
1655 * If this event can't go on and it's part of a
1656 * group, then the whole group has to come off.
1657 */
1658 if (leader != event)
1659 group_sched_out(leader, cpuctx, ctx);
1660 if (leader->attr.pinned) {
1661 update_group_times(leader);
1662 leader->state = PERF_EVENT_STATE_ERROR;
1663 }
1664 }
1665
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001666unlock:
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001667 raw_spin_unlock(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001668
1669 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001670}
1671
1672/*
1673 * Enable a event.
1674 *
1675 * If event->ctx is a cloned context, callers must make sure that
1676 * every task struct that event->ctx->task could possibly point to
1677 * remains valid. This condition is satisfied when called through
1678 * perf_event_for_each_child or perf_event_for_each as described
1679 * for perf_event_disable.
1680 */
Frederic Weisbecker44234ad2009-12-09 09:25:48 +01001681void perf_event_enable(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001682{
1683 struct perf_event_context *ctx = event->ctx;
1684 struct task_struct *task = ctx->task;
1685
1686 if (!task) {
1687 /*
1688 * Enable the event on the cpu that it's on
1689 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001690 cpu_function_call(event->cpu, __perf_event_enable, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001691 return;
1692 }
1693
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001694 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001695 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1696 goto out;
1697
1698 /*
1699 * If the event is in error state, clear that first.
1700 * That way, if we see the event in error state below, we
1701 * know that it has gone back into error state, as distinct
1702 * from the task having been scheduled away before the
1703 * cross-call arrived.
1704 */
1705 if (event->state == PERF_EVENT_STATE_ERROR)
1706 event->state = PERF_EVENT_STATE_OFF;
1707
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001708retry:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001709 if (!ctx->is_active) {
1710 __perf_event_mark_enabled(event, ctx);
1711 goto out;
1712 }
1713
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001714 raw_spin_unlock_irq(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001715
1716 if (!task_function_call(task, __perf_event_enable, event))
1717 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001718
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001719 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001720
1721 /*
1722 * If the context is active and the event is still off,
1723 * we need to retry the cross-call.
1724 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001725 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
1726 /*
1727 * task could have been flipped by a concurrent
1728 * perf_event_context_sched_out()
1729 */
1730 task = ctx->task;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001731 goto retry;
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001732 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001733
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001734out:
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001735 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001736}
1737
1738static int perf_event_refresh(struct perf_event *event, int refresh)
1739{
1740 /*
1741 * not supported on inherited events
1742 */
Franck Bui-Huu2e939d12010-11-23 16:21:44 +01001743 if (event->attr.inherit || !is_sampling_event(event))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001744 return -EINVAL;
1745
1746 atomic_add(refresh, &event->event_limit);
1747 perf_event_enable(event);
1748
1749 return 0;
1750}
1751
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001752static void ctx_sched_out(struct perf_event_context *ctx,
1753 struct perf_cpu_context *cpuctx,
1754 enum event_type_t event_type)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001755{
1756 struct perf_event *event;
1757
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001758 raw_spin_lock(&ctx->lock);
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02001759 perf_pmu_disable(ctx->pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001760 ctx->is_active = 0;
1761 if (likely(!ctx->nr_events))
1762 goto out;
1763 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02001764 update_cgrp_time_from_cpuctx(cpuctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001765
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001766 if (!ctx->nr_active)
Peter Zijlstra24cd7f52010-06-11 17:32:03 +02001767 goto out;
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001768
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001769 if (event_type & EVENT_PINNED) {
Frederic Weisbecker889ff012010-01-09 20:04:47 +01001770 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1771 group_sched_out(event, cpuctx, ctx);
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001772 }
Frederic Weisbecker889ff012010-01-09 20:04:47 +01001773
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001774 if (event_type & EVENT_FLEXIBLE) {
Frederic Weisbecker889ff012010-01-09 20:04:47 +01001775 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
Xiao Guangrong8c9ed8e2009-09-25 13:51:17 +08001776 group_sched_out(event, cpuctx, ctx);
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001777 }
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001778out:
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02001779 perf_pmu_enable(ctx->pmu);
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001780 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001781}
1782
1783/*
1784 * Test whether two contexts are equivalent, i.e. whether they
1785 * have both been cloned from the same version of the same context
1786 * and they both have the same number of enabled events.
1787 * If the number of enabled events is the same, then the set
1788 * of enabled events should be the same, because these are both
1789 * inherited contexts, therefore we can't access individual events
1790 * in them directly with an fd; we can only enable/disable all
1791 * events via prctl, or enable/disable all events in a family
1792 * via ioctl, which will have the same effect on both contexts.
1793 */
1794static int context_equiv(struct perf_event_context *ctx1,
1795 struct perf_event_context *ctx2)
1796{
1797 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1798 && ctx1->parent_gen == ctx2->parent_gen
1799 && !ctx1->pin_count && !ctx2->pin_count;
1800}
1801
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001802static void __perf_event_sync_stat(struct perf_event *event,
1803 struct perf_event *next_event)
1804{
1805 u64 value;
1806
1807 if (!event->attr.inherit_stat)
1808 return;
1809
1810 /*
1811 * Update the event value, we cannot use perf_event_read()
1812 * because we're in the middle of a context switch and have IRQs
1813 * disabled, which upsets smp_call_function_single(), however
1814 * we know the event must be on the current CPU, therefore we
1815 * don't need to use it.
1816 */
1817 switch (event->state) {
1818 case PERF_EVENT_STATE_ACTIVE:
Peter Zijlstra3dbebf12009-11-20 22:19:52 +01001819 event->pmu->read(event);
1820 /* fall-through */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001821
1822 case PERF_EVENT_STATE_INACTIVE:
1823 update_event_times(event);
1824 break;
1825
1826 default:
1827 break;
1828 }
1829
1830 /*
1831 * In order to keep per-task stats reliable we need to flip the event
1832 * values when we flip the contexts.
1833 */
Peter Zijlstrae7850592010-05-21 14:43:08 +02001834 value = local64_read(&next_event->count);
1835 value = local64_xchg(&event->count, value);
1836 local64_set(&next_event->count, value);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001837
1838 swap(event->total_time_enabled, next_event->total_time_enabled);
1839 swap(event->total_time_running, next_event->total_time_running);
1840
1841 /*
1842 * Since we swizzled the values, update the user visible data too.
1843 */
1844 perf_event_update_userpage(event);
1845 perf_event_update_userpage(next_event);
1846}
1847
1848#define list_next_entry(pos, member) \
1849 list_entry(pos->member.next, typeof(*pos), member)
1850
1851static void perf_event_sync_stat(struct perf_event_context *ctx,
1852 struct perf_event_context *next_ctx)
1853{
1854 struct perf_event *event, *next_event;
1855
1856 if (!ctx->nr_stat)
1857 return;
1858
Peter Zijlstra02ffdbc2009-11-20 22:19:50 +01001859 update_context_time(ctx);
1860
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001861 event = list_first_entry(&ctx->event_list,
1862 struct perf_event, event_entry);
1863
1864 next_event = list_first_entry(&next_ctx->event_list,
1865 struct perf_event, event_entry);
1866
1867 while (&event->event_entry != &ctx->event_list &&
1868 &next_event->event_entry != &next_ctx->event_list) {
1869
1870 __perf_event_sync_stat(event, next_event);
1871
1872 event = list_next_entry(event, event_entry);
1873 next_event = list_next_entry(next_event, event_entry);
1874 }
1875}
1876
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001877static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1878 struct task_struct *next)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001879{
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02001880 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001881 struct perf_event_context *next_ctx;
1882 struct perf_event_context *parent;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001883 struct perf_cpu_context *cpuctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001884 int do_switch = 1;
1885
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001886 if (likely(!ctx))
1887 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001888
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001889 cpuctx = __get_cpu_context(ctx);
1890 if (!cpuctx->task_ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001891 return;
1892
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001893 rcu_read_lock();
1894 parent = rcu_dereference(ctx->parent_ctx);
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02001895 next_ctx = next->perf_event_ctxp[ctxn];
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001896 if (parent && next_ctx &&
1897 rcu_dereference(next_ctx->parent_ctx) == parent) {
1898 /*
1899 * Looks like the two contexts are clones, so we might be
1900 * able to optimize the context switch. We lock both
1901 * contexts and check that they are clones under the
1902 * lock (including re-checking that neither has been
1903 * uncloned in the meantime). It doesn't matter which
1904 * order we take the locks because no other cpu could
1905 * be trying to lock both of these tasks.
1906 */
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001907 raw_spin_lock(&ctx->lock);
1908 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001909 if (context_equiv(ctx, next_ctx)) {
1910 /*
1911 * XXX do we need a memory barrier of sorts
1912 * wrt to rcu_dereference() of perf_event_ctxp
1913 */
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02001914 task->perf_event_ctxp[ctxn] = next_ctx;
1915 next->perf_event_ctxp[ctxn] = ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001916 ctx->task = next;
1917 next_ctx->task = task;
1918 do_switch = 0;
1919
1920 perf_event_sync_stat(ctx, next_ctx);
1921 }
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001922 raw_spin_unlock(&next_ctx->lock);
1923 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001924 }
1925 rcu_read_unlock();
1926
1927 if (do_switch) {
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001928 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001929 cpuctx->task_ctx = NULL;
1930 }
1931}
1932
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02001933#define for_each_task_context_nr(ctxn) \
1934 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
1935
1936/*
1937 * Called from scheduler to remove the events of the current task,
1938 * with interrupts disabled.
1939 *
1940 * We stop each event and update the event value in event->count.
1941 *
1942 * This does not protect us against NMI, but disable()
1943 * sets the disabled bit in the control field of event _before_
1944 * accessing the event control register. If a NMI hits, then it will
1945 * not restart the event.
1946 */
Peter Zijlstra82cd6de2010-10-14 17:57:23 +02001947void __perf_event_task_sched_out(struct task_struct *task,
1948 struct task_struct *next)
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02001949{
1950 int ctxn;
1951
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02001952 for_each_task_context_nr(ctxn)
1953 perf_event_context_sched_out(task, ctxn, next);
Stephane Eraniane5d13672011-02-14 11:20:01 +02001954
1955 /*
1956 * if cgroup events exist on this CPU, then we need
1957 * to check if we have to switch out PMU state.
1958 * cgroup event are system-wide mode only
1959 */
1960 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
1961 perf_cgroup_sched_out(task);
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02001962}
1963
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001964static void task_ctx_sched_out(struct perf_event_context *ctx,
1965 enum event_type_t event_type)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001966{
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001967 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001968
1969 if (!cpuctx->task_ctx)
1970 return;
1971
1972 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1973 return;
1974
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001975 ctx_sched_out(ctx, cpuctx, event_type);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001976 cpuctx->task_ctx = NULL;
1977}
1978
1979/*
1980 * Called with IRQs disabled
1981 */
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001982static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1983 enum event_type_t event_type)
1984{
1985 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001986}
1987
1988static void
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001989ctx_pinned_sched_in(struct perf_event_context *ctx,
Peter Zijlstra6e377382010-02-11 13:21:58 +01001990 struct perf_cpu_context *cpuctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001991{
1992 struct perf_event *event;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001993
Frederic Weisbecker889ff012010-01-09 20:04:47 +01001994 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1995 if (event->state <= PERF_EVENT_STATE_OFF)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001996 continue;
Stephane Eranian5632ab12011-01-03 18:20:01 +02001997 if (!event_filter_match(event))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001998 continue;
1999
Stephane Eraniane5d13672011-02-14 11:20:01 +02002000 /* may need to reset tstamp_enabled */
2001 if (is_cgroup_event(event))
2002 perf_cgroup_mark_enabled(event, ctx);
2003
Xiao Guangrong8c9ed8e2009-09-25 13:51:17 +08002004 if (group_can_go_on(event, cpuctx, 1))
Peter Zijlstra6e377382010-02-11 13:21:58 +01002005 group_sched_in(event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002006
2007 /*
2008 * If this pinned group hasn't been scheduled,
2009 * put it in error state.
2010 */
2011 if (event->state == PERF_EVENT_STATE_INACTIVE) {
2012 update_group_times(event);
2013 event->state = PERF_EVENT_STATE_ERROR;
2014 }
2015 }
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002016}
2017
2018static void
2019ctx_flexible_sched_in(struct perf_event_context *ctx,
Peter Zijlstra6e377382010-02-11 13:21:58 +01002020 struct perf_cpu_context *cpuctx)
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002021{
2022 struct perf_event *event;
2023 int can_add_hw = 1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002024
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002025 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2026 /* Ignore events in OFF or ERROR state */
2027 if (event->state <= PERF_EVENT_STATE_OFF)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002028 continue;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002029 /*
2030 * Listen to the 'cpu' scheduling filter constraint
2031 * of events:
2032 */
Stephane Eranian5632ab12011-01-03 18:20:01 +02002033 if (!event_filter_match(event))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002034 continue;
2035
Stephane Eraniane5d13672011-02-14 11:20:01 +02002036 /* may need to reset tstamp_enabled */
2037 if (is_cgroup_event(event))
2038 perf_cgroup_mark_enabled(event, ctx);
2039
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002040 if (group_can_go_on(event, cpuctx, can_add_hw)) {
Peter Zijlstra6e377382010-02-11 13:21:58 +01002041 if (group_sched_in(event, cpuctx, ctx))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002042 can_add_hw = 0;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002043 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002044 }
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002045}
2046
2047static void
2048ctx_sched_in(struct perf_event_context *ctx,
2049 struct perf_cpu_context *cpuctx,
Stephane Eraniane5d13672011-02-14 11:20:01 +02002050 enum event_type_t event_type,
2051 struct task_struct *task)
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002052{
Stephane Eraniane5d13672011-02-14 11:20:01 +02002053 u64 now;
2054
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002055 raw_spin_lock(&ctx->lock);
2056 ctx->is_active = 1;
2057 if (likely(!ctx->nr_events))
2058 goto out;
2059
Stephane Eraniane5d13672011-02-14 11:20:01 +02002060 now = perf_clock();
2061 ctx->timestamp = now;
Stephane Eranian3f7cce32011-02-18 14:40:01 +02002062 perf_cgroup_set_timestamp(task, ctx);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002063 /*
2064 * First go through the list and put on any pinned groups
2065 * in order to give them the best chance of going on.
2066 */
2067 if (event_type & EVENT_PINNED)
Peter Zijlstra6e377382010-02-11 13:21:58 +01002068 ctx_pinned_sched_in(ctx, cpuctx);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002069
2070 /* Then walk through the lower prio flexible groups */
2071 if (event_type & EVENT_FLEXIBLE)
Peter Zijlstra6e377382010-02-11 13:21:58 +01002072 ctx_flexible_sched_in(ctx, cpuctx);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002073
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002074out:
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002075 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002076}
2077
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002078static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
Stephane Eraniane5d13672011-02-14 11:20:01 +02002079 enum event_type_t event_type,
2080 struct task_struct *task)
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002081{
2082 struct perf_event_context *ctx = &cpuctx->ctx;
2083
Stephane Eraniane5d13672011-02-14 11:20:01 +02002084 ctx_sched_in(ctx, cpuctx, event_type, task);
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002085}
2086
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02002087static void task_ctx_sched_in(struct perf_event_context *ctx,
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002088 enum event_type_t event_type)
2089{
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02002090 struct perf_cpu_context *cpuctx;
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002091
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002092 cpuctx = __get_cpu_context(ctx);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002093 if (cpuctx->task_ctx == ctx)
2094 return;
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02002095
Stephane Eraniane5d13672011-02-14 11:20:01 +02002096 ctx_sched_in(ctx, cpuctx, event_type, NULL);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002097 cpuctx->task_ctx = ctx;
2098}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002099
Stephane Eraniane5d13672011-02-14 11:20:01 +02002100static void perf_event_context_sched_in(struct perf_event_context *ctx,
2101 struct task_struct *task)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002102{
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02002103 struct perf_cpu_context *cpuctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002104
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002105 cpuctx = __get_cpu_context(ctx);
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002106 if (cpuctx->task_ctx == ctx)
2107 return;
2108
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02002109 perf_pmu_disable(ctx->pmu);
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002110 /*
2111 * We want to keep the following priority order:
2112 * cpu pinned (that don't need to move), task pinned,
2113 * cpu flexible, task flexible.
2114 */
2115 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2116
Stephane Eraniane5d13672011-02-14 11:20:01 +02002117 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2118 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2119 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002120
2121 cpuctx->task_ctx = ctx;
eranian@google.com9b33fa62010-03-10 22:26:05 -08002122
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002123 /*
2124 * Since these rotations are per-cpu, we need to ensure the
2125 * cpu-context we got scheduled on is actually rotating.
2126 */
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002127 perf_pmu_rotate_start(ctx->pmu);
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02002128 perf_pmu_enable(ctx->pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002129}
2130
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02002131/*
2132 * Called from scheduler to add the events of the current task
2133 * with interrupts disabled.
2134 *
2135 * We restore the event value and then enable it.
2136 *
2137 * This does not protect us against NMI, but enable()
2138 * sets the enabled bit in the control field of event _before_
2139 * accessing the event control register. If a NMI hits, then it will
2140 * keep the event running.
2141 */
Peter Zijlstra82cd6de2010-10-14 17:57:23 +02002142void __perf_event_task_sched_in(struct task_struct *task)
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02002143{
2144 struct perf_event_context *ctx;
2145 int ctxn;
2146
2147 for_each_task_context_nr(ctxn) {
2148 ctx = task->perf_event_ctxp[ctxn];
2149 if (likely(!ctx))
2150 continue;
2151
Stephane Eraniane5d13672011-02-14 11:20:01 +02002152 perf_event_context_sched_in(ctx, task);
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02002153 }
Stephane Eraniane5d13672011-02-14 11:20:01 +02002154 /*
2155 * if cgroup events exist on this CPU, then we need
2156 * to check if we have to switch in PMU state.
2157 * cgroup event are system-wide mode only
2158 */
2159 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2160 perf_cgroup_sched_in(task);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002161}
2162
Peter Zijlstraabd50712010-01-26 18:50:16 +01002163static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2164{
2165 u64 frequency = event->attr.sample_freq;
2166 u64 sec = NSEC_PER_SEC;
2167 u64 divisor, dividend;
2168
2169 int count_fls, nsec_fls, frequency_fls, sec_fls;
2170
2171 count_fls = fls64(count);
2172 nsec_fls = fls64(nsec);
2173 frequency_fls = fls64(frequency);
2174 sec_fls = 30;
2175
2176 /*
2177 * We got @count in @nsec, with a target of sample_freq HZ
2178 * the target period becomes:
2179 *
2180 * @count * 10^9
2181 * period = -------------------
2182 * @nsec * sample_freq
2183 *
2184 */
2185
2186 /*
2187 * Reduce accuracy by one bit such that @a and @b converge
2188 * to a similar magnitude.
2189 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002190#define REDUCE_FLS(a, b) \
Peter Zijlstraabd50712010-01-26 18:50:16 +01002191do { \
2192 if (a##_fls > b##_fls) { \
2193 a >>= 1; \
2194 a##_fls--; \
2195 } else { \
2196 b >>= 1; \
2197 b##_fls--; \
2198 } \
2199} while (0)
2200
2201 /*
2202 * Reduce accuracy until either term fits in a u64, then proceed with
2203 * the other, so that finally we can do a u64/u64 division.
2204 */
2205 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2206 REDUCE_FLS(nsec, frequency);
2207 REDUCE_FLS(sec, count);
2208 }
2209
2210 if (count_fls + sec_fls > 64) {
2211 divisor = nsec * frequency;
2212
2213 while (count_fls + sec_fls > 64) {
2214 REDUCE_FLS(count, sec);
2215 divisor >>= 1;
2216 }
2217
2218 dividend = count * sec;
2219 } else {
2220 dividend = count * sec;
2221
2222 while (nsec_fls + frequency_fls > 64) {
2223 REDUCE_FLS(nsec, frequency);
2224 dividend >>= 1;
2225 }
2226
2227 divisor = nsec * frequency;
2228 }
2229
Peter Zijlstraf6ab91ad2010-06-04 15:18:01 +02002230 if (!divisor)
2231 return dividend;
2232
Peter Zijlstraabd50712010-01-26 18:50:16 +01002233 return div64_u64(dividend, divisor);
2234}
2235
2236static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002237{
2238 struct hw_perf_event *hwc = &event->hw;
Peter Zijlstraf6ab91ad2010-06-04 15:18:01 +02002239 s64 period, sample_period;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002240 s64 delta;
2241
Peter Zijlstraabd50712010-01-26 18:50:16 +01002242 period = perf_calculate_period(event, nsec, count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002243
2244 delta = (s64)(period - hwc->sample_period);
2245 delta = (delta + 7) / 8; /* low pass filter */
2246
2247 sample_period = hwc->sample_period + delta;
2248
2249 if (!sample_period)
2250 sample_period = 1;
2251
2252 hwc->sample_period = sample_period;
Peter Zijlstraabd50712010-01-26 18:50:16 +01002253
Peter Zijlstrae7850592010-05-21 14:43:08 +02002254 if (local64_read(&hwc->period_left) > 8*sample_period) {
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02002255 event->pmu->stop(event, PERF_EF_UPDATE);
Peter Zijlstrae7850592010-05-21 14:43:08 +02002256 local64_set(&hwc->period_left, 0);
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02002257 event->pmu->start(event, PERF_EF_RELOAD);
Peter Zijlstraabd50712010-01-26 18:50:16 +01002258 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002259}
2260
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002261static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002262{
2263 struct perf_event *event;
2264 struct hw_perf_event *hwc;
Peter Zijlstraabd50712010-01-26 18:50:16 +01002265 u64 interrupts, now;
2266 s64 delta;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002267
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002268 raw_spin_lock(&ctx->lock);
Paul Mackerras03541f82009-10-14 16:58:03 +11002269 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002270 if (event->state != PERF_EVENT_STATE_ACTIVE)
2271 continue;
2272
Stephane Eranian5632ab12011-01-03 18:20:01 +02002273 if (!event_filter_match(event))
Peter Zijlstra5d27c232009-12-17 13:16:32 +01002274 continue;
2275
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002276 hwc = &event->hw;
2277
2278 interrupts = hwc->interrupts;
2279 hwc->interrupts = 0;
2280
2281 /*
2282 * unthrottle events on the tick
2283 */
2284 if (interrupts == MAX_INTERRUPTS) {
2285 perf_log_throttle(event, 1);
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02002286 event->pmu->start(event, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002287 }
2288
2289 if (!event->attr.freq || !event->attr.sample_freq)
2290 continue;
2291
Peter Zijlstraabd50712010-01-26 18:50:16 +01002292 event->pmu->read(event);
Peter Zijlstrae7850592010-05-21 14:43:08 +02002293 now = local64_read(&event->count);
Peter Zijlstraabd50712010-01-26 18:50:16 +01002294 delta = now - hwc->freq_count_stamp;
2295 hwc->freq_count_stamp = now;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002296
Peter Zijlstraabd50712010-01-26 18:50:16 +01002297 if (delta > 0)
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002298 perf_adjust_period(event, period, delta);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002299 }
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002300 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002301}
2302
2303/*
2304 * Round-robin a context's events:
2305 */
2306static void rotate_ctx(struct perf_event_context *ctx)
2307{
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002308 raw_spin_lock(&ctx->lock);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002309
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01002310 /*
2311 * Rotate the first entry last of non-pinned groups. Rotation might be
2312 * disabled by the inheritance code.
2313 */
2314 if (!ctx->rotate_disable)
2315 list_rotate_left(&ctx->flexible_groups);
Frederic Weisbeckere2864172010-01-09 21:05:28 +01002316
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002317 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002318}
2319
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002320/*
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002321 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
2322 * because they're strictly cpu affine and rotate_start is called with IRQs
2323 * disabled, while rotate_context is called from IRQ context.
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002324 */
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002325static void perf_rotate_context(struct perf_cpu_context *cpuctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002326{
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002327 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02002328 struct perf_event_context *ctx = NULL;
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002329 int rotate = 0, remove = 1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002330
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002331 if (cpuctx->ctx.nr_events) {
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002332 remove = 0;
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002333 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
2334 rotate = 1;
2335 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002336
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02002337 ctx = cpuctx->task_ctx;
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002338 if (ctx && ctx->nr_events) {
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002339 remove = 0;
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002340 if (ctx->nr_events != ctx->nr_active)
2341 rotate = 1;
2342 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002343
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02002344 perf_pmu_disable(cpuctx->ctx.pmu);
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002345 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002346 if (ctx)
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002347 perf_ctx_adjust_freq(ctx, interval);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002348
Peter Zijlstrad4944a02010-03-08 13:51:20 +01002349 if (!rotate)
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002350 goto done;
Peter Zijlstrad4944a02010-03-08 13:51:20 +01002351
Frederic Weisbecker7defb0f2010-01-17 12:15:31 +01002352 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002353 if (ctx)
Frederic Weisbecker7defb0f2010-01-17 12:15:31 +01002354 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002355
2356 rotate_ctx(&cpuctx->ctx);
2357 if (ctx)
2358 rotate_ctx(ctx);
2359
Stephane Eraniane5d13672011-02-14 11:20:01 +02002360 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002361 if (ctx)
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02002362 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002363
2364done:
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002365 if (remove)
2366 list_del_init(&cpuctx->rotation_list);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002367
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002368 perf_pmu_enable(cpuctx->ctx.pmu);
2369}
2370
2371void perf_event_task_tick(void)
2372{
2373 struct list_head *head = &__get_cpu_var(rotation_list);
2374 struct perf_cpu_context *cpuctx, *tmp;
2375
2376 WARN_ON(!irqs_disabled());
2377
2378 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
2379 if (cpuctx->jiffies_interval == 1 ||
2380 !(jiffies % cpuctx->jiffies_interval))
2381 perf_rotate_context(cpuctx);
2382 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002383}
2384
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002385static int event_enable_on_exec(struct perf_event *event,
2386 struct perf_event_context *ctx)
2387{
2388 if (!event->attr.enable_on_exec)
2389 return 0;
2390
2391 event->attr.enable_on_exec = 0;
2392 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2393 return 0;
2394
2395 __perf_event_mark_enabled(event, ctx);
2396
2397 return 1;
2398}
2399
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002400/*
2401 * Enable all of a task's events that have been marked enable-on-exec.
2402 * This expects task == current.
2403 */
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02002404static void perf_event_enable_on_exec(struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002405{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002406 struct perf_event *event;
2407 unsigned long flags;
2408 int enabled = 0;
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002409 int ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002410
2411 local_irq_save(flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002412 if (!ctx || !ctx->nr_events)
2413 goto out;
2414
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02002415 task_ctx_sched_out(ctx, EVENT_ALL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002416
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002417 raw_spin_lock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002418
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002419 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2420 ret = event_enable_on_exec(event, ctx);
2421 if (ret)
2422 enabled = 1;
2423 }
2424
2425 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2426 ret = event_enable_on_exec(event, ctx);
2427 if (ret)
2428 enabled = 1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002429 }
2430
2431 /*
2432 * Unclone this context if we enabled any event.
2433 */
2434 if (enabled)
2435 unclone_ctx(ctx);
2436
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002437 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002438
Stephane Eraniane5d13672011-02-14 11:20:01 +02002439 perf_event_context_sched_in(ctx, ctx->task);
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002440out:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002441 local_irq_restore(flags);
2442}
2443
2444/*
2445 * Cross CPU call to read the hardware event
2446 */
2447static void __perf_event_read(void *info)
2448{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002449 struct perf_event *event = info;
2450 struct perf_event_context *ctx = event->ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002451 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002452
2453 /*
2454 * If this is a task context, we need to check whether it is
2455 * the current task context of this cpu. If not it has been
2456 * scheduled out before the smp call arrived. In that case
2457 * event->count would have been updated to a recent sample
2458 * when the event was scheduled out.
2459 */
2460 if (ctx->task && cpuctx->task_ctx != ctx)
2461 return;
2462
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002463 raw_spin_lock(&ctx->lock);
Stephane Eraniane5d13672011-02-14 11:20:01 +02002464 if (ctx->is_active) {
Peter Zijlstra542e72f2011-01-26 15:38:35 +01002465 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02002466 update_cgrp_time_from_event(event);
2467 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002468 update_event_times(event);
Peter Zijlstra542e72f2011-01-26 15:38:35 +01002469 if (event->state == PERF_EVENT_STATE_ACTIVE)
2470 event->pmu->read(event);
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002471 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002472}
2473
Peter Zijlstrab5e58792010-05-21 14:43:12 +02002474static inline u64 perf_event_count(struct perf_event *event)
2475{
Peter Zijlstrae7850592010-05-21 14:43:08 +02002476 return local64_read(&event->count) + atomic64_read(&event->child_count);
Peter Zijlstrab5e58792010-05-21 14:43:12 +02002477}
2478
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002479static u64 perf_event_read(struct perf_event *event)
2480{
2481 /*
2482 * If event is enabled and currently active on a CPU, update the
2483 * value in the event structure:
2484 */
2485 if (event->state == PERF_EVENT_STATE_ACTIVE) {
2486 smp_call_function_single(event->oncpu,
2487 __perf_event_read, event, 1);
2488 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
Peter Zijlstra2b8988c2009-11-20 22:19:54 +01002489 struct perf_event_context *ctx = event->ctx;
2490 unsigned long flags;
2491
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002492 raw_spin_lock_irqsave(&ctx->lock, flags);
Stephane Eranianc530ccd2010-10-15 15:26:01 +02002493 /*
2494 * may read while context is not active
2495 * (e.g., thread is blocked), in that case
2496 * we cannot update context time
2497 */
Stephane Eraniane5d13672011-02-14 11:20:01 +02002498 if (ctx->is_active) {
Stephane Eranianc530ccd2010-10-15 15:26:01 +02002499 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02002500 update_cgrp_time_from_event(event);
2501 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002502 update_event_times(event);
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002503 raw_spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002504 }
2505
Peter Zijlstrab5e58792010-05-21 14:43:12 +02002506 return perf_event_count(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002507}
2508
2509/*
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02002510 * Callchain support
2511 */
2512
2513struct callchain_cpus_entries {
2514 struct rcu_head rcu_head;
2515 struct perf_callchain_entry *cpu_entries[0];
2516};
2517
Frederic Weisbecker7ae07ea2010-08-14 20:45:13 +02002518static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02002519static atomic_t nr_callchain_events;
2520static DEFINE_MUTEX(callchain_mutex);
2521struct callchain_cpus_entries *callchain_cpus_entries;
2522
2523
2524__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
2525 struct pt_regs *regs)
2526{
2527}
2528
2529__weak void perf_callchain_user(struct perf_callchain_entry *entry,
2530 struct pt_regs *regs)
2531{
2532}
2533
2534static void release_callchain_buffers_rcu(struct rcu_head *head)
2535{
2536 struct callchain_cpus_entries *entries;
2537 int cpu;
2538
2539 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
2540
2541 for_each_possible_cpu(cpu)
2542 kfree(entries->cpu_entries[cpu]);
2543
2544 kfree(entries);
2545}
2546
2547static void release_callchain_buffers(void)
2548{
2549 struct callchain_cpus_entries *entries;
2550
2551 entries = callchain_cpus_entries;
2552 rcu_assign_pointer(callchain_cpus_entries, NULL);
2553 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
2554}
2555
2556static int alloc_callchain_buffers(void)
2557{
2558 int cpu;
2559 int size;
2560 struct callchain_cpus_entries *entries;
2561
2562 /*
2563 * We can't use the percpu allocation API for data that can be
2564 * accessed from NMI. Use a temporary manual per cpu allocation
2565 * until that gets sorted out.
2566 */
Eric Dumazet88d4f0d2011-01-25 19:40:51 +01002567 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02002568
2569 entries = kzalloc(size, GFP_KERNEL);
2570 if (!entries)
2571 return -ENOMEM;
2572
Frederic Weisbecker7ae07ea2010-08-14 20:45:13 +02002573 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02002574
2575 for_each_possible_cpu(cpu) {
2576 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
2577 cpu_to_node(cpu));
2578 if (!entries->cpu_entries[cpu])
2579 goto fail;
2580 }
2581
2582 rcu_assign_pointer(callchain_cpus_entries, entries);
2583
2584 return 0;
2585
2586fail:
2587 for_each_possible_cpu(cpu)
2588 kfree(entries->cpu_entries[cpu]);
2589 kfree(entries);
2590
2591 return -ENOMEM;
2592}
2593
2594static int get_callchain_buffers(void)
2595{
2596 int err = 0;
2597 int count;
2598
2599 mutex_lock(&callchain_mutex);
2600
2601 count = atomic_inc_return(&nr_callchain_events);
2602 if (WARN_ON_ONCE(count < 1)) {
2603 err = -EINVAL;
2604 goto exit;
2605 }
2606
2607 if (count > 1) {
2608 /* If the allocation failed, give up */
2609 if (!callchain_cpus_entries)
2610 err = -ENOMEM;
2611 goto exit;
2612 }
2613
2614 err = alloc_callchain_buffers();
2615 if (err)
2616 release_callchain_buffers();
2617exit:
2618 mutex_unlock(&callchain_mutex);
2619
2620 return err;
2621}
2622
2623static void put_callchain_buffers(void)
2624{
2625 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
2626 release_callchain_buffers();
2627 mutex_unlock(&callchain_mutex);
2628 }
2629}
2630
2631static int get_recursion_context(int *recursion)
2632{
2633 int rctx;
2634
2635 if (in_nmi())
2636 rctx = 3;
2637 else if (in_irq())
2638 rctx = 2;
2639 else if (in_softirq())
2640 rctx = 1;
2641 else
2642 rctx = 0;
2643
2644 if (recursion[rctx])
2645 return -1;
2646
2647 recursion[rctx]++;
2648 barrier();
2649
2650 return rctx;
2651}
2652
2653static inline void put_recursion_context(int *recursion, int rctx)
2654{
2655 barrier();
2656 recursion[rctx]--;
2657}
2658
2659static struct perf_callchain_entry *get_callchain_entry(int *rctx)
2660{
2661 int cpu;
2662 struct callchain_cpus_entries *entries;
2663
2664 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
2665 if (*rctx == -1)
2666 return NULL;
2667
2668 entries = rcu_dereference(callchain_cpus_entries);
2669 if (!entries)
2670 return NULL;
2671
2672 cpu = smp_processor_id();
2673
2674 return &entries->cpu_entries[cpu][*rctx];
2675}
2676
2677static void
2678put_callchain_entry(int rctx)
2679{
2680 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
2681}
2682
2683static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2684{
2685 int rctx;
2686 struct perf_callchain_entry *entry;
2687
2688
2689 entry = get_callchain_entry(&rctx);
2690 if (rctx == -1)
2691 return NULL;
2692
2693 if (!entry)
2694 goto exit_put;
2695
2696 entry->nr = 0;
2697
2698 if (!user_mode(regs)) {
2699 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
2700 perf_callchain_kernel(entry, regs);
2701 if (current->mm)
2702 regs = task_pt_regs(current);
2703 else
2704 regs = NULL;
2705 }
2706
2707 if (regs) {
2708 perf_callchain_store(entry, PERF_CONTEXT_USER);
2709 perf_callchain_user(entry, regs);
2710 }
2711
2712exit_put:
2713 put_callchain_entry(rctx);
2714
2715 return entry;
2716}
2717
2718/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002719 * Initialize the perf_event context in a task_struct:
2720 */
Peter Zijlstraeb184472010-09-07 15:55:13 +02002721static void __perf_event_init_context(struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002722{
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002723 raw_spin_lock_init(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002724 mutex_init(&ctx->mutex);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002725 INIT_LIST_HEAD(&ctx->pinned_groups);
2726 INIT_LIST_HEAD(&ctx->flexible_groups);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002727 INIT_LIST_HEAD(&ctx->event_list);
2728 atomic_set(&ctx->refcount, 1);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002729}
2730
Peter Zijlstraeb184472010-09-07 15:55:13 +02002731static struct perf_event_context *
2732alloc_perf_context(struct pmu *pmu, struct task_struct *task)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002733{
2734 struct perf_event_context *ctx;
Peter Zijlstraeb184472010-09-07 15:55:13 +02002735
2736 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
2737 if (!ctx)
2738 return NULL;
2739
2740 __perf_event_init_context(ctx);
2741 if (task) {
2742 ctx->task = task;
2743 get_task_struct(task);
2744 }
2745 ctx->pmu = pmu;
2746
2747 return ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002748}
2749
Matt Helsley2ebd4ff2010-09-13 13:01:19 -07002750static struct task_struct *
2751find_lively_task_by_vpid(pid_t vpid)
2752{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002753 struct task_struct *task;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002754 int err;
2755
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002756 rcu_read_lock();
Matt Helsley2ebd4ff2010-09-13 13:01:19 -07002757 if (!vpid)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002758 task = current;
2759 else
Matt Helsley2ebd4ff2010-09-13 13:01:19 -07002760 task = find_task_by_vpid(vpid);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002761 if (task)
2762 get_task_struct(task);
2763 rcu_read_unlock();
2764
2765 if (!task)
2766 return ERR_PTR(-ESRCH);
2767
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002768 /* Reuse ptrace permission checks for now. */
2769 err = -EACCES;
2770 if (!ptrace_may_access(task, PTRACE_MODE_READ))
2771 goto errout;
2772
Matt Helsley2ebd4ff2010-09-13 13:01:19 -07002773 return task;
2774errout:
2775 put_task_struct(task);
2776 return ERR_PTR(err);
2777
2778}
2779
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002780/*
2781 * Returns a matching context with refcount and pincount.
2782 */
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002783static struct perf_event_context *
Matt Helsley38a81da2010-09-13 13:01:20 -07002784find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002785{
2786 struct perf_event_context *ctx;
2787 struct perf_cpu_context *cpuctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002788 unsigned long flags;
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02002789 int ctxn, err;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002790
Oleg Nesterov22a4ec72011-01-18 17:10:08 +01002791 if (!task) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002792 /* Must be root to operate on a CPU event: */
2793 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2794 return ERR_PTR(-EACCES);
2795
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002796 /*
2797 * We could be clever and allow to attach a event to an
2798 * offline CPU and activate it when the CPU comes up, but
2799 * that's for later.
2800 */
2801 if (!cpu_online(cpu))
2802 return ERR_PTR(-ENODEV);
2803
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002804 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002805 ctx = &cpuctx->ctx;
2806 get_ctx(ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002807 ++ctx->pin_count;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002808
2809 return ctx;
2810 }
2811
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02002812 err = -EINVAL;
2813 ctxn = pmu->task_ctx_nr;
2814 if (ctxn < 0)
2815 goto errout;
2816
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002817retry:
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02002818 ctx = perf_lock_task_context(task, ctxn, &flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002819 if (ctx) {
2820 unclone_ctx(ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002821 ++ctx->pin_count;
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002822 raw_spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002823 }
2824
2825 if (!ctx) {
Peter Zijlstraeb184472010-09-07 15:55:13 +02002826 ctx = alloc_perf_context(pmu, task);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002827 err = -ENOMEM;
2828 if (!ctx)
2829 goto errout;
Peter Zijlstraeb184472010-09-07 15:55:13 +02002830
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002831 get_ctx(ctx);
Peter Zijlstraeb184472010-09-07 15:55:13 +02002832
Oleg Nesterovdbe08d82011-01-19 19:22:07 +01002833 err = 0;
2834 mutex_lock(&task->perf_event_mutex);
2835 /*
2836 * If it has already passed perf_event_exit_task().
2837 * we must see PF_EXITING, it takes this mutex too.
2838 */
2839 if (task->flags & PF_EXITING)
2840 err = -ESRCH;
2841 else if (task->perf_event_ctxp[ctxn])
2842 err = -EAGAIN;
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002843 else {
2844 ++ctx->pin_count;
Oleg Nesterovdbe08d82011-01-19 19:22:07 +01002845 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002846 }
Oleg Nesterovdbe08d82011-01-19 19:22:07 +01002847 mutex_unlock(&task->perf_event_mutex);
2848
2849 if (unlikely(err)) {
Peter Zijlstraeb184472010-09-07 15:55:13 +02002850 put_task_struct(task);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002851 kfree(ctx);
Oleg Nesterovdbe08d82011-01-19 19:22:07 +01002852
2853 if (err == -EAGAIN)
2854 goto retry;
2855 goto errout;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002856 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002857 }
2858
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002859 return ctx;
2860
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002861errout:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002862 return ERR_PTR(err);
2863}
2864
Li Zefan6fb29152009-10-15 11:21:42 +08002865static void perf_event_free_filter(struct perf_event *event);
2866
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002867static void free_event_rcu(struct rcu_head *head)
2868{
2869 struct perf_event *event;
2870
2871 event = container_of(head, struct perf_event, rcu_head);
2872 if (event->ns)
2873 put_pid_ns(event->ns);
Li Zefan6fb29152009-10-15 11:21:42 +08002874 perf_event_free_filter(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002875 kfree(event);
2876}
2877
Peter Zijlstraca5135e2010-05-28 19:33:23 +02002878static void perf_buffer_put(struct perf_buffer *buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002879
2880static void free_event(struct perf_event *event)
2881{
Peter Zijlstrae360adb2010-10-14 14:01:34 +08002882 irq_work_sync(&event->pending);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002883
2884 if (!event->parent) {
Peter Zijlstra82cd6de2010-10-14 17:57:23 +02002885 if (event->attach_state & PERF_ATTACH_TASK)
Stephane Eraniane5d13672011-02-14 11:20:01 +02002886 jump_label_dec(&perf_sched_events);
Eric B Munson3af9e852010-05-18 15:30:49 +01002887 if (event->attr.mmap || event->attr.mmap_data)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002888 atomic_dec(&nr_mmap_events);
2889 if (event->attr.comm)
2890 atomic_dec(&nr_comm_events);
2891 if (event->attr.task)
2892 atomic_dec(&nr_task_events);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02002893 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2894 put_callchain_buffers();
Peter Zijlstra08309372011-03-03 11:31:20 +01002895 if (is_cgroup_event(event)) {
2896 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2897 jump_label_dec(&perf_sched_events);
2898 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002899 }
2900
Peter Zijlstraca5135e2010-05-28 19:33:23 +02002901 if (event->buffer) {
2902 perf_buffer_put(event->buffer);
2903 event->buffer = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002904 }
2905
Stephane Eraniane5d13672011-02-14 11:20:01 +02002906 if (is_cgroup_event(event))
2907 perf_detach_cgroup(event);
2908
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002909 if (event->destroy)
2910 event->destroy(event);
2911
Peter Zijlstra0c67b402010-09-13 11:15:58 +02002912 if (event->ctx)
2913 put_ctx(event->ctx);
2914
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002915 call_rcu(&event->rcu_head, free_event_rcu);
2916}
2917
Arjan van de Venfb0459d2009-09-25 12:25:56 +02002918int perf_event_release_kernel(struct perf_event *event)
2919{
2920 struct perf_event_context *ctx = event->ctx;
2921
Peter Zijlstra050735b2010-05-11 11:51:53 +02002922 /*
2923 * Remove from the PMU, can't get re-enabled since we got
2924 * here because the last ref went.
2925 */
2926 perf_event_disable(event);
2927
Arjan van de Venfb0459d2009-09-25 12:25:56 +02002928 WARN_ON_ONCE(ctx->parent_ctx);
Peter Zijlstraa0507c82010-05-06 15:42:53 +02002929 /*
2930 * There are two ways this annotation is useful:
2931 *
2932 * 1) there is a lock recursion from perf_event_exit_task
2933 * see the comment there.
2934 *
2935 * 2) there is a lock-inversion with mmap_sem through
2936 * perf_event_read_group(), which takes faults while
2937 * holding ctx->mutex, however this is called after
2938 * the last filedesc died, so there is no possibility
2939 * to trigger the AB-BA case.
2940 */
2941 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
Peter Zijlstra050735b2010-05-11 11:51:53 +02002942 raw_spin_lock_irq(&ctx->lock);
Peter Zijlstra8a495422010-05-27 15:47:49 +02002943 perf_group_detach(event);
Peter Zijlstra050735b2010-05-11 11:51:53 +02002944 list_del_event(event, ctx);
Peter Zijlstra050735b2010-05-11 11:51:53 +02002945 raw_spin_unlock_irq(&ctx->lock);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02002946 mutex_unlock(&ctx->mutex);
2947
Arjan van de Venfb0459d2009-09-25 12:25:56 +02002948 free_event(event);
2949
2950 return 0;
2951}
2952EXPORT_SYMBOL_GPL(perf_event_release_kernel);
2953
Peter Zijlstraa66a3052009-11-23 11:37:23 +01002954/*
2955 * Called when the last reference to the file is gone.
2956 */
2957static int perf_release(struct inode *inode, struct file *file)
2958{
2959 struct perf_event *event = file->private_data;
Peter Zijlstra88821352010-11-09 19:01:43 +01002960 struct task_struct *owner;
Peter Zijlstraa66a3052009-11-23 11:37:23 +01002961
2962 file->private_data = NULL;
2963
Peter Zijlstra88821352010-11-09 19:01:43 +01002964 rcu_read_lock();
2965 owner = ACCESS_ONCE(event->owner);
2966 /*
2967 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
2968 * !owner it means the list deletion is complete and we can indeed
2969 * free this event, otherwise we need to serialize on
2970 * owner->perf_event_mutex.
2971 */
2972 smp_read_barrier_depends();
2973 if (owner) {
2974 /*
2975 * Since delayed_put_task_struct() also drops the last
2976 * task reference we can safely take a new reference
2977 * while holding the rcu_read_lock().
2978 */
2979 get_task_struct(owner);
2980 }
2981 rcu_read_unlock();
2982
2983 if (owner) {
2984 mutex_lock(&owner->perf_event_mutex);
2985 /*
2986 * We have to re-check the event->owner field, if it is cleared
2987 * we raced with perf_event_exit_task(), acquiring the mutex
2988 * ensured they're done, and we can proceed with freeing the
2989 * event.
2990 */
2991 if (event->owner)
2992 list_del_init(&event->owner_entry);
2993 mutex_unlock(&owner->perf_event_mutex);
2994 put_task_struct(owner);
2995 }
2996
Peter Zijlstraa66a3052009-11-23 11:37:23 +01002997 return perf_event_release_kernel(event);
2998}
2999
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003000u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003001{
3002 struct perf_event *child;
3003 u64 total = 0;
3004
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003005 *enabled = 0;
3006 *running = 0;
3007
Peter Zijlstra6f105812009-11-20 22:19:56 +01003008 mutex_lock(&event->child_mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003009 total += perf_event_read(event);
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003010 *enabled += event->total_time_enabled +
3011 atomic64_read(&event->child_total_time_enabled);
3012 *running += event->total_time_running +
3013 atomic64_read(&event->child_total_time_running);
3014
3015 list_for_each_entry(child, &event->child_list, child_list) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003016 total += perf_event_read(child);
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003017 *enabled += child->total_time_enabled;
3018 *running += child->total_time_running;
3019 }
Peter Zijlstra6f105812009-11-20 22:19:56 +01003020 mutex_unlock(&event->child_mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003021
3022 return total;
3023}
Arjan van de Venfb0459d2009-09-25 12:25:56 +02003024EXPORT_SYMBOL_GPL(perf_event_read_value);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003025
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003026static int perf_event_read_group(struct perf_event *event,
3027 u64 read_format, char __user *buf)
3028{
3029 struct perf_event *leader = event->group_leader, *sub;
Peter Zijlstra6f105812009-11-20 22:19:56 +01003030 int n = 0, size = 0, ret = -EFAULT;
3031 struct perf_event_context *ctx = leader->ctx;
Peter Zijlstraabf48682009-11-20 22:19:49 +01003032 u64 values[5];
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003033 u64 count, enabled, running;
Peter Zijlstraabf48682009-11-20 22:19:49 +01003034
Peter Zijlstra6f105812009-11-20 22:19:56 +01003035 mutex_lock(&ctx->mutex);
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003036 count = perf_event_read_value(leader, &enabled, &running);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003037
3038 values[n++] = 1 + leader->nr_siblings;
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003039 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3040 values[n++] = enabled;
3041 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3042 values[n++] = running;
Peter Zijlstraabf48682009-11-20 22:19:49 +01003043 values[n++] = count;
3044 if (read_format & PERF_FORMAT_ID)
3045 values[n++] = primary_event_id(leader);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003046
3047 size = n * sizeof(u64);
3048
3049 if (copy_to_user(buf, values, size))
Peter Zijlstra6f105812009-11-20 22:19:56 +01003050 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003051
Peter Zijlstra6f105812009-11-20 22:19:56 +01003052 ret = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003053
3054 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
Peter Zijlstraabf48682009-11-20 22:19:49 +01003055 n = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003056
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003057 values[n++] = perf_event_read_value(sub, &enabled, &running);
Peter Zijlstraabf48682009-11-20 22:19:49 +01003058 if (read_format & PERF_FORMAT_ID)
3059 values[n++] = primary_event_id(sub);
3060
3061 size = n * sizeof(u64);
3062
Stephane Eranian184d3da2009-11-23 21:40:49 -08003063 if (copy_to_user(buf + ret, values, size)) {
Peter Zijlstra6f105812009-11-20 22:19:56 +01003064 ret = -EFAULT;
3065 goto unlock;
3066 }
Peter Zijlstraabf48682009-11-20 22:19:49 +01003067
3068 ret += size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003069 }
Peter Zijlstra6f105812009-11-20 22:19:56 +01003070unlock:
3071 mutex_unlock(&ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003072
Peter Zijlstraabf48682009-11-20 22:19:49 +01003073 return ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003074}
3075
3076static int perf_event_read_one(struct perf_event *event,
3077 u64 read_format, char __user *buf)
3078{
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003079 u64 enabled, running;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003080 u64 values[4];
3081 int n = 0;
3082
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003083 values[n++] = perf_event_read_value(event, &enabled, &running);
3084 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3085 values[n++] = enabled;
3086 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3087 values[n++] = running;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003088 if (read_format & PERF_FORMAT_ID)
3089 values[n++] = primary_event_id(event);
3090
3091 if (copy_to_user(buf, values, n * sizeof(u64)))
3092 return -EFAULT;
3093
3094 return n * sizeof(u64);
3095}
3096
3097/*
3098 * Read the performance event - simple non blocking version for now
3099 */
3100static ssize_t
3101perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
3102{
3103 u64 read_format = event->attr.read_format;
3104 int ret;
3105
3106 /*
3107 * Return end-of-file for a read on a event that is in
3108 * error state (i.e. because it was pinned but it couldn't be
3109 * scheduled on to the CPU at some point).
3110 */
3111 if (event->state == PERF_EVENT_STATE_ERROR)
3112 return 0;
3113
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02003114 if (count < event->read_size)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003115 return -ENOSPC;
3116
3117 WARN_ON_ONCE(event->ctx->parent_ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003118 if (read_format & PERF_FORMAT_GROUP)
3119 ret = perf_event_read_group(event, read_format, buf);
3120 else
3121 ret = perf_event_read_one(event, read_format, buf);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003122
3123 return ret;
3124}
3125
3126static ssize_t
3127perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3128{
3129 struct perf_event *event = file->private_data;
3130
3131 return perf_read_hw(event, buf, count);
3132}
3133
3134static unsigned int perf_poll(struct file *file, poll_table *wait)
3135{
3136 struct perf_event *event = file->private_data;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003137 struct perf_buffer *buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003138 unsigned int events = POLL_HUP;
3139
3140 rcu_read_lock();
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003141 buffer = rcu_dereference(event->buffer);
3142 if (buffer)
3143 events = atomic_xchg(&buffer->poll, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003144 rcu_read_unlock();
3145
3146 poll_wait(file, &event->waitq, wait);
3147
3148 return events;
3149}
3150
3151static void perf_event_reset(struct perf_event *event)
3152{
3153 (void)perf_event_read(event);
Peter Zijlstrae7850592010-05-21 14:43:08 +02003154 local64_set(&event->count, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003155 perf_event_update_userpage(event);
3156}
3157
3158/*
3159 * Holding the top-level event's child_mutex means that any
3160 * descendant process that has inherited this event will block
3161 * in sync_child_event if it goes to exit, thus satisfying the
3162 * task existence requirements of perf_event_enable/disable.
3163 */
3164static void perf_event_for_each_child(struct perf_event *event,
3165 void (*func)(struct perf_event *))
3166{
3167 struct perf_event *child;
3168
3169 WARN_ON_ONCE(event->ctx->parent_ctx);
3170 mutex_lock(&event->child_mutex);
3171 func(event);
3172 list_for_each_entry(child, &event->child_list, child_list)
3173 func(child);
3174 mutex_unlock(&event->child_mutex);
3175}
3176
3177static void perf_event_for_each(struct perf_event *event,
3178 void (*func)(struct perf_event *))
3179{
3180 struct perf_event_context *ctx = event->ctx;
3181 struct perf_event *sibling;
3182
3183 WARN_ON_ONCE(ctx->parent_ctx);
3184 mutex_lock(&ctx->mutex);
3185 event = event->group_leader;
3186
3187 perf_event_for_each_child(event, func);
3188 func(event);
3189 list_for_each_entry(sibling, &event->sibling_list, group_entry)
3190 perf_event_for_each_child(event, func);
3191 mutex_unlock(&ctx->mutex);
3192}
3193
3194static int perf_event_period(struct perf_event *event, u64 __user *arg)
3195{
3196 struct perf_event_context *ctx = event->ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003197 int ret = 0;
3198 u64 value;
3199
Franck Bui-Huu6c7e5502010-11-23 16:21:43 +01003200 if (!is_sampling_event(event))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003201 return -EINVAL;
3202
John Blackwoodad0cf342010-09-28 18:03:11 -04003203 if (copy_from_user(&value, arg, sizeof(value)))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003204 return -EFAULT;
3205
3206 if (!value)
3207 return -EINVAL;
3208
Thomas Gleixnere625cce12009-11-17 18:02:06 +01003209 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003210 if (event->attr.freq) {
3211 if (value > sysctl_perf_event_sample_rate) {
3212 ret = -EINVAL;
3213 goto unlock;
3214 }
3215
3216 event->attr.sample_freq = value;
3217 } else {
3218 event->attr.sample_period = value;
3219 event->hw.sample_period = value;
3220 }
3221unlock:
Thomas Gleixnere625cce12009-11-17 18:02:06 +01003222 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003223
3224 return ret;
3225}
3226
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003227static const struct file_operations perf_fops;
3228
3229static struct perf_event *perf_fget_light(int fd, int *fput_needed)
3230{
3231 struct file *file;
3232
3233 file = fget_light(fd, fput_needed);
3234 if (!file)
3235 return ERR_PTR(-EBADF);
3236
3237 if (file->f_op != &perf_fops) {
3238 fput_light(file, *fput_needed);
3239 *fput_needed = 0;
3240 return ERR_PTR(-EBADF);
3241 }
3242
3243 return file->private_data;
3244}
3245
3246static int perf_event_set_output(struct perf_event *event,
3247 struct perf_event *output_event);
Li Zefan6fb29152009-10-15 11:21:42 +08003248static int perf_event_set_filter(struct perf_event *event, void __user *arg);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003249
3250static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3251{
3252 struct perf_event *event = file->private_data;
3253 void (*func)(struct perf_event *);
3254 u32 flags = arg;
3255
3256 switch (cmd) {
3257 case PERF_EVENT_IOC_ENABLE:
3258 func = perf_event_enable;
3259 break;
3260 case PERF_EVENT_IOC_DISABLE:
3261 func = perf_event_disable;
3262 break;
3263 case PERF_EVENT_IOC_RESET:
3264 func = perf_event_reset;
3265 break;
3266
3267 case PERF_EVENT_IOC_REFRESH:
3268 return perf_event_refresh(event, arg);
3269
3270 case PERF_EVENT_IOC_PERIOD:
3271 return perf_event_period(event, (u64 __user *)arg);
3272
3273 case PERF_EVENT_IOC_SET_OUTPUT:
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003274 {
3275 struct perf_event *output_event = NULL;
3276 int fput_needed = 0;
3277 int ret;
3278
3279 if (arg != -1) {
3280 output_event = perf_fget_light(arg, &fput_needed);
3281 if (IS_ERR(output_event))
3282 return PTR_ERR(output_event);
3283 }
3284
3285 ret = perf_event_set_output(event, output_event);
3286 if (output_event)
3287 fput_light(output_event->filp, fput_needed);
3288
3289 return ret;
3290 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003291
Li Zefan6fb29152009-10-15 11:21:42 +08003292 case PERF_EVENT_IOC_SET_FILTER:
3293 return perf_event_set_filter(event, (void __user *)arg);
3294
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003295 default:
3296 return -ENOTTY;
3297 }
3298
3299 if (flags & PERF_IOC_FLAG_GROUP)
3300 perf_event_for_each(event, func);
3301 else
3302 perf_event_for_each_child(event, func);
3303
3304 return 0;
3305}
3306
3307int perf_event_task_enable(void)
3308{
3309 struct perf_event *event;
3310
3311 mutex_lock(&current->perf_event_mutex);
3312 list_for_each_entry(event, &current->perf_event_list, owner_entry)
3313 perf_event_for_each_child(event, perf_event_enable);
3314 mutex_unlock(&current->perf_event_mutex);
3315
3316 return 0;
3317}
3318
3319int perf_event_task_disable(void)
3320{
3321 struct perf_event *event;
3322
3323 mutex_lock(&current->perf_event_mutex);
3324 list_for_each_entry(event, &current->perf_event_list, owner_entry)
3325 perf_event_for_each_child(event, perf_event_disable);
3326 mutex_unlock(&current->perf_event_mutex);
3327
3328 return 0;
3329}
3330
3331#ifndef PERF_EVENT_INDEX_OFFSET
3332# define PERF_EVENT_INDEX_OFFSET 0
3333#endif
3334
3335static int perf_event_index(struct perf_event *event)
3336{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02003337 if (event->hw.state & PERF_HES_STOPPED)
3338 return 0;
3339
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003340 if (event->state != PERF_EVENT_STATE_ACTIVE)
3341 return 0;
3342
3343 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
3344}
3345
3346/*
3347 * Callers need to ensure there can be no nesting of this function, otherwise
3348 * the seqlock logic goes bad. We can not serialize this because the arch
3349 * code calls this from NMI context.
3350 */
3351void perf_event_update_userpage(struct perf_event *event)
3352{
3353 struct perf_event_mmap_page *userpg;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003354 struct perf_buffer *buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003355
3356 rcu_read_lock();
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003357 buffer = rcu_dereference(event->buffer);
3358 if (!buffer)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003359 goto unlock;
3360
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003361 userpg = buffer->user_page;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003362
3363 /*
3364 * Disable preemption so as to not let the corresponding user-space
3365 * spin too long if we get preempted.
3366 */
3367 preempt_disable();
3368 ++userpg->lock;
3369 barrier();
3370 userpg->index = perf_event_index(event);
Peter Zijlstrab5e58792010-05-21 14:43:12 +02003371 userpg->offset = perf_event_count(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003372 if (event->state == PERF_EVENT_STATE_ACTIVE)
Peter Zijlstrae7850592010-05-21 14:43:08 +02003373 userpg->offset -= local64_read(&event->hw.prev_count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003374
3375 userpg->time_enabled = event->total_time_enabled +
3376 atomic64_read(&event->child_total_time_enabled);
3377
3378 userpg->time_running = event->total_time_running +
3379 atomic64_read(&event->child_total_time_running);
3380
3381 barrier();
3382 ++userpg->lock;
3383 preempt_enable();
3384unlock:
3385 rcu_read_unlock();
3386}
3387
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003388static unsigned long perf_data_size(struct perf_buffer *buffer);
3389
3390static void
3391perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
3392{
3393 long max_size = perf_data_size(buffer);
3394
3395 if (watermark)
3396 buffer->watermark = min(max_size, watermark);
3397
3398 if (!buffer->watermark)
3399 buffer->watermark = max_size / 2;
3400
3401 if (flags & PERF_BUFFER_WRITABLE)
3402 buffer->writable = 1;
3403
3404 atomic_set(&buffer->refcount, 1);
3405}
3406
Peter Zijlstra906010b2009-09-21 16:08:49 +02003407#ifndef CONFIG_PERF_USE_VMALLOC
3408
3409/*
3410 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
3411 */
3412
3413static struct page *
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003414perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003415{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003416 if (pgoff > buffer->nr_pages)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003417 return NULL;
3418
3419 if (pgoff == 0)
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003420 return virt_to_page(buffer->user_page);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003421
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003422 return virt_to_page(buffer->data_pages[pgoff - 1]);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003423}
3424
Peter Zijlstraa19d35c2010-05-17 18:48:00 +02003425static void *perf_mmap_alloc_page(int cpu)
3426{
3427 struct page *page;
3428 int node;
3429
3430 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
3431 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
3432 if (!page)
3433 return NULL;
3434
3435 return page_address(page);
3436}
3437
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003438static struct perf_buffer *
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003439perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003440{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003441 struct perf_buffer *buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003442 unsigned long size;
3443 int i;
3444
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003445 size = sizeof(struct perf_buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003446 size += nr_pages * sizeof(void *);
3447
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003448 buffer = kzalloc(size, GFP_KERNEL);
3449 if (!buffer)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003450 goto fail;
3451
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003452 buffer->user_page = perf_mmap_alloc_page(cpu);
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003453 if (!buffer->user_page)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003454 goto fail_user_page;
3455
3456 for (i = 0; i < nr_pages; i++) {
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003457 buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003458 if (!buffer->data_pages[i])
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003459 goto fail_data_pages;
3460 }
3461
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003462 buffer->nr_pages = nr_pages;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003463
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003464 perf_buffer_init(buffer, watermark, flags);
3465
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003466 return buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003467
3468fail_data_pages:
3469 for (i--; i >= 0; i--)
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003470 free_page((unsigned long)buffer->data_pages[i]);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003471
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003472 free_page((unsigned long)buffer->user_page);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003473
3474fail_user_page:
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003475 kfree(buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003476
3477fail:
Peter Zijlstra906010b2009-09-21 16:08:49 +02003478 return NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003479}
3480
3481static void perf_mmap_free_page(unsigned long addr)
3482{
3483 struct page *page = virt_to_page((void *)addr);
3484
3485 page->mapping = NULL;
3486 __free_page(page);
3487}
3488
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003489static void perf_buffer_free(struct perf_buffer *buffer)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003490{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003491 int i;
3492
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003493 perf_mmap_free_page((unsigned long)buffer->user_page);
3494 for (i = 0; i < buffer->nr_pages; i++)
3495 perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
3496 kfree(buffer);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003497}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003498
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003499static inline int page_order(struct perf_buffer *buffer)
Peter Zijlstra3cafa9f2010-05-20 19:07:56 +02003500{
3501 return 0;
3502}
3503
Peter Zijlstra906010b2009-09-21 16:08:49 +02003504#else
3505
3506/*
3507 * Back perf_mmap() with vmalloc memory.
3508 *
3509 * Required for architectures that have d-cache aliasing issues.
3510 */
3511
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003512static inline int page_order(struct perf_buffer *buffer)
Peter Zijlstra3cafa9f2010-05-20 19:07:56 +02003513{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003514 return buffer->page_order;
Peter Zijlstra3cafa9f2010-05-20 19:07:56 +02003515}
3516
Peter Zijlstra906010b2009-09-21 16:08:49 +02003517static struct page *
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003518perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003519{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003520 if (pgoff > (1UL << page_order(buffer)))
Peter Zijlstra906010b2009-09-21 16:08:49 +02003521 return NULL;
3522
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003523 return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003524}
3525
3526static void perf_mmap_unmark_page(void *addr)
3527{
3528 struct page *page = vmalloc_to_page(addr);
3529
3530 page->mapping = NULL;
3531}
3532
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003533static void perf_buffer_free_work(struct work_struct *work)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003534{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003535 struct perf_buffer *buffer;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003536 void *base;
3537 int i, nr;
3538
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003539 buffer = container_of(work, struct perf_buffer, work);
3540 nr = 1 << page_order(buffer);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003541
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003542 base = buffer->user_page;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003543 for (i = 0; i < nr + 1; i++)
3544 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
3545
3546 vfree(base);
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003547 kfree(buffer);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003548}
3549
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003550static void perf_buffer_free(struct perf_buffer *buffer)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003551{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003552 schedule_work(&buffer->work);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003553}
3554
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003555static struct perf_buffer *
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003556perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003557{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003558 struct perf_buffer *buffer;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003559 unsigned long size;
3560 void *all_buf;
3561
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003562 size = sizeof(struct perf_buffer);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003563 size += sizeof(void *);
3564
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003565 buffer = kzalloc(size, GFP_KERNEL);
3566 if (!buffer)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003567 goto fail;
3568
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003569 INIT_WORK(&buffer->work, perf_buffer_free_work);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003570
3571 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
3572 if (!all_buf)
3573 goto fail_all_buf;
3574
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003575 buffer->user_page = all_buf;
3576 buffer->data_pages[0] = all_buf + PAGE_SIZE;
3577 buffer->page_order = ilog2(nr_pages);
3578 buffer->nr_pages = 1;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003579
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003580 perf_buffer_init(buffer, watermark, flags);
3581
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003582 return buffer;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003583
3584fail_all_buf:
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003585 kfree(buffer);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003586
3587fail:
3588 return NULL;
3589}
3590
3591#endif
3592
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003593static unsigned long perf_data_size(struct perf_buffer *buffer)
Peter Zijlstra3cafa9f2010-05-20 19:07:56 +02003594{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003595 return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
Peter Zijlstra3cafa9f2010-05-20 19:07:56 +02003596}
3597
Peter Zijlstra906010b2009-09-21 16:08:49 +02003598static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3599{
3600 struct perf_event *event = vma->vm_file->private_data;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003601 struct perf_buffer *buffer;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003602 int ret = VM_FAULT_SIGBUS;
3603
3604 if (vmf->flags & FAULT_FLAG_MKWRITE) {
3605 if (vmf->pgoff == 0)
3606 ret = 0;
3607 return ret;
3608 }
3609
3610 rcu_read_lock();
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003611 buffer = rcu_dereference(event->buffer);
3612 if (!buffer)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003613 goto unlock;
3614
3615 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
3616 goto unlock;
3617
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003618 vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003619 if (!vmf->page)
3620 goto unlock;
3621
3622 get_page(vmf->page);
3623 vmf->page->mapping = vma->vm_file->f_mapping;
3624 vmf->page->index = vmf->pgoff;
3625
3626 ret = 0;
3627unlock:
3628 rcu_read_unlock();
3629
3630 return ret;
3631}
3632
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003633static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003634{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003635 struct perf_buffer *buffer;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003636
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003637 buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
3638 perf_buffer_free(buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003639}
3640
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003641static struct perf_buffer *perf_buffer_get(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003642{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003643 struct perf_buffer *buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003644
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003645 rcu_read_lock();
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003646 buffer = rcu_dereference(event->buffer);
3647 if (buffer) {
3648 if (!atomic_inc_not_zero(&buffer->refcount))
3649 buffer = NULL;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003650 }
3651 rcu_read_unlock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003652
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003653 return buffer;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003654}
3655
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003656static void perf_buffer_put(struct perf_buffer *buffer)
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003657{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003658 if (!atomic_dec_and_test(&buffer->refcount))
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003659 return;
3660
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003661 call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003662}
3663
3664static void perf_mmap_open(struct vm_area_struct *vma)
3665{
3666 struct perf_event *event = vma->vm_file->private_data;
3667
3668 atomic_inc(&event->mmap_count);
3669}
3670
3671static void perf_mmap_close(struct vm_area_struct *vma)
3672{
3673 struct perf_event *event = vma->vm_file->private_data;
3674
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003675 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003676 unsigned long size = perf_data_size(event->buffer);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003677 struct user_struct *user = event->mmap_user;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003678 struct perf_buffer *buffer = event->buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003679
Peter Zijlstra906010b2009-09-21 16:08:49 +02003680 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003681 vma->vm_mm->locked_vm -= event->mmap_locked;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003682 rcu_assign_pointer(event->buffer, NULL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003683 mutex_unlock(&event->mmap_mutex);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003684
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003685 perf_buffer_put(buffer);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003686 free_uid(user);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003687 }
3688}
3689
Alexey Dobriyanf0f37e2f2009-09-27 22:29:37 +04003690static const struct vm_operations_struct perf_mmap_vmops = {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003691 .open = perf_mmap_open,
3692 .close = perf_mmap_close,
3693 .fault = perf_mmap_fault,
3694 .page_mkwrite = perf_mmap_fault,
3695};
3696
3697static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3698{
3699 struct perf_event *event = file->private_data;
3700 unsigned long user_locked, user_lock_limit;
3701 struct user_struct *user = current_user();
3702 unsigned long locked, lock_limit;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003703 struct perf_buffer *buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003704 unsigned long vma_size;
3705 unsigned long nr_pages;
3706 long user_extra, extra;
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003707 int ret = 0, flags = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003708
Peter Zijlstrac7920612010-05-18 10:33:24 +02003709 /*
3710 * Don't allow mmap() of inherited per-task counters. This would
3711 * create a performance issue due to all children writing to the
3712 * same buffer.
3713 */
3714 if (event->cpu == -1 && event->attr.inherit)
3715 return -EINVAL;
3716
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003717 if (!(vma->vm_flags & VM_SHARED))
3718 return -EINVAL;
3719
3720 vma_size = vma->vm_end - vma->vm_start;
3721 nr_pages = (vma_size / PAGE_SIZE) - 1;
3722
3723 /*
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003724 * If we have buffer pages ensure they're a power-of-two number, so we
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003725 * can do bitmasks instead of modulo.
3726 */
3727 if (nr_pages != 0 && !is_power_of_2(nr_pages))
3728 return -EINVAL;
3729
3730 if (vma_size != PAGE_SIZE * (1 + nr_pages))
3731 return -EINVAL;
3732
3733 if (vma->vm_pgoff != 0)
3734 return -EINVAL;
3735
3736 WARN_ON_ONCE(event->ctx->parent_ctx);
3737 mutex_lock(&event->mmap_mutex);
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003738 if (event->buffer) {
3739 if (event->buffer->nr_pages == nr_pages)
3740 atomic_inc(&event->buffer->refcount);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003741 else
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003742 ret = -EINVAL;
3743 goto unlock;
3744 }
3745
3746 user_extra = nr_pages + 1;
3747 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
3748
3749 /*
3750 * Increase the limit linearly with more CPUs:
3751 */
3752 user_lock_limit *= num_online_cpus();
3753
3754 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
3755
3756 extra = 0;
3757 if (user_locked > user_lock_limit)
3758 extra = user_locked - user_lock_limit;
3759
Jiri Slaby78d7d402010-03-05 13:42:54 -08003760 lock_limit = rlimit(RLIMIT_MEMLOCK);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003761 lock_limit >>= PAGE_SHIFT;
3762 locked = vma->vm_mm->locked_vm + extra;
3763
3764 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
3765 !capable(CAP_IPC_LOCK)) {
3766 ret = -EPERM;
3767 goto unlock;
3768 }
3769
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003770 WARN_ON(event->buffer);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003771
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003772 if (vma->vm_flags & VM_WRITE)
3773 flags |= PERF_BUFFER_WRITABLE;
3774
3775 buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
3776 event->cpu, flags);
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003777 if (!buffer) {
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003778 ret = -ENOMEM;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003779 goto unlock;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003780 }
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003781 rcu_assign_pointer(event->buffer, buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003782
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003783 atomic_long_add(user_extra, &user->locked_vm);
3784 event->mmap_locked = extra;
3785 event->mmap_user = get_current_user();
3786 vma->vm_mm->locked_vm += event->mmap_locked;
3787
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003788unlock:
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003789 if (!ret)
3790 atomic_inc(&event->mmap_count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003791 mutex_unlock(&event->mmap_mutex);
3792
3793 vma->vm_flags |= VM_RESERVED;
3794 vma->vm_ops = &perf_mmap_vmops;
3795
3796 return ret;
3797}
3798
3799static int perf_fasync(int fd, struct file *filp, int on)
3800{
3801 struct inode *inode = filp->f_path.dentry->d_inode;
3802 struct perf_event *event = filp->private_data;
3803 int retval;
3804
3805 mutex_lock(&inode->i_mutex);
3806 retval = fasync_helper(fd, filp, on, &event->fasync);
3807 mutex_unlock(&inode->i_mutex);
3808
3809 if (retval < 0)
3810 return retval;
3811
3812 return 0;
3813}
3814
3815static const struct file_operations perf_fops = {
Arnd Bergmann3326c1c2010-03-23 19:09:33 +01003816 .llseek = no_llseek,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003817 .release = perf_release,
3818 .read = perf_read,
3819 .poll = perf_poll,
3820 .unlocked_ioctl = perf_ioctl,
3821 .compat_ioctl = perf_ioctl,
3822 .mmap = perf_mmap,
3823 .fasync = perf_fasync,
3824};
3825
3826/*
3827 * Perf event wakeup
3828 *
3829 * If there's data, ensure we set the poll() state and publish everything
3830 * to user-space before waking everybody up.
3831 */
3832
3833void perf_event_wakeup(struct perf_event *event)
3834{
3835 wake_up_all(&event->waitq);
3836
3837 if (event->pending_kill) {
3838 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
3839 event->pending_kill = 0;
3840 }
3841}
3842
Peter Zijlstrae360adb2010-10-14 14:01:34 +08003843static void perf_pending_event(struct irq_work *entry)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003844{
3845 struct perf_event *event = container_of(entry,
3846 struct perf_event, pending);
3847
3848 if (event->pending_disable) {
3849 event->pending_disable = 0;
3850 __perf_event_disable(event);
3851 }
3852
3853 if (event->pending_wakeup) {
3854 event->pending_wakeup = 0;
3855 perf_event_wakeup(event);
3856 }
3857}
3858
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003859/*
Zhang, Yanmin39447b32010-04-19 13:32:41 +08003860 * We assume there is only KVM supporting the callbacks.
3861 * Later on, we might change it to a list if there is
3862 * another virtualization implementation supporting the callbacks.
3863 */
3864struct perf_guest_info_callbacks *perf_guest_cbs;
3865
3866int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3867{
3868 perf_guest_cbs = cbs;
3869 return 0;
3870}
3871EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
3872
3873int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3874{
3875 perf_guest_cbs = NULL;
3876 return 0;
3877}
3878EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
3879
3880/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003881 * Output
3882 */
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003883static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003884 unsigned long offset, unsigned long head)
3885{
3886 unsigned long mask;
3887
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003888 if (!buffer->writable)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003889 return true;
3890
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003891 mask = perf_data_size(buffer) - 1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003892
3893 offset = (offset - tail) & mask;
3894 head = (head - tail) & mask;
3895
3896 if ((int)(head - offset) < 0)
3897 return false;
3898
3899 return true;
3900}
3901
3902static void perf_output_wakeup(struct perf_output_handle *handle)
3903{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003904 atomic_set(&handle->buffer->poll, POLL_IN);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003905
3906 if (handle->nmi) {
3907 handle->event->pending_wakeup = 1;
Peter Zijlstrae360adb2010-10-14 14:01:34 +08003908 irq_work_queue(&handle->event->pending);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003909 } else
3910 perf_event_wakeup(handle->event);
3911}
3912
3913/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003914 * We need to ensure a later event_id doesn't publish a head when a former
Peter Zijlstraef607772010-05-18 10:50:41 +02003915 * event isn't done writing. However since we need to deal with NMIs we
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003916 * cannot fully serialize things.
3917 *
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003918 * We only publish the head (and generate a wakeup) when the outer-most
Peter Zijlstraef607772010-05-18 10:50:41 +02003919 * event completes.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003920 */
Peter Zijlstraef607772010-05-18 10:50:41 +02003921static void perf_output_get_handle(struct perf_output_handle *handle)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003922{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003923 struct perf_buffer *buffer = handle->buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003924
Peter Zijlstraef607772010-05-18 10:50:41 +02003925 preempt_disable();
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003926 local_inc(&buffer->nest);
3927 handle->wakeup = local_read(&buffer->wakeup);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003928}
3929
Peter Zijlstraef607772010-05-18 10:50:41 +02003930static void perf_output_put_handle(struct perf_output_handle *handle)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003931{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003932 struct perf_buffer *buffer = handle->buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003933 unsigned long head;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003934
3935again:
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003936 head = local_read(&buffer->head);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003937
3938 /*
Peter Zijlstraef607772010-05-18 10:50:41 +02003939 * IRQ/NMI can happen here, which means we can miss a head update.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003940 */
3941
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003942 if (!local_dec_and_test(&buffer->nest))
Frederic Weisbeckeracd35a42010-05-20 21:28:34 +02003943 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003944
3945 /*
Peter Zijlstraef607772010-05-18 10:50:41 +02003946 * Publish the known good head. Rely on the full barrier implied
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003947 * by atomic_dec_and_test() order the buffer->head read and this
Peter Zijlstraef607772010-05-18 10:50:41 +02003948 * write.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003949 */
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003950 buffer->user_page->data_head = head;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003951
Peter Zijlstraef607772010-05-18 10:50:41 +02003952 /*
3953 * Now check if we missed an update, rely on the (compiler)
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003954 * barrier in atomic_dec_and_test() to re-read buffer->head.
Peter Zijlstraef607772010-05-18 10:50:41 +02003955 */
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003956 if (unlikely(head != local_read(&buffer->head))) {
3957 local_inc(&buffer->nest);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003958 goto again;
3959 }
3960
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003961 if (handle->wakeup != local_read(&buffer->wakeup))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003962 perf_output_wakeup(handle);
Peter Zijlstraef607772010-05-18 10:50:41 +02003963
Peter Zijlstra9ed60602010-06-11 17:36:35 +02003964out:
Peter Zijlstraef607772010-05-18 10:50:41 +02003965 preempt_enable();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003966}
3967
Peter Zijlstraa94ffaa2010-05-20 19:50:07 +02003968__always_inline void perf_output_copy(struct perf_output_handle *handle,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003969 const void *buf, unsigned int len)
3970{
Peter Zijlstra5d967a82010-05-20 16:46:39 +02003971 do {
Peter Zijlstraa94ffaa2010-05-20 19:50:07 +02003972 unsigned long size = min_t(unsigned long, handle->size, len);
Peter Zijlstra5d967a82010-05-20 16:46:39 +02003973
3974 memcpy(handle->addr, buf, size);
3975
3976 len -= size;
3977 handle->addr += size;
Frederic Weisbecker74048f82010-05-27 21:34:58 +02003978 buf += size;
Peter Zijlstra5d967a82010-05-20 16:46:39 +02003979 handle->size -= size;
3980 if (!handle->size) {
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003981 struct perf_buffer *buffer = handle->buffer;
Peter Zijlstra3cafa9f2010-05-20 19:07:56 +02003982
Peter Zijlstra5d967a82010-05-20 16:46:39 +02003983 handle->page++;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003984 handle->page &= buffer->nr_pages - 1;
3985 handle->addr = buffer->data_pages[handle->page];
3986 handle->size = PAGE_SIZE << page_order(buffer);
Peter Zijlstra5d967a82010-05-20 16:46:39 +02003987 }
3988 } while (len);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003989}
3990
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02003991static void __perf_event_header__init_id(struct perf_event_header *header,
3992 struct perf_sample_data *data,
3993 struct perf_event *event)
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02003994{
3995 u64 sample_type = event->attr.sample_type;
3996
3997 data->type = sample_type;
3998 header->size += event->id_header_size;
3999
4000 if (sample_type & PERF_SAMPLE_TID) {
4001 /* namespace issues */
4002 data->tid_entry.pid = perf_event_pid(event, current);
4003 data->tid_entry.tid = perf_event_tid(event, current);
4004 }
4005
4006 if (sample_type & PERF_SAMPLE_TIME)
4007 data->time = perf_clock();
4008
4009 if (sample_type & PERF_SAMPLE_ID)
4010 data->id = primary_event_id(event);
4011
4012 if (sample_type & PERF_SAMPLE_STREAM_ID)
4013 data->stream_id = event->id;
4014
4015 if (sample_type & PERF_SAMPLE_CPU) {
4016 data->cpu_entry.cpu = raw_smp_processor_id();
4017 data->cpu_entry.reserved = 0;
4018 }
4019}
4020
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004021static void perf_event_header__init_id(struct perf_event_header *header,
4022 struct perf_sample_data *data,
4023 struct perf_event *event)
4024{
4025 if (event->attr.sample_id_all)
4026 __perf_event_header__init_id(header, data, event);
4027}
4028
4029static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4030 struct perf_sample_data *data)
4031{
4032 u64 sample_type = data->type;
4033
4034 if (sample_type & PERF_SAMPLE_TID)
4035 perf_output_put(handle, data->tid_entry);
4036
4037 if (sample_type & PERF_SAMPLE_TIME)
4038 perf_output_put(handle, data->time);
4039
4040 if (sample_type & PERF_SAMPLE_ID)
4041 perf_output_put(handle, data->id);
4042
4043 if (sample_type & PERF_SAMPLE_STREAM_ID)
4044 perf_output_put(handle, data->stream_id);
4045
4046 if (sample_type & PERF_SAMPLE_CPU)
4047 perf_output_put(handle, data->cpu_entry);
4048}
4049
4050static void perf_event__output_id_sample(struct perf_event *event,
4051 struct perf_output_handle *handle,
4052 struct perf_sample_data *sample)
4053{
4054 if (event->attr.sample_id_all)
4055 __perf_event__output_id_sample(handle, sample);
4056}
4057
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004058int perf_output_begin(struct perf_output_handle *handle,
4059 struct perf_event *event, unsigned int size,
4060 int nmi, int sample)
4061{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004062 struct perf_buffer *buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004063 unsigned long tail, offset, head;
4064 int have_lost;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004065 struct perf_sample_data sample_data;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004066 struct {
4067 struct perf_event_header header;
4068 u64 id;
4069 u64 lost;
4070 } lost_event;
4071
4072 rcu_read_lock();
4073 /*
4074 * For inherited events we send all the output towards the parent.
4075 */
4076 if (event->parent)
4077 event = event->parent;
4078
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004079 buffer = rcu_dereference(event->buffer);
4080 if (!buffer)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004081 goto out;
4082
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004083 handle->buffer = buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004084 handle->event = event;
4085 handle->nmi = nmi;
4086 handle->sample = sample;
4087
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004088 if (!buffer->nr_pages)
Stephane Eranian00d1d0b2010-05-17 12:46:01 +02004089 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004090
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004091 have_lost = local_read(&buffer->lost);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004092 if (have_lost) {
4093 lost_event.header.size = sizeof(lost_event);
4094 perf_event_header__init_id(&lost_event.header, &sample_data,
4095 event);
4096 size += lost_event.header.size;
4097 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004098
Peter Zijlstraef607772010-05-18 10:50:41 +02004099 perf_output_get_handle(handle);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004100
4101 do {
4102 /*
4103 * Userspace could choose to issue a mb() before updating the
4104 * tail pointer. So that all reads will be completed before the
4105 * write is issued.
4106 */
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004107 tail = ACCESS_ONCE(buffer->user_page->data_tail);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004108 smp_rmb();
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004109 offset = head = local_read(&buffer->head);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004110 head += size;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004111 if (unlikely(!perf_output_space(buffer, tail, offset, head)))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004112 goto fail;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004113 } while (local_cmpxchg(&buffer->head, offset, head) != offset);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004114
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004115 if (head - local_read(&buffer->wakeup) > buffer->watermark)
4116 local_add(buffer->watermark, &buffer->wakeup);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004117
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004118 handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
4119 handle->page &= buffer->nr_pages - 1;
4120 handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
4121 handle->addr = buffer->data_pages[handle->page];
Peter Zijlstra5d967a82010-05-20 16:46:39 +02004122 handle->addr += handle->size;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004123 handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
Peter Zijlstra5d967a82010-05-20 16:46:39 +02004124
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004125 if (have_lost) {
4126 lost_event.header.type = PERF_RECORD_LOST;
4127 lost_event.header.misc = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004128 lost_event.id = event->id;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004129 lost_event.lost = local_xchg(&buffer->lost, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004130
4131 perf_output_put(handle, lost_event);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004132 perf_event__output_id_sample(event, handle, &sample_data);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004133 }
4134
4135 return 0;
4136
4137fail:
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004138 local_inc(&buffer->lost);
Peter Zijlstraef607772010-05-18 10:50:41 +02004139 perf_output_put_handle(handle);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004140out:
4141 rcu_read_unlock();
4142
4143 return -ENOSPC;
4144}
4145
4146void perf_output_end(struct perf_output_handle *handle)
4147{
4148 struct perf_event *event = handle->event;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004149 struct perf_buffer *buffer = handle->buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004150
4151 int wakeup_events = event->attr.wakeup_events;
4152
4153 if (handle->sample && wakeup_events) {
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004154 int events = local_inc_return(&buffer->events);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004155 if (events >= wakeup_events) {
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004156 local_sub(wakeup_events, &buffer->events);
4157 local_inc(&buffer->wakeup);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004158 }
4159 }
4160
Peter Zijlstraef607772010-05-18 10:50:41 +02004161 perf_output_put_handle(handle);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004162 rcu_read_unlock();
4163}
4164
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004165static void perf_output_read_one(struct perf_output_handle *handle,
Stephane Eranianeed01522010-10-26 16:08:01 +02004166 struct perf_event *event,
4167 u64 enabled, u64 running)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004168{
4169 u64 read_format = event->attr.read_format;
4170 u64 values[4];
4171 int n = 0;
4172
Peter Zijlstrab5e58792010-05-21 14:43:12 +02004173 values[n++] = perf_event_count(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004174 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
Stephane Eranianeed01522010-10-26 16:08:01 +02004175 values[n++] = enabled +
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004176 atomic64_read(&event->child_total_time_enabled);
4177 }
4178 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
Stephane Eranianeed01522010-10-26 16:08:01 +02004179 values[n++] = running +
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004180 atomic64_read(&event->child_total_time_running);
4181 }
4182 if (read_format & PERF_FORMAT_ID)
4183 values[n++] = primary_event_id(event);
4184
4185 perf_output_copy(handle, values, n * sizeof(u64));
4186}
4187
4188/*
4189 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
4190 */
4191static void perf_output_read_group(struct perf_output_handle *handle,
Stephane Eranianeed01522010-10-26 16:08:01 +02004192 struct perf_event *event,
4193 u64 enabled, u64 running)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004194{
4195 struct perf_event *leader = event->group_leader, *sub;
4196 u64 read_format = event->attr.read_format;
4197 u64 values[5];
4198 int n = 0;
4199
4200 values[n++] = 1 + leader->nr_siblings;
4201
4202 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
Stephane Eranianeed01522010-10-26 16:08:01 +02004203 values[n++] = enabled;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004204
4205 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
Stephane Eranianeed01522010-10-26 16:08:01 +02004206 values[n++] = running;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004207
4208 if (leader != event)
4209 leader->pmu->read(leader);
4210
Peter Zijlstrab5e58792010-05-21 14:43:12 +02004211 values[n++] = perf_event_count(leader);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004212 if (read_format & PERF_FORMAT_ID)
4213 values[n++] = primary_event_id(leader);
4214
4215 perf_output_copy(handle, values, n * sizeof(u64));
4216
4217 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4218 n = 0;
4219
4220 if (sub != event)
4221 sub->pmu->read(sub);
4222
Peter Zijlstrab5e58792010-05-21 14:43:12 +02004223 values[n++] = perf_event_count(sub);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004224 if (read_format & PERF_FORMAT_ID)
4225 values[n++] = primary_event_id(sub);
4226
4227 perf_output_copy(handle, values, n * sizeof(u64));
4228 }
4229}
4230
Stephane Eranianeed01522010-10-26 16:08:01 +02004231#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
4232 PERF_FORMAT_TOTAL_TIME_RUNNING)
4233
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004234static void perf_output_read(struct perf_output_handle *handle,
4235 struct perf_event *event)
4236{
Stephane Eranianeed01522010-10-26 16:08:01 +02004237 u64 enabled = 0, running = 0, now, ctx_time;
4238 u64 read_format = event->attr.read_format;
4239
4240 /*
4241 * compute total_time_enabled, total_time_running
4242 * based on snapshot values taken when the event
4243 * was last scheduled in.
4244 *
4245 * we cannot simply called update_context_time()
4246 * because of locking issue as we are called in
4247 * NMI context
4248 */
4249 if (read_format & PERF_FORMAT_TOTAL_TIMES) {
4250 now = perf_clock();
4251 ctx_time = event->shadow_ctx_time + now;
4252 enabled = ctx_time - event->tstamp_enabled;
4253 running = ctx_time - event->tstamp_running;
4254 }
4255
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004256 if (event->attr.read_format & PERF_FORMAT_GROUP)
Stephane Eranianeed01522010-10-26 16:08:01 +02004257 perf_output_read_group(handle, event, enabled, running);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004258 else
Stephane Eranianeed01522010-10-26 16:08:01 +02004259 perf_output_read_one(handle, event, enabled, running);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004260}
4261
4262void perf_output_sample(struct perf_output_handle *handle,
4263 struct perf_event_header *header,
4264 struct perf_sample_data *data,
4265 struct perf_event *event)
4266{
4267 u64 sample_type = data->type;
4268
4269 perf_output_put(handle, *header);
4270
4271 if (sample_type & PERF_SAMPLE_IP)
4272 perf_output_put(handle, data->ip);
4273
4274 if (sample_type & PERF_SAMPLE_TID)
4275 perf_output_put(handle, data->tid_entry);
4276
4277 if (sample_type & PERF_SAMPLE_TIME)
4278 perf_output_put(handle, data->time);
4279
4280 if (sample_type & PERF_SAMPLE_ADDR)
4281 perf_output_put(handle, data->addr);
4282
4283 if (sample_type & PERF_SAMPLE_ID)
4284 perf_output_put(handle, data->id);
4285
4286 if (sample_type & PERF_SAMPLE_STREAM_ID)
4287 perf_output_put(handle, data->stream_id);
4288
4289 if (sample_type & PERF_SAMPLE_CPU)
4290 perf_output_put(handle, data->cpu_entry);
4291
4292 if (sample_type & PERF_SAMPLE_PERIOD)
4293 perf_output_put(handle, data->period);
4294
4295 if (sample_type & PERF_SAMPLE_READ)
4296 perf_output_read(handle, event);
4297
4298 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4299 if (data->callchain) {
4300 int size = 1;
4301
4302 if (data->callchain)
4303 size += data->callchain->nr;
4304
4305 size *= sizeof(u64);
4306
4307 perf_output_copy(handle, data->callchain, size);
4308 } else {
4309 u64 nr = 0;
4310 perf_output_put(handle, nr);
4311 }
4312 }
4313
4314 if (sample_type & PERF_SAMPLE_RAW) {
4315 if (data->raw) {
4316 perf_output_put(handle, data->raw->size);
4317 perf_output_copy(handle, data->raw->data,
4318 data->raw->size);
4319 } else {
4320 struct {
4321 u32 size;
4322 u32 data;
4323 } raw = {
4324 .size = sizeof(u32),
4325 .data = 0,
4326 };
4327 perf_output_put(handle, raw);
4328 }
4329 }
4330}
4331
4332void perf_prepare_sample(struct perf_event_header *header,
4333 struct perf_sample_data *data,
4334 struct perf_event *event,
4335 struct pt_regs *regs)
4336{
4337 u64 sample_type = event->attr.sample_type;
4338
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004339 header->type = PERF_RECORD_SAMPLE;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02004340 header->size = sizeof(*header) + event->header_size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004341
4342 header->misc = 0;
4343 header->misc |= perf_misc_flags(regs);
4344
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004345 __perf_event_header__init_id(header, data, event);
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02004346
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02004347 if (sample_type & PERF_SAMPLE_IP)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004348 data->ip = perf_instruction_pointer(regs);
4349
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004350 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4351 int size = 1;
4352
4353 data->callchain = perf_callchain(regs);
4354
4355 if (data->callchain)
4356 size += data->callchain->nr;
4357
4358 header->size += size * sizeof(u64);
4359 }
4360
4361 if (sample_type & PERF_SAMPLE_RAW) {
4362 int size = sizeof(u32);
4363
4364 if (data->raw)
4365 size += data->raw->size;
4366 else
4367 size += sizeof(u32);
4368
4369 WARN_ON_ONCE(size & (sizeof(u64)-1));
4370 header->size += size;
4371 }
4372}
4373
4374static void perf_event_output(struct perf_event *event, int nmi,
4375 struct perf_sample_data *data,
4376 struct pt_regs *regs)
4377{
4378 struct perf_output_handle handle;
4379 struct perf_event_header header;
4380
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02004381 /* protect the callchain buffers */
4382 rcu_read_lock();
4383
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004384 perf_prepare_sample(&header, data, event, regs);
4385
4386 if (perf_output_begin(&handle, event, header.size, nmi, 1))
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02004387 goto exit;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004388
4389 perf_output_sample(&handle, &header, data, event);
4390
4391 perf_output_end(&handle);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02004392
4393exit:
4394 rcu_read_unlock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004395}
4396
4397/*
4398 * read event_id
4399 */
4400
4401struct perf_read_event {
4402 struct perf_event_header header;
4403
4404 u32 pid;
4405 u32 tid;
4406};
4407
4408static void
4409perf_event_read_event(struct perf_event *event,
4410 struct task_struct *task)
4411{
4412 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004413 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004414 struct perf_read_event read_event = {
4415 .header = {
4416 .type = PERF_RECORD_READ,
4417 .misc = 0,
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02004418 .size = sizeof(read_event) + event->read_size,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004419 },
4420 .pid = perf_event_pid(event, task),
4421 .tid = perf_event_tid(event, task),
4422 };
4423 int ret;
4424
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004425 perf_event_header__init_id(&read_event.header, &sample, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004426 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
4427 if (ret)
4428 return;
4429
4430 perf_output_put(&handle, read_event);
4431 perf_output_read(&handle, event);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004432 perf_event__output_id_sample(event, &handle, &sample);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004433
4434 perf_output_end(&handle);
4435}
4436
4437/*
4438 * task tracking -- fork/exit
4439 *
Eric B Munson3af9e852010-05-18 15:30:49 +01004440 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004441 */
4442
4443struct perf_task_event {
4444 struct task_struct *task;
4445 struct perf_event_context *task_ctx;
4446
4447 struct {
4448 struct perf_event_header header;
4449
4450 u32 pid;
4451 u32 ppid;
4452 u32 tid;
4453 u32 ptid;
4454 u64 time;
4455 } event_id;
4456};
4457
4458static void perf_event_task_output(struct perf_event *event,
4459 struct perf_task_event *task_event)
4460{
4461 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004462 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004463 struct task_struct *task = task_event->task;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004464 int ret, size = task_event->event_id.header.size;
Mike Galbraith8bb39f92010-03-26 11:11:33 +01004465
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004466 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004467
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004468 ret = perf_output_begin(&handle, event,
4469 task_event->event_id.header.size, 0, 0);
Peter Zijlstraef607772010-05-18 10:50:41 +02004470 if (ret)
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004471 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004472
4473 task_event->event_id.pid = perf_event_pid(event, task);
4474 task_event->event_id.ppid = perf_event_pid(event, current);
4475
4476 task_event->event_id.tid = perf_event_tid(event, task);
4477 task_event->event_id.ptid = perf_event_tid(event, current);
4478
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004479 perf_output_put(&handle, task_event->event_id);
4480
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004481 perf_event__output_id_sample(event, &handle, &sample);
4482
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004483 perf_output_end(&handle);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004484out:
4485 task_event->event_id.header.size = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004486}
4487
4488static int perf_event_task_match(struct perf_event *event)
4489{
Peter Zijlstra6f93d0a2010-02-14 11:12:04 +01004490 if (event->state < PERF_EVENT_STATE_INACTIVE)
Peter Zijlstra22e19082010-01-18 09:12:32 +01004491 return 0;
4492
Stephane Eranian5632ab12011-01-03 18:20:01 +02004493 if (!event_filter_match(event))
Peter Zijlstra5d27c232009-12-17 13:16:32 +01004494 return 0;
4495
Eric B Munson3af9e852010-05-18 15:30:49 +01004496 if (event->attr.comm || event->attr.mmap ||
4497 event->attr.mmap_data || event->attr.task)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004498 return 1;
4499
4500 return 0;
4501}
4502
4503static void perf_event_task_ctx(struct perf_event_context *ctx,
4504 struct perf_task_event *task_event)
4505{
4506 struct perf_event *event;
4507
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004508 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4509 if (perf_event_task_match(event))
4510 perf_event_task_output(event, task_event);
4511 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004512}
4513
4514static void perf_event_task_event(struct perf_task_event *task_event)
4515{
4516 struct perf_cpu_context *cpuctx;
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02004517 struct perf_event_context *ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004518 struct pmu *pmu;
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02004519 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004520
Peter Zijlstrad6ff86c2009-11-20 22:19:46 +01004521 rcu_read_lock();
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004522 list_for_each_entry_rcu(pmu, &pmus, entry) {
Peter Zijlstra41945f62010-09-16 19:17:24 +02004523 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra51676952010-12-07 14:18:20 +01004524 if (cpuctx->active_pmu != pmu)
4525 goto next;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004526 perf_event_task_ctx(&cpuctx->ctx, task_event);
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02004527
4528 ctx = task_event->task_ctx;
4529 if (!ctx) {
4530 ctxn = pmu->task_ctx_nr;
4531 if (ctxn < 0)
Peter Zijlstra41945f62010-09-16 19:17:24 +02004532 goto next;
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02004533 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4534 }
4535 if (ctx)
4536 perf_event_task_ctx(ctx, task_event);
Peter Zijlstra41945f62010-09-16 19:17:24 +02004537next:
4538 put_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004539 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004540 rcu_read_unlock();
4541}
4542
4543static void perf_event_task(struct task_struct *task,
4544 struct perf_event_context *task_ctx,
4545 int new)
4546{
4547 struct perf_task_event task_event;
4548
4549 if (!atomic_read(&nr_comm_events) &&
4550 !atomic_read(&nr_mmap_events) &&
4551 !atomic_read(&nr_task_events))
4552 return;
4553
4554 task_event = (struct perf_task_event){
4555 .task = task,
4556 .task_ctx = task_ctx,
4557 .event_id = {
4558 .header = {
4559 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
4560 .misc = 0,
4561 .size = sizeof(task_event.event_id),
4562 },
4563 /* .pid */
4564 /* .ppid */
4565 /* .tid */
4566 /* .ptid */
Peter Zijlstra6f93d0a2010-02-14 11:12:04 +01004567 .time = perf_clock(),
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004568 },
4569 };
4570
4571 perf_event_task_event(&task_event);
4572}
4573
4574void perf_event_fork(struct task_struct *task)
4575{
4576 perf_event_task(task, NULL, 1);
4577}
4578
4579/*
4580 * comm tracking
4581 */
4582
4583struct perf_comm_event {
4584 struct task_struct *task;
4585 char *comm;
4586 int comm_size;
4587
4588 struct {
4589 struct perf_event_header header;
4590
4591 u32 pid;
4592 u32 tid;
4593 } event_id;
4594};
4595
4596static void perf_event_comm_output(struct perf_event *event,
4597 struct perf_comm_event *comm_event)
4598{
4599 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004600 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004601 int size = comm_event->event_id.header.size;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004602 int ret;
4603
4604 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4605 ret = perf_output_begin(&handle, event,
4606 comm_event->event_id.header.size, 0, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004607
4608 if (ret)
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004609 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004610
4611 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
4612 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
4613
4614 perf_output_put(&handle, comm_event->event_id);
4615 perf_output_copy(&handle, comm_event->comm,
4616 comm_event->comm_size);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004617
4618 perf_event__output_id_sample(event, &handle, &sample);
4619
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004620 perf_output_end(&handle);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004621out:
4622 comm_event->event_id.header.size = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004623}
4624
4625static int perf_event_comm_match(struct perf_event *event)
4626{
Peter Zijlstra6f93d0a2010-02-14 11:12:04 +01004627 if (event->state < PERF_EVENT_STATE_INACTIVE)
Peter Zijlstra22e19082010-01-18 09:12:32 +01004628 return 0;
4629
Stephane Eranian5632ab12011-01-03 18:20:01 +02004630 if (!event_filter_match(event))
Peter Zijlstra5d27c232009-12-17 13:16:32 +01004631 return 0;
4632
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004633 if (event->attr.comm)
4634 return 1;
4635
4636 return 0;
4637}
4638
4639static void perf_event_comm_ctx(struct perf_event_context *ctx,
4640 struct perf_comm_event *comm_event)
4641{
4642 struct perf_event *event;
4643
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004644 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4645 if (perf_event_comm_match(event))
4646 perf_event_comm_output(event, comm_event);
4647 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004648}
4649
4650static void perf_event_comm_event(struct perf_comm_event *comm_event)
4651{
4652 struct perf_cpu_context *cpuctx;
4653 struct perf_event_context *ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004654 char comm[TASK_COMM_LEN];
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004655 unsigned int size;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004656 struct pmu *pmu;
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02004657 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004658
4659 memset(comm, 0, sizeof(comm));
Márton Németh96b02d72009-11-21 23:10:15 +01004660 strlcpy(comm, comm_event->task->comm, sizeof(comm));
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004661 size = ALIGN(strlen(comm)+1, sizeof(u64));
4662
4663 comm_event->comm = comm;
4664 comm_event->comm_size = size;
4665
4666 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
Peter Zijlstraf6595f32009-11-20 22:19:47 +01004667 rcu_read_lock();
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004668 list_for_each_entry_rcu(pmu, &pmus, entry) {
Peter Zijlstra41945f62010-09-16 19:17:24 +02004669 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra51676952010-12-07 14:18:20 +01004670 if (cpuctx->active_pmu != pmu)
4671 goto next;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004672 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02004673
4674 ctxn = pmu->task_ctx_nr;
4675 if (ctxn < 0)
Peter Zijlstra41945f62010-09-16 19:17:24 +02004676 goto next;
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02004677
4678 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4679 if (ctx)
4680 perf_event_comm_ctx(ctx, comm_event);
Peter Zijlstra41945f62010-09-16 19:17:24 +02004681next:
4682 put_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004683 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004684 rcu_read_unlock();
4685}
4686
4687void perf_event_comm(struct task_struct *task)
4688{
4689 struct perf_comm_event comm_event;
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02004690 struct perf_event_context *ctx;
4691 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004692
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02004693 for_each_task_context_nr(ctxn) {
4694 ctx = task->perf_event_ctxp[ctxn];
4695 if (!ctx)
4696 continue;
4697
4698 perf_event_enable_on_exec(ctx);
4699 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004700
4701 if (!atomic_read(&nr_comm_events))
4702 return;
4703
4704 comm_event = (struct perf_comm_event){
4705 .task = task,
4706 /* .comm */
4707 /* .comm_size */
4708 .event_id = {
4709 .header = {
4710 .type = PERF_RECORD_COMM,
4711 .misc = 0,
4712 /* .size */
4713 },
4714 /* .pid */
4715 /* .tid */
4716 },
4717 };
4718
4719 perf_event_comm_event(&comm_event);
4720}
4721
4722/*
4723 * mmap tracking
4724 */
4725
4726struct perf_mmap_event {
4727 struct vm_area_struct *vma;
4728
4729 const char *file_name;
4730 int file_size;
4731
4732 struct {
4733 struct perf_event_header header;
4734
4735 u32 pid;
4736 u32 tid;
4737 u64 start;
4738 u64 len;
4739 u64 pgoff;
4740 } event_id;
4741};
4742
4743static void perf_event_mmap_output(struct perf_event *event,
4744 struct perf_mmap_event *mmap_event)
4745{
4746 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004747 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004748 int size = mmap_event->event_id.header.size;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004749 int ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004750
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004751 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4752 ret = perf_output_begin(&handle, event,
4753 mmap_event->event_id.header.size, 0, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004754 if (ret)
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004755 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004756
4757 mmap_event->event_id.pid = perf_event_pid(event, current);
4758 mmap_event->event_id.tid = perf_event_tid(event, current);
4759
4760 perf_output_put(&handle, mmap_event->event_id);
4761 perf_output_copy(&handle, mmap_event->file_name,
4762 mmap_event->file_size);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004763
4764 perf_event__output_id_sample(event, &handle, &sample);
4765
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004766 perf_output_end(&handle);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004767out:
4768 mmap_event->event_id.header.size = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004769}
4770
4771static int perf_event_mmap_match(struct perf_event *event,
Eric B Munson3af9e852010-05-18 15:30:49 +01004772 struct perf_mmap_event *mmap_event,
4773 int executable)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004774{
Peter Zijlstra6f93d0a2010-02-14 11:12:04 +01004775 if (event->state < PERF_EVENT_STATE_INACTIVE)
Peter Zijlstra22e19082010-01-18 09:12:32 +01004776 return 0;
4777
Stephane Eranian5632ab12011-01-03 18:20:01 +02004778 if (!event_filter_match(event))
Peter Zijlstra5d27c232009-12-17 13:16:32 +01004779 return 0;
4780
Eric B Munson3af9e852010-05-18 15:30:49 +01004781 if ((!executable && event->attr.mmap_data) ||
4782 (executable && event->attr.mmap))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004783 return 1;
4784
4785 return 0;
4786}
4787
4788static void perf_event_mmap_ctx(struct perf_event_context *ctx,
Eric B Munson3af9e852010-05-18 15:30:49 +01004789 struct perf_mmap_event *mmap_event,
4790 int executable)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004791{
4792 struct perf_event *event;
4793
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004794 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
Eric B Munson3af9e852010-05-18 15:30:49 +01004795 if (perf_event_mmap_match(event, mmap_event, executable))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004796 perf_event_mmap_output(event, mmap_event);
4797 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004798}
4799
4800static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4801{
4802 struct perf_cpu_context *cpuctx;
4803 struct perf_event_context *ctx;
4804 struct vm_area_struct *vma = mmap_event->vma;
4805 struct file *file = vma->vm_file;
4806 unsigned int size;
4807 char tmp[16];
4808 char *buf = NULL;
4809 const char *name;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004810 struct pmu *pmu;
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02004811 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004812
4813 memset(tmp, 0, sizeof(tmp));
4814
4815 if (file) {
4816 /*
4817 * d_path works from the end of the buffer backwards, so we
4818 * need to add enough zero bytes after the string to handle
4819 * the 64bit alignment we do later.
4820 */
4821 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
4822 if (!buf) {
4823 name = strncpy(tmp, "//enomem", sizeof(tmp));
4824 goto got_name;
4825 }
4826 name = d_path(&file->f_path, buf, PATH_MAX);
4827 if (IS_ERR(name)) {
4828 name = strncpy(tmp, "//toolong", sizeof(tmp));
4829 goto got_name;
4830 }
4831 } else {
4832 if (arch_vma_name(mmap_event->vma)) {
4833 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
4834 sizeof(tmp));
4835 goto got_name;
4836 }
4837
4838 if (!vma->vm_mm) {
4839 name = strncpy(tmp, "[vdso]", sizeof(tmp));
4840 goto got_name;
Eric B Munson3af9e852010-05-18 15:30:49 +01004841 } else if (vma->vm_start <= vma->vm_mm->start_brk &&
4842 vma->vm_end >= vma->vm_mm->brk) {
4843 name = strncpy(tmp, "[heap]", sizeof(tmp));
4844 goto got_name;
4845 } else if (vma->vm_start <= vma->vm_mm->start_stack &&
4846 vma->vm_end >= vma->vm_mm->start_stack) {
4847 name = strncpy(tmp, "[stack]", sizeof(tmp));
4848 goto got_name;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004849 }
4850
4851 name = strncpy(tmp, "//anon", sizeof(tmp));
4852 goto got_name;
4853 }
4854
4855got_name:
4856 size = ALIGN(strlen(name)+1, sizeof(u64));
4857
4858 mmap_event->file_name = name;
4859 mmap_event->file_size = size;
4860
4861 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
4862
Peter Zijlstraf6d9dd22009-11-20 22:19:48 +01004863 rcu_read_lock();
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004864 list_for_each_entry_rcu(pmu, &pmus, entry) {
Peter Zijlstra41945f62010-09-16 19:17:24 +02004865 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra51676952010-12-07 14:18:20 +01004866 if (cpuctx->active_pmu != pmu)
4867 goto next;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004868 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
4869 vma->vm_flags & VM_EXEC);
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02004870
4871 ctxn = pmu->task_ctx_nr;
4872 if (ctxn < 0)
Peter Zijlstra41945f62010-09-16 19:17:24 +02004873 goto next;
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02004874
4875 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4876 if (ctx) {
4877 perf_event_mmap_ctx(ctx, mmap_event,
4878 vma->vm_flags & VM_EXEC);
4879 }
Peter Zijlstra41945f62010-09-16 19:17:24 +02004880next:
4881 put_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004882 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004883 rcu_read_unlock();
4884
4885 kfree(buf);
4886}
4887
Eric B Munson3af9e852010-05-18 15:30:49 +01004888void perf_event_mmap(struct vm_area_struct *vma)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004889{
4890 struct perf_mmap_event mmap_event;
4891
4892 if (!atomic_read(&nr_mmap_events))
4893 return;
4894
4895 mmap_event = (struct perf_mmap_event){
4896 .vma = vma,
4897 /* .file_name */
4898 /* .file_size */
4899 .event_id = {
4900 .header = {
4901 .type = PERF_RECORD_MMAP,
Zhang, Yanmin39447b32010-04-19 13:32:41 +08004902 .misc = PERF_RECORD_MISC_USER,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004903 /* .size */
4904 },
4905 /* .pid */
4906 /* .tid */
4907 .start = vma->vm_start,
4908 .len = vma->vm_end - vma->vm_start,
Peter Zijlstra3a0304e2010-02-26 10:33:41 +01004909 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004910 },
4911 };
4912
4913 perf_event_mmap_event(&mmap_event);
4914}
4915
4916/*
4917 * IRQ throttle logging
4918 */
4919
4920static void perf_log_throttle(struct perf_event *event, int enable)
4921{
4922 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004923 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004924 int ret;
4925
4926 struct {
4927 struct perf_event_header header;
4928 u64 time;
4929 u64 id;
4930 u64 stream_id;
4931 } throttle_event = {
4932 .header = {
4933 .type = PERF_RECORD_THROTTLE,
4934 .misc = 0,
4935 .size = sizeof(throttle_event),
4936 },
4937 .time = perf_clock(),
4938 .id = primary_event_id(event),
4939 .stream_id = event->id,
4940 };
4941
4942 if (enable)
4943 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
4944
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004945 perf_event_header__init_id(&throttle_event.header, &sample, event);
4946
4947 ret = perf_output_begin(&handle, event,
4948 throttle_event.header.size, 1, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004949 if (ret)
4950 return;
4951
4952 perf_output_put(&handle, throttle_event);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004953 perf_event__output_id_sample(event, &handle, &sample);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004954 perf_output_end(&handle);
4955}
4956
4957/*
4958 * Generic event overflow handling, sampling.
4959 */
4960
4961static int __perf_event_overflow(struct perf_event *event, int nmi,
4962 int throttle, struct perf_sample_data *data,
4963 struct pt_regs *regs)
4964{
4965 int events = atomic_read(&event->event_limit);
4966 struct hw_perf_event *hwc = &event->hw;
4967 int ret = 0;
4968
Peter Zijlstra96398822010-11-24 18:55:29 +01004969 /*
4970 * Non-sampling counters might still use the PMI to fold short
4971 * hardware counters, ignore those.
4972 */
4973 if (unlikely(!is_sampling_event(event)))
4974 return 0;
4975
Peter Zijlstra163ec432011-02-16 11:22:34 +01004976 if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
4977 if (throttle) {
4978 hwc->interrupts = MAX_INTERRUPTS;
4979 perf_log_throttle(event, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004980 ret = 1;
4981 }
Peter Zijlstra163ec432011-02-16 11:22:34 +01004982 } else
4983 hwc->interrupts++;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004984
4985 if (event->attr.freq) {
4986 u64 now = perf_clock();
Peter Zijlstraabd50712010-01-26 18:50:16 +01004987 s64 delta = now - hwc->freq_time_stamp;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004988
Peter Zijlstraabd50712010-01-26 18:50:16 +01004989 hwc->freq_time_stamp = now;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004990
Peter Zijlstraabd50712010-01-26 18:50:16 +01004991 if (delta > 0 && delta < 2*TICK_NSEC)
4992 perf_adjust_period(event, delta, hwc->last_period);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004993 }
4994
4995 /*
4996 * XXX event_limit might not quite work as expected on inherited
4997 * events
4998 */
4999
5000 event->pending_kill = POLL_IN;
5001 if (events && atomic_dec_and_test(&event->event_limit)) {
5002 ret = 1;
5003 event->pending_kill = POLL_HUP;
5004 if (nmi) {
5005 event->pending_disable = 1;
Peter Zijlstrae360adb2010-10-14 14:01:34 +08005006 irq_work_queue(&event->pending);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005007 } else
5008 perf_event_disable(event);
5009 }
5010
Peter Zijlstra453f19e2009-11-20 22:19:43 +01005011 if (event->overflow_handler)
5012 event->overflow_handler(event, nmi, data, regs);
5013 else
5014 perf_event_output(event, nmi, data, regs);
5015
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005016 return ret;
5017}
5018
5019int perf_event_overflow(struct perf_event *event, int nmi,
5020 struct perf_sample_data *data,
5021 struct pt_regs *regs)
5022{
5023 return __perf_event_overflow(event, nmi, 1, data, regs);
5024}
5025
5026/*
5027 * Generic software event infrastructure
5028 */
5029
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005030struct swevent_htable {
5031 struct swevent_hlist *swevent_hlist;
5032 struct mutex hlist_mutex;
5033 int hlist_refcount;
5034
5035 /* Recursion avoidance in each contexts */
5036 int recursion[PERF_NR_CONTEXTS];
5037};
5038
5039static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
5040
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005041/*
5042 * We directly increment event->count and keep a second value in
5043 * event->hw.period_left to count intervals. This period event
5044 * is kept in the range [-sample_period, 0] so that we can use the
5045 * sign as trigger.
5046 */
5047
5048static u64 perf_swevent_set_period(struct perf_event *event)
5049{
5050 struct hw_perf_event *hwc = &event->hw;
5051 u64 period = hwc->last_period;
5052 u64 nr, offset;
5053 s64 old, val;
5054
5055 hwc->last_period = hwc->sample_period;
5056
5057again:
Peter Zijlstrae7850592010-05-21 14:43:08 +02005058 old = val = local64_read(&hwc->period_left);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005059 if (val < 0)
5060 return 0;
5061
5062 nr = div64_u64(period + val, period);
5063 offset = nr * period;
5064 val -= offset;
Peter Zijlstrae7850592010-05-21 14:43:08 +02005065 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005066 goto again;
5067
5068 return nr;
5069}
5070
Peter Zijlstra0cff7842009-11-20 22:19:44 +01005071static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005072 int nmi, struct perf_sample_data *data,
5073 struct pt_regs *regs)
5074{
5075 struct hw_perf_event *hwc = &event->hw;
5076 int throttle = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005077
5078 data->period = event->hw.last_period;
Peter Zijlstra0cff7842009-11-20 22:19:44 +01005079 if (!overflow)
5080 overflow = perf_swevent_set_period(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005081
5082 if (hwc->interrupts == MAX_INTERRUPTS)
5083 return;
5084
5085 for (; overflow; overflow--) {
5086 if (__perf_event_overflow(event, nmi, throttle,
5087 data, regs)) {
5088 /*
5089 * We inhibit the overflow from happening when
5090 * hwc->interrupts == MAX_INTERRUPTS.
5091 */
5092 break;
5093 }
5094 throttle = 1;
5095 }
5096}
5097
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005098static void perf_swevent_event(struct perf_event *event, u64 nr,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005099 int nmi, struct perf_sample_data *data,
5100 struct pt_regs *regs)
5101{
5102 struct hw_perf_event *hwc = &event->hw;
5103
Peter Zijlstrae7850592010-05-21 14:43:08 +02005104 local64_add(nr, &event->count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005105
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005106 if (!regs)
5107 return;
5108
Franck Bui-Huu6c7e5502010-11-23 16:21:43 +01005109 if (!is_sampling_event(event))
Peter Zijlstra0cff7842009-11-20 22:19:44 +01005110 return;
5111
5112 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
5113 return perf_swevent_overflow(event, 1, nmi, data, regs);
5114
Peter Zijlstrae7850592010-05-21 14:43:08 +02005115 if (local64_add_negative(nr, &hwc->period_left))
Peter Zijlstra0cff7842009-11-20 22:19:44 +01005116 return;
5117
5118 perf_swevent_overflow(event, 0, nmi, data, regs);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005119}
5120
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005121static int perf_exclude_event(struct perf_event *event,
5122 struct pt_regs *regs)
5123{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005124 if (event->hw.state & PERF_HES_STOPPED)
Frederic Weisbecker91b2f482011-03-07 21:27:08 +01005125 return 1;
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005126
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005127 if (regs) {
5128 if (event->attr.exclude_user && user_mode(regs))
5129 return 1;
5130
5131 if (event->attr.exclude_kernel && !user_mode(regs))
5132 return 1;
5133 }
5134
5135 return 0;
5136}
5137
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005138static int perf_swevent_match(struct perf_event *event,
5139 enum perf_type_id type,
Li Zefan6fb29152009-10-15 11:21:42 +08005140 u32 event_id,
5141 struct perf_sample_data *data,
5142 struct pt_regs *regs)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005143{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005144 if (event->attr.type != type)
5145 return 0;
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005146
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005147 if (event->attr.config != event_id)
5148 return 0;
5149
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005150 if (perf_exclude_event(event, regs))
5151 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005152
5153 return 1;
5154}
5155
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005156static inline u64 swevent_hash(u64 type, u32 event_id)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005157{
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005158 u64 val = event_id | (type << 32);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005159
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005160 return hash_64(val, SWEVENT_HLIST_BITS);
5161}
5162
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005163static inline struct hlist_head *
5164__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005165{
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005166 u64 hash = swevent_hash(type, event_id);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005167
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005168 return &hlist->heads[hash];
5169}
5170
5171/* For the read side: events when they trigger */
5172static inline struct hlist_head *
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005173find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005174{
5175 struct swevent_hlist *hlist;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005176
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005177 hlist = rcu_dereference(swhash->swevent_hlist);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005178 if (!hlist)
5179 return NULL;
5180
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005181 return __find_swevent_head(hlist, type, event_id);
5182}
5183
5184/* For the event head insertion and removal in the hlist */
5185static inline struct hlist_head *
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005186find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005187{
5188 struct swevent_hlist *hlist;
5189 u32 event_id = event->attr.config;
5190 u64 type = event->attr.type;
5191
5192 /*
5193 * Event scheduling is always serialized against hlist allocation
5194 * and release. Which makes the protected version suitable here.
5195 * The context lock guarantees that.
5196 */
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005197 hlist = rcu_dereference_protected(swhash->swevent_hlist,
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005198 lockdep_is_held(&event->ctx->lock));
5199 if (!hlist)
5200 return NULL;
5201
5202 return __find_swevent_head(hlist, type, event_id);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005203}
5204
5205static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5206 u64 nr, int nmi,
5207 struct perf_sample_data *data,
5208 struct pt_regs *regs)
5209{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005210 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005211 struct perf_event *event;
5212 struct hlist_node *node;
5213 struct hlist_head *head;
5214
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005215 rcu_read_lock();
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005216 head = find_swevent_head_rcu(swhash, type, event_id);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005217 if (!head)
5218 goto end;
5219
5220 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
Li Zefan6fb29152009-10-15 11:21:42 +08005221 if (perf_swevent_match(event, type, event_id, data, regs))
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005222 perf_swevent_event(event, nr, nmi, data, regs);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005223 }
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005224end:
5225 rcu_read_unlock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005226}
5227
Peter Zijlstra4ed7c922009-11-23 11:37:29 +01005228int perf_swevent_get_recursion_context(void)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005229{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005230 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01005231
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005232 return get_recursion_context(swhash->recursion);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005233}
Ingo Molnar645e8cc2009-11-22 12:20:19 +01005234EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005235
Jesper Juhlfa9f90b2010-11-28 21:39:34 +01005236inline void perf_swevent_put_recursion_context(int rctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005237{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005238 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02005239
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005240 put_recursion_context(swhash->recursion, rctx);
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01005241}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005242
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005243void __perf_sw_event(u32 event_id, u64 nr, int nmi,
5244 struct pt_regs *regs, u64 addr)
5245{
Ingo Molnara4234bf2009-11-23 10:57:59 +01005246 struct perf_sample_data data;
Peter Zijlstra4ed7c922009-11-23 11:37:29 +01005247 int rctx;
5248
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005249 preempt_disable_notrace();
Peter Zijlstra4ed7c922009-11-23 11:37:29 +01005250 rctx = perf_swevent_get_recursion_context();
5251 if (rctx < 0)
5252 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005253
Peter Zijlstradc1d6282010-03-03 15:55:04 +01005254 perf_sample_data_init(&data, addr);
Ingo Molnara4234bf2009-11-23 10:57:59 +01005255
5256 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
Peter Zijlstra4ed7c922009-11-23 11:37:29 +01005257
5258 perf_swevent_put_recursion_context(rctx);
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005259 preempt_enable_notrace();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005260}
5261
5262static void perf_swevent_read(struct perf_event *event)
5263{
5264}
5265
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005266static int perf_swevent_add(struct perf_event *event, int flags)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005267{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005268 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005269 struct hw_perf_event *hwc = &event->hw;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005270 struct hlist_head *head;
5271
Franck Bui-Huu6c7e5502010-11-23 16:21:43 +01005272 if (is_sampling_event(event)) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005273 hwc->last_period = hwc->sample_period;
5274 perf_swevent_set_period(event);
5275 }
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005276
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005277 hwc->state = !(flags & PERF_EF_START);
5278
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005279 head = find_swevent_head(swhash, event);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005280 if (WARN_ON_ONCE(!head))
5281 return -EINVAL;
5282
5283 hlist_add_head_rcu(&event->hlist_entry, head);
5284
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005285 return 0;
5286}
5287
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005288static void perf_swevent_del(struct perf_event *event, int flags)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005289{
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005290 hlist_del_rcu(&event->hlist_entry);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005291}
5292
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005293static void perf_swevent_start(struct perf_event *event, int flags)
Peter Zijlstrac6df8d52010-06-03 11:21:20 +02005294{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005295 event->hw.state = 0;
Peter Zijlstrac6df8d52010-06-03 11:21:20 +02005296}
5297
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005298static void perf_swevent_stop(struct perf_event *event, int flags)
Peter Zijlstrac6df8d52010-06-03 11:21:20 +02005299{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005300 event->hw.state = PERF_HES_STOPPED;
Peter Zijlstrac6df8d52010-06-03 11:21:20 +02005301}
5302
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005303/* Deref the hlist from the update side */
5304static inline struct swevent_hlist *
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005305swevent_hlist_deref(struct swevent_htable *swhash)
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005306{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005307 return rcu_dereference_protected(swhash->swevent_hlist,
5308 lockdep_is_held(&swhash->hlist_mutex));
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005309}
5310
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005311static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
5312{
5313 struct swevent_hlist *hlist;
5314
5315 hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
5316 kfree(hlist);
5317}
5318
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005319static void swevent_hlist_release(struct swevent_htable *swhash)
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005320{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005321 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005322
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005323 if (!hlist)
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005324 return;
5325
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005326 rcu_assign_pointer(swhash->swevent_hlist, NULL);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005327 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
5328}
5329
5330static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
5331{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005332 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005333
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005334 mutex_lock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005335
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005336 if (!--swhash->hlist_refcount)
5337 swevent_hlist_release(swhash);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005338
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005339 mutex_unlock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005340}
5341
5342static void swevent_hlist_put(struct perf_event *event)
5343{
5344 int cpu;
5345
5346 if (event->cpu != -1) {
5347 swevent_hlist_put_cpu(event, event->cpu);
5348 return;
5349 }
5350
5351 for_each_possible_cpu(cpu)
5352 swevent_hlist_put_cpu(event, cpu);
5353}
5354
5355static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
5356{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005357 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005358 int err = 0;
5359
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005360 mutex_lock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005361
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005362 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005363 struct swevent_hlist *hlist;
5364
5365 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
5366 if (!hlist) {
5367 err = -ENOMEM;
5368 goto exit;
5369 }
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005370 rcu_assign_pointer(swhash->swevent_hlist, hlist);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005371 }
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005372 swhash->hlist_refcount++;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02005373exit:
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005374 mutex_unlock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005375
5376 return err;
5377}
5378
5379static int swevent_hlist_get(struct perf_event *event)
5380{
5381 int err;
5382 int cpu, failed_cpu;
5383
5384 if (event->cpu != -1)
5385 return swevent_hlist_get_cpu(event, event->cpu);
5386
5387 get_online_cpus();
5388 for_each_possible_cpu(cpu) {
5389 err = swevent_hlist_get_cpu(event, cpu);
5390 if (err) {
5391 failed_cpu = cpu;
5392 goto fail;
5393 }
5394 }
5395 put_online_cpus();
5396
5397 return 0;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02005398fail:
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005399 for_each_possible_cpu(cpu) {
5400 if (cpu == failed_cpu)
5401 break;
5402 swevent_hlist_put_cpu(event, cpu);
5403 }
5404
5405 put_online_cpus();
5406 return err;
5407}
5408
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005409atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
Frederic Weisbecker95476b62010-04-14 23:42:18 +02005410
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005411static void sw_perf_event_destroy(struct perf_event *event)
5412{
5413 u64 event_id = event->attr.config;
5414
5415 WARN_ON(event->parent);
5416
Peter Zijlstra7e54a5a2010-10-14 22:32:45 +02005417 jump_label_dec(&perf_swevent_enabled[event_id]);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005418 swevent_hlist_put(event);
5419}
5420
5421static int perf_swevent_init(struct perf_event *event)
5422{
5423 int event_id = event->attr.config;
5424
5425 if (event->attr.type != PERF_TYPE_SOFTWARE)
5426 return -ENOENT;
5427
5428 switch (event_id) {
5429 case PERF_COUNT_SW_CPU_CLOCK:
5430 case PERF_COUNT_SW_TASK_CLOCK:
5431 return -ENOENT;
5432
5433 default:
5434 break;
5435 }
5436
Dan Carpenterce677832010-10-24 21:50:42 +02005437 if (event_id >= PERF_COUNT_SW_MAX)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005438 return -ENOENT;
5439
5440 if (!event->parent) {
5441 int err;
5442
5443 err = swevent_hlist_get(event);
5444 if (err)
5445 return err;
5446
Peter Zijlstra7e54a5a2010-10-14 22:32:45 +02005447 jump_label_inc(&perf_swevent_enabled[event_id]);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005448 event->destroy = sw_perf_event_destroy;
5449 }
5450
5451 return 0;
5452}
5453
5454static struct pmu perf_swevent = {
Peter Zijlstra89a1e182010-09-07 17:34:50 +02005455 .task_ctx_nr = perf_sw_context,
5456
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005457 .event_init = perf_swevent_init,
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005458 .add = perf_swevent_add,
5459 .del = perf_swevent_del,
5460 .start = perf_swevent_start,
5461 .stop = perf_swevent_stop,
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005462 .read = perf_swevent_read,
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005463};
Frederic Weisbecker95476b62010-04-14 23:42:18 +02005464
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005465#ifdef CONFIG_EVENT_TRACING
5466
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005467static int perf_tp_filter_match(struct perf_event *event,
Frederic Weisbecker95476b62010-04-14 23:42:18 +02005468 struct perf_sample_data *data)
5469{
5470 void *record = data->raw->data;
5471
5472 if (likely(!event->filter) || filter_match_preds(event->filter, record))
5473 return 1;
5474 return 0;
5475}
5476
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005477static int perf_tp_event_match(struct perf_event *event,
5478 struct perf_sample_data *data,
5479 struct pt_regs *regs)
5480{
Peter Zijlstra580d6072010-05-20 20:54:31 +02005481 /*
5482 * All tracepoints are from kernel-space.
5483 */
5484 if (event->attr.exclude_kernel)
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005485 return 0;
5486
5487 if (!perf_tp_filter_match(event, data))
5488 return 0;
5489
5490 return 1;
5491}
5492
5493void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
Peter Zijlstraecc55f82010-05-21 15:11:34 +02005494 struct pt_regs *regs, struct hlist_head *head, int rctx)
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005495{
5496 struct perf_sample_data data;
5497 struct perf_event *event;
5498 struct hlist_node *node;
5499
5500 struct perf_raw_record raw = {
5501 .size = entry_size,
5502 .data = record,
5503 };
5504
5505 perf_sample_data_init(&data, addr);
5506 data.raw = &raw;
5507
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005508 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5509 if (perf_tp_event_match(event, &data, regs))
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005510 perf_swevent_event(event, count, 1, &data, regs);
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005511 }
Peter Zijlstraecc55f82010-05-21 15:11:34 +02005512
5513 perf_swevent_put_recursion_context(rctx);
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005514}
5515EXPORT_SYMBOL_GPL(perf_tp_event);
5516
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005517static void tp_perf_event_destroy(struct perf_event *event)
5518{
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005519 perf_trace_destroy(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005520}
5521
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005522static int perf_tp_event_init(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005523{
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005524 int err;
5525
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005526 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5527 return -ENOENT;
5528
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005529 err = perf_trace_init(event);
5530 if (err)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005531 return err;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005532
5533 event->destroy = tp_perf_event_destroy;
5534
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005535 return 0;
5536}
5537
5538static struct pmu perf_tracepoint = {
Peter Zijlstra89a1e182010-09-07 17:34:50 +02005539 .task_ctx_nr = perf_sw_context,
5540
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005541 .event_init = perf_tp_event_init,
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005542 .add = perf_trace_add,
5543 .del = perf_trace_del,
5544 .start = perf_swevent_start,
5545 .stop = perf_swevent_stop,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005546 .read = perf_swevent_read,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005547};
5548
5549static inline void perf_tp_register(void)
5550{
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005551 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005552}
Li Zefan6fb29152009-10-15 11:21:42 +08005553
5554static int perf_event_set_filter(struct perf_event *event, void __user *arg)
5555{
5556 char *filter_str;
5557 int ret;
5558
5559 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5560 return -EINVAL;
5561
5562 filter_str = strndup_user(arg, PAGE_SIZE);
5563 if (IS_ERR(filter_str))
5564 return PTR_ERR(filter_str);
5565
5566 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
5567
5568 kfree(filter_str);
5569 return ret;
5570}
5571
5572static void perf_event_free_filter(struct perf_event *event)
5573{
5574 ftrace_profile_free_filter(event);
5575}
5576
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005577#else
Li Zefan6fb29152009-10-15 11:21:42 +08005578
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005579static inline void perf_tp_register(void)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005580{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005581}
Li Zefan6fb29152009-10-15 11:21:42 +08005582
5583static int perf_event_set_filter(struct perf_event *event, void __user *arg)
5584{
5585 return -ENOENT;
5586}
5587
5588static void perf_event_free_filter(struct perf_event *event)
5589{
5590}
5591
Li Zefan07b139c2009-12-21 14:27:35 +08005592#endif /* CONFIG_EVENT_TRACING */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005593
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02005594#ifdef CONFIG_HAVE_HW_BREAKPOINT
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005595void perf_bp_event(struct perf_event *bp, void *data)
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02005596{
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005597 struct perf_sample_data sample;
5598 struct pt_regs *regs = data;
5599
Peter Zijlstradc1d6282010-03-03 15:55:04 +01005600 perf_sample_data_init(&sample, bp->attr.bp_addr);
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005601
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005602 if (!bp->hw.state && !perf_exclude_event(bp, regs))
5603 perf_swevent_event(bp, 1, 1, &sample, regs);
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02005604}
5605#endif
5606
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005607/*
5608 * hrtimer based swevent callback
5609 */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005610
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005611static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005612{
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005613 enum hrtimer_restart ret = HRTIMER_RESTART;
5614 struct perf_sample_data data;
5615 struct pt_regs *regs;
5616 struct perf_event *event;
5617 u64 period;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005618
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005619 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
Peter Zijlstraba3dd362011-02-15 12:41:46 +01005620
5621 if (event->state != PERF_EVENT_STATE_ACTIVE)
5622 return HRTIMER_NORESTART;
5623
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005624 event->pmu->read(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005625
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005626 perf_sample_data_init(&data, 0);
5627 data.period = event->hw.last_period;
5628 regs = get_irq_regs();
5629
5630 if (regs && !perf_exclude_event(event, regs)) {
5631 if (!(event->attr.exclude_idle && current->pid == 0))
5632 if (perf_event_overflow(event, 0, &data, regs))
5633 ret = HRTIMER_NORESTART;
5634 }
5635
5636 period = max_t(u64, 10000, event->hw.sample_period);
5637 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
5638
5639 return ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005640}
5641
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005642static void perf_swevent_start_hrtimer(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005643{
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005644 struct hw_perf_event *hwc = &event->hw;
Franck Bui-Huu5d508e82010-11-23 16:21:45 +01005645 s64 period;
5646
5647 if (!is_sampling_event(event))
5648 return;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005649
Franck Bui-Huu5d508e82010-11-23 16:21:45 +01005650 period = local64_read(&hwc->period_left);
5651 if (period) {
5652 if (period < 0)
5653 period = 10000;
Peter Zijlstrafa407f32010-06-24 12:35:12 +02005654
Franck Bui-Huu5d508e82010-11-23 16:21:45 +01005655 local64_set(&hwc->period_left, 0);
5656 } else {
5657 period = max_t(u64, 10000, hwc->sample_period);
5658 }
5659 __hrtimer_start_range_ns(&hwc->hrtimer,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005660 ns_to_ktime(period), 0,
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02005661 HRTIMER_MODE_REL_PINNED, 0);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005662}
5663
5664static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5665{
5666 struct hw_perf_event *hwc = &event->hw;
5667
Franck Bui-Huu6c7e5502010-11-23 16:21:43 +01005668 if (is_sampling_event(event)) {
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005669 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
Peter Zijlstrafa407f32010-06-24 12:35:12 +02005670 local64_set(&hwc->period_left, ktime_to_ns(remaining));
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005671
5672 hrtimer_cancel(&hwc->hrtimer);
5673 }
5674}
5675
Peter Zijlstraba3dd362011-02-15 12:41:46 +01005676static void perf_swevent_init_hrtimer(struct perf_event *event)
5677{
5678 struct hw_perf_event *hwc = &event->hw;
5679
5680 if (!is_sampling_event(event))
5681 return;
5682
5683 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5684 hwc->hrtimer.function = perf_swevent_hrtimer;
5685
5686 /*
5687 * Since hrtimers have a fixed rate, we can do a static freq->period
5688 * mapping and avoid the whole period adjust feedback stuff.
5689 */
5690 if (event->attr.freq) {
5691 long freq = event->attr.sample_freq;
5692
5693 event->attr.sample_period = NSEC_PER_SEC / freq;
5694 hwc->sample_period = event->attr.sample_period;
5695 local64_set(&hwc->period_left, hwc->sample_period);
5696 event->attr.freq = 0;
5697 }
5698}
5699
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005700/*
5701 * Software event: cpu wall time clock
5702 */
5703
5704static void cpu_clock_event_update(struct perf_event *event)
5705{
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005706 s64 prev;
5707 u64 now;
5708
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005709 now = local_clock();
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005710 prev = local64_xchg(&event->hw.prev_count, now);
5711 local64_add(now - prev, &event->count);
5712}
5713
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005714static void cpu_clock_event_start(struct perf_event *event, int flags)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005715{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005716 local64_set(&event->hw.prev_count, local_clock());
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005717 perf_swevent_start_hrtimer(event);
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005718}
5719
5720static void cpu_clock_event_stop(struct perf_event *event, int flags)
5721{
5722 perf_swevent_cancel_hrtimer(event);
5723 cpu_clock_event_update(event);
5724}
5725
5726static int cpu_clock_event_add(struct perf_event *event, int flags)
5727{
5728 if (flags & PERF_EF_START)
5729 cpu_clock_event_start(event, flags);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005730
5731 return 0;
5732}
5733
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005734static void cpu_clock_event_del(struct perf_event *event, int flags)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005735{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005736 cpu_clock_event_stop(event, flags);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005737}
5738
5739static void cpu_clock_event_read(struct perf_event *event)
5740{
5741 cpu_clock_event_update(event);
5742}
5743
5744static int cpu_clock_event_init(struct perf_event *event)
5745{
5746 if (event->attr.type != PERF_TYPE_SOFTWARE)
5747 return -ENOENT;
5748
5749 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5750 return -ENOENT;
5751
Peter Zijlstraba3dd362011-02-15 12:41:46 +01005752 perf_swevent_init_hrtimer(event);
5753
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005754 return 0;
5755}
5756
5757static struct pmu perf_cpu_clock = {
Peter Zijlstra89a1e182010-09-07 17:34:50 +02005758 .task_ctx_nr = perf_sw_context,
5759
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005760 .event_init = cpu_clock_event_init,
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005761 .add = cpu_clock_event_add,
5762 .del = cpu_clock_event_del,
5763 .start = cpu_clock_event_start,
5764 .stop = cpu_clock_event_stop,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005765 .read = cpu_clock_event_read,
5766};
5767
5768/*
5769 * Software event: task time clock
5770 */
5771
5772static void task_clock_event_update(struct perf_event *event, u64 now)
5773{
5774 u64 prev;
5775 s64 delta;
5776
5777 prev = local64_xchg(&event->hw.prev_count, now);
5778 delta = now - prev;
5779 local64_add(delta, &event->count);
5780}
5781
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005782static void task_clock_event_start(struct perf_event *event, int flags)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005783{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005784 local64_set(&event->hw.prev_count, event->ctx->time);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005785 perf_swevent_start_hrtimer(event);
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005786}
5787
5788static void task_clock_event_stop(struct perf_event *event, int flags)
5789{
5790 perf_swevent_cancel_hrtimer(event);
5791 task_clock_event_update(event, event->ctx->time);
5792}
5793
5794static int task_clock_event_add(struct perf_event *event, int flags)
5795{
5796 if (flags & PERF_EF_START)
5797 task_clock_event_start(event, flags);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005798
5799 return 0;
5800}
5801
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005802static void task_clock_event_del(struct perf_event *event, int flags)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005803{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005804 task_clock_event_stop(event, PERF_EF_UPDATE);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005805}
5806
5807static void task_clock_event_read(struct perf_event *event)
5808{
Peter Zijlstra768a06e2011-02-22 16:52:24 +01005809 u64 now = perf_clock();
5810 u64 delta = now - event->ctx->timestamp;
5811 u64 time = event->ctx->time + delta;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005812
5813 task_clock_event_update(event, time);
5814}
5815
5816static int task_clock_event_init(struct perf_event *event)
5817{
5818 if (event->attr.type != PERF_TYPE_SOFTWARE)
5819 return -ENOENT;
5820
5821 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5822 return -ENOENT;
5823
Peter Zijlstraba3dd362011-02-15 12:41:46 +01005824 perf_swevent_init_hrtimer(event);
5825
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005826 return 0;
5827}
5828
5829static struct pmu perf_task_clock = {
Peter Zijlstra89a1e182010-09-07 17:34:50 +02005830 .task_ctx_nr = perf_sw_context,
5831
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005832 .event_init = task_clock_event_init,
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005833 .add = task_clock_event_add,
5834 .del = task_clock_event_del,
5835 .start = task_clock_event_start,
5836 .stop = task_clock_event_stop,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005837 .read = task_clock_event_read,
5838};
5839
Peter Zijlstraad5133b2010-06-15 12:22:39 +02005840static void perf_pmu_nop_void(struct pmu *pmu)
5841{
5842}
5843
5844static int perf_pmu_nop_int(struct pmu *pmu)
5845{
5846 return 0;
5847}
5848
5849static void perf_pmu_start_txn(struct pmu *pmu)
5850{
5851 perf_pmu_disable(pmu);
5852}
5853
5854static int perf_pmu_commit_txn(struct pmu *pmu)
5855{
5856 perf_pmu_enable(pmu);
5857 return 0;
5858}
5859
5860static void perf_pmu_cancel_txn(struct pmu *pmu)
5861{
5862 perf_pmu_enable(pmu);
5863}
5864
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02005865/*
5866 * Ensures all contexts with the same task_ctx_nr have the same
5867 * pmu_cpu_context too.
5868 */
5869static void *find_pmu_context(int ctxn)
5870{
5871 struct pmu *pmu;
5872
5873 if (ctxn < 0)
5874 return NULL;
5875
5876 list_for_each_entry(pmu, &pmus, entry) {
5877 if (pmu->task_ctx_nr == ctxn)
5878 return pmu->pmu_cpu_context;
5879 }
5880
5881 return NULL;
5882}
5883
Peter Zijlstra51676952010-12-07 14:18:20 +01005884static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02005885{
Peter Zijlstra51676952010-12-07 14:18:20 +01005886 int cpu;
5887
5888 for_each_possible_cpu(cpu) {
5889 struct perf_cpu_context *cpuctx;
5890
5891 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5892
5893 if (cpuctx->active_pmu == old_pmu)
5894 cpuctx->active_pmu = pmu;
5895 }
5896}
5897
5898static void free_pmu_context(struct pmu *pmu)
5899{
5900 struct pmu *i;
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02005901
5902 mutex_lock(&pmus_lock);
5903 /*
5904 * Like a real lame refcount.
5905 */
Peter Zijlstra51676952010-12-07 14:18:20 +01005906 list_for_each_entry(i, &pmus, entry) {
5907 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
5908 update_pmu_context(i, pmu);
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02005909 goto out;
Peter Zijlstra51676952010-12-07 14:18:20 +01005910 }
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02005911 }
5912
Peter Zijlstra51676952010-12-07 14:18:20 +01005913 free_percpu(pmu->pmu_cpu_context);
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02005914out:
5915 mutex_unlock(&pmus_lock);
5916}
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005917static struct idr pmu_idr;
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02005918
Peter Zijlstraabe43402010-11-17 23:17:37 +01005919static ssize_t
5920type_show(struct device *dev, struct device_attribute *attr, char *page)
5921{
5922 struct pmu *pmu = dev_get_drvdata(dev);
5923
5924 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5925}
5926
5927static struct device_attribute pmu_dev_attrs[] = {
5928 __ATTR_RO(type),
5929 __ATTR_NULL,
5930};
5931
5932static int pmu_bus_running;
5933static struct bus_type pmu_bus = {
5934 .name = "event_source",
5935 .dev_attrs = pmu_dev_attrs,
5936};
5937
5938static void pmu_dev_release(struct device *dev)
5939{
5940 kfree(dev);
5941}
5942
5943static int pmu_dev_alloc(struct pmu *pmu)
5944{
5945 int ret = -ENOMEM;
5946
5947 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
5948 if (!pmu->dev)
5949 goto out;
5950
5951 device_initialize(pmu->dev);
5952 ret = dev_set_name(pmu->dev, "%s", pmu->name);
5953 if (ret)
5954 goto free_dev;
5955
5956 dev_set_drvdata(pmu->dev, pmu);
5957 pmu->dev->bus = &pmu_bus;
5958 pmu->dev->release = pmu_dev_release;
5959 ret = device_add(pmu->dev);
5960 if (ret)
5961 goto free_dev;
5962
5963out:
5964 return ret;
5965
5966free_dev:
5967 put_device(pmu->dev);
5968 goto out;
5969}
5970
Peter Zijlstra547e9fd2011-01-19 12:51:39 +01005971static struct lock_class_key cpuctx_mutex;
5972
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005973int perf_pmu_register(struct pmu *pmu, char *name, int type)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005974{
Peter Zijlstra108b02c2010-09-06 14:32:03 +02005975 int cpu, ret;
Peter Zijlstra33696fc2010-06-14 08:49:00 +02005976
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005977 mutex_lock(&pmus_lock);
Peter Zijlstra33696fc2010-06-14 08:49:00 +02005978 ret = -ENOMEM;
5979 pmu->pmu_disable_count = alloc_percpu(int);
5980 if (!pmu->pmu_disable_count)
5981 goto unlock;
Peter Zijlstraad5133b2010-06-15 12:22:39 +02005982
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005983 pmu->type = -1;
5984 if (!name)
5985 goto skip_type;
5986 pmu->name = name;
5987
5988 if (type < 0) {
5989 int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
5990 if (!err)
5991 goto free_pdc;
5992
5993 err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
5994 if (err) {
5995 ret = err;
5996 goto free_pdc;
5997 }
5998 }
5999 pmu->type = type;
6000
Peter Zijlstraabe43402010-11-17 23:17:37 +01006001 if (pmu_bus_running) {
6002 ret = pmu_dev_alloc(pmu);
6003 if (ret)
6004 goto free_idr;
6005 }
6006
Peter Zijlstra2e80a822010-11-17 23:17:36 +01006007skip_type:
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02006008 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
6009 if (pmu->pmu_cpu_context)
6010 goto got_cpu_context;
6011
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006012 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
6013 if (!pmu->pmu_cpu_context)
Peter Zijlstraabe43402010-11-17 23:17:37 +01006014 goto free_dev;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006015
6016 for_each_possible_cpu(cpu) {
6017 struct perf_cpu_context *cpuctx;
6018
6019 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
Peter Zijlstraeb184472010-09-07 15:55:13 +02006020 __perf_event_init_context(&cpuctx->ctx);
Peter Zijlstra547e9fd2011-01-19 12:51:39 +01006021 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006022 cpuctx->ctx.type = cpu_context;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006023 cpuctx->ctx.pmu = pmu;
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02006024 cpuctx->jiffies_interval = 1;
6025 INIT_LIST_HEAD(&cpuctx->rotation_list);
Peter Zijlstra51676952010-12-07 14:18:20 +01006026 cpuctx->active_pmu = pmu;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006027 }
6028
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02006029got_cpu_context:
Peter Zijlstraad5133b2010-06-15 12:22:39 +02006030 if (!pmu->start_txn) {
6031 if (pmu->pmu_enable) {
6032 /*
6033 * If we have pmu_enable/pmu_disable calls, install
6034 * transaction stubs that use that to try and batch
6035 * hardware accesses.
6036 */
6037 pmu->start_txn = perf_pmu_start_txn;
6038 pmu->commit_txn = perf_pmu_commit_txn;
6039 pmu->cancel_txn = perf_pmu_cancel_txn;
6040 } else {
6041 pmu->start_txn = perf_pmu_nop_void;
6042 pmu->commit_txn = perf_pmu_nop_int;
6043 pmu->cancel_txn = perf_pmu_nop_void;
6044 }
6045 }
6046
6047 if (!pmu->pmu_enable) {
6048 pmu->pmu_enable = perf_pmu_nop_void;
6049 pmu->pmu_disable = perf_pmu_nop_void;
6050 }
6051
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006052 list_add_rcu(&pmu->entry, &pmus);
Peter Zijlstra33696fc2010-06-14 08:49:00 +02006053 ret = 0;
6054unlock:
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006055 mutex_unlock(&pmus_lock);
6056
Peter Zijlstra33696fc2010-06-14 08:49:00 +02006057 return ret;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006058
Peter Zijlstraabe43402010-11-17 23:17:37 +01006059free_dev:
6060 device_del(pmu->dev);
6061 put_device(pmu->dev);
6062
Peter Zijlstra2e80a822010-11-17 23:17:36 +01006063free_idr:
6064 if (pmu->type >= PERF_TYPE_MAX)
6065 idr_remove(&pmu_idr, pmu->type);
6066
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006067free_pdc:
6068 free_percpu(pmu->pmu_disable_count);
6069 goto unlock;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006070}
6071
6072void perf_pmu_unregister(struct pmu *pmu)
6073{
6074 mutex_lock(&pmus_lock);
6075 list_del_rcu(&pmu->entry);
6076 mutex_unlock(&pmus_lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006077
6078 /*
Peter Zijlstracde8e882010-09-13 11:06:55 +02006079 * We dereference the pmu list under both SRCU and regular RCU, so
6080 * synchronize against both of those.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006081 */
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006082 synchronize_srcu(&pmus_srcu);
Peter Zijlstracde8e882010-09-13 11:06:55 +02006083 synchronize_rcu();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006084
Peter Zijlstra33696fc2010-06-14 08:49:00 +02006085 free_percpu(pmu->pmu_disable_count);
Peter Zijlstra2e80a822010-11-17 23:17:36 +01006086 if (pmu->type >= PERF_TYPE_MAX)
6087 idr_remove(&pmu_idr, pmu->type);
Peter Zijlstraabe43402010-11-17 23:17:37 +01006088 device_del(pmu->dev);
6089 put_device(pmu->dev);
Peter Zijlstra51676952010-12-07 14:18:20 +01006090 free_pmu_context(pmu);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006091}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006092
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006093struct pmu *perf_init_event(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006094{
Peter Zijlstra51b0fe32010-06-11 13:35:57 +02006095 struct pmu *pmu = NULL;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006096 int idx;
Lin Ming940c5b22011-02-27 21:13:31 +08006097 int ret;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006098
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006099 idx = srcu_read_lock(&pmus_srcu);
Peter Zijlstra2e80a822010-11-17 23:17:36 +01006100
6101 rcu_read_lock();
6102 pmu = idr_find(&pmu_idr, event->attr.type);
6103 rcu_read_unlock();
Lin Ming940c5b22011-02-27 21:13:31 +08006104 if (pmu) {
6105 ret = pmu->event_init(event);
6106 if (ret)
6107 pmu = ERR_PTR(ret);
Peter Zijlstra2e80a822010-11-17 23:17:36 +01006108 goto unlock;
Lin Ming940c5b22011-02-27 21:13:31 +08006109 }
Peter Zijlstra2e80a822010-11-17 23:17:36 +01006110
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006111 list_for_each_entry_rcu(pmu, &pmus, entry) {
Lin Ming940c5b22011-02-27 21:13:31 +08006112 ret = pmu->event_init(event);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006113 if (!ret)
Peter Zijlstrae5f4d332010-09-10 17:38:06 +02006114 goto unlock;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006115
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006116 if (ret != -ENOENT) {
6117 pmu = ERR_PTR(ret);
Peter Zijlstrae5f4d332010-09-10 17:38:06 +02006118 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006119 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006120 }
Peter Zijlstrae5f4d332010-09-10 17:38:06 +02006121 pmu = ERR_PTR(-ENOENT);
6122unlock:
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006123 srcu_read_unlock(&pmus_srcu, idx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006124
6125 return pmu;
6126}
6127
6128/*
6129 * Allocate and initialize a event structure
6130 */
6131static struct perf_event *
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006132perf_event_alloc(struct perf_event_attr *attr, int cpu,
Peter Zijlstrad580ff82010-10-14 17:43:23 +02006133 struct task_struct *task,
6134 struct perf_event *group_leader,
6135 struct perf_event *parent_event,
6136 perf_overflow_handler_t overflow_handler)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006137{
Peter Zijlstra51b0fe32010-06-11 13:35:57 +02006138 struct pmu *pmu;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006139 struct perf_event *event;
6140 struct hw_perf_event *hwc;
6141 long err;
6142
Oleg Nesterov66832eb2011-01-18 17:10:32 +01006143 if ((unsigned)cpu >= nr_cpu_ids) {
6144 if (!task || cpu != -1)
6145 return ERR_PTR(-EINVAL);
6146 }
6147
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006148 event = kzalloc(sizeof(*event), GFP_KERNEL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006149 if (!event)
6150 return ERR_PTR(-ENOMEM);
6151
6152 /*
6153 * Single events are their own group leaders, with an
6154 * empty sibling list:
6155 */
6156 if (!group_leader)
6157 group_leader = event;
6158
6159 mutex_init(&event->child_mutex);
6160 INIT_LIST_HEAD(&event->child_list);
6161
6162 INIT_LIST_HEAD(&event->group_entry);
6163 INIT_LIST_HEAD(&event->event_entry);
6164 INIT_LIST_HEAD(&event->sibling_list);
6165 init_waitqueue_head(&event->waitq);
Peter Zijlstrae360adb2010-10-14 14:01:34 +08006166 init_irq_work(&event->pending, perf_pending_event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006167
6168 mutex_init(&event->mmap_mutex);
6169
6170 event->cpu = cpu;
6171 event->attr = *attr;
6172 event->group_leader = group_leader;
6173 event->pmu = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006174 event->oncpu = -1;
6175
6176 event->parent = parent_event;
6177
6178 event->ns = get_pid_ns(current->nsproxy->pid_ns);
6179 event->id = atomic64_inc_return(&perf_event_id);
6180
6181 event->state = PERF_EVENT_STATE_INACTIVE;
6182
Peter Zijlstrad580ff82010-10-14 17:43:23 +02006183 if (task) {
6184 event->attach_state = PERF_ATTACH_TASK;
6185#ifdef CONFIG_HAVE_HW_BREAKPOINT
6186 /*
6187 * hw_breakpoint is a bit difficult here..
6188 */
6189 if (attr->type == PERF_TYPE_BREAKPOINT)
6190 event->hw.bp_target = task;
6191#endif
6192 }
6193
Frederic Weisbeckerb326e952009-12-05 09:44:31 +01006194 if (!overflow_handler && parent_event)
6195 overflow_handler = parent_event->overflow_handler;
Oleg Nesterov66832eb2011-01-18 17:10:32 +01006196
Frederic Weisbeckerb326e952009-12-05 09:44:31 +01006197 event->overflow_handler = overflow_handler;
Frederic Weisbecker97eaf532009-10-18 15:33:50 +02006198
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006199 if (attr->disabled)
6200 event->state = PERF_EVENT_STATE_OFF;
6201
6202 pmu = NULL;
6203
6204 hwc = &event->hw;
6205 hwc->sample_period = attr->sample_period;
6206 if (attr->freq && attr->sample_freq)
6207 hwc->sample_period = 1;
6208 hwc->last_period = hwc->sample_period;
6209
Peter Zijlstrae7850592010-05-21 14:43:08 +02006210 local64_set(&hwc->period_left, hwc->sample_period);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006211
6212 /*
6213 * we currently do not support PERF_FORMAT_GROUP on inherited events
6214 */
6215 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
6216 goto done;
6217
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006218 pmu = perf_init_event(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006219
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006220done:
6221 err = 0;
6222 if (!pmu)
6223 err = -EINVAL;
6224 else if (IS_ERR(pmu))
6225 err = PTR_ERR(pmu);
6226
6227 if (err) {
6228 if (event->ns)
6229 put_pid_ns(event->ns);
6230 kfree(event);
6231 return ERR_PTR(err);
6232 }
6233
6234 event->pmu = pmu;
6235
6236 if (!event->parent) {
Peter Zijlstra82cd6de2010-10-14 17:57:23 +02006237 if (event->attach_state & PERF_ATTACH_TASK)
Stephane Eraniane5d13672011-02-14 11:20:01 +02006238 jump_label_inc(&perf_sched_events);
Eric B Munson3af9e852010-05-18 15:30:49 +01006239 if (event->attr.mmap || event->attr.mmap_data)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006240 atomic_inc(&nr_mmap_events);
6241 if (event->attr.comm)
6242 atomic_inc(&nr_comm_events);
6243 if (event->attr.task)
6244 atomic_inc(&nr_task_events);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02006245 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
6246 err = get_callchain_buffers();
6247 if (err) {
6248 free_event(event);
6249 return ERR_PTR(err);
6250 }
6251 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006252 }
6253
6254 return event;
6255}
6256
6257static int perf_copy_attr(struct perf_event_attr __user *uattr,
6258 struct perf_event_attr *attr)
6259{
6260 u32 size;
6261 int ret;
6262
6263 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
6264 return -EFAULT;
6265
6266 /*
6267 * zero the full structure, so that a short copy will be nice.
6268 */
6269 memset(attr, 0, sizeof(*attr));
6270
6271 ret = get_user(size, &uattr->size);
6272 if (ret)
6273 return ret;
6274
6275 if (size > PAGE_SIZE) /* silly large */
6276 goto err_size;
6277
6278 if (!size) /* abi compat */
6279 size = PERF_ATTR_SIZE_VER0;
6280
6281 if (size < PERF_ATTR_SIZE_VER0)
6282 goto err_size;
6283
6284 /*
6285 * If we're handed a bigger struct than we know of,
6286 * ensure all the unknown bits are 0 - i.e. new
6287 * user-space does not rely on any kernel feature
6288 * extensions we dont know about yet.
6289 */
6290 if (size > sizeof(*attr)) {
6291 unsigned char __user *addr;
6292 unsigned char __user *end;
6293 unsigned char val;
6294
6295 addr = (void __user *)uattr + sizeof(*attr);
6296 end = (void __user *)uattr + size;
6297
6298 for (; addr < end; addr++) {
6299 ret = get_user(val, addr);
6300 if (ret)
6301 return ret;
6302 if (val)
6303 goto err_size;
6304 }
6305 size = sizeof(*attr);
6306 }
6307
6308 ret = copy_from_user(attr, uattr, size);
6309 if (ret)
6310 return -EFAULT;
6311
6312 /*
6313 * If the type exists, the corresponding creation will verify
6314 * the attr->config.
6315 */
6316 if (attr->type >= PERF_TYPE_MAX)
6317 return -EINVAL;
6318
Mahesh Salgaonkarcd757642010-01-30 10:25:18 +05306319 if (attr->__reserved_1)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006320 return -EINVAL;
6321
6322 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
6323 return -EINVAL;
6324
6325 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
6326 return -EINVAL;
6327
6328out:
6329 return ret;
6330
6331err_size:
6332 put_user(sizeof(*attr), &uattr->size);
6333 ret = -E2BIG;
6334 goto out;
6335}
6336
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006337static int
6338perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006339{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02006340 struct perf_buffer *buffer = NULL, *old_buffer = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006341 int ret = -EINVAL;
6342
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006343 if (!output_event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006344 goto set;
6345
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006346 /* don't allow circular references */
6347 if (event == output_event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006348 goto out;
6349
Peter Zijlstra0f139302010-05-20 14:35:15 +02006350 /*
6351 * Don't allow cross-cpu buffers
6352 */
6353 if (output_event->cpu != event->cpu)
6354 goto out;
6355
6356 /*
6357 * If its not a per-cpu buffer, it must be the same task.
6358 */
6359 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
6360 goto out;
6361
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006362set:
6363 mutex_lock(&event->mmap_mutex);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006364 /* Can't redirect output if we've got an active mmap() */
6365 if (atomic_read(&event->mmap_count))
6366 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006367
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006368 if (output_event) {
6369 /* get the buffer we want to redirect to */
Peter Zijlstraca5135e2010-05-28 19:33:23 +02006370 buffer = perf_buffer_get(output_event);
6371 if (!buffer)
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006372 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006373 }
6374
Peter Zijlstraca5135e2010-05-28 19:33:23 +02006375 old_buffer = event->buffer;
6376 rcu_assign_pointer(event->buffer, buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006377 ret = 0;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006378unlock:
6379 mutex_unlock(&event->mmap_mutex);
6380
Peter Zijlstraca5135e2010-05-28 19:33:23 +02006381 if (old_buffer)
6382 perf_buffer_put(old_buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006383out:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006384 return ret;
6385}
6386
6387/**
6388 * sys_perf_event_open - open a performance event, associate it to a task/cpu
6389 *
6390 * @attr_uptr: event_id type attributes for monitoring/sampling
6391 * @pid: target pid
6392 * @cpu: target cpu
6393 * @group_fd: group leader event fd
6394 */
6395SYSCALL_DEFINE5(perf_event_open,
6396 struct perf_event_attr __user *, attr_uptr,
6397 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
6398{
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006399 struct perf_event *group_leader = NULL, *output_event = NULL;
6400 struct perf_event *event, *sibling;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006401 struct perf_event_attr attr;
6402 struct perf_event_context *ctx;
6403 struct file *event_file = NULL;
6404 struct file *group_file = NULL;
Matt Helsley38a81da2010-09-13 13:01:20 -07006405 struct task_struct *task = NULL;
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006406 struct pmu *pmu;
Al Viroea635c62010-05-26 17:40:29 -04006407 int event_fd;
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006408 int move_group = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006409 int fput_needed = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006410 int err;
6411
6412 /* for future expandability... */
Stephane Eraniane5d13672011-02-14 11:20:01 +02006413 if (flags & ~PERF_FLAG_ALL)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006414 return -EINVAL;
6415
6416 err = perf_copy_attr(attr_uptr, &attr);
6417 if (err)
6418 return err;
6419
6420 if (!attr.exclude_kernel) {
6421 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6422 return -EACCES;
6423 }
6424
6425 if (attr.freq) {
6426 if (attr.sample_freq > sysctl_perf_event_sample_rate)
6427 return -EINVAL;
6428 }
6429
Stephane Eraniane5d13672011-02-14 11:20:01 +02006430 /*
6431 * In cgroup mode, the pid argument is used to pass the fd
6432 * opened to the cgroup directory in cgroupfs. The cpu argument
6433 * designates the cpu on which to monitor threads from that
6434 * cgroup.
6435 */
6436 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6437 return -EINVAL;
6438
Al Viroea635c62010-05-26 17:40:29 -04006439 event_fd = get_unused_fd_flags(O_RDWR);
6440 if (event_fd < 0)
6441 return event_fd;
6442
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006443 if (group_fd != -1) {
6444 group_leader = perf_fget_light(group_fd, &fput_needed);
6445 if (IS_ERR(group_leader)) {
6446 err = PTR_ERR(group_leader);
Stephane Eraniand14b12d2010-09-17 11:28:47 +02006447 goto err_fd;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006448 }
6449 group_file = group_leader->filp;
6450 if (flags & PERF_FLAG_FD_OUTPUT)
6451 output_event = group_leader;
6452 if (flags & PERF_FLAG_FD_NO_GROUP)
6453 group_leader = NULL;
6454 }
6455
Stephane Eraniane5d13672011-02-14 11:20:01 +02006456 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
Peter Zijlstrac6be5a52010-10-14 16:59:46 +02006457 task = find_lively_task_by_vpid(pid);
6458 if (IS_ERR(task)) {
6459 err = PTR_ERR(task);
6460 goto err_group_fd;
6461 }
6462 }
6463
Peter Zijlstrad580ff82010-10-14 17:43:23 +02006464 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
Stephane Eraniand14b12d2010-09-17 11:28:47 +02006465 if (IS_ERR(event)) {
6466 err = PTR_ERR(event);
Peter Zijlstrac6be5a52010-10-14 16:59:46 +02006467 goto err_task;
Stephane Eraniand14b12d2010-09-17 11:28:47 +02006468 }
6469
Stephane Eraniane5d13672011-02-14 11:20:01 +02006470 if (flags & PERF_FLAG_PID_CGROUP) {
6471 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6472 if (err)
6473 goto err_alloc;
Peter Zijlstra08309372011-03-03 11:31:20 +01006474 /*
6475 * one more event:
6476 * - that has cgroup constraint on event->cpu
6477 * - that may need work on context switch
6478 */
6479 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6480 jump_label_inc(&perf_sched_events);
Stephane Eraniane5d13672011-02-14 11:20:01 +02006481 }
6482
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006483 /*
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006484 * Special case software events and allow them to be part of
6485 * any hardware group.
6486 */
6487 pmu = event->pmu;
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006488
6489 if (group_leader &&
6490 (is_software_event(event) != is_software_event(group_leader))) {
6491 if (is_software_event(event)) {
6492 /*
6493 * If event and group_leader are not both a software
6494 * event, and event is, then group leader is not.
6495 *
6496 * Allow the addition of software events to !software
6497 * groups, this is safe because software events never
6498 * fail to schedule.
6499 */
6500 pmu = group_leader->pmu;
6501 } else if (is_software_event(group_leader) &&
6502 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
6503 /*
6504 * In case the group is a pure software group, and we
6505 * try to add a hardware event, move the whole group to
6506 * the hardware context.
6507 */
6508 move_group = 1;
6509 }
6510 }
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006511
6512 /*
6513 * Get the target context (task or percpu):
6514 */
Matt Helsley38a81da2010-09-13 13:01:20 -07006515 ctx = find_get_context(pmu, task, cpu);
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006516 if (IS_ERR(ctx)) {
6517 err = PTR_ERR(ctx);
Peter Zijlstrac6be5a52010-10-14 16:59:46 +02006518 goto err_alloc;
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006519 }
6520
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006521 /*
6522 * Look up the group leader (we will attach this event to it):
6523 */
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006524 if (group_leader) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006525 err = -EINVAL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006526
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006527 /*
6528 * Do not allow a recursive hierarchy (this new sibling
6529 * becoming part of another group-sibling):
6530 */
6531 if (group_leader->group_leader != group_leader)
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006532 goto err_context;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006533 /*
6534 * Do not allow to attach to a group in a different
6535 * task or CPU context:
6536 */
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006537 if (move_group) {
6538 if (group_leader->ctx->type != ctx->type)
6539 goto err_context;
6540 } else {
6541 if (group_leader->ctx != ctx)
6542 goto err_context;
6543 }
6544
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006545 /*
6546 * Only a group leader can be exclusive or pinned
6547 */
6548 if (attr.exclusive || attr.pinned)
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006549 goto err_context;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006550 }
6551
6552 if (output_event) {
6553 err = perf_event_set_output(event, output_event);
6554 if (err)
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006555 goto err_context;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006556 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006557
Al Viroea635c62010-05-26 17:40:29 -04006558 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
6559 if (IS_ERR(event_file)) {
6560 err = PTR_ERR(event_file);
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006561 goto err_context;
Al Viroea635c62010-05-26 17:40:29 -04006562 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006563
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006564 if (move_group) {
6565 struct perf_event_context *gctx = group_leader->ctx;
6566
6567 mutex_lock(&gctx->mutex);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006568 perf_remove_from_context(group_leader);
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006569 list_for_each_entry(sibling, &group_leader->sibling_list,
6570 group_entry) {
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006571 perf_remove_from_context(sibling);
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006572 put_ctx(gctx);
6573 }
6574 mutex_unlock(&gctx->mutex);
6575 put_ctx(gctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006576 }
6577
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006578 event->filp = event_file;
6579 WARN_ON_ONCE(ctx->parent_ctx);
6580 mutex_lock(&ctx->mutex);
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006581
6582 if (move_group) {
6583 perf_install_in_context(ctx, group_leader, cpu);
6584 get_ctx(ctx);
6585 list_for_each_entry(sibling, &group_leader->sibling_list,
6586 group_entry) {
6587 perf_install_in_context(ctx, sibling, cpu);
6588 get_ctx(ctx);
6589 }
6590 }
6591
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006592 perf_install_in_context(ctx, event, cpu);
6593 ++ctx->generation;
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006594 perf_unpin_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006595 mutex_unlock(&ctx->mutex);
6596
6597 event->owner = current;
Peter Zijlstra88821352010-11-09 19:01:43 +01006598
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006599 mutex_lock(&current->perf_event_mutex);
6600 list_add_tail(&event->owner_entry, &current->perf_event_list);
6601 mutex_unlock(&current->perf_event_mutex);
6602
Peter Zijlstra8a495422010-05-27 15:47:49 +02006603 /*
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02006604 * Precalculate sample_data sizes
6605 */
6606 perf_event__header_size(event);
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02006607 perf_event__id_header_size(event);
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02006608
6609 /*
Peter Zijlstra8a495422010-05-27 15:47:49 +02006610 * Drop the reference on the group_event after placing the
6611 * new event on the sibling_list. This ensures destruction
6612 * of the group leader will find the pointer to itself in
6613 * perf_group_detach().
6614 */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006615 fput_light(group_file, fput_needed);
Al Viroea635c62010-05-26 17:40:29 -04006616 fd_install(event_fd, event_file);
6617 return event_fd;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006618
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006619err_context:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006620 perf_unpin_context(ctx);
Al Viroea635c62010-05-26 17:40:29 -04006621 put_ctx(ctx);
Peter Zijlstrac6be5a52010-10-14 16:59:46 +02006622err_alloc:
6623 free_event(event);
Peter Zijlstrae7d0bc02010-10-14 16:54:51 +02006624err_task:
6625 if (task)
6626 put_task_struct(task);
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006627err_group_fd:
6628 fput_light(group_file, fput_needed);
Al Viroea635c62010-05-26 17:40:29 -04006629err_fd:
6630 put_unused_fd(event_fd);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006631 return err;
6632}
6633
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006634/**
6635 * perf_event_create_kernel_counter
6636 *
6637 * @attr: attributes of the counter to create
6638 * @cpu: cpu in which the counter is bound
Matt Helsley38a81da2010-09-13 13:01:20 -07006639 * @task: task to profile (NULL for percpu)
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006640 */
6641struct perf_event *
6642perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
Matt Helsley38a81da2010-09-13 13:01:20 -07006643 struct task_struct *task,
Frederic Weisbeckerb326e952009-12-05 09:44:31 +01006644 perf_overflow_handler_t overflow_handler)
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006645{
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006646 struct perf_event_context *ctx;
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006647 struct perf_event *event;
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006648 int err;
6649
6650 /*
6651 * Get the target context (task or percpu):
6652 */
6653
Peter Zijlstrad580ff82010-10-14 17:43:23 +02006654 event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
Frederic Weisbeckerc6567f62009-11-26 05:35:41 +01006655 if (IS_ERR(event)) {
6656 err = PTR_ERR(event);
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006657 goto err;
6658 }
6659
Matt Helsley38a81da2010-09-13 13:01:20 -07006660 ctx = find_get_context(event->pmu, task, cpu);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006661 if (IS_ERR(ctx)) {
6662 err = PTR_ERR(ctx);
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006663 goto err_free;
Frederic Weisbeckerc6567f62009-11-26 05:35:41 +01006664 }
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006665
6666 event->filp = NULL;
6667 WARN_ON_ONCE(ctx->parent_ctx);
6668 mutex_lock(&ctx->mutex);
6669 perf_install_in_context(ctx, event, cpu);
6670 ++ctx->generation;
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006671 perf_unpin_context(ctx);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006672 mutex_unlock(&ctx->mutex);
6673
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006674 return event;
6675
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006676err_free:
6677 free_event(event);
6678err:
Frederic Weisbeckerc6567f62009-11-26 05:35:41 +01006679 return ERR_PTR(err);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006680}
6681EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
6682
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006683static void sync_child_event(struct perf_event *child_event,
6684 struct task_struct *child)
6685{
6686 struct perf_event *parent_event = child_event->parent;
6687 u64 child_val;
6688
6689 if (child_event->attr.inherit_stat)
6690 perf_event_read_event(child_event, child);
6691
Peter Zijlstrab5e58792010-05-21 14:43:12 +02006692 child_val = perf_event_count(child_event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006693
6694 /*
6695 * Add back the child's count to the parent's count:
6696 */
Peter Zijlstraa6e6dea2010-05-21 14:27:58 +02006697 atomic64_add(child_val, &parent_event->child_count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006698 atomic64_add(child_event->total_time_enabled,
6699 &parent_event->child_total_time_enabled);
6700 atomic64_add(child_event->total_time_running,
6701 &parent_event->child_total_time_running);
6702
6703 /*
6704 * Remove this event from the parent's list
6705 */
6706 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
6707 mutex_lock(&parent_event->child_mutex);
6708 list_del_init(&child_event->child_list);
6709 mutex_unlock(&parent_event->child_mutex);
6710
6711 /*
6712 * Release the parent event, if this was the last
6713 * reference to it.
6714 */
6715 fput(parent_event->filp);
6716}
6717
6718static void
6719__perf_event_exit_task(struct perf_event *child_event,
6720 struct perf_event_context *child_ctx,
6721 struct task_struct *child)
6722{
6723 struct perf_event *parent_event;
6724
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006725 perf_remove_from_context(child_event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006726
6727 parent_event = child_event->parent;
6728 /*
6729 * It can happen that parent exits first, and has events
6730 * that are still around due to the child reference. These
6731 * events need to be zapped - but otherwise linger.
6732 */
6733 if (parent_event) {
6734 sync_child_event(child_event, child);
6735 free_event(child_event);
6736 }
6737}
6738
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02006739static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006740{
6741 struct perf_event *child_event, *tmp;
6742 struct perf_event_context *child_ctx;
6743 unsigned long flags;
6744
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02006745 if (likely(!child->perf_event_ctxp[ctxn])) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006746 perf_event_task(child, NULL, 0);
6747 return;
6748 }
6749
6750 local_irq_save(flags);
6751 /*
6752 * We can't reschedule here because interrupts are disabled,
6753 * and either child is current or it is a task that can't be
6754 * scheduled, so we are now safe from rescheduling changing
6755 * our context.
6756 */
Oleg Nesterov806839b2011-01-21 18:45:47 +01006757 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
Peter Zijlstra82cd6de2010-10-14 17:57:23 +02006758 task_ctx_sched_out(child_ctx, EVENT_ALL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006759
6760 /*
6761 * Take the context lock here so that if find_get_context is
6762 * reading child->perf_event_ctxp, we wait until it has
6763 * incremented the context's refcount before we do put_ctx below.
6764 */
Thomas Gleixnere625cce12009-11-17 18:02:06 +01006765 raw_spin_lock(&child_ctx->lock);
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02006766 child->perf_event_ctxp[ctxn] = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006767 /*
6768 * If this context is a clone; unclone it so it can't get
6769 * swapped to another process while we're removing all
6770 * the events from it.
6771 */
6772 unclone_ctx(child_ctx);
Peter Zijlstra5e942bb2009-11-23 11:37:26 +01006773 update_context_time(child_ctx);
Thomas Gleixnere625cce12009-11-17 18:02:06 +01006774 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006775
6776 /*
6777 * Report the task dead after unscheduling the events so that we
6778 * won't get any samples after PERF_RECORD_EXIT. We can however still
6779 * get a few PERF_RECORD_READ events.
6780 */
6781 perf_event_task(child, child_ctx, 0);
6782
6783 /*
6784 * We can recurse on the same lock type through:
6785 *
6786 * __perf_event_exit_task()
6787 * sync_child_event()
6788 * fput(parent_event->filp)
6789 * perf_release()
6790 * mutex_lock(&ctx->mutex)
6791 *
6792 * But since its the parent context it won't be the same instance.
6793 */
Peter Zijlstraa0507c82010-05-06 15:42:53 +02006794 mutex_lock(&child_ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006795
6796again:
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006797 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
6798 group_entry)
6799 __perf_event_exit_task(child_event, child_ctx, child);
6800
6801 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006802 group_entry)
6803 __perf_event_exit_task(child_event, child_ctx, child);
6804
6805 /*
6806 * If the last event was a group event, it will have appended all
6807 * its siblings to the list, but we obtained 'tmp' before that which
6808 * will still point to the list head terminating the iteration.
6809 */
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006810 if (!list_empty(&child_ctx->pinned_groups) ||
6811 !list_empty(&child_ctx->flexible_groups))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006812 goto again;
6813
6814 mutex_unlock(&child_ctx->mutex);
6815
6816 put_ctx(child_ctx);
6817}
6818
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02006819/*
6820 * When a child task exits, feed back event values to parent events.
6821 */
6822void perf_event_exit_task(struct task_struct *child)
6823{
Peter Zijlstra88821352010-11-09 19:01:43 +01006824 struct perf_event *event, *tmp;
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02006825 int ctxn;
6826
Peter Zijlstra88821352010-11-09 19:01:43 +01006827 mutex_lock(&child->perf_event_mutex);
6828 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
6829 owner_entry) {
6830 list_del_init(&event->owner_entry);
6831
6832 /*
6833 * Ensure the list deletion is visible before we clear
6834 * the owner, closes a race against perf_release() where
6835 * we need to serialize on the owner->perf_event_mutex.
6836 */
6837 smp_wmb();
6838 event->owner = NULL;
6839 }
6840 mutex_unlock(&child->perf_event_mutex);
6841
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02006842 for_each_task_context_nr(ctxn)
6843 perf_event_exit_task_context(child, ctxn);
6844}
6845
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006846static void perf_free_event(struct perf_event *event,
6847 struct perf_event_context *ctx)
6848{
6849 struct perf_event *parent = event->parent;
6850
6851 if (WARN_ON_ONCE(!parent))
6852 return;
6853
6854 mutex_lock(&parent->child_mutex);
6855 list_del_init(&event->child_list);
6856 mutex_unlock(&parent->child_mutex);
6857
6858 fput(parent->filp);
6859
Peter Zijlstra8a495422010-05-27 15:47:49 +02006860 perf_group_detach(event);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006861 list_del_event(event, ctx);
6862 free_event(event);
6863}
6864
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006865/*
6866 * free an unexposed, unused context as created by inheritance by
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02006867 * perf_event_init_task below, used by fork() in case of fail.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006868 */
6869void perf_event_free_task(struct task_struct *task)
6870{
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02006871 struct perf_event_context *ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006872 struct perf_event *event, *tmp;
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02006873 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006874
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02006875 for_each_task_context_nr(ctxn) {
6876 ctx = task->perf_event_ctxp[ctxn];
6877 if (!ctx)
6878 continue;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006879
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02006880 mutex_lock(&ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006881again:
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02006882 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
6883 group_entry)
6884 perf_free_event(event, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006885
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02006886 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
6887 group_entry)
6888 perf_free_event(event, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006889
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02006890 if (!list_empty(&ctx->pinned_groups) ||
6891 !list_empty(&ctx->flexible_groups))
6892 goto again;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006893
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02006894 mutex_unlock(&ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006895
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02006896 put_ctx(ctx);
6897 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006898}
6899
Peter Zijlstra4e231c72010-09-09 21:01:59 +02006900void perf_event_delayed_put(struct task_struct *task)
6901{
6902 int ctxn;
6903
6904 for_each_task_context_nr(ctxn)
6905 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
6906}
6907
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006908/*
6909 * inherit a event from parent task to child task:
6910 */
6911static struct perf_event *
6912inherit_event(struct perf_event *parent_event,
6913 struct task_struct *parent,
6914 struct perf_event_context *parent_ctx,
6915 struct task_struct *child,
6916 struct perf_event *group_leader,
6917 struct perf_event_context *child_ctx)
6918{
6919 struct perf_event *child_event;
Peter Zijlstracee010e2010-09-10 12:51:54 +02006920 unsigned long flags;
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006921
6922 /*
6923 * Instead of creating recursive hierarchies of events,
6924 * we link inherited events back to the original parent,
6925 * which has a filp for sure, which we use as the reference
6926 * count:
6927 */
6928 if (parent_event->parent)
6929 parent_event = parent_event->parent;
6930
6931 child_event = perf_event_alloc(&parent_event->attr,
6932 parent_event->cpu,
Peter Zijlstrad580ff82010-10-14 17:43:23 +02006933 child,
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006934 group_leader, parent_event,
6935 NULL);
6936 if (IS_ERR(child_event))
6937 return child_event;
6938 get_ctx(child_ctx);
6939
6940 /*
6941 * Make the child state follow the state of the parent event,
6942 * not its attr.disabled bit. We hold the parent's mutex,
6943 * so we won't race with perf_event_{en, dis}able_family.
6944 */
6945 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
6946 child_event->state = PERF_EVENT_STATE_INACTIVE;
6947 else
6948 child_event->state = PERF_EVENT_STATE_OFF;
6949
6950 if (parent_event->attr.freq) {
6951 u64 sample_period = parent_event->hw.sample_period;
6952 struct hw_perf_event *hwc = &child_event->hw;
6953
6954 hwc->sample_period = sample_period;
6955 hwc->last_period = sample_period;
6956
6957 local64_set(&hwc->period_left, sample_period);
6958 }
6959
6960 child_event->ctx = child_ctx;
6961 child_event->overflow_handler = parent_event->overflow_handler;
6962
6963 /*
Thomas Gleixner614b6782010-12-03 16:24:32 -02006964 * Precalculate sample_data sizes
6965 */
6966 perf_event__header_size(child_event);
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02006967 perf_event__id_header_size(child_event);
Thomas Gleixner614b6782010-12-03 16:24:32 -02006968
6969 /*
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006970 * Link it up in the child's context:
6971 */
Peter Zijlstracee010e2010-09-10 12:51:54 +02006972 raw_spin_lock_irqsave(&child_ctx->lock, flags);
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006973 add_event_to_ctx(child_event, child_ctx);
Peter Zijlstracee010e2010-09-10 12:51:54 +02006974 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006975
6976 /*
6977 * Get a reference to the parent filp - we will fput it
6978 * when the child event exits. This is safe to do because
6979 * we are in the parent and we know that the filp still
6980 * exists and has a nonzero count:
6981 */
6982 atomic_long_inc(&parent_event->filp->f_count);
6983
6984 /*
6985 * Link this into the parent event's child list
6986 */
6987 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
6988 mutex_lock(&parent_event->child_mutex);
6989 list_add_tail(&child_event->child_list, &parent_event->child_list);
6990 mutex_unlock(&parent_event->child_mutex);
6991
6992 return child_event;
6993}
6994
6995static int inherit_group(struct perf_event *parent_event,
6996 struct task_struct *parent,
6997 struct perf_event_context *parent_ctx,
6998 struct task_struct *child,
6999 struct perf_event_context *child_ctx)
7000{
7001 struct perf_event *leader;
7002 struct perf_event *sub;
7003 struct perf_event *child_ctr;
7004
7005 leader = inherit_event(parent_event, parent, parent_ctx,
7006 child, NULL, child_ctx);
7007 if (IS_ERR(leader))
7008 return PTR_ERR(leader);
7009 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
7010 child_ctr = inherit_event(sub, parent, parent_ctx,
7011 child, leader, child_ctx);
7012 if (IS_ERR(child_ctr))
7013 return PTR_ERR(child_ctr);
7014 }
7015 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007016}
7017
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007018static int
7019inherit_task_group(struct perf_event *event, struct task_struct *parent,
7020 struct perf_event_context *parent_ctx,
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02007021 struct task_struct *child, int ctxn,
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007022 int *inherited_all)
7023{
7024 int ret;
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02007025 struct perf_event_context *child_ctx;
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007026
7027 if (!event->attr.inherit) {
7028 *inherited_all = 0;
7029 return 0;
7030 }
7031
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01007032 child_ctx = child->perf_event_ctxp[ctxn];
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007033 if (!child_ctx) {
7034 /*
7035 * This is executed from the parent task context, so
7036 * inherit events that have been marked for cloning.
7037 * First allocate and initialize a context for the
7038 * child.
7039 */
7040
Peter Zijlstraeb184472010-09-07 15:55:13 +02007041 child_ctx = alloc_perf_context(event->pmu, child);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007042 if (!child_ctx)
7043 return -ENOMEM;
7044
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02007045 child->perf_event_ctxp[ctxn] = child_ctx;
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007046 }
7047
7048 ret = inherit_group(event, parent, parent_ctx,
7049 child, child_ctx);
7050
7051 if (ret)
7052 *inherited_all = 0;
7053
7054 return ret;
7055}
7056
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007057/*
7058 * Initialize the perf_event context in task_struct
7059 */
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02007060int perf_event_init_context(struct task_struct *child, int ctxn)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007061{
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007062 struct perf_event_context *child_ctx, *parent_ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007063 struct perf_event_context *cloned_ctx;
7064 struct perf_event *event;
7065 struct task_struct *parent = current;
7066 int inherited_all = 1;
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01007067 unsigned long flags;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007068 int ret = 0;
7069
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02007070 if (likely(!parent->perf_event_ctxp[ctxn]))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007071 return 0;
7072
7073 /*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007074 * If the parent's context is a clone, pin it so it won't get
7075 * swapped under us.
7076 */
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02007077 parent_ctx = perf_pin_task_context(parent, ctxn);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007078
7079 /*
7080 * No need to check if parent_ctx != NULL here; since we saw
7081 * it non-NULL earlier, the only reason for it to become NULL
7082 * is if we exit, and since we're currently in the middle of
7083 * a fork we can't be exiting at the same time.
7084 */
7085
7086 /*
7087 * Lock the parent list. No need to lock the child - not PID
7088 * hashed yet and not running, so nobody can access it.
7089 */
7090 mutex_lock(&parent_ctx->mutex);
7091
7092 /*
7093 * We dont have to disable NMIs - we are only looking at
7094 * the list, not manipulating it:
7095 */
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007096 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02007097 ret = inherit_task_group(event, parent, parent_ctx,
7098 child, ctxn, &inherited_all);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007099 if (ret)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007100 break;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007101 }
7102
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01007103 /*
7104 * We can't hold ctx->lock when iterating the ->flexible_group list due
7105 * to allocations, but we need to prevent rotation because
7106 * rotate_ctx() will change the list from interrupt context.
7107 */
7108 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
7109 parent_ctx->rotate_disable = 1;
7110 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
7111
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007112 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02007113 ret = inherit_task_group(event, parent, parent_ctx,
7114 child, ctxn, &inherited_all);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007115 if (ret)
7116 break;
7117 }
7118
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01007119 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
7120 parent_ctx->rotate_disable = 0;
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01007121
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02007122 child_ctx = child->perf_event_ctxp[ctxn];
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007123
Peter Zijlstra05cbaa22009-12-30 16:00:35 +01007124 if (child_ctx && inherited_all) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007125 /*
7126 * Mark the child context as a clone of the parent
7127 * context, or of whatever the parent is a clone of.
Peter Zijlstrac5ed5142011-01-17 13:45:37 +01007128 *
7129 * Note that if the parent is a clone, the holding of
7130 * parent_ctx->lock avoids it from being uncloned.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007131 */
Peter Zijlstrac5ed5142011-01-17 13:45:37 +01007132 cloned_ctx = parent_ctx->parent_ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007133 if (cloned_ctx) {
7134 child_ctx->parent_ctx = cloned_ctx;
7135 child_ctx->parent_gen = parent_ctx->parent_gen;
7136 } else {
7137 child_ctx->parent_ctx = parent_ctx;
7138 child_ctx->parent_gen = parent_ctx->generation;
7139 }
7140 get_ctx(child_ctx->parent_ctx);
7141 }
7142
Peter Zijlstrac5ed5142011-01-17 13:45:37 +01007143 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007144 mutex_unlock(&parent_ctx->mutex);
7145
7146 perf_unpin_context(parent_ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01007147 put_ctx(parent_ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007148
7149 return ret;
7150}
7151
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02007152/*
7153 * Initialize the perf_event context in task_struct
7154 */
7155int perf_event_init_task(struct task_struct *child)
7156{
7157 int ctxn, ret;
7158
Oleg Nesterov8550d7c2011-01-19 19:22:28 +01007159 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
7160 mutex_init(&child->perf_event_mutex);
7161 INIT_LIST_HEAD(&child->perf_event_list);
7162
Peter Zijlstra8dc85d5472010-09-02 16:50:03 +02007163 for_each_task_context_nr(ctxn) {
7164 ret = perf_event_init_context(child, ctxn);
7165 if (ret)
7166 return ret;
7167 }
7168
7169 return 0;
7170}
7171
Paul Mackerras220b1402010-03-10 20:45:52 +11007172static void __init perf_event_init_all_cpus(void)
7173{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007174 struct swevent_htable *swhash;
Paul Mackerras220b1402010-03-10 20:45:52 +11007175 int cpu;
Paul Mackerras220b1402010-03-10 20:45:52 +11007176
7177 for_each_possible_cpu(cpu) {
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007178 swhash = &per_cpu(swevent_htable, cpu);
7179 mutex_init(&swhash->hlist_mutex);
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02007180 INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
Paul Mackerras220b1402010-03-10 20:45:52 +11007181 }
7182}
7183
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007184static void __cpuinit perf_event_init_cpu(int cpu)
7185{
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007186 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007187
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007188 mutex_lock(&swhash->hlist_mutex);
7189 if (swhash->hlist_refcount > 0) {
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02007190 struct swevent_hlist *hlist;
7191
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007192 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
7193 WARN_ON(!hlist);
7194 rcu_assign_pointer(swhash->swevent_hlist, hlist);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02007195 }
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007196 mutex_unlock(&swhash->hlist_mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007197}
7198
Peter Zijlstrac2774432010-12-08 15:29:02 +01007199#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02007200static void perf_pmu_rotate_stop(struct pmu *pmu)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007201{
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02007202 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
7203
7204 WARN_ON(!irqs_disabled());
7205
7206 list_del_init(&cpuctx->rotation_list);
7207}
7208
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007209static void __perf_event_exit_context(void *__info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007210{
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007211 struct perf_event_context *ctx = __info;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007212 struct perf_event *event, *tmp;
7213
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007214 perf_pmu_rotate_stop(ctx->pmu);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02007215
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007216 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01007217 __perf_remove_from_context(event);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007218 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01007219 __perf_remove_from_context(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007220}
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007221
7222static void perf_event_exit_cpu_context(int cpu)
7223{
7224 struct perf_event_context *ctx;
7225 struct pmu *pmu;
7226 int idx;
7227
7228 idx = srcu_read_lock(&pmus_srcu);
7229 list_for_each_entry_rcu(pmu, &pmus, entry) {
Peter Zijlstra917bdd12010-09-17 11:28:49 +02007230 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007231
7232 mutex_lock(&ctx->mutex);
7233 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
7234 mutex_unlock(&ctx->mutex);
7235 }
7236 srcu_read_unlock(&pmus_srcu, idx);
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007237}
7238
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007239static void perf_event_exit_cpu(int cpu)
7240{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007241 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007242
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007243 mutex_lock(&swhash->hlist_mutex);
7244 swevent_hlist_release(swhash);
7245 mutex_unlock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02007246
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007247 perf_event_exit_cpu_context(cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007248}
7249#else
7250static inline void perf_event_exit_cpu(int cpu) { }
7251#endif
7252
Peter Zijlstrac2774432010-12-08 15:29:02 +01007253static int
7254perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
7255{
7256 int cpu;
7257
7258 for_each_online_cpu(cpu)
7259 perf_event_exit_cpu(cpu);
7260
7261 return NOTIFY_OK;
7262}
7263
7264/*
7265 * Run the perf reboot notifier at the very last possible moment so that
7266 * the generic watchdog code runs as long as possible.
7267 */
7268static struct notifier_block perf_reboot_notifier = {
7269 .notifier_call = perf_reboot,
7270 .priority = INT_MIN,
7271};
7272
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007273static int __cpuinit
7274perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
7275{
7276 unsigned int cpu = (long)hcpu;
7277
Peter Zijlstra5e116372010-06-11 13:35:08 +02007278 switch (action & ~CPU_TASKS_FROZEN) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007279
7280 case CPU_UP_PREPARE:
Peter Zijlstra5e116372010-06-11 13:35:08 +02007281 case CPU_DOWN_FAILED:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007282 perf_event_init_cpu(cpu);
7283 break;
7284
Peter Zijlstra5e116372010-06-11 13:35:08 +02007285 case CPU_UP_CANCELED:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007286 case CPU_DOWN_PREPARE:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007287 perf_event_exit_cpu(cpu);
7288 break;
7289
7290 default:
7291 break;
7292 }
7293
7294 return NOTIFY_OK;
7295}
7296
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007297void __init perf_event_init(void)
7298{
Jason Wessel3c502e72010-11-04 17:33:01 -05007299 int ret;
7300
Peter Zijlstra2e80a822010-11-17 23:17:36 +01007301 idr_init(&pmu_idr);
7302
Paul Mackerras220b1402010-03-10 20:45:52 +11007303 perf_event_init_all_cpus();
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007304 init_srcu_struct(&pmus_srcu);
Peter Zijlstra2e80a822010-11-17 23:17:36 +01007305 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
7306 perf_pmu_register(&perf_cpu_clock, NULL, -1);
7307 perf_pmu_register(&perf_task_clock, NULL, -1);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007308 perf_tp_register();
7309 perf_cpu_notifier(perf_cpu_notify);
Peter Zijlstrac2774432010-12-08 15:29:02 +01007310 register_reboot_notifier(&perf_reboot_notifier);
Jason Wessel3c502e72010-11-04 17:33:01 -05007311
7312 ret = init_hw_breakpoint();
7313 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007314}
Peter Zijlstraabe43402010-11-17 23:17:37 +01007315
7316static int __init perf_event_sysfs_init(void)
7317{
7318 struct pmu *pmu;
7319 int ret;
7320
7321 mutex_lock(&pmus_lock);
7322
7323 ret = bus_register(&pmu_bus);
7324 if (ret)
7325 goto unlock;
7326
7327 list_for_each_entry(pmu, &pmus, entry) {
7328 if (!pmu->name || pmu->type < 0)
7329 continue;
7330
7331 ret = pmu_dev_alloc(pmu);
7332 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
7333 }
7334 pmu_bus_running = 1;
7335 ret = 0;
7336
7337unlock:
7338 mutex_unlock(&pmus_lock);
7339
7340 return ret;
7341}
7342device_initcall(perf_event_sysfs_init);
Stephane Eraniane5d13672011-02-14 11:20:01 +02007343
7344#ifdef CONFIG_CGROUP_PERF
7345static struct cgroup_subsys_state *perf_cgroup_create(
7346 struct cgroup_subsys *ss, struct cgroup *cont)
7347{
7348 struct perf_cgroup *jc;
Stephane Eraniane5d13672011-02-14 11:20:01 +02007349
Li Zefan1b15d052011-03-03 14:26:06 +08007350 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
Stephane Eraniane5d13672011-02-14 11:20:01 +02007351 if (!jc)
7352 return ERR_PTR(-ENOMEM);
7353
Stephane Eraniane5d13672011-02-14 11:20:01 +02007354 jc->info = alloc_percpu(struct perf_cgroup_info);
7355 if (!jc->info) {
7356 kfree(jc);
7357 return ERR_PTR(-ENOMEM);
7358 }
7359
Stephane Eraniane5d13672011-02-14 11:20:01 +02007360 return &jc->css;
7361}
7362
7363static void perf_cgroup_destroy(struct cgroup_subsys *ss,
7364 struct cgroup *cont)
7365{
7366 struct perf_cgroup *jc;
7367 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
7368 struct perf_cgroup, css);
7369 free_percpu(jc->info);
7370 kfree(jc);
7371}
7372
7373static int __perf_cgroup_move(void *info)
7374{
7375 struct task_struct *task = info;
7376 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
7377 return 0;
7378}
7379
7380static void perf_cgroup_move(struct task_struct *task)
7381{
7382 task_function_call(task, __perf_cgroup_move, task);
7383}
7384
7385static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7386 struct cgroup *old_cgrp, struct task_struct *task,
7387 bool threadgroup)
7388{
7389 perf_cgroup_move(task);
7390 if (threadgroup) {
7391 struct task_struct *c;
7392 rcu_read_lock();
7393 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
7394 perf_cgroup_move(c);
7395 }
7396 rcu_read_unlock();
7397 }
7398}
7399
7400static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7401 struct cgroup *old_cgrp, struct task_struct *task)
7402{
7403 /*
7404 * cgroup_exit() is called in the copy_process() failure path.
7405 * Ignore this case since the task hasn't ran yet, this avoids
7406 * trying to poke a half freed task state from generic code.
7407 */
7408 if (!(task->flags & PF_EXITING))
7409 return;
7410
7411 perf_cgroup_move(task);
7412}
7413
7414struct cgroup_subsys perf_subsys = {
7415 .name = "perf_event",
7416 .subsys_id = perf_subsys_id,
7417 .create = perf_cgroup_create,
7418 .destroy = perf_cgroup_destroy,
7419 .exit = perf_cgroup_exit,
7420 .attach = perf_cgroup_attach,
7421};
7422#endif /* CONFIG_CGROUP_PERF */