| /** |
| * @file cpu_buffer.c |
| * |
| * @remark Copyright 2002 OProfile authors |
| * @remark Read the file COPYING |
| * |
| * @author John Levon <levon@movementarian.org> |
| * @author Barry Kasindorf <barry.kasindorf@amd.com> |
| * |
| * Each CPU has a local buffer that stores PC value/event |
| * pairs. We also log context switches when we notice them. |
| * Eventually each CPU's buffer is processed into the global |
| * event buffer by sync_buffer(). |
| * |
| * We use a local buffer for two reasons: an NMI or similar |
| * interrupt cannot synchronise, and high sampling rates |
| * would lead to catastrophic global synchronisation if |
| * a global buffer was used. |
| */ |
| |
| #include <linux/sched.h> |
| #include <linux/oprofile.h> |
| #include <linux/vmalloc.h> |
| #include <linux/errno.h> |
| |
| #include "event_buffer.h" |
| #include "cpu_buffer.h" |
| #include "buffer_sync.h" |
| #include "oprof.h" |
| |
| #define OP_BUFFER_FLAGS 0 |
| |
| /* |
| * Read and write access is using spin locking. Thus, writing to the |
| * buffer by NMI handler (x86) could occur also during critical |
| * sections when reading the buffer. To avoid this, there are 2 |
| * buffers for independent read and write access. Read access is in |
| * process context only, write access only in the NMI handler. If the |
| * read buffer runs empty, both buffers are swapped atomically. There |
| * is potentially a small window during swapping where the buffers are |
| * disabled and samples could be lost. |
| * |
| * Using 2 buffers is a little bit overhead, but the solution is clear |
| * and does not require changes in the ring buffer implementation. It |
| * can be changed to a single buffer solution when the ring buffer |
| * access is implemented as non-locking atomic code. |
| */ |
| static struct ring_buffer *op_ring_buffer_read; |
| static struct ring_buffer *op_ring_buffer_write; |
| DEFINE_PER_CPU(struct oprofile_cpu_buffer, cpu_buffer); |
| |
| static void wq_sync_buffer(struct work_struct *work); |
| |
| #define DEFAULT_TIMER_EXPIRE (HZ / 10) |
| static int work_enabled; |
| |
| unsigned long oprofile_get_cpu_buffer_size(void) |
| { |
| return oprofile_cpu_buffer_size; |
| } |
| |
| void oprofile_cpu_buffer_inc_smpl_lost(void) |
| { |
| struct oprofile_cpu_buffer *cpu_buf |
| = &__get_cpu_var(cpu_buffer); |
| |
| cpu_buf->sample_lost_overflow++; |
| } |
| |
| void free_cpu_buffers(void) |
| { |
| if (op_ring_buffer_read) |
| ring_buffer_free(op_ring_buffer_read); |
| op_ring_buffer_read = NULL; |
| if (op_ring_buffer_write) |
| ring_buffer_free(op_ring_buffer_write); |
| op_ring_buffer_write = NULL; |
| } |
| |
| int alloc_cpu_buffers(void) |
| { |
| int i; |
| |
| unsigned long buffer_size = oprofile_cpu_buffer_size; |
| |
| op_ring_buffer_read = ring_buffer_alloc(buffer_size, OP_BUFFER_FLAGS); |
| if (!op_ring_buffer_read) |
| goto fail; |
| op_ring_buffer_write = ring_buffer_alloc(buffer_size, OP_BUFFER_FLAGS); |
| if (!op_ring_buffer_write) |
| goto fail; |
| |
| for_each_possible_cpu(i) { |
| struct oprofile_cpu_buffer *b = &per_cpu(cpu_buffer, i); |
| |
| b->last_task = NULL; |
| b->last_is_kernel = -1; |
| b->tracing = 0; |
| b->buffer_size = buffer_size; |
| b->tail_pos = 0; |
| b->head_pos = 0; |
| b->sample_received = 0; |
| b->sample_lost_overflow = 0; |
| b->backtrace_aborted = 0; |
| b->sample_invalid_eip = 0; |
| b->cpu = i; |
| INIT_DELAYED_WORK(&b->work, wq_sync_buffer); |
| } |
| return 0; |
| |
| fail: |
| free_cpu_buffers(); |
| return -ENOMEM; |
| } |
| |
| void start_cpu_work(void) |
| { |
| int i; |
| |
| work_enabled = 1; |
| |
| for_each_online_cpu(i) { |
| struct oprofile_cpu_buffer *b = &per_cpu(cpu_buffer, i); |
| |
| /* |
| * Spread the work by 1 jiffy per cpu so they dont all |
| * fire at once. |
| */ |
| schedule_delayed_work_on(i, &b->work, DEFAULT_TIMER_EXPIRE + i); |
| } |
| } |
| |
| void end_cpu_work(void) |
| { |
| int i; |
| |
| work_enabled = 0; |
| |
| for_each_online_cpu(i) { |
| struct oprofile_cpu_buffer *b = &per_cpu(cpu_buffer, i); |
| |
| cancel_delayed_work(&b->work); |
| } |
| |
| flush_scheduled_work(); |
| } |
| |
| int op_cpu_buffer_write_entry(struct op_entry *entry) |
| { |
| entry->event = ring_buffer_lock_reserve(op_ring_buffer_write, |
| sizeof(struct op_sample), |
| &entry->irq_flags); |
| if (entry->event) |
| entry->sample = ring_buffer_event_data(entry->event); |
| else |
| entry->sample = NULL; |
| |
| if (!entry->sample) |
| return -ENOMEM; |
| |
| return 0; |
| } |
| |
| int op_cpu_buffer_write_commit(struct op_entry *entry) |
| { |
| return ring_buffer_unlock_commit(op_ring_buffer_write, entry->event, |
| entry->irq_flags); |
| } |
| |
| struct op_sample *op_cpu_buffer_read_entry(int cpu) |
| { |
| struct ring_buffer_event *e; |
| e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL); |
| if (e) |
| return ring_buffer_event_data(e); |
| if (ring_buffer_swap_cpu(op_ring_buffer_read, |
| op_ring_buffer_write, |
| cpu)) |
| return NULL; |
| e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL); |
| if (e) |
| return ring_buffer_event_data(e); |
| return NULL; |
| } |
| |
| unsigned long op_cpu_buffer_entries(int cpu) |
| { |
| return ring_buffer_entries_cpu(op_ring_buffer_read, cpu) |
| + ring_buffer_entries_cpu(op_ring_buffer_write, cpu); |
| } |
| |
| static inline int |
| add_sample(struct oprofile_cpu_buffer *cpu_buf, |
| unsigned long pc, unsigned long event) |
| { |
| struct op_entry entry; |
| int ret; |
| |
| ret = op_cpu_buffer_write_entry(&entry); |
| if (ret) |
| return ret; |
| |
| entry.sample->eip = pc; |
| entry.sample->event = event; |
| |
| ret = op_cpu_buffer_write_commit(&entry); |
| if (ret) |
| return ret; |
| |
| return 0; |
| } |
| |
| static inline int |
| add_code(struct oprofile_cpu_buffer *buffer, unsigned long value) |
| { |
| return add_sample(buffer, ESCAPE_CODE, value); |
| } |
| |
| /* This must be safe from any context. It's safe writing here |
| * because of the head/tail separation of the writer and reader |
| * of the CPU buffer. |
| * |
| * is_kernel is needed because on some architectures you cannot |
| * tell if you are in kernel or user space simply by looking at |
| * pc. We tag this in the buffer by generating kernel enter/exit |
| * events whenever is_kernel changes |
| */ |
| static int log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc, |
| int is_kernel, unsigned long event) |
| { |
| struct task_struct *task; |
| |
| cpu_buf->sample_received++; |
| |
| if (pc == ESCAPE_CODE) { |
| cpu_buf->sample_invalid_eip++; |
| return 0; |
| } |
| |
| is_kernel = !!is_kernel; |
| |
| task = current; |
| |
| /* notice a switch from user->kernel or vice versa */ |
| if (cpu_buf->last_is_kernel != is_kernel) { |
| cpu_buf->last_is_kernel = is_kernel; |
| if (add_code(cpu_buf, is_kernel)) |
| goto fail; |
| } |
| |
| /* notice a task switch */ |
| if (cpu_buf->last_task != task) { |
| cpu_buf->last_task = task; |
| if (add_code(cpu_buf, (unsigned long)task)) |
| goto fail; |
| } |
| |
| if (add_sample(cpu_buf, pc, event)) |
| goto fail; |
| |
| return 1; |
| |
| fail: |
| cpu_buf->sample_lost_overflow++; |
| return 0; |
| } |
| |
| static int oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf) |
| { |
| add_code(cpu_buf, CPU_TRACE_BEGIN); |
| cpu_buf->tracing = 1; |
| return 1; |
| } |
| |
| static void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf) |
| { |
| cpu_buf->tracing = 0; |
| } |
| |
| void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs, |
| unsigned long event, int is_kernel) |
| { |
| struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer); |
| |
| if (!oprofile_backtrace_depth) { |
| log_sample(cpu_buf, pc, is_kernel, event); |
| return; |
| } |
| |
| if (!oprofile_begin_trace(cpu_buf)) |
| return; |
| |
| /* |
| * if log_sample() fail we can't backtrace since we lost the |
| * source of this event |
| */ |
| if (log_sample(cpu_buf, pc, is_kernel, event)) |
| oprofile_ops.backtrace(regs, oprofile_backtrace_depth); |
| oprofile_end_trace(cpu_buf); |
| } |
| |
| void oprofile_add_sample(struct pt_regs * const regs, unsigned long event) |
| { |
| int is_kernel = !user_mode(regs); |
| unsigned long pc = profile_pc(regs); |
| |
| oprofile_add_ext_sample(pc, regs, event, is_kernel); |
| } |
| |
| #ifdef CONFIG_OPROFILE_IBS |
| |
| #define MAX_IBS_SAMPLE_SIZE 14 |
| |
| void oprofile_add_ibs_sample(struct pt_regs * const regs, |
| unsigned int * const ibs_sample, int ibs_code) |
| { |
| int is_kernel = !user_mode(regs); |
| struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer); |
| struct task_struct *task; |
| int fail = 0; |
| |
| cpu_buf->sample_received++; |
| |
| /* notice a switch from user->kernel or vice versa */ |
| if (cpu_buf->last_is_kernel != is_kernel) { |
| if (add_code(cpu_buf, is_kernel)) |
| goto fail; |
| cpu_buf->last_is_kernel = is_kernel; |
| } |
| |
| /* notice a task switch */ |
| if (!is_kernel) { |
| task = current; |
| if (cpu_buf->last_task != task) { |
| if (add_code(cpu_buf, (unsigned long)task)) |
| goto fail; |
| cpu_buf->last_task = task; |
| } |
| } |
| |
| fail = fail || add_code(cpu_buf, ibs_code); |
| fail = fail || add_sample(cpu_buf, ibs_sample[0], ibs_sample[1]); |
| fail = fail || add_sample(cpu_buf, ibs_sample[2], ibs_sample[3]); |
| fail = fail || add_sample(cpu_buf, ibs_sample[4], ibs_sample[5]); |
| |
| if (ibs_code == IBS_OP_BEGIN) { |
| fail = fail || add_sample(cpu_buf, ibs_sample[6], ibs_sample[7]); |
| fail = fail || add_sample(cpu_buf, ibs_sample[8], ibs_sample[9]); |
| fail = fail || add_sample(cpu_buf, ibs_sample[10], ibs_sample[11]); |
| } |
| |
| if (fail) |
| goto fail; |
| |
| if (oprofile_backtrace_depth) |
| oprofile_ops.backtrace(regs, oprofile_backtrace_depth); |
| |
| return; |
| |
| fail: |
| cpu_buf->sample_lost_overflow++; |
| return; |
| } |
| |
| #endif |
| |
| void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event) |
| { |
| struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer); |
| log_sample(cpu_buf, pc, is_kernel, event); |
| } |
| |
| void oprofile_add_trace(unsigned long pc) |
| { |
| struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer); |
| |
| if (!cpu_buf->tracing) |
| return; |
| |
| /* |
| * broken frame can give an eip with the same value as an |
| * escape code, abort the trace if we get it |
| */ |
| if (pc == ESCAPE_CODE) |
| goto fail; |
| |
| if (add_sample(cpu_buf, pc, 0)) |
| goto fail; |
| |
| return; |
| fail: |
| cpu_buf->tracing = 0; |
| cpu_buf->backtrace_aborted++; |
| return; |
| } |
| |
| /* |
| * This serves to avoid cpu buffer overflow, and makes sure |
| * the task mortuary progresses |
| * |
| * By using schedule_delayed_work_on and then schedule_delayed_work |
| * we guarantee this will stay on the correct cpu |
| */ |
| static void wq_sync_buffer(struct work_struct *work) |
| { |
| struct oprofile_cpu_buffer *b = |
| container_of(work, struct oprofile_cpu_buffer, work.work); |
| if (b->cpu != smp_processor_id()) { |
| printk(KERN_DEBUG "WQ on CPU%d, prefer CPU%d\n", |
| smp_processor_id(), b->cpu); |
| |
| if (!cpu_online(b->cpu)) { |
| cancel_delayed_work(&b->work); |
| return; |
| } |
| } |
| sync_buffer(b->cpu); |
| |
| /* don't re-add the work if we're shutting down */ |
| if (work_enabled) |
| schedule_delayed_work(&b->work, DEFAULT_TIMER_EXPIRE); |
| } |