Merge branch 'linus' into perf/core, to pick up fixes before merging new changes

Signed-off-by: Ingo Molnar <mingo@kernel.org>
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 33787ee..929655d 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1622,6 +1622,29 @@
 }
 EXPORT_SYMBOL_GPL(events_sysfs_show);
 
+ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
+			  char *page)
+{
+	struct perf_pmu_events_ht_attr *pmu_attr =
+		container_of(attr, struct perf_pmu_events_ht_attr, attr);
+
+	/*
+	 * Report conditional events depending on Hyper-Threading.
+	 *
+	 * This is overly conservative as usually the HT special
+	 * handling is not needed if the other CPU thread is idle.
+	 *
+	 * Note this does not (and cannot) handle the case when thread
+	 * siblings are invisible, for example with virtualization
+	 * if they are owned by some other guest.  The user tool
+	 * has to re-read when a thread sibling gets onlined later.
+	 */
+	return sprintf(page, "%s",
+			topology_max_smt_threads() > 1 ?
+			pmu_attr->event_str_ht :
+			pmu_attr->event_str_noht);
+}
+
 EVENT_ATTR(cpu-cycles,			CPU_CYCLES		);
 EVENT_ATTR(instructions,		INSTRUCTIONS		);
 EVENT_ATTR(cache-references,		CACHE_REFERENCES	);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 7c66695..3ed528c 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -16,6 +16,7 @@
 
 #include <asm/cpufeature.h>
 #include <asm/hardirq.h>
+#include <asm/intel-family.h>
 #include <asm/apic.h>
 
 #include "../perf_event.h"
@@ -177,7 +178,7 @@
 	EVENT_CONSTRAINT_END
 };
 
-struct event_constraint intel_skl_event_constraints[] = {
+static struct event_constraint intel_skl_event_constraints[] = {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
 	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
 	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
@@ -186,10 +187,8 @@
 };
 
 static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
-	INTEL_UEVENT_EXTRA_REG(0x01b7,
-			       MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0),
-	INTEL_UEVENT_EXTRA_REG(0x02b7,
-			       MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1),
+	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x799ffbb6e7ull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x399ffbffe7ull, RSP_1),
 	EVENT_EXTRA_END
 };
 
@@ -225,14 +224,51 @@
 EVENT_ATTR_STR(mem-loads,	mem_ld_snb,	"event=0xcd,umask=0x1,ldlat=3");
 EVENT_ATTR_STR(mem-stores,	mem_st_snb,	"event=0xcd,umask=0x2");
 
-struct attribute *nhm_events_attrs[] = {
+static struct attribute *nhm_events_attrs[] = {
 	EVENT_PTR(mem_ld_nhm),
 	NULL,
 };
 
-struct attribute *snb_events_attrs[] = {
+/*
+ * topdown events for Intel Core CPUs.
+ *
+ * The events are all in slots, which is a free slot in a 4 wide
+ * pipeline. Some events are already reported in slots, for cycle
+ * events we multiply by the pipeline width (4).
+ *
+ * With Hyper Threading on, topdown metrics are either summed or averaged
+ * between the threads of a core: (count_t0 + count_t1).
+ *
+ * For the average case the metric is always scaled to pipeline width,
+ * so we use factor 2 ((count_t0 + count_t1) / 2 * 4)
+ */
+
+EVENT_ATTR_STR_HT(topdown-total-slots, td_total_slots,
+	"event=0x3c,umask=0x0",			/* cpu_clk_unhalted.thread */
+	"event=0x3c,umask=0x0,any=1");		/* cpu_clk_unhalted.thread_any */
+EVENT_ATTR_STR_HT(topdown-total-slots.scale, td_total_slots_scale, "4", "2");
+EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued,
+	"event=0xe,umask=0x1");			/* uops_issued.any */
+EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired,
+	"event=0xc2,umask=0x2");		/* uops_retired.retire_slots */
+EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles,
+	"event=0x9c,umask=0x1");		/* idq_uops_not_delivered_core */
+EVENT_ATTR_STR_HT(topdown-recovery-bubbles, td_recovery_bubbles,
+	"event=0xd,umask=0x3,cmask=1",		/* int_misc.recovery_cycles */
+	"event=0xd,umask=0x3,cmask=1,any=1");	/* int_misc.recovery_cycles_any */
+EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale,
+	"4", "2");
+
+static struct attribute *snb_events_attrs[] = {
 	EVENT_PTR(mem_ld_snb),
 	EVENT_PTR(mem_st_snb),
+	EVENT_PTR(td_slots_issued),
+	EVENT_PTR(td_slots_retired),
+	EVENT_PTR(td_fetch_bubbles),
+	EVENT_PTR(td_total_slots),
+	EVENT_PTR(td_total_slots_scale),
+	EVENT_PTR(td_recovery_bubbles),
+	EVENT_PTR(td_recovery_bubbles_scale),
 	NULL,
 };
 
@@ -258,7 +294,7 @@
 	EVENT_CONSTRAINT_END
 };
 
-struct event_constraint intel_bdw_event_constraints[] = {
+static struct event_constraint intel_bdw_event_constraints[] = {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
 	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
 	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
@@ -1332,6 +1368,29 @@
  },
 };
 
+EVENT_ATTR_STR(topdown-total-slots, td_total_slots_slm, "event=0x3c");
+EVENT_ATTR_STR(topdown-total-slots.scale, td_total_slots_scale_slm, "2");
+/* no_alloc_cycles.not_delivered */
+EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles_slm,
+	       "event=0xca,umask=0x50");
+EVENT_ATTR_STR(topdown-fetch-bubbles.scale, td_fetch_bubbles_scale_slm, "2");
+/* uops_retired.all */
+EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued_slm,
+	       "event=0xc2,umask=0x10");
+/* uops_retired.all */
+EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired_slm,
+	       "event=0xc2,umask=0x10");
+
+static struct attribute *slm_events_attrs[] = {
+	EVENT_PTR(td_total_slots_slm),
+	EVENT_PTR(td_total_slots_scale_slm),
+	EVENT_PTR(td_fetch_bubbles_slm),
+	EVENT_PTR(td_fetch_bubbles_scale_slm),
+	EVENT_PTR(td_slots_issued_slm),
+	EVENT_PTR(td_slots_retired_slm),
+	NULL
+};
+
 static struct extra_reg intel_slm_extra_regs[] __read_mostly =
 {
 	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
@@ -3261,11 +3320,11 @@
 	u32 rev = UINT_MAX; /* default to broken for unknown models */
 
 	switch (cpu_data(cpu).x86_model) {
-	case 42: /* SNB */
+	case INTEL_FAM6_SANDYBRIDGE:
 		rev = 0x28;
 		break;
 
-	case 45: /* SNB-EP */
+	case INTEL_FAM6_SANDYBRIDGE_X:
 		switch (cpu_data(cpu).x86_mask) {
 		case 6: rev = 0x618; break;
 		case 7: rev = 0x70c; break;
@@ -3437,6 +3496,13 @@
 	EVENT_PTR(cycles_ct),
 	EVENT_PTR(mem_ld_hsw),
 	EVENT_PTR(mem_st_hsw),
+	EVENT_PTR(td_slots_issued),
+	EVENT_PTR(td_slots_retired),
+	EVENT_PTR(td_fetch_bubbles),
+	EVENT_PTR(td_total_slots),
+	EVENT_PTR(td_total_slots_scale),
+	EVENT_PTR(td_recovery_bubbles),
+	EVENT_PTR(td_recovery_bubbles_scale),
 	NULL
 };
 
@@ -3508,15 +3574,15 @@
 	 * Install the hw-cache-events table:
 	 */
 	switch (boot_cpu_data.x86_model) {
-	case 14: /* 65nm Core "Yonah" */
+	case INTEL_FAM6_CORE_YONAH:
 		pr_cont("Core events, ");
 		break;
 
-	case 15: /* 65nm Core2 "Merom"          */
+	case INTEL_FAM6_CORE2_MEROM:
 		x86_add_quirk(intel_clovertown_quirk);
-	case 22: /* 65nm Core2 "Merom-L"        */
-	case 23: /* 45nm Core2 "Penryn"         */
-	case 29: /* 45nm Core2 "Dunnington (MP) */
+	case INTEL_FAM6_CORE2_MEROM_L:
+	case INTEL_FAM6_CORE2_PENRYN:
+	case INTEL_FAM6_CORE2_DUNNINGTON:
 		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
@@ -3527,9 +3593,9 @@
 		pr_cont("Core2 events, ");
 		break;
 
-	case 30: /* 45nm Nehalem    */
-	case 26: /* 45nm Nehalem-EP */
-	case 46: /* 45nm Nehalem-EX */
+	case INTEL_FAM6_NEHALEM:
+	case INTEL_FAM6_NEHALEM_EP:
+	case INTEL_FAM6_NEHALEM_EX:
 		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
@@ -3557,11 +3623,11 @@
 		pr_cont("Nehalem events, ");
 		break;
 
-	case 28: /* 45nm Atom "Pineview"   */
-	case 38: /* 45nm Atom "Lincroft"   */
-	case 39: /* 32nm Atom "Penwell"    */
-	case 53: /* 32nm Atom "Cloverview" */
-	case 54: /* 32nm Atom "Cedarview"  */
+	case INTEL_FAM6_ATOM_PINEVIEW:
+	case INTEL_FAM6_ATOM_LINCROFT:
+	case INTEL_FAM6_ATOM_PENWELL:
+	case INTEL_FAM6_ATOM_CLOVERVIEW:
+	case INTEL_FAM6_ATOM_CEDARVIEW:
 		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
@@ -3573,9 +3639,9 @@
 		pr_cont("Atom events, ");
 		break;
 
-	case 55: /* 22nm Atom "Silvermont"                */
-	case 76: /* 14nm Atom "Airmont"                   */
-	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+	case INTEL_FAM6_ATOM_SILVERMONT1:
+	case INTEL_FAM6_ATOM_SILVERMONT2:
+	case INTEL_FAM6_ATOM_AIRMONT:
 		memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
 			sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
@@ -3587,11 +3653,12 @@
 		x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_slm_extra_regs;
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.cpu_events = slm_events_attrs;
 		pr_cont("Silvermont events, ");
 		break;
 
-	case 92: /* 14nm Atom "Goldmont" */
-	case 95: /* 14nm Atom "Goldmont Denverton" */
+	case INTEL_FAM6_ATOM_GOLDMONT:
+	case INTEL_FAM6_ATOM_DENVERTON:
 		memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,
@@ -3614,9 +3681,9 @@
 		pr_cont("Goldmont events, ");
 		break;
 
-	case 37: /* 32nm Westmere    */
-	case 44: /* 32nm Westmere-EP */
-	case 47: /* 32nm Westmere-EX */
+	case INTEL_FAM6_WESTMERE:
+	case INTEL_FAM6_WESTMERE_EP:
+	case INTEL_FAM6_WESTMERE_EX:
 		memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
@@ -3643,8 +3710,8 @@
 		pr_cont("Westmere events, ");
 		break;
 
-	case 42: /* 32nm SandyBridge         */
-	case 45: /* 32nm SandyBridge-E/EN/EP */
+	case INTEL_FAM6_SANDYBRIDGE:
+	case INTEL_FAM6_SANDYBRIDGE_X:
 		x86_add_quirk(intel_sandybridge_quirk);
 		x86_add_quirk(intel_ht_bug);
 		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
@@ -3657,7 +3724,7 @@
 		x86_pmu.event_constraints = intel_snb_event_constraints;
 		x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
 		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
-		if (boot_cpu_data.x86_model == 45)
+		if (boot_cpu_data.x86_model == INTEL_FAM6_SANDYBRIDGE_X)
 			x86_pmu.extra_regs = intel_snbep_extra_regs;
 		else
 			x86_pmu.extra_regs = intel_snb_extra_regs;
@@ -3679,8 +3746,8 @@
 		pr_cont("SandyBridge events, ");
 		break;
 
-	case 58: /* 22nm IvyBridge       */
-	case 62: /* 22nm IvyBridge-EP/EX */
+	case INTEL_FAM6_IVYBRIDGE:
+	case INTEL_FAM6_IVYBRIDGE_X:
 		x86_add_quirk(intel_ht_bug);
 		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
@@ -3696,7 +3763,7 @@
 		x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
 		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
 		x86_pmu.pebs_prec_dist = true;
-		if (boot_cpu_data.x86_model == 62)
+		if (boot_cpu_data.x86_model == INTEL_FAM6_IVYBRIDGE_X)
 			x86_pmu.extra_regs = intel_snbep_extra_regs;
 		else
 			x86_pmu.extra_regs = intel_snb_extra_regs;
@@ -3714,10 +3781,10 @@
 		break;
 
 
-	case 60: /* 22nm Haswell Core */
-	case 63: /* 22nm Haswell Server */
-	case 69: /* 22nm Haswell ULT */
-	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+	case INTEL_FAM6_HASWELL_CORE:
+	case INTEL_FAM6_HASWELL_X:
+	case INTEL_FAM6_HASWELL_ULT:
+	case INTEL_FAM6_HASWELL_GT3E:
 		x86_add_quirk(intel_ht_bug);
 		x86_pmu.late_ack = true;
 		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
@@ -3741,10 +3808,10 @@
 		pr_cont("Haswell events, ");
 		break;
 
-	case 61: /* 14nm Broadwell Core-M */
-	case 86: /* 14nm Broadwell Xeon D */
-	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-	case 79: /* 14nm Broadwell Server */
+	case INTEL_FAM6_BROADWELL_CORE:
+	case INTEL_FAM6_BROADWELL_XEON_D:
+	case INTEL_FAM6_BROADWELL_GT3E:
+	case INTEL_FAM6_BROADWELL_X:
 		x86_pmu.late_ack = true;
 		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
@@ -3777,7 +3844,7 @@
 		pr_cont("Broadwell events, ");
 		break;
 
-	case 87: /* Knights Landing Xeon Phi */
+	case INTEL_FAM6_XEON_PHI_KNL:
 		memcpy(hw_cache_event_ids,
 		       slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs,
@@ -3795,16 +3862,22 @@
 		pr_cont("Knights Landing events, ");
 		break;
 
-	case 142: /* 14nm Kabylake Mobile */
-	case 158: /* 14nm Kabylake Desktop */
-	case 78: /* 14nm Skylake Mobile */
-	case 94: /* 14nm Skylake Desktop */
-	case 85: /* 14nm Skylake Server */
+	case INTEL_FAM6_SKYLAKE_MOBILE:
+	case INTEL_FAM6_SKYLAKE_DESKTOP:
+	case INTEL_FAM6_SKYLAKE_X:
+	case INTEL_FAM6_KABYLAKE_MOBILE:
+	case INTEL_FAM6_KABYLAKE_DESKTOP:
 		x86_pmu.late_ack = true;
 		memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
 		intel_pmu_lbr_init_skl();
 
+		/* INT_MISC.RECOVERY_CYCLES has umask 1 in Skylake */
+		event_attr_td_recovery_bubbles.event_str_noht =
+			"event=0xd,umask=0x1,cmask=1";
+		event_attr_td_recovery_bubbles.event_str_ht =
+			"event=0xd,umask=0x1,cmask=1,any=1";
+
 		x86_pmu.event_constraints = intel_skl_event_constraints;
 		x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_skl_extra_regs;
@@ -3917,16 +3990,14 @@
  */
 static __init int fixup_ht_bug(void)
 {
-	int cpu = smp_processor_id();
-	int w, c;
+	int c;
 	/*
 	 * problem not present on this CPU model, nothing to do
 	 */
 	if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
 		return 0;
 
-	w = cpumask_weight(topology_sibling_cpumask(cpu));
-	if (w > 1) {
+	if (topology_max_smt_threads() > 1) {
 		pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
 		return 0;
 	}
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 9ba4e41..4c7638b 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -89,6 +89,7 @@
 #include <linux/slab.h>
 #include <linux/perf_event.h>
 #include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
 #include "../perf_event.h"
 
 MODULE_LICENSE("GPL");
@@ -511,37 +512,37 @@
 	{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long) &(states) }
 
 static const struct x86_cpu_id intel_cstates_match[] __initconst = {
-	X86_CSTATES_MODEL(30, nhm_cstates),    /* 45nm Nehalem              */
-	X86_CSTATES_MODEL(26, nhm_cstates),    /* 45nm Nehalem-EP           */
-	X86_CSTATES_MODEL(46, nhm_cstates),    /* 45nm Nehalem-EX           */
+	X86_CSTATES_MODEL(INTEL_FAM6_NEHALEM,    nhm_cstates),
+	X86_CSTATES_MODEL(INTEL_FAM6_NEHALEM_EP, nhm_cstates),
+	X86_CSTATES_MODEL(INTEL_FAM6_NEHALEM_EX, nhm_cstates),
 
-	X86_CSTATES_MODEL(37, nhm_cstates),    /* 32nm Westmere             */
-	X86_CSTATES_MODEL(44, nhm_cstates),    /* 32nm Westmere-EP          */
-	X86_CSTATES_MODEL(47, nhm_cstates),    /* 32nm Westmere-EX          */
+	X86_CSTATES_MODEL(INTEL_FAM6_WESTMERE,    nhm_cstates),
+	X86_CSTATES_MODEL(INTEL_FAM6_WESTMERE_EP, nhm_cstates),
+	X86_CSTATES_MODEL(INTEL_FAM6_WESTMERE_EX, nhm_cstates),
 
-	X86_CSTATES_MODEL(42, snb_cstates),    /* 32nm SandyBridge          */
-	X86_CSTATES_MODEL(45, snb_cstates),    /* 32nm SandyBridge-E/EN/EP  */
+	X86_CSTATES_MODEL(INTEL_FAM6_SANDYBRIDGE,   snb_cstates),
+	X86_CSTATES_MODEL(INTEL_FAM6_SANDYBRIDGE_X, snb_cstates),
 
-	X86_CSTATES_MODEL(58, snb_cstates),    /* 22nm IvyBridge            */
-	X86_CSTATES_MODEL(62, snb_cstates),    /* 22nm IvyBridge-EP/EX      */
+	X86_CSTATES_MODEL(INTEL_FAM6_IVYBRIDGE,   snb_cstates),
+	X86_CSTATES_MODEL(INTEL_FAM6_IVYBRIDGE_X, snb_cstates),
 
-	X86_CSTATES_MODEL(60, snb_cstates),    /* 22nm Haswell Core         */
-	X86_CSTATES_MODEL(63, snb_cstates),    /* 22nm Haswell Server       */
-	X86_CSTATES_MODEL(70, snb_cstates),    /* 22nm Haswell + GT3e       */
+	X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_CORE, snb_cstates),
+	X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_X,	   snb_cstates),
+	X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_GT3E, snb_cstates),
 
-	X86_CSTATES_MODEL(69, hswult_cstates), /* 22nm Haswell ULT          */
+	X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_ULT, hswult_cstates),
 
-	X86_CSTATES_MODEL(55, slm_cstates),    /* 22nm Atom Silvermont      */
-	X86_CSTATES_MODEL(77, slm_cstates),    /* 22nm Atom Avoton/Rangely  */
-	X86_CSTATES_MODEL(76, slm_cstates),    /* 22nm Atom Airmont         */
+	X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT1, slm_cstates),
+	X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT2, slm_cstates),
+	X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT,     slm_cstates),
 
-	X86_CSTATES_MODEL(61, snb_cstates),    /* 14nm Broadwell Core-M     */
-	X86_CSTATES_MODEL(86, snb_cstates),    /* 14nm Broadwell Xeon D     */
-	X86_CSTATES_MODEL(71, snb_cstates),    /* 14nm Broadwell + GT3e     */
-	X86_CSTATES_MODEL(79, snb_cstates),    /* 14nm Broadwell Server     */
+	X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_CORE,   snb_cstates),
+	X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_XEON_D, snb_cstates),
+	X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_GT3E,   snb_cstates),
+	X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_X,      snb_cstates),
 
-	X86_CSTATES_MODEL(78, snb_cstates),    /* 14nm Skylake Mobile       */
-	X86_CSTATES_MODEL(94, snb_cstates),    /* 14nm Skylake Desktop      */
+	X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_MOBILE,  snb_cstates),
+	X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates),
 	{ },
 };
 MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index e30eef4..d0c58b3 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -55,6 +55,7 @@
 #include <linux/slab.h>
 #include <linux/perf_event.h>
 #include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
 #include "../perf_event.h"
 
 MODULE_LICENSE("GPL");
@@ -786,26 +787,27 @@
 };
 
 static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
-	X86_RAPL_MODEL_MATCH(42, snb_rapl_init),	/* Sandy Bridge */
-	X86_RAPL_MODEL_MATCH(45, snbep_rapl_init),	/* Sandy Bridge-EP */
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE,   snb_rapl_init),
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, snbep_rapl_init),
 
-	X86_RAPL_MODEL_MATCH(58, snb_rapl_init),	/* Ivy Bridge */
-	X86_RAPL_MODEL_MATCH(62, snbep_rapl_init),	/* IvyTown */
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE,   snb_rapl_init),
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, snbep_rapl_init),
 
-	X86_RAPL_MODEL_MATCH(60, hsw_rapl_init),	/* Haswell */
-	X86_RAPL_MODEL_MATCH(63, hsx_rapl_init),	/* Haswell-Server */
-	X86_RAPL_MODEL_MATCH(69, hsw_rapl_init),	/* Haswell-Celeron */
-	X86_RAPL_MODEL_MATCH(70, hsw_rapl_init),	/* Haswell GT3e */
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, hsw_rapl_init),
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X,    hsw_rapl_init),
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT,  hsw_rapl_init),
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, hsw_rapl_init),
 
-	X86_RAPL_MODEL_MATCH(61, hsw_rapl_init),	/* Broadwell */
-	X86_RAPL_MODEL_MATCH(71, hsw_rapl_init),	/* Broadwell-H */
-	X86_RAPL_MODEL_MATCH(79, hsx_rapl_init),	/* Broadwell-Server */
-	X86_RAPL_MODEL_MATCH(86, hsx_rapl_init),	/* Broadwell Xeon D */
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE,   hsw_rapl_init),
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E,   hsw_rapl_init),
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X,	  hsw_rapl_init),
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsw_rapl_init),
 
-	X86_RAPL_MODEL_MATCH(87, knl_rapl_init),	/* Knights Landing */
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_rapl_init),
 
-	X86_RAPL_MODEL_MATCH(78, skl_rapl_init),	/* Skylake */
-	X86_RAPL_MODEL_MATCH(94, skl_rapl_init),	/* Skylake H/S */
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE,  skl_rapl_init),
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, skl_rapl_init),
+	X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X,	 hsx_rapl_init),
 	{},
 };
 
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index fce7406..4e70d27 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1,4 +1,5 @@
 #include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
 #include "uncore.h"
 
 static struct intel_uncore_type *empty_uncore[] = { NULL, };
@@ -882,7 +883,7 @@
 static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct intel_uncore_type *type;
-	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_pmu *pmu = NULL;
 	struct intel_uncore_box *box;
 	int phys_id, pkg, ret;
 
@@ -903,20 +904,37 @@
 	}
 
 	type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
+
 	/*
-	 * for performance monitoring unit with multiple boxes,
-	 * each box has a different function id.
+	 * Some platforms, e.g.  Knights Landing, use a common PCI device ID
+	 * for multiple instances of an uncore PMU device type. We should check
+	 * PCI slot and func to indicate the uncore box.
 	 */
-	pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
-	/* Knights Landing uses a common PCI device ID for multiple instances of
-	 * an uncore PMU device type. There is only one entry per device type in
-	 * the knl_uncore_pci_ids table inspite of multiple devices present for
-	 * some device types. Hence PCI device idx would be 0 for all devices.
-	 * So increment pmu pointer to point to an unused array element.
-	 */
-	if (boot_cpu_data.x86_model == 87) {
-		while (pmu->func_id >= 0)
-			pmu++;
+	if (id->driver_data & ~0xffff) {
+		struct pci_driver *pci_drv = pdev->driver;
+		const struct pci_device_id *ids = pci_drv->id_table;
+		unsigned int devfn;
+
+		while (ids && ids->vendor) {
+			if ((ids->vendor == pdev->vendor) &&
+			    (ids->device == pdev->device)) {
+				devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(ids->driver_data),
+						  UNCORE_PCI_DEV_FUNC(ids->driver_data));
+				if (devfn == pdev->devfn) {
+					pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)];
+					break;
+				}
+			}
+			ids++;
+		}
+		if (pmu == NULL)
+			return -ENODEV;
+	} else {
+		/*
+		 * for performance monitoring unit with multiple boxes,
+		 * each box has a different function id.
+		 */
+		pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
 	}
 
 	if (WARN_ON_ONCE(pmu->boxes[pkg] != NULL))
@@ -1365,26 +1383,26 @@
 };
 
 static const struct x86_cpu_id intel_uncore_match[] __initconst = {
-	X86_UNCORE_MODEL_MATCH(26, nhm_uncore_init),	/* Nehalem */
-	X86_UNCORE_MODEL_MATCH(30, nhm_uncore_init),
-	X86_UNCORE_MODEL_MATCH(37, nhm_uncore_init),	/* Westmere */
-	X86_UNCORE_MODEL_MATCH(44, nhm_uncore_init),
-	X86_UNCORE_MODEL_MATCH(42, snb_uncore_init),	/* Sandy Bridge */
-	X86_UNCORE_MODEL_MATCH(58, ivb_uncore_init),	/* Ivy Bridge */
-	X86_UNCORE_MODEL_MATCH(60, hsw_uncore_init),	/* Haswell */
-	X86_UNCORE_MODEL_MATCH(69, hsw_uncore_init),	/* Haswell Celeron */
-	X86_UNCORE_MODEL_MATCH(70, hsw_uncore_init),	/* Haswell */
-	X86_UNCORE_MODEL_MATCH(61, bdw_uncore_init),	/* Broadwell */
-	X86_UNCORE_MODEL_MATCH(71, bdw_uncore_init),	/* Broadwell */
-	X86_UNCORE_MODEL_MATCH(45, snbep_uncore_init),	/* Sandy Bridge-EP */
-	X86_UNCORE_MODEL_MATCH(46, nhmex_uncore_init),	/* Nehalem-EX */
-	X86_UNCORE_MODEL_MATCH(47, nhmex_uncore_init),	/* Westmere-EX aka. Xeon E7 */
-	X86_UNCORE_MODEL_MATCH(62, ivbep_uncore_init),	/* Ivy Bridge-EP */
-	X86_UNCORE_MODEL_MATCH(63, hswep_uncore_init),	/* Haswell-EP */
-	X86_UNCORE_MODEL_MATCH(79, bdx_uncore_init),	/* BDX-EP */
-	X86_UNCORE_MODEL_MATCH(86, bdx_uncore_init),	/* BDX-DE */
-	X86_UNCORE_MODEL_MATCH(87, knl_uncore_init),	/* Knights Landing */
-	X86_UNCORE_MODEL_MATCH(94, skl_uncore_init),	/* SkyLake */
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP,	  nhm_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM,	  nhm_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE,	  nhm_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE_EP,	  nhm_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE,	  snb_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE,	  ivb_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE,	  hsw_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT,	  hsw_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E,	  hsw_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, bdw_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, bdw_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X,  snbep_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EX,	  nhmex_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE_EX,	  nhmex_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X,	  ivbep_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_X,	  hswep_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_X,	  bdx_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, bdx_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL,	  knl_uncore_init),
+	X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP,skl_uncore_init),
 	{},
 };
 
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 79766b9..66c3a36 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -15,7 +15,11 @@
 #define UNCORE_PMC_IDX_FIXED		UNCORE_PMC_IDX_MAX_GENERIC
 #define UNCORE_PMC_IDX_MAX		(UNCORE_PMC_IDX_FIXED + 1)
 
+#define UNCORE_PCI_DEV_FULL_DATA(dev, func, type, idx)	\
+		((dev << 24) | (func << 16) | (type << 8) | idx)
 #define UNCORE_PCI_DEV_DATA(type, idx)	((type << 8) | idx)
+#define UNCORE_PCI_DEV_DEV(data)	((data >> 24) & 0xff)
+#define UNCORE_PCI_DEV_FUNC(data)	((data >> 16) & 0xff)
 #define UNCORE_PCI_DEV_TYPE(data)	((data >> 8) & 0xff)
 #define UNCORE_PCI_DEV_IDX(data)	(data & 0xff)
 #define UNCORE_EXTRA_PCI_DEV		0xff
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 874e8bd..824e540 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -2164,21 +2164,101 @@
 */
 
 static const struct pci_device_id knl_uncore_pci_ids[] = {
-	{ /* MC UClk */
+	{ /* MC0 UClk */
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
-		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_UCLK, 0),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 0, KNL_PCI_UNCORE_MC_UCLK, 0),
 	},
-	{ /* MC DClk Channel */
+	{ /* MC1 UClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(11, 0, KNL_PCI_UNCORE_MC_UCLK, 1),
+	},
+	{ /* MC0 DClk CH 0 */
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
-		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_DCLK, 0),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 2, KNL_PCI_UNCORE_MC_DCLK, 0),
 	},
-	{ /* EDC UClk */
+	{ /* MC0 DClk CH 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 3, KNL_PCI_UNCORE_MC_DCLK, 1),
+	},
+	{ /* MC0 DClk CH 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 4, KNL_PCI_UNCORE_MC_DCLK, 2),
+	},
+	{ /* MC1 DClk CH 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 2, KNL_PCI_UNCORE_MC_DCLK, 3),
+	},
+	{ /* MC1 DClk CH 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 3, KNL_PCI_UNCORE_MC_DCLK, 4),
+	},
+	{ /* MC1 DClk CH 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 4, KNL_PCI_UNCORE_MC_DCLK, 5),
+	},
+	{ /* EDC0 UClk */
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
-		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_UCLK, 0),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(15, 0, KNL_PCI_UNCORE_EDC_UCLK, 0),
 	},
-	{ /* EDC EClk */
+	{ /* EDC1 UClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(16, 0, KNL_PCI_UNCORE_EDC_UCLK, 1),
+	},
+	{ /* EDC2 UClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(17, 0, KNL_PCI_UNCORE_EDC_UCLK, 2),
+	},
+	{ /* EDC3 UClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 0, KNL_PCI_UNCORE_EDC_UCLK, 3),
+	},
+	{ /* EDC4 UClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(19, 0, KNL_PCI_UNCORE_EDC_UCLK, 4),
+	},
+	{ /* EDC5 UClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(20, 0, KNL_PCI_UNCORE_EDC_UCLK, 5),
+	},
+	{ /* EDC6 UClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 0, KNL_PCI_UNCORE_EDC_UCLK, 6),
+	},
+	{ /* EDC7 UClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(22, 0, KNL_PCI_UNCORE_EDC_UCLK, 7),
+	},
+	{ /* EDC0 EClk */
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
-		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_ECLK, 0),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(24, 2, KNL_PCI_UNCORE_EDC_ECLK, 0),
+	},
+	{ /* EDC1 EClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(25, 2, KNL_PCI_UNCORE_EDC_ECLK, 1),
+	},
+	{ /* EDC2 EClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(26, 2, KNL_PCI_UNCORE_EDC_ECLK, 2),
+	},
+	{ /* EDC3 EClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(27, 2, KNL_PCI_UNCORE_EDC_ECLK, 3),
+	},
+	{ /* EDC4 EClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(28, 2, KNL_PCI_UNCORE_EDC_ECLK, 4),
+	},
+	{ /* EDC5 EClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(29, 2, KNL_PCI_UNCORE_EDC_ECLK, 5),
+	},
+	{ /* EDC6 EClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(30, 2, KNL_PCI_UNCORE_EDC_ECLK, 6),
+	},
+	{ /* EDC7 EClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+		.driver_data = UNCORE_PCI_DEV_FULL_DATA(31, 2, KNL_PCI_UNCORE_EDC_ECLK, 7),
 	},
 	{ /* M2PCIe */
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817),
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index 85ef3c2..50b3a05 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -1,4 +1,5 @@
 #include <linux/perf_event.h>
+#include <asm/intel-family.h>
 
 enum perf_msr_id {
 	PERF_MSR_TSC			= 0,
@@ -34,39 +35,43 @@
 		return false;
 
 	switch (boot_cpu_data.x86_model) {
-	case 30: /* 45nm Nehalem    */
-	case 26: /* 45nm Nehalem-EP */
-	case 46: /* 45nm Nehalem-EX */
+	case INTEL_FAM6_NEHALEM:
+	case INTEL_FAM6_NEHALEM_EP:
+	case INTEL_FAM6_NEHALEM_EX:
 
-	case 37: /* 32nm Westmere    */
-	case 44: /* 32nm Westmere-EP */
-	case 47: /* 32nm Westmere-EX */
+	case INTEL_FAM6_WESTMERE:
+	case INTEL_FAM6_WESTMERE2:
+	case INTEL_FAM6_WESTMERE_EP:
+	case INTEL_FAM6_WESTMERE_EX:
 
-	case 42: /* 32nm SandyBridge         */
-	case 45: /* 32nm SandyBridge-E/EN/EP */
+	case INTEL_FAM6_SANDYBRIDGE:
+	case INTEL_FAM6_SANDYBRIDGE_X:
 
-	case 58: /* 22nm IvyBridge       */
-	case 62: /* 22nm IvyBridge-EP/EX */
+	case INTEL_FAM6_IVYBRIDGE:
+	case INTEL_FAM6_IVYBRIDGE_X:
 
-	case 60: /* 22nm Haswell Core */
-	case 63: /* 22nm Haswell Server */
-	case 69: /* 22nm Haswell ULT */
-	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+	case INTEL_FAM6_HASWELL_CORE:
+	case INTEL_FAM6_HASWELL_X:
+	case INTEL_FAM6_HASWELL_ULT:
+	case INTEL_FAM6_HASWELL_GT3E:
 
-	case 61: /* 14nm Broadwell Core-M */
-	case 86: /* 14nm Broadwell Xeon D */
-	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-	case 79: /* 14nm Broadwell Server */
+	case INTEL_FAM6_BROADWELL_CORE:
+	case INTEL_FAM6_BROADWELL_XEON_D:
+	case INTEL_FAM6_BROADWELL_GT3E:
+	case INTEL_FAM6_BROADWELL_X:
 
-	case 55: /* 22nm Atom "Silvermont"                */
-	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-	case 76: /* 14nm Atom "Airmont"                   */
+	case INTEL_FAM6_ATOM_SILVERMONT1:
+	case INTEL_FAM6_ATOM_SILVERMONT2:
+	case INTEL_FAM6_ATOM_AIRMONT:
 		if (idx == PERF_MSR_SMI)
 			return true;
 		break;
 
-	case 78: /* 14nm Skylake Mobile */
-	case 94: /* 14nm Skylake Desktop */
+	case INTEL_FAM6_SKYLAKE_MOBILE:
+	case INTEL_FAM6_SKYLAKE_DESKTOP:
+	case INTEL_FAM6_SKYLAKE_X:
+	case INTEL_FAM6_KABYLAKE_MOBILE:
+	case INTEL_FAM6_KABYLAKE_DESKTOP:
 		if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
 			return true;
 		break;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 8bd764d..e2d7285 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -668,6 +668,14 @@
 	.event_str	= str,						\
 };
 
+#define EVENT_ATTR_STR_HT(_name, v, noht, ht)				\
+static struct perf_pmu_events_ht_attr event_attr_##v = {		\
+	.attr		= __ATTR(_name, 0444, events_ht_sysfs_show, NULL),\
+	.id		= 0,						\
+	.event_str_noht	= noht,						\
+	.event_str_ht	= ht,						\
+}
+
 extern struct x86_pmu x86_pmu __read_mostly;
 
 static inline bool x86_pmu_has_lbr_callstack(void)
@@ -803,6 +811,8 @@
 
 ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
 			  char *page);
+ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
+			  char *page);
 
 #ifdef CONFIG_CPU_SUP_AMD
 
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 7f991bd5..e346572 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -129,6 +129,14 @@
 
 extern unsigned int __max_logical_packages;
 #define topology_max_packages()			(__max_logical_packages)
+
+extern int __max_smt_threads;
+
+static inline int topology_max_smt_threads(void)
+{
+	return __max_smt_threads;
+}
+
 int topology_update_package_map(unsigned int apicid, unsigned int cpu);
 extern int topology_phys_to_logical_pkg(unsigned int pkg);
 #else
@@ -136,6 +144,7 @@
 static inline int
 topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; }
 static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
+static inline int topology_max_smt_threads(void) { return 1; }
 #endif
 
 static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fafe8b9..2ed0ec1 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -105,6 +105,9 @@
 unsigned int __max_logical_packages __read_mostly;
 EXPORT_SYMBOL(__max_logical_packages);
 
+/* Maximum number of SMT threads on any online core */
+int __max_smt_threads __read_mostly;
+
 static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
 {
 	unsigned long flags;
@@ -493,7 +496,7 @@
 	bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1;
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 	struct cpuinfo_x86 *o;
-	int i;
+	int i, threads;
 
 	cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
 
@@ -550,6 +553,10 @@
 		if (match_die(c, o) && !topology_same_node(c, o))
 			primarily_use_numa_for_topology();
 	}
+
+	threads = cpumask_weight(topology_sibling_cpumask(cpu));
+	if (threads > __max_smt_threads)
+		__max_smt_threads = threads;
 }
 
 /* maps the cpu to the sched domain representing multi-core */
@@ -1441,6 +1448,21 @@
 
 #ifdef CONFIG_HOTPLUG_CPU
 
+/* Recompute SMT state for all CPUs on offline */
+static void recompute_smt_state(void)
+{
+	int max_threads, cpu;
+
+	max_threads = 0;
+	for_each_online_cpu (cpu) {
+		int threads = cpumask_weight(topology_sibling_cpumask(cpu));
+
+		if (threads > max_threads)
+			max_threads = threads;
+	}
+	__max_smt_threads = max_threads;
+}
+
 static void remove_siblinginfo(int cpu)
 {
 	int sibling;
@@ -1465,6 +1487,7 @@
 	c->phys_proc_id = 0;
 	c->cpu_core_id = 0;
 	cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
+	recompute_smt_state();
 }
 
 static void remove_cpu_from_maps(int cpu)
diff --git a/drivers/platform/x86/intel_pmc_core.c b/drivers/platform/x86/intel_pmc_core.c
index 2776bec..e57f923 100644
--- a/drivers/platform/x86/intel_pmc_core.c
+++ b/drivers/platform/x86/intel_pmc_core.c
@@ -26,6 +26,7 @@
 #include <linux/seq_file.h>
 
 #include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
 #include <asm/pmc_core.h>
 
 #include "intel_pmc_core.h"
@@ -138,10 +139,10 @@
 #endif /* CONFIG_DEBUG_FS */
 
 static const struct x86_cpu_id intel_pmc_core_ids[] = {
-	{ X86_VENDOR_INTEL, 6, 0x4e, X86_FEATURE_MWAIT,
-		(kernel_ulong_t)NULL}, /* Skylake CPUID Signature */
-	{ X86_VENDOR_INTEL, 6, 0x5e, X86_FEATURE_MWAIT,
-		(kernel_ulong_t)NULL}, /* Skylake CPUID Signature */
+	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_MOBILE, X86_FEATURE_MWAIT,
+		(kernel_ulong_t)NULL},
+	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_DESKTOP, X86_FEATURE_MWAIT,
+		(kernel_ulong_t)NULL},
 	{}
 };
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1a827ce..7921f4f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -517,6 +517,11 @@
 struct perf_cgroup;
 struct ring_buffer;
 
+struct pmu_event_list {
+	raw_spinlock_t		lock;
+	struct list_head	list;
+};
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -675,6 +680,7 @@
 	int				cgrp_defer_enabled;
 #endif
 
+	struct list_head		sb_list;
 #endif /* CONFIG_PERF_EVENTS */
 };
 
@@ -1074,7 +1080,7 @@
 extern struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 		   u32 max_stack, bool crosstask, bool add_mark);
-extern int get_callchain_buffers(void);
+extern int get_callchain_buffers(int max_stack);
 extern void put_callchain_buffers(void);
 
 extern int sysctl_perf_event_max_stack;
@@ -1326,6 +1332,13 @@
 	const char *event_str;
 };
 
+struct perf_pmu_events_ht_attr {
+	struct device_attribute			attr;
+	u64					id;
+	const char				*event_str_ht;
+	const char				*event_str_noht;
+};
+
 ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
 			      char *page);
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 36ce552..c66a485 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -276,6 +276,9 @@
 
 /*
  * Hardware event_id to monitor via a performance monitoring event:
+ *
+ * @sample_max_stack: Max number of frame pointers in a callchain,
+ *		      should be < /proc/sys/kernel/perf_event_max_stack
  */
 struct perf_event_attr {
 
@@ -385,7 +388,8 @@
 	 * Wakeup watermark for AUX area
 	 */
 	__u32	aux_watermark;
-	__u32	__reserved_2;	/* align to __u64 */
+	__u16	sample_max_stack;
+	__u16	__reserved_2;	/* align to __u64 */
 };
 
 #define perf_flags(attr)	(*(&(attr)->read_format + 1))
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 080a2df..bf4495f 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -99,7 +99,7 @@
 	if (err)
 		goto free_smap;
 
-	err = get_callchain_buffers();
+	err = get_callchain_buffers(sysctl_perf_event_max_stack);
 	if (err)
 		goto free_smap;
 
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 179ef46..e9fdb52 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -104,7 +104,7 @@
 	return -ENOMEM;
 }
 
-int get_callchain_buffers(void)
+int get_callchain_buffers(int event_max_stack)
 {
 	int err = 0;
 	int count;
@@ -121,6 +121,15 @@
 		/* If the allocation failed, give up */
 		if (!callchain_cpus_entries)
 			err = -ENOMEM;
+		/*
+		 * If requesting per event more than the global cap,
+		 * return a different error to help userspace figure
+		 * this out.
+		 *
+		 * And also do it here so that we have &callchain_mutex held.
+		 */
+		if (event_max_stack > sysctl_perf_event_max_stack)
+			err = -EOVERFLOW;
 		goto exit;
 	}
 
@@ -174,11 +183,12 @@
 	bool user   = !event->attr.exclude_callchain_user;
 	/* Disallow cross-task user callchains. */
 	bool crosstask = event->ctx->task && event->ctx->task != current;
+	const u32 max_stack = event->attr.sample_max_stack;
 
 	if (!kernel && !user)
 		return NULL;
 
-	return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true);
+	return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true);
 }
 
 struct perf_callchain_entry *
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9c51ec3..9345028 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -335,6 +335,7 @@
 
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -396,6 +397,13 @@
 	if (ret || !write)
 		return ret;
 
+	/*
+	 * If throttling is disabled don't allow the write:
+	 */
+	if (sysctl_perf_cpu_time_max_percent == 100 ||
+	    sysctl_perf_cpu_time_max_percent == 0)
+		return -EINVAL;
+
 	max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
 	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 	update_perf_cpu_limits();
@@ -3665,6 +3673,39 @@
 static void ring_buffer_attach(struct perf_event *event,
 			       struct ring_buffer *rb);
 
+static void detach_sb_event(struct perf_event *event)
+{
+	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+	raw_spin_lock(&pel->lock);
+	list_del_rcu(&event->sb_list);
+	raw_spin_unlock(&pel->lock);
+}
+
+static bool is_sb_event(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+
+	if (event->parent)
+		return false;
+
+	if (event->attach_state & PERF_ATTACH_TASK)
+		return false;
+
+	if (attr->mmap || attr->mmap_data || attr->mmap2 ||
+	    attr->comm || attr->comm_exec ||
+	    attr->task ||
+	    attr->context_switch)
+		return true;
+	return false;
+}
+
+static void unaccount_pmu_sb_event(struct perf_event *event)
+{
+	if (is_sb_event(event))
+		detach_sb_event(event);
+}
+
 static void unaccount_event_cpu(struct perf_event *event, int cpu)
 {
 	if (event->parent)
@@ -3728,6 +3769,8 @@
 	}
 
 	unaccount_event_cpu(event, event->cpu);
+
+	unaccount_pmu_sb_event(event);
 }
 
 static void perf_sched_delayed(struct work_struct *work)
@@ -5854,11 +5897,11 @@
 	perf_output_end(&handle);
 }
 
-typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
+typedef void (perf_iterate_f)(struct perf_event *event, void *data);
 
 static void
-perf_event_aux_ctx(struct perf_event_context *ctx,
-		   perf_event_aux_output_cb output,
+perf_iterate_ctx(struct perf_event_context *ctx,
+		   perf_iterate_f output,
 		   void *data, bool all)
 {
 	struct perf_event *event;
@@ -5875,52 +5918,55 @@
 	}
 }
 
-static void
-perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
-			struct perf_event_context *task_ctx)
+static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
 {
-	rcu_read_lock();
-	preempt_disable();
-	perf_event_aux_ctx(task_ctx, output, data, false);
-	preempt_enable();
-	rcu_read_unlock();
+	struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
+	struct perf_event *event;
+
+	list_for_each_entry_rcu(event, &pel->list, sb_list) {
+		if (event->state < PERF_EVENT_STATE_INACTIVE)
+			continue;
+		if (!event_filter_match(event))
+			continue;
+		output(event, data);
+	}
 }
 
+/*
+ * Iterate all events that need to receive side-band events.
+ *
+ * For new callers; ensure that account_pmu_sb_event() includes
+ * your event, otherwise it might not get delivered.
+ */
 static void
-perf_event_aux(perf_event_aux_output_cb output, void *data,
+perf_iterate_sb(perf_iterate_f output, void *data,
 	       struct perf_event_context *task_ctx)
 {
-	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx;
-	struct pmu *pmu;
 	int ctxn;
 
+	rcu_read_lock();
+	preempt_disable();
+
 	/*
-	 * If we have task_ctx != NULL we only notify
-	 * the task context itself. The task_ctx is set
-	 * only for EXIT events before releasing task
+	 * If we have task_ctx != NULL we only notify the task context itself.
+	 * The task_ctx is set only for EXIT events before releasing task
 	 * context.
 	 */
 	if (task_ctx) {
-		perf_event_aux_task_ctx(output, data, task_ctx);
-		return;
+		perf_iterate_ctx(task_ctx, output, data, false);
+		goto done;
 	}
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-		if (cpuctx->unique_pmu != pmu)
-			goto next;
-		perf_event_aux_ctx(&cpuctx->ctx, output, data, false);
-		ctxn = pmu->task_ctx_nr;
-		if (ctxn < 0)
-			goto next;
+	perf_iterate_sb_cpu(output, data);
+
+	for_each_task_context_nr(ctxn) {
 		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
 		if (ctx)
-			perf_event_aux_ctx(ctx, output, data, false);
-next:
-		put_cpu_ptr(pmu->pmu_cpu_context);
+			perf_iterate_ctx(ctx, output, data, false);
 	}
+done:
+	preempt_enable();
 	rcu_read_unlock();
 }
 
@@ -5969,7 +6015,7 @@
 
 		perf_event_enable_on_exec(ctxn);
 
-		perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL,
+		perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
 				   true);
 	}
 	rcu_read_unlock();
@@ -6013,9 +6059,9 @@
 	};
 
 	rcu_read_lock();
-	perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
+	perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
 	if (cpuctx->task_ctx)
-		perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop,
+		perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
 				   &ro, false);
 	rcu_read_unlock();
 
@@ -6144,7 +6190,7 @@
 		},
 	};
 
-	perf_event_aux(perf_event_task_output,
+	perf_iterate_sb(perf_event_task_output,
 		       &task_event,
 		       task_ctx);
 }
@@ -6223,7 +6269,7 @@
 
 	comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
 
-	perf_event_aux(perf_event_comm_output,
+	perf_iterate_sb(perf_event_comm_output,
 		       comm_event,
 		       NULL);
 }
@@ -6454,7 +6500,7 @@
 
 	mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
 
-	perf_event_aux(perf_event_mmap_output,
+	perf_iterate_sb(perf_event_mmap_output,
 		       mmap_event,
 		       NULL);
 
@@ -6537,7 +6583,7 @@
 		if (!ctx)
 			continue;
 
-		perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true);
+		perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
 	}
 	rcu_read_unlock();
 }
@@ -6724,7 +6770,7 @@
 		},
 	};
 
-	perf_event_aux(perf_event_switch_output,
+	perf_iterate_sb(perf_event_switch_output,
 		       &switch_event,
 		       NULL);
 }
@@ -8646,6 +8692,28 @@
 	return pmu;
 }
 
+static void attach_sb_event(struct perf_event *event)
+{
+	struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+	raw_spin_lock(&pel->lock);
+	list_add_rcu(&event->sb_list, &pel->list);
+	raw_spin_unlock(&pel->lock);
+}
+
+/*
+ * We keep a list of all !task (and therefore per-cpu) events
+ * that need to receive side-band records.
+ *
+ * This avoids having to scan all the various PMU per-cpu contexts
+ * looking for them.
+ */
+static void account_pmu_sb_event(struct perf_event *event)
+{
+	if (is_sb_event(event))
+		attach_sb_event(event);
+}
+
 static void account_event_cpu(struct perf_event *event, int cpu)
 {
 	if (event->parent)
@@ -8726,6 +8794,8 @@
 enabled:
 
 	account_event_cpu(event, event->cpu);
+
+	account_pmu_sb_event(event);
 }
 
 /*
@@ -8874,7 +8944,7 @@
 
 	if (!event->parent) {
 		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
-			err = get_callchain_buffers();
+			err = get_callchain_buffers(attr->sample_max_stack);
 			if (err)
 				goto err_addr_filters;
 		}
@@ -9196,6 +9266,9 @@
 			return -EINVAL;
 	}
 
+	if (!attr.sample_max_stack)
+		attr.sample_max_stack = sysctl_perf_event_max_stack;
+
 	/*
 	 * In cgroup mode, the pid argument is used to pass the fd
 	 * opened to the cgroup directory in cgroupfs. The cpu argument
@@ -9269,7 +9342,7 @@
 
 	if (is_sampling_event(event)) {
 		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
-			err = -ENOTSUPP;
+			err = -EOPNOTSUPP;
 			goto err_alloc;
 		}
 	}
@@ -10231,6 +10304,9 @@
 		swhash = &per_cpu(swevent_htable, cpu);
 		mutex_init(&swhash->hlist_mutex);
 		INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
+
+		INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
+		raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
 	}
 }
 
diff --git a/tools/lib/api/Makefile b/tools/lib/api/Makefile
index 316f308..67ff93e 100644
--- a/tools/lib/api/Makefile
+++ b/tools/lib/api/Makefile
@@ -10,6 +10,7 @@
 
 CC = $(CROSS_COMPILE)gcc
 AR = $(CROSS_COMPILE)ar
+LD = $(CROSS_COMPILE)ld
 
 MAKEFLAGS += --no-print-directory
 
diff --git a/tools/lib/api/fd/array.c b/tools/lib/api/fd/array.c
index 0e636c4..b0a035f 100644
--- a/tools/lib/api/fd/array.c
+++ b/tools/lib/api/fd/array.c
@@ -85,7 +85,8 @@
 }
 
 int fdarray__filter(struct fdarray *fda, short revents,
-		    void (*entry_destructor)(struct fdarray *fda, int fd))
+		    void (*entry_destructor)(struct fdarray *fda, int fd, void *arg),
+		    void *arg)
 {
 	int fd, nr = 0;
 
@@ -95,7 +96,7 @@
 	for (fd = 0; fd < fda->nr; ++fd) {
 		if (fda->entries[fd].revents & revents) {
 			if (entry_destructor)
-				entry_destructor(fda, fd);
+				entry_destructor(fda, fd, arg);
 
 			continue;
 		}
diff --git a/tools/lib/api/fd/array.h b/tools/lib/api/fd/array.h
index 45db018..e87fd80 100644
--- a/tools/lib/api/fd/array.h
+++ b/tools/lib/api/fd/array.h
@@ -34,7 +34,8 @@
 int fdarray__add(struct fdarray *fda, int fd, short revents);
 int fdarray__poll(struct fdarray *fda, int timeout);
 int fdarray__filter(struct fdarray *fda, short revents,
-		    void (*entry_destructor)(struct fdarray *fda, int fd));
+		    void (*entry_destructor)(struct fdarray *fda, int fd, void *arg),
+		    void *arg);
 int fdarray__grow(struct fdarray *fda, int extra);
 int fdarray__fprintf(struct fdarray *fda, FILE *fp);
 
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 7e543c3..462e526 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1186,20 +1186,14 @@
 	return next;
 }
 
-const char *
-bpf_object__get_name(struct bpf_object *obj)
+const char *bpf_object__name(struct bpf_object *obj)
 {
-	if (!obj)
-		return ERR_PTR(-EINVAL);
-	return obj->path;
+	return obj ? obj->path : ERR_PTR(-EINVAL);
 }
 
-unsigned int
-bpf_object__get_kversion(struct bpf_object *obj)
+unsigned int bpf_object__kversion(struct bpf_object *obj)
 {
-	if (!obj)
-		return 0;
-	return obj->kern_version;
+	return obj ? obj->kern_version : 0;
 }
 
 struct bpf_program *
@@ -1224,9 +1218,8 @@
 	return &obj->programs[idx];
 }
 
-int bpf_program__set_private(struct bpf_program *prog,
-			     void *priv,
-			     bpf_program_clear_priv_t clear_priv)
+int bpf_program__set_priv(struct bpf_program *prog, void *priv,
+			  bpf_program_clear_priv_t clear_priv)
 {
 	if (prog->priv && prog->clear_priv)
 		prog->clear_priv(prog, prog->priv);
@@ -1236,10 +1229,9 @@
 	return 0;
 }
 
-int bpf_program__get_private(struct bpf_program *prog, void **ppriv)
+void *bpf_program__priv(struct bpf_program *prog)
 {
-	*ppriv = prog->priv;
-	return 0;
+	return prog ? prog->priv : ERR_PTR(-EINVAL);
 }
 
 const char *bpf_program__title(struct bpf_program *prog, bool needs_copy)
@@ -1311,32 +1303,23 @@
 	return fd;
 }
 
-int bpf_map__get_fd(struct bpf_map *map)
+int bpf_map__fd(struct bpf_map *map)
 {
-	if (!map)
-		return -EINVAL;
-
-	return map->fd;
+	return map ? map->fd : -EINVAL;
 }
 
-int bpf_map__get_def(struct bpf_map *map, struct bpf_map_def *pdef)
+const struct bpf_map_def *bpf_map__def(struct bpf_map *map)
 {
-	if (!map || !pdef)
-		return -EINVAL;
-
-	*pdef = map->def;
-	return 0;
+	return map ? &map->def : ERR_PTR(-EINVAL);
 }
 
-const char *bpf_map__get_name(struct bpf_map *map)
+const char *bpf_map__name(struct bpf_map *map)
 {
-	if (!map)
-		return NULL;
-	return map->name;
+	return map ? map->name : NULL;
 }
 
-int bpf_map__set_private(struct bpf_map *map, void *priv,
-			 bpf_map_clear_priv_t clear_priv)
+int bpf_map__set_priv(struct bpf_map *map, void *priv,
+		     bpf_map_clear_priv_t clear_priv)
 {
 	if (!map)
 		return -EINVAL;
@@ -1351,14 +1334,9 @@
 	return 0;
 }
 
-int bpf_map__get_private(struct bpf_map *map, void **ppriv)
+void *bpf_map__priv(struct bpf_map *map)
 {
-	if (!map)
-		return -EINVAL;
-
-	if (ppriv)
-		*ppriv = map->priv;
-	return 0;
+	return map ? map->priv : ERR_PTR(-EINVAL);
 }
 
 struct bpf_map *
@@ -1389,7 +1367,7 @@
 }
 
 struct bpf_map *
-bpf_object__get_map_by_name(struct bpf_object *obj, const char *name)
+bpf_object__find_map_by_name(struct bpf_object *obj, const char *name)
 {
 	struct bpf_map *pos;
 
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index a51594c..722f46b 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -55,8 +55,8 @@
 /* Load/unload object into/from kernel */
 int bpf_object__load(struct bpf_object *obj);
 int bpf_object__unload(struct bpf_object *obj);
-const char *bpf_object__get_name(struct bpf_object *obj);
-unsigned int bpf_object__get_kversion(struct bpf_object *obj);
+const char *bpf_object__name(struct bpf_object *obj);
+unsigned int bpf_object__kversion(struct bpf_object *obj);
 
 struct bpf_object *bpf_object__next(struct bpf_object *prev);
 #define bpf_object__for_each_safe(pos, tmp)			\
@@ -78,11 +78,10 @@
 typedef void (*bpf_program_clear_priv_t)(struct bpf_program *,
 					 void *);
 
-int bpf_program__set_private(struct bpf_program *prog, void *priv,
-			     bpf_program_clear_priv_t clear_priv);
+int bpf_program__set_priv(struct bpf_program *prog, void *priv,
+			  bpf_program_clear_priv_t clear_priv);
 
-int bpf_program__get_private(struct bpf_program *prog,
-			     void **ppriv);
+void *bpf_program__priv(struct bpf_program *prog);
 
 const char *bpf_program__title(struct bpf_program *prog, bool needs_copy);
 
@@ -171,7 +170,7 @@
  */
 struct bpf_map;
 struct bpf_map *
-bpf_object__get_map_by_name(struct bpf_object *obj, const char *name);
+bpf_object__find_map_by_name(struct bpf_object *obj, const char *name);
 
 struct bpf_map *
 bpf_map__next(struct bpf_map *map, struct bpf_object *obj);
@@ -180,13 +179,13 @@
 	     (pos) != NULL;				\
 	     (pos) = bpf_map__next((pos), (obj)))
 
-int bpf_map__get_fd(struct bpf_map *map);
-int bpf_map__get_def(struct bpf_map *map, struct bpf_map_def *pdef);
-const char *bpf_map__get_name(struct bpf_map *map);
+int bpf_map__fd(struct bpf_map *map);
+const struct bpf_map_def *bpf_map__def(struct bpf_map *map);
+const char *bpf_map__name(struct bpf_map *map);
 
 typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *);
-int bpf_map__set_private(struct bpf_map *map, void *priv,
-			 bpf_map_clear_priv_t clear_priv);
-int bpf_map__get_private(struct bpf_map *map, void **ppriv);
+int bpf_map__set_priv(struct bpf_map *map, void *priv,
+		      bpf_map_clear_priv_t clear_priv);
+void *bpf_map__priv(struct bpf_map *map);
 
 #endif
diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore
index 3d1bb80..3db3db9 100644
--- a/tools/perf/.gitignore
+++ b/tools/perf/.gitignore
@@ -30,3 +30,4 @@
 *.pyo
 .config-detected
 util/intel-pt-decoder/inat-tables.c
+arch/*/include/generated/
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 04f23b4..d96ccd4 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -204,6 +204,38 @@
 --no-aggr::
 Do not aggregate counts across all monitored CPUs.
 
+--topdown::
+Print top down level 1 metrics if supported by the CPU. This allows to
+determine bottle necks in the CPU pipeline for CPU bound workloads,
+by breaking the cycles consumed down into frontend bound, backend bound,
+bad speculation and retiring.
+
+Frontend bound means that the CPU cannot fetch and decode instructions fast
+enough. Backend bound means that computation or memory access is the bottle
+neck. Bad Speculation means that the CPU wasted cycles due to branch
+mispredictions and similar issues. Retiring means that the CPU computed without
+an apparently bottleneck. The bottleneck is only the real bottleneck
+if the workload is actually bound by the CPU and not by something else.
+
+For best results it is usually a good idea to use it with interval
+mode like -I 1000, as the bottleneck of workloads can change often.
+
+The top down metrics are collected per core instead of per
+CPU thread. Per core mode is automatically enabled
+and -a (global monitoring) is needed, requiring root rights or
+perf.perf_event_paranoid=-1.
+
+Topdown uses the full Performance Monitoring Unit, and needs
+disabling of the NMI watchdog (as root):
+echo 0 > /proc/sys/kernel/nmi_watchdog
+for best results. Otherwise the bottlenecks may be inconsistent
+on workload with changing phases.
+
+This enables --metric-only, unless overriden with --no-metric-only.
+
+To interpret the results it is usually needed to know on which
+CPUs the workload runs on. If needed the CPUs can be forced using
+taskset.
 
 EXAMPLES
 --------
diff --git a/tools/perf/arch/arm/util/Build b/tools/perf/arch/arm/util/Build
index d22e3d0..f98da17 100644
--- a/tools/perf/arch/arm/util/Build
+++ b/tools/perf/arch/arm/util/Build
@@ -1,4 +1,4 @@
 libperf-$(CONFIG_DWARF) += dwarf-regs.o
 
-libperf-$(CONFIG_LIBUNWIND)          += unwind-libunwind.o
+libperf-$(CONFIG_LOCAL_LIBUNWIND)    += unwind-libunwind.o
 libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build
index e58123a8..02f41db 100644
--- a/tools/perf/arch/arm64/util/Build
+++ b/tools/perf/arch/arm64/util/Build
@@ -1,2 +1,2 @@
 libperf-$(CONFIG_DWARF)     += dwarf-regs.o
-libperf-$(CONFIG_LIBUNWIND) += unwind-libunwind.o
+libperf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
diff --git a/tools/perf/arch/arm64/util/unwind-libunwind.c b/tools/perf/arch/arm64/util/unwind-libunwind.c
index a87afa9..c116b71 100644
--- a/tools/perf/arch/arm64/util/unwind-libunwind.c
+++ b/tools/perf/arch/arm64/util/unwind-libunwind.c
@@ -1,11 +1,13 @@
 
+#ifndef REMOTE_UNWIND_LIBUNWIND
 #include <errno.h>
 #include <libunwind.h>
 #include "perf_regs.h"
 #include "../../util/unwind.h"
 #include "../../util/debug.h"
+#endif
 
-int libunwind__arch_reg_id(int regnum)
+int LIBUNWIND__ARCH_REG_ID(int regnum)
 {
 	switch (regnum) {
 	case UNW_AARCH64_X0:
diff --git a/tools/perf/arch/common.c b/tools/perf/arch/common.c
index e83c8ce..fa090a9 100644
--- a/tools/perf/arch/common.c
+++ b/tools/perf/arch/common.c
@@ -102,7 +102,7 @@
  * Return architecture name in a normalized form.
  * The conversion logic comes from the Makefile.
  */
-static const char *normalize_arch(char *arch)
+const char *normalize_arch(char *arch)
 {
 	if (!strcmp(arch, "x86_64"))
 		return "x86";
diff --git a/tools/perf/arch/common.h b/tools/perf/arch/common.h
index 7529cfb..6b01c73 100644
--- a/tools/perf/arch/common.h
+++ b/tools/perf/arch/common.h
@@ -6,5 +6,6 @@
 extern const char *objdump_path;
 
 int perf_env__lookup_objdump(struct perf_env *env);
+const char *normalize_arch(char *arch);
 
 #endif /* ARCH_PERF_COMMON_H */
diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build
index 4659703..f95e6f4 100644
--- a/tools/perf/arch/x86/util/Build
+++ b/tools/perf/arch/x86/util/Build
@@ -3,11 +3,12 @@
 libperf-y += pmu.o
 libperf-y += kvm-stat.o
 libperf-y += perf_regs.o
+libperf-y += group.o
 
 libperf-$(CONFIG_DWARF) += dwarf-regs.o
 libperf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o
 
-libperf-$(CONFIG_LIBUNWIND)          += unwind-libunwind.o
+libperf-$(CONFIG_LOCAL_LIBUNWIND)    += unwind-libunwind.o
 libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
 
 libperf-$(CONFIG_AUXTRACE) += auxtrace.o
diff --git a/tools/perf/arch/x86/util/group.c b/tools/perf/arch/x86/util/group.c
new file mode 100644
index 0000000..37f92aa
--- /dev/null
+++ b/tools/perf/arch/x86/util/group.c
@@ -0,0 +1,27 @@
+#include <stdio.h>
+#include "api/fs/fs.h"
+#include "util/group.h"
+
+/*
+ * Check whether we can use a group for top down.
+ * Without a group may get bad results due to multiplexing.
+ */
+bool arch_topdown_check_group(bool *warn)
+{
+	int n;
+
+	if (sysctl__read_int("kernel/nmi_watchdog", &n) < 0)
+		return false;
+	if (n > 0) {
+		*warn = true;
+		return false;
+	}
+	return true;
+}
+
+void arch_topdown_group_warn(void)
+{
+	fprintf(stderr,
+		"nmi_watchdog enabled with topdown. May give wrong results.\n"
+		"Disable with echo 0 > /proc/sys/kernel/nmi_watchdog\n");
+}
diff --git a/tools/perf/arch/x86/util/tsc.c b/tools/perf/arch/x86/util/tsc.c
index 357f1b1..2e5567c 100644
--- a/tools/perf/arch/x86/util/tsc.c
+++ b/tools/perf/arch/x86/util/tsc.c
@@ -62,6 +62,8 @@
 	struct perf_tsc_conversion tc;
 	int err;
 
+	if (!pc)
+		return 0;
 	err = perf_read_tsc_conversion(pc, &tc);
 	if (err == -EOPNOTSUPP)
 		return 0;
diff --git a/tools/perf/arch/x86/util/unwind-libunwind.c b/tools/perf/arch/x86/util/unwind-libunwind.c
index db25e93..4f16661 100644
--- a/tools/perf/arch/x86/util/unwind-libunwind.c
+++ b/tools/perf/arch/x86/util/unwind-libunwind.c
@@ -1,12 +1,14 @@
 
+#ifndef REMOTE_UNWIND_LIBUNWIND
 #include <errno.h>
 #include <libunwind.h>
 #include "perf_regs.h"
 #include "../../util/unwind.h"
 #include "../../util/debug.h"
+#endif
 
 #ifdef HAVE_ARCH_X86_64_SUPPORT
-int libunwind__arch_reg_id(int regnum)
+int LIBUNWIND__ARCH_REG_ID(int regnum)
 {
 	int id;
 
@@ -70,7 +72,7 @@
 	return id;
 }
 #else
-int libunwind__arch_reg_id(int regnum)
+int LIBUNWIND__ARCH_REG_ID(int regnum)
 {
 	int id;
 
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index dc3fcb5..d4cf1b0 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -655,6 +655,13 @@
 	return 0;
 }
 
+static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
+{
+	if (rec->evlist && rec->evlist->mmap && rec->evlist->mmap[0].base)
+		return rec->evlist->mmap[0].base;
+	return NULL;
+}
+
 static int record__synthesize(struct record *rec)
 {
 	struct perf_session *session = rec->session;
@@ -692,7 +699,7 @@
 		}
 	}
 
-	err = perf_event__synth_time_conv(rec->evlist->mmap[0].base, tool,
+	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
 					  process_synthesized_event, machine);
 	if (err)
 		goto out;
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index e3ce2f3..4601123 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -339,7 +339,7 @@
  */
 static int perf_session__check_output_opt(struct perf_session *session)
 {
-	int j;
+	unsigned int j;
 	struct perf_evsel *evsel;
 
 	for (j = 0; j < PERF_TYPE_MAX; ++j) {
@@ -388,17 +388,20 @@
 		struct perf_event_attr *attr;
 
 		j = PERF_TYPE_TRACEPOINT;
-		evsel = perf_session__find_first_evtype(session, j);
-		if (evsel == NULL)
-			goto out;
 
-		attr = &evsel->attr;
+		evlist__for_each(session->evlist, evsel) {
+			if (evsel->attr.type != j)
+				continue;
 
-		if (attr->sample_type & PERF_SAMPLE_CALLCHAIN) {
-			output[j].fields |= PERF_OUTPUT_IP;
-			output[j].fields |= PERF_OUTPUT_SYM;
-			output[j].fields |= PERF_OUTPUT_DSO;
-			set_print_ip_opts(attr);
+			attr = &evsel->attr;
+
+			if (attr->sample_type & PERF_SAMPLE_CALLCHAIN) {
+				output[j].fields |= PERF_OUTPUT_IP;
+				output[j].fields |= PERF_OUTPUT_SYM;
+				output[j].fields |= PERF_OUTPUT_DSO;
+				set_print_ip_opts(attr);
+				goto out;
+			}
 		}
 	}
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index ee7ada7..dff6373 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -59,10 +59,13 @@
 #include "util/thread.h"
 #include "util/thread_map.h"
 #include "util/counts.h"
+#include "util/group.h"
 #include "util/session.h"
 #include "util/tool.h"
+#include "util/group.h"
 #include "asm/bug.h"
 
+#include <api/fs/fs.h>
 #include <stdlib.h>
 #include <sys/prctl.h>
 #include <locale.h>
@@ -98,6 +101,15 @@
 	"}"
 };
 
+static const char * topdown_attrs[] = {
+	"topdown-total-slots",
+	"topdown-slots-retired",
+	"topdown-recovery-bubbles",
+	"topdown-fetch-bubbles",
+	"topdown-slots-issued",
+	NULL,
+};
+
 static struct perf_evlist	*evsel_list;
 
 static struct target target = {
@@ -112,6 +124,7 @@
 static bool			null_run			=  false;
 static int			detailed_run			=  0;
 static bool			transaction_run;
+static bool			topdown_run			= false;
 static bool			big_num				=  true;
 static int			big_num_opt			=  -1;
 static const char		*csv_sep			= NULL;
@@ -124,6 +137,7 @@
 static unsigned int		unit_width			= 4; /* strlen("unit") */
 static bool			forever				= false;
 static bool			metric_only			= false;
+static bool			force_metric_only		= false;
 static struct timespec		ref_time;
 static struct cpu_map		*aggr_map;
 static aggr_get_id_t		aggr_get_id;
@@ -1302,7 +1316,15 @@
 	[AGGR_GLOBAL] = 0,
 };
 
-static void print_metric_headers(char *prefix)
+static const char *aggr_header_csv[] = {
+	[AGGR_CORE] 	= 	"core,cpus,",
+	[AGGR_SOCKET] 	= 	"socket,cpus",
+	[AGGR_NONE] 	= 	"cpu,",
+	[AGGR_THREAD] 	= 	"comm-pid,",
+	[AGGR_GLOBAL] 	=	""
+};
+
+static void print_metric_headers(const char *prefix, bool no_indent)
 {
 	struct perf_stat_output_ctx out;
 	struct perf_evsel *counter;
@@ -1313,9 +1335,15 @@
 	if (prefix)
 		fprintf(stat_config.output, "%s", prefix);
 
-	if (!csv_output)
+	if (!csv_output && !no_indent)
 		fprintf(stat_config.output, "%*s",
 			aggr_header_lens[stat_config.aggr_mode], "");
+	if (csv_output) {
+		if (stat_config.interval)
+			fputs("time,", stat_config.output);
+		fputs(aggr_header_csv[stat_config.aggr_mode],
+			stat_config.output);
+	}
 
 	/* Print metrics headers only */
 	evlist__for_each(evsel_list, counter) {
@@ -1338,28 +1366,40 @@
 
 	sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, csv_sep);
 
-	if (num_print_interval == 0 && !csv_output && !metric_only) {
+	if (num_print_interval == 0 && !csv_output) {
 		switch (stat_config.aggr_mode) {
 		case AGGR_SOCKET:
-			fprintf(output, "#           time socket cpus             counts %*s events\n", unit_width, "unit");
+			fprintf(output, "#           time socket cpus");
+			if (!metric_only)
+				fprintf(output, "             counts %*s events\n", unit_width, "unit");
 			break;
 		case AGGR_CORE:
-			fprintf(output, "#           time core         cpus             counts %*s events\n", unit_width, "unit");
+			fprintf(output, "#           time core         cpus");
+			if (!metric_only)
+				fprintf(output, "             counts %*s events\n", unit_width, "unit");
 			break;
 		case AGGR_NONE:
-			fprintf(output, "#           time CPU                counts %*s events\n", unit_width, "unit");
+			fprintf(output, "#           time CPU");
+			if (!metric_only)
+				fprintf(output, "                counts %*s events\n", unit_width, "unit");
 			break;
 		case AGGR_THREAD:
-			fprintf(output, "#           time             comm-pid                  counts %*s events\n", unit_width, "unit");
+			fprintf(output, "#           time             comm-pid");
+			if (!metric_only)
+				fprintf(output, "                  counts %*s events\n", unit_width, "unit");
 			break;
 		case AGGR_GLOBAL:
 		default:
-			fprintf(output, "#           time             counts %*s events\n", unit_width, "unit");
+			fprintf(output, "#           time");
+			if (!metric_only)
+				fprintf(output, "             counts %*s events\n", unit_width, "unit");
 		case AGGR_UNSET:
 			break;
 		}
 	}
 
+	if (num_print_interval == 0 && metric_only)
+		print_metric_headers(" ", true);
 	if (++num_print_interval == 25)
 		num_print_interval = 0;
 }
@@ -1428,8 +1468,8 @@
 	if (metric_only) {
 		static int num_print_iv;
 
-		if (num_print_iv == 0)
-			print_metric_headers(prefix);
+		if (num_print_iv == 0 && !interval)
+			print_metric_headers(prefix, false);
 		if (num_print_iv++ == 25)
 			num_print_iv = 0;
 		if (stat_config.aggr_mode == AGGR_GLOBAL && prefix)
@@ -1520,6 +1560,14 @@
 	return 0;
 }
 
+static int enable_metric_only(const struct option *opt __maybe_unused,
+			      const char *s __maybe_unused, int unset)
+{
+	force_metric_only = true;
+	metric_only = !unset;
+	return 0;
+}
+
 static const struct option stat_options[] = {
 	OPT_BOOLEAN('T', "transaction", &transaction_run,
 		    "hardware transaction statistics"),
@@ -1578,8 +1626,10 @@
 		     "aggregate counts per thread", AGGR_THREAD),
 	OPT_UINTEGER('D', "delay", &initial_delay,
 		     "ms to wait before starting measurement after program start"),
-	OPT_BOOLEAN(0, "metric-only", &metric_only,
-			"Only print computed metrics. No raw values"),
+	OPT_CALLBACK_NOOPT(0, "metric-only", &metric_only, NULL,
+			"Only print computed metrics. No raw values", enable_metric_only),
+	OPT_BOOLEAN(0, "topdown", &topdown_run,
+			"measure topdown level 1 statistics"),
 	OPT_END()
 };
 
@@ -1772,12 +1822,62 @@
 	return 0;
 }
 
+static int topdown_filter_events(const char **attr, char **str, bool use_group)
+{
+	int off = 0;
+	int i;
+	int len = 0;
+	char *s;
+
+	for (i = 0; attr[i]; i++) {
+		if (pmu_have_event("cpu", attr[i])) {
+			len += strlen(attr[i]) + 1;
+			attr[i - off] = attr[i];
+		} else
+			off++;
+	}
+	attr[i - off] = NULL;
+
+	*str = malloc(len + 1 + 2);
+	if (!*str)
+		return -1;
+	s = *str;
+	if (i - off == 0) {
+		*s = 0;
+		return 0;
+	}
+	if (use_group)
+		*s++ = '{';
+	for (i = 0; attr[i]; i++) {
+		strcpy(s, attr[i]);
+		s += strlen(s);
+		*s++ = ',';
+	}
+	if (use_group) {
+		s[-1] = '}';
+		*s = 0;
+	} else
+		s[-1] = 0;
+	return 0;
+}
+
+__weak bool arch_topdown_check_group(bool *warn)
+{
+	*warn = false;
+	return false;
+}
+
+__weak void arch_topdown_group_warn(void)
+{
+}
+
 /*
  * Add default attributes, if there were no attributes specified or
  * if -d/--detailed, -d -d or -d -d -d is used:
  */
 static int add_default_attributes(void)
 {
+	int err;
 	struct perf_event_attr default_attrs0[] = {
 
   { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK		},
@@ -1896,7 +1996,6 @@
 		return 0;
 
 	if (transaction_run) {
-		int err;
 		if (pmu_have_event("cpu", "cycles-ct") &&
 		    pmu_have_event("cpu", "el-start"))
 			err = parse_events(evsel_list, transaction_attrs, NULL);
@@ -1909,6 +2008,46 @@
 		return 0;
 	}
 
+	if (topdown_run) {
+		char *str = NULL;
+		bool warn = false;
+
+		if (stat_config.aggr_mode != AGGR_GLOBAL &&
+		    stat_config.aggr_mode != AGGR_CORE) {
+			pr_err("top down event configuration requires --per-core mode\n");
+			return -1;
+		}
+		stat_config.aggr_mode = AGGR_CORE;
+		if (nr_cgroups || !target__has_cpu(&target)) {
+			pr_err("top down event configuration requires system-wide mode (-a)\n");
+			return -1;
+		}
+
+		if (!force_metric_only)
+			metric_only = true;
+		if (topdown_filter_events(topdown_attrs, &str,
+				arch_topdown_check_group(&warn)) < 0) {
+			pr_err("Out of memory\n");
+			return -1;
+		}
+		if (topdown_attrs[0] && str) {
+			if (warn)
+				arch_topdown_group_warn();
+			err = parse_events(evsel_list, str, NULL);
+			if (err) {
+				fprintf(stderr,
+					"Cannot set up top down events %s: %d\n",
+					str, err);
+				free(str);
+				return -1;
+			}
+		} else {
+			fprintf(stderr, "System does not support topdown\n");
+			return -1;
+		}
+		free(str);
+	}
+
 	if (!evsel_list->nr_entries) {
 		if (target__has_cpu(&target))
 			default_attrs0[0].config = PERF_COUNT_SW_CPU_CLOCK;
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index 5ad0255..098874b 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -73,17 +73,25 @@
 #
 #   make DEBUG=1 LIBUNWIND_DIR=/opt/libunwind/
 #
+
+libunwind_arch_set_flags = $(eval $(libunwind_arch_set_flags_code))
+define libunwind_arch_set_flags_code
+  FEATURE_CHECK_CFLAGS-libunwind-$(1)  = -I$(LIBUNWIND_DIR)/include
+  FEATURE_CHECK_LDFLAGS-libunwind-$(1) = -L$(LIBUNWIND_DIR)/lib
+endef
+
 ifdef LIBUNWIND_DIR
   LIBUNWIND_CFLAGS  = -I$(LIBUNWIND_DIR)/include
   LIBUNWIND_LDFLAGS = -L$(LIBUNWIND_DIR)/lib
+  LIBUNWIND_ARCHS = x86 x86_64 arm aarch64 debug-frame-arm debug-frame-aarch64
+  $(foreach libunwind_arch,$(LIBUNWIND_ARCHS),$(call libunwind_arch_set_flags,$(libunwind_arch)))
 endif
-LIBUNWIND_LDFLAGS += $(LIBUNWIND_LIBS)
 
 # Set per-feature check compilation flags
 FEATURE_CHECK_CFLAGS-libunwind = $(LIBUNWIND_CFLAGS)
-FEATURE_CHECK_LDFLAGS-libunwind = $(LIBUNWIND_LDFLAGS)
+FEATURE_CHECK_LDFLAGS-libunwind = $(LIBUNWIND_LDFLAGS) $(LIBUNWIND_LIBS)
 FEATURE_CHECK_CFLAGS-libunwind-debug-frame = $(LIBUNWIND_CFLAGS)
-FEATURE_CHECK_LDFLAGS-libunwind-debug-frame = $(LIBUNWIND_LDFLAGS)
+FEATURE_CHECK_LDFLAGS-libunwind-debug-frame = $(LIBUNWIND_LDFLAGS) $(LIBUNWIND_LIBS)
 
 ifeq ($(NO_PERF_REGS),0)
   CFLAGS += -DHAVE_PERF_REGS_SUPPORT
@@ -351,10 +359,40 @@
 endif
 
 ifndef NO_LIBUNWIND
+  have_libunwind :=
+
+  ifeq ($(feature-libunwind-x86), 1)
+    $(call detected,CONFIG_LIBUNWIND_X86)
+    CFLAGS += -DHAVE_LIBUNWIND_X86_SUPPORT
+    LDFLAGS += -lunwind-x86
+    have_libunwind = 1
+  endif
+
+  ifeq ($(feature-libunwind-aarch64), 1)
+    $(call detected,CONFIG_LIBUNWIND_AARCH64)
+    CFLAGS += -DHAVE_LIBUNWIND_AARCH64_SUPPORT
+    LDFLAGS += -lunwind-aarch64
+    have_libunwind = 1
+    $(call feature_check,libunwind-debug-frame-aarch64)
+    ifneq ($(feature-libunwind-debug-frame-aarch64), 1)
+      msg := $(warning No debug_frame support found in libunwind-aarch64);
+      CFLAGS += -DNO_LIBUNWIND_DEBUG_FRAME_AARCH64
+    endif
+  endif
+
   ifneq ($(feature-libunwind), 1)
     msg := $(warning No libunwind found. Please install libunwind-dev[el] >= 1.1 and/or set LIBUNWIND_DIR);
+    NO_LOCAL_LIBUNWIND := 1
+  else
+    have_libunwind := 1
+    $(call detected,CONFIG_LOCAL_LIBUNWIND)
+  endif
+
+  ifneq ($(have_libunwind), 1)
     NO_LIBUNWIND := 1
   endif
+else
+  NO_LOCAL_LIBUNWIND := 1
 endif
 
 ifndef NO_LIBBPF
@@ -392,7 +430,7 @@
   NO_DWARF_UNWIND := 1
 endif
 
-ifndef NO_LIBUNWIND
+ifndef NO_LOCAL_LIBUNWIND
   ifeq ($(ARCH),$(filter $(ARCH),arm arm64))
     $(call feature_check,libunwind-debug-frame)
     ifneq ($(feature-libunwind-debug-frame), 1)
@@ -403,8 +441,12 @@
     # non-ARM has no dwarf_find_debug_frame() function:
     CFLAGS += -DNO_LIBUNWIND_DEBUG_FRAME
   endif
-  CFLAGS  += -DHAVE_LIBUNWIND_SUPPORT
   EXTLIBS += $(LIBUNWIND_LIBS)
+  LDFLAGS += $(LIBUNWIND_LIBS)
+endif
+
+ifndef NO_LIBUNWIND
+  CFLAGS  += -DHAVE_LIBUNWIND_SUPPORT
   CFLAGS  += $(LIBUNWIND_CFLAGS)
   LDFLAGS += $(LIBUNWIND_LDFLAGS)
 endif
diff --git a/tools/perf/tests/fdarray.c b/tools/perf/tests/fdarray.c
index c809463..59dbd05 100644
--- a/tools/perf/tests/fdarray.c
+++ b/tools/perf/tests/fdarray.c
@@ -36,7 +36,7 @@
 	}
 
 	fdarray__init_revents(fda, POLLIN);
-	nr_fds = fdarray__filter(fda, POLLHUP, NULL);
+	nr_fds = fdarray__filter(fda, POLLHUP, NULL, NULL);
 	if (nr_fds != fda->nr_alloc) {
 		pr_debug("\nfdarray__filter()=%d != %d shouldn't have filtered anything",
 			 nr_fds, fda->nr_alloc);
@@ -44,7 +44,7 @@
 	}
 
 	fdarray__init_revents(fda, POLLHUP);
-	nr_fds = fdarray__filter(fda, POLLHUP, NULL);
+	nr_fds = fdarray__filter(fda, POLLHUP, NULL, NULL);
 	if (nr_fds != 0) {
 		pr_debug("\nfdarray__filter()=%d != %d, should have filtered all fds",
 			 nr_fds, fda->nr_alloc);
@@ -57,7 +57,7 @@
 
 	pr_debug("\nfiltering all but fda->entries[2]:");
 	fdarray__fprintf_prefix(fda, "before", stderr);
-	nr_fds = fdarray__filter(fda, POLLHUP, NULL);
+	nr_fds = fdarray__filter(fda, POLLHUP, NULL, NULL);
 	fdarray__fprintf_prefix(fda, " after", stderr);
 	if (nr_fds != 1) {
 		pr_debug("\nfdarray__filter()=%d != 1, should have left just one event", nr_fds);
@@ -78,7 +78,7 @@
 
 	pr_debug("\nfiltering all but (fda->entries[0], fda->entries[3]):");
 	fdarray__fprintf_prefix(fda, "before", stderr);
-	nr_fds = fdarray__filter(fda, POLLHUP, NULL);
+	nr_fds = fdarray__filter(fda, POLLHUP, NULL, NULL);
 	fdarray__fprintf_prefix(fda, " after", stderr);
 	if (nr_fds != 2) {
 		pr_debug("\nfdarray__filter()=%d != 2, should have left just two events",
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index 7865f68..b2a2c74 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -1783,8 +1783,8 @@
 		struct evlist_test e;
 		char name[MAX_NAME];
 
-		if (!strcmp(ent->d_name, ".") ||
-		    !strcmp(ent->d_name, ".."))
+		/* Names containing . are special and cannot be used directly */
+		if (strchr(ent->d_name, '.'))
 			continue;
 
 		snprintf(name, MAX_NAME, "cpu/event=%s/u", ent->d_name);
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 8c6c8a0..fced833 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -99,7 +99,10 @@
 libperf-$(CONFIG_DWARF) += dwarf-aux.o
 
 libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
+libperf-$(CONFIG_LOCAL_LIBUNWIND)    += unwind-libunwind-local.o
 libperf-$(CONFIG_LIBUNWIND)          += unwind-libunwind.o
+libperf-$(CONFIG_LIBUNWIND_X86)      += libunwind/x86_32.o
+libperf-$(CONFIG_LIBUNWIND_AARCH64)  += libunwind/arm64.o
 
 libperf-$(CONFIG_LIBBABELTRACE) += data-convert-bt.o
 
diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 493307d..dcc8845 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -339,7 +339,7 @@
 	}
 	pr_debug("bpf: config '%s' is ok\n", config_str);
 
-	err = bpf_program__set_private(prog, priv, clear_prog_priv);
+	err = bpf_program__set_priv(prog, priv, clear_prog_priv);
 	if (err) {
 		pr_debug("Failed to set priv for program '%s'\n", config_str);
 		goto errout;
@@ -380,15 +380,14 @@
 		     struct bpf_insn *orig_insns, int orig_insns_cnt,
 		     struct bpf_prog_prep_result *res)
 {
+	struct bpf_prog_priv *priv = bpf_program__priv(prog);
 	struct probe_trace_event *tev;
 	struct perf_probe_event *pev;
-	struct bpf_prog_priv *priv;
 	struct bpf_insn *buf;
 	size_t prologue_cnt = 0;
 	int i, err;
 
-	err = bpf_program__get_private(prog, (void **)&priv);
-	if (err || !priv)
+	if (IS_ERR(priv) || !priv)
 		goto errout;
 
 	pev = &priv->pev;
@@ -535,13 +534,12 @@
 
 static int hook_load_preprocessor(struct bpf_program *prog)
 {
+	struct bpf_prog_priv *priv = bpf_program__priv(prog);
 	struct perf_probe_event *pev;
-	struct bpf_prog_priv *priv;
 	bool need_prologue = false;
 	int err, i;
 
-	err = bpf_program__get_private(prog, (void **)&priv);
-	if (err || !priv) {
+	if (IS_ERR(priv) || !priv) {
 		pr_debug("Internal error when hook preprocessor\n");
 		return -BPF_LOADER_ERRNO__INTERNAL;
 	}
@@ -607,9 +605,11 @@
 		if (err)
 			goto out;
 
-		err = bpf_program__get_private(prog, (void **)&priv);
-		if (err || !priv)
+		priv = bpf_program__priv(prog);
+		if (IS_ERR(priv) || !priv) {
+			err = PTR_ERR(priv);
 			goto out;
+		}
 		pev = &priv->pev;
 
 		err = convert_perf_probe_events(pev, 1);
@@ -645,13 +645,12 @@
 {
 	int err, ret = 0;
 	struct bpf_program *prog;
-	struct bpf_prog_priv *priv;
 
 	bpf_object__for_each_program(prog, obj) {
+		struct bpf_prog_priv *priv = bpf_program__priv(prog);
 		int i;
 
-		err = bpf_program__get_private(prog, (void **)&priv);
-		if (err || !priv)
+		if (IS_ERR(priv) || !priv)
 			continue;
 
 		for (i = 0; i < priv->pev.ntevs; i++) {
@@ -702,14 +701,12 @@
 	int err;
 
 	bpf_object__for_each_program(prog, obj) {
+		struct bpf_prog_priv *priv = bpf_program__priv(prog);
 		struct probe_trace_event *tev;
 		struct perf_probe_event *pev;
-		struct bpf_prog_priv *priv;
 		int i, fd;
 
-		err = bpf_program__get_private(prog,
-				(void **)&priv);
-		if (err || !priv) {
+		if (IS_ERR(priv) || !priv) {
 			pr_debug("bpf: failed to get private field\n");
 			return -BPF_LOADER_ERRNO__INTERNAL;
 		}
@@ -897,15 +894,12 @@
 static int
 bpf_map__add_op(struct bpf_map *map, struct bpf_map_op *op)
 {
-	struct bpf_map_priv *priv;
-	const char *map_name;
-	int err;
+	const char *map_name = bpf_map__name(map);
+	struct bpf_map_priv *priv = bpf_map__priv(map);
 
-	map_name = bpf_map__get_name(map);
-	err = bpf_map__get_private(map, (void **)&priv);
-	if (err) {
+	if (IS_ERR(priv)) {
 		pr_debug("Failed to get private from map %s\n", map_name);
-		return err;
+		return PTR_ERR(priv);
 	}
 
 	if (!priv) {
@@ -916,7 +910,7 @@
 		}
 		INIT_LIST_HEAD(&priv->ops_list);
 
-		if (bpf_map__set_private(map, priv, bpf_map_priv__clear)) {
+		if (bpf_map__set_priv(map, priv, bpf_map_priv__clear)) {
 			free(priv);
 			return -BPF_LOADER_ERRNO__INTERNAL;
 		}
@@ -948,30 +942,26 @@
 __bpf_map__config_value(struct bpf_map *map,
 			struct parse_events_term *term)
 {
-	struct bpf_map_def def;
 	struct bpf_map_op *op;
-	const char *map_name;
-	int err;
+	const char *map_name = bpf_map__name(map);
+	const struct bpf_map_def *def = bpf_map__def(map);
 
-	map_name = bpf_map__get_name(map);
-
-	err = bpf_map__get_def(map, &def);
-	if (err) {
+	if (IS_ERR(def)) {
 		pr_debug("Unable to get map definition from '%s'\n",
 			 map_name);
 		return -BPF_LOADER_ERRNO__INTERNAL;
 	}
 
-	if (def.type != BPF_MAP_TYPE_ARRAY) {
+	if (def->type != BPF_MAP_TYPE_ARRAY) {
 		pr_debug("Map %s type is not BPF_MAP_TYPE_ARRAY\n",
 			 map_name);
 		return -BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE;
 	}
-	if (def.key_size < sizeof(unsigned int)) {
+	if (def->key_size < sizeof(unsigned int)) {
 		pr_debug("Map %s has incorrect key size\n", map_name);
 		return -BPF_LOADER_ERRNO__OBJCONF_MAP_KEYSIZE;
 	}
-	switch (def.value_size) {
+	switch (def->value_size) {
 	case 1:
 	case 2:
 	case 4:
@@ -1014,12 +1004,10 @@
 			struct perf_evlist *evlist)
 {
 	struct perf_evsel *evsel;
-	struct bpf_map_def def;
+	const struct bpf_map_def *def;
 	struct bpf_map_op *op;
-	const char *map_name;
-	int err;
+	const char *map_name = bpf_map__name(map);
 
-	map_name = bpf_map__get_name(map);
 	evsel = perf_evlist__find_evsel_by_str(evlist, term->val.str);
 	if (!evsel) {
 		pr_debug("Event (for '%s') '%s' doesn't exist\n",
@@ -1027,18 +1015,18 @@
 		return -BPF_LOADER_ERRNO__OBJCONF_MAP_NOEVT;
 	}
 
-	err = bpf_map__get_def(map, &def);
-	if (err) {
+	def = bpf_map__def(map);
+	if (IS_ERR(def)) {
 		pr_debug("Unable to get map definition from '%s'\n",
 			 map_name);
-		return err;
+		return PTR_ERR(def);
 	}
 
 	/*
 	 * No need to check key_size and value_size:
 	 * kernel has already checked them.
 	 */
-	if (def.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) {
+	if (def->type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) {
 		pr_debug("Map %s type is not BPF_MAP_TYPE_PERF_EVENT_ARRAY\n",
 			 map_name);
 		return -BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE;
@@ -1087,9 +1075,8 @@
 			       const char *map_name)
 {
 	struct parse_events_array *array = &term->array;
-	struct bpf_map_def def;
+	const struct bpf_map_def *def;
 	unsigned int i;
-	int err;
 
 	if (!array->nr_ranges)
 		return 0;
@@ -1099,8 +1086,8 @@
 		return -BPF_LOADER_ERRNO__INTERNAL;
 	}
 
-	err = bpf_map__get_def(map, &def);
-	if (err) {
+	def = bpf_map__def(map);
+	if (IS_ERR(def)) {
 		pr_debug("ERROR: Unable to get map definition from '%s'\n",
 			 map_name);
 		return -BPF_LOADER_ERRNO__INTERNAL;
@@ -1111,7 +1098,7 @@
 		size_t length = array->ranges[i].length;
 		unsigned int idx = start + length - 1;
 
-		if (idx >= def.max_entries) {
+		if (idx >= def->max_entries) {
 			pr_debug("ERROR: index %d too large\n", idx);
 			return -BPF_LOADER_ERRNO__OBJCONF_MAP_IDX2BIG;
 		}
@@ -1147,7 +1134,7 @@
 		goto out;
 	}
 
-	map = bpf_object__get_map_by_name(obj, map_name);
+	map = bpf_object__find_map_by_name(obj, map_name);
 	if (!map) {
 		pr_debug("ERROR: Map %s doesn't exist\n", map_name);
 		err = -BPF_LOADER_ERRNO__OBJCONF_MAP_NOTEXIST;
@@ -1204,14 +1191,14 @@
 }
 
 typedef int (*map_config_func_t)(const char *name, int map_fd,
-				 struct bpf_map_def *pdef,
+				 const struct bpf_map_def *pdef,
 				 struct bpf_map_op *op,
 				 void *pkey, void *arg);
 
 static int
 foreach_key_array_all(map_config_func_t func,
 		      void *arg, const char *name,
-		      int map_fd, struct bpf_map_def *pdef,
+		      int map_fd, const struct bpf_map_def *pdef,
 		      struct bpf_map_op *op)
 {
 	unsigned int i;
@@ -1231,7 +1218,7 @@
 static int
 foreach_key_array_ranges(map_config_func_t func, void *arg,
 			 const char *name, int map_fd,
-			 struct bpf_map_def *pdef,
+			 const struct bpf_map_def *pdef,
 			 struct bpf_map_op *op)
 {
 	unsigned int i, j;
@@ -1261,15 +1248,12 @@
 			   void *arg)
 {
 	int err, map_fd;
-	const char *name;
 	struct bpf_map_op *op;
-	struct bpf_map_def def;
-	struct bpf_map_priv *priv;
+	const struct bpf_map_def *def;
+	const char *name = bpf_map__name(map);
+	struct bpf_map_priv *priv = bpf_map__priv(map);
 
-	name = bpf_map__get_name(map);
-
-	err = bpf_map__get_private(map, (void **)&priv);
-	if (err) {
+	if (IS_ERR(priv)) {
 		pr_debug("ERROR: failed to get private from map %s\n", name);
 		return -BPF_LOADER_ERRNO__INTERNAL;
 	}
@@ -1278,29 +1262,29 @@
 		return 0;
 	}
 
-	err = bpf_map__get_def(map, &def);
-	if (err) {
+	def = bpf_map__def(map);
+	if (IS_ERR(def)) {
 		pr_debug("ERROR: failed to get definition from map %s\n", name);
 		return -BPF_LOADER_ERRNO__INTERNAL;
 	}
-	map_fd = bpf_map__get_fd(map);
+	map_fd = bpf_map__fd(map);
 	if (map_fd < 0) {
 		pr_debug("ERROR: failed to get fd from map %s\n", name);
 		return map_fd;
 	}
 
 	list_for_each_entry(op, &priv->ops_list, list) {
-		switch (def.type) {
+		switch (def->type) {
 		case BPF_MAP_TYPE_ARRAY:
 		case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
 			switch (op->key_type) {
 			case BPF_MAP_KEY_ALL:
 				err = foreach_key_array_all(func, arg, name,
-							    map_fd, &def, op);
+							    map_fd, def, op);
 				break;
 			case BPF_MAP_KEY_RANGES:
 				err = foreach_key_array_ranges(func, arg, name,
-							       map_fd, &def,
+							       map_fd, def,
 							       op);
 				break;
 			default:
@@ -1410,7 +1394,7 @@
 
 static int
 apply_obj_config_map_for_key(const char *name, int map_fd,
-			     struct bpf_map_def *pdef __maybe_unused,
+			     const struct bpf_map_def *pdef,
 			     struct bpf_map_op *op,
 			     void *pkey, void *arg __maybe_unused)
 {
@@ -1475,9 +1459,9 @@
 
 #define bpf__for_each_stdout_map(pos, obj, objtmp)	\
 	bpf__for_each_map(pos, obj, objtmp) 		\
-		if (bpf_map__get_name(pos) && 		\
+		if (bpf_map__name(pos) && 		\
 			(strcmp("__bpf_stdout__", 	\
-				bpf_map__get_name(pos)) == 0))
+				bpf_map__name(pos)) == 0))
 
 int bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused)
 {
@@ -1489,10 +1473,9 @@
 	bool need_init = false;
 
 	bpf__for_each_stdout_map(map, obj, tmp) {
-		struct bpf_map_priv *priv;
+		struct bpf_map_priv *priv = bpf_map__priv(map);
 
-		err = bpf_map__get_private(map, (void **)&priv);
-		if (err)
+		if (IS_ERR(priv))
 			return -BPF_LOADER_ERRNO__INTERNAL;
 
 		/*
@@ -1520,10 +1503,9 @@
 	}
 
 	bpf__for_each_stdout_map(map, obj, tmp) {
-		struct bpf_map_priv *priv;
+		struct bpf_map_priv *priv = bpf_map__priv(map);
 
-		err = bpf_map__get_private(map, (void **)&priv);
-		if (err)
+		if (IS_ERR(priv))
 			return -BPF_LOADER_ERRNO__INTERNAL;
 		if (priv)
 			continue;
@@ -1533,7 +1515,7 @@
 			if (!priv)
 				return -ENOMEM;
 
-			err = bpf_map__set_private(map, priv, bpf_map_priv__clear);
+			err = bpf_map__set_priv(map, priv, bpf_map_priv__clear);
 			if (err) {
 				bpf_map_priv__clear(map, priv);
 				return err;
@@ -1677,7 +1659,7 @@
 {
 	bpf__strerror_head(err, buf, size);
 	case LIBBPF_ERRNO__KVER: {
-		unsigned int obj_kver = bpf_object__get_kversion(obj);
+		unsigned int obj_kver = bpf_object__kversion(obj);
 		unsigned int real_kver;
 
 		if (fetch_kernel_version(&real_kver, NULL, 0)) {
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index 67e5966..20aef90 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -144,7 +144,29 @@
 	return ret;
 }
 
-static char *build_id__filename(const char *sbuild_id, char *bf, size_t size)
+char *build_id_cache__kallsyms_path(const char *sbuild_id, char *bf,
+				    size_t size)
+{
+	bool retry_old = true;
+
+	snprintf(bf, size, "%s/%s/%s/kallsyms",
+		 buildid_dir, DSO__NAME_KALLSYMS, sbuild_id);
+retry:
+	if (!access(bf, F_OK))
+		return bf;
+	if (retry_old) {
+		/* Try old style kallsyms cache */
+		snprintf(bf, size, "%s/%s/%s",
+			 buildid_dir, DSO__NAME_KALLSYMS, sbuild_id);
+		retry_old = false;
+		goto retry;
+	}
+
+	return NULL;
+}
+
+static char *build_id_cache__linkname(const char *sbuild_id, char *bf,
+				      size_t size)
 {
 	char *tmp = bf;
 	int ret = asnprintf(&bf, size, "%s/.build-id/%.2s/%s", buildid_dir,
@@ -154,23 +176,52 @@
 	return bf;
 }
 
+static const char *build_id_cache__basename(bool is_kallsyms, bool is_vdso)
+{
+	return is_kallsyms ? "kallsyms" : (is_vdso ? "vdso" : "elf");
+}
+
 char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size)
 {
-	char build_id_hex[SBUILD_ID_SIZE];
+	bool is_kallsyms = dso__is_kallsyms((struct dso *)dso);
+	bool is_vdso = dso__is_vdso((struct dso *)dso);
+	char sbuild_id[SBUILD_ID_SIZE];
+	char *linkname;
+	bool alloc = (bf == NULL);
+	int ret;
 
 	if (!dso->has_build_id)
 		return NULL;
 
-	build_id__sprintf(dso->build_id, sizeof(dso->build_id), build_id_hex);
-	return build_id__filename(build_id_hex, bf, size);
+	build_id__sprintf(dso->build_id, sizeof(dso->build_id), sbuild_id);
+	linkname = build_id_cache__linkname(sbuild_id, NULL, 0);
+	if (!linkname)
+		return NULL;
+
+	/* Check if old style build_id cache */
+	if (is_regular_file(linkname))
+		ret = asnprintf(&bf, size, "%s", linkname);
+	else
+		ret = asnprintf(&bf, size, "%s/%s", linkname,
+			 build_id_cache__basename(is_kallsyms, is_vdso));
+	if (ret < 0 || (!alloc && size < (unsigned int)ret))
+		bf = NULL;
+	free(linkname);
+
+	return bf;
 }
 
 bool dso__build_id_is_kmod(const struct dso *dso, char *bf, size_t size)
 {
-	char *id_name, *ch;
+	char *id_name = NULL, *ch;
 	struct stat sb;
+	char sbuild_id[SBUILD_ID_SIZE];
 
-	id_name = dso__build_id_filename(dso, bf, size);
+	if (!dso->has_build_id)
+		goto err;
+
+	build_id__sprintf(dso->build_id, sizeof(dso->build_id), sbuild_id);
+	id_name = build_id_cache__linkname(sbuild_id, NULL, 0);
 	if (!id_name)
 		goto err;
 	if (access(id_name, F_OK))
@@ -194,18 +245,14 @@
 	if (ch - 3 < bf)
 		goto err;
 
+	free(id_name);
 	return strncmp(".ko", ch - 3, 3) == 0;
 err:
-	/*
-	 * If dso__build_id_filename work, get id_name again,
-	 * because id_name points to bf and is broken.
-	 */
-	if (id_name)
-		id_name = dso__build_id_filename(dso, bf, size);
 	pr_err("Invalid build id: %s\n", id_name ? :
 					 dso->long_name ? :
 					 dso->short_name ? :
 					 "[unknown]");
+	free(id_name);
 	return false;
 }
 
@@ -341,7 +388,8 @@
 }
 
 static char *build_id_cache__dirname_from_path(const char *name,
-					       bool is_kallsyms, bool is_vdso)
+					       bool is_kallsyms, bool is_vdso,
+					       const char *sbuild_id)
 {
 	char *realname = (char *)name, *filename;
 	bool slash = is_kallsyms || is_vdso;
@@ -352,8 +400,9 @@
 			return NULL;
 	}
 
-	if (asprintf(&filename, "%s%s%s", buildid_dir, slash ? "/" : "",
-		     is_vdso ? DSO__NAME_VDSO : realname) < 0)
+	if (asprintf(&filename, "%s%s%s%s%s", buildid_dir, slash ? "/" : "",
+		     is_vdso ? DSO__NAME_VDSO : realname,
+		     sbuild_id ? "/" : "", sbuild_id ?: "") < 0)
 		filename = NULL;
 
 	if (!slash)
@@ -368,7 +417,8 @@
 	char *dir_name;
 	int ret = 0;
 
-	dir_name = build_id_cache__dirname_from_path(pathname, false, false);
+	dir_name = build_id_cache__dirname_from_path(pathname, false, false,
+						     NULL);
 	if (!dir_name)
 		return -ENOMEM;
 
@@ -385,7 +435,7 @@
 {
 	const size_t size = PATH_MAX;
 	char *realname = NULL, *filename = NULL, *dir_name = NULL,
-	     *linkname = zalloc(size), *targetname, *tmp;
+	     *linkname = zalloc(size), *tmp;
 	int err = -1;
 
 	if (!is_kallsyms) {
@@ -394,14 +444,22 @@
 			goto out_free;
 	}
 
-	dir_name = build_id_cache__dirname_from_path(name, is_kallsyms, is_vdso);
+	dir_name = build_id_cache__dirname_from_path(name, is_kallsyms,
+						     is_vdso, sbuild_id);
 	if (!dir_name)
 		goto out_free;
 
+	/* Remove old style build-id cache */
+	if (is_regular_file(dir_name))
+		if (unlink(dir_name))
+			goto out_free;
+
 	if (mkdir_p(dir_name, 0755))
 		goto out_free;
 
-	if (asprintf(&filename, "%s/%s", dir_name, sbuild_id) < 0) {
+	/* Save the allocated buildid dirname */
+	if (asprintf(&filename, "%s/%s", dir_name,
+		     build_id_cache__basename(is_kallsyms, is_vdso)) < 0) {
 		filename = NULL;
 		goto out_free;
 	}
@@ -415,7 +473,7 @@
 			goto out_free;
 	}
 
-	if (!build_id__filename(sbuild_id, linkname, size))
+	if (!build_id_cache__linkname(sbuild_id, linkname, size))
 		goto out_free;
 	tmp = strrchr(linkname, '/');
 	*tmp = '\0';
@@ -424,10 +482,10 @@
 		goto out_free;
 
 	*tmp = '/';
-	targetname = filename + strlen(buildid_dir) - 5;
-	memcpy(targetname, "../..", 5);
+	tmp = dir_name + strlen(buildid_dir) - 5;
+	memcpy(tmp, "../..", 5);
 
-	if (symlink(targetname, linkname) == 0)
+	if (symlink(tmp, linkname) == 0)
 		err = 0;
 out_free:
 	if (!is_kallsyms)
@@ -452,7 +510,7 @@
 bool build_id_cache__cached(const char *sbuild_id)
 {
 	bool ret = false;
-	char *filename = build_id__filename(sbuild_id, NULL, 0);
+	char *filename = build_id_cache__linkname(sbuild_id, NULL, 0);
 
 	if (filename && !access(filename, F_OK))
 		ret = true;
@@ -471,7 +529,7 @@
 	if (filename == NULL || linkname == NULL)
 		goto out_free;
 
-	if (!build_id__filename(sbuild_id, linkname, size))
+	if (!build_id_cache__linkname(sbuild_id, linkname, size))
 		goto out_free;
 
 	if (access(linkname, F_OK))
@@ -489,7 +547,7 @@
 	tmp = strrchr(linkname, '/') + 1;
 	snprintf(tmp, size - (tmp - linkname), "%s", filename);
 
-	if (unlink(linkname))
+	if (rm_rf(linkname))
 		goto out_free;
 
 	err = 0;
@@ -501,7 +559,7 @@
 
 static int dso__cache_build_id(struct dso *dso, struct machine *machine)
 {
-	bool is_kallsyms = dso->kernel && dso->long_name[0] != '/';
+	bool is_kallsyms = dso__is_kallsyms(dso);
 	bool is_vdso = dso__is_vdso(dso);
 	const char *name = dso->long_name;
 	char nm[PATH_MAX];
diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h
index 64af3e2..e5435f4 100644
--- a/tools/perf/util/build-id.h
+++ b/tools/perf/util/build-id.h
@@ -14,6 +14,8 @@
 int build_id__sprintf(const u8 *build_id, int len, char *bf);
 int sysfs__sprintf_build_id(const char *root_dir, char *sbuild_id);
 int filename__sprintf_build_id(const char *pathname, char *sbuild_id);
+char *build_id_cache__kallsyms_path(const char *sbuild_id, char *bf,
+				    size_t size);
 
 char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size);
 bool dso__build_id_is_kmod(const struct dso *dso, char *bf, size_t size);
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 65e2a4f..a70f6b5 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -94,6 +94,7 @@
 	enum perf_call_graph_mode record_mode;
 	u32			dump_size;
 	enum chain_mode 	mode;
+	u16			max_stack;
 	u32			print_limit;
 	double			min_percent;
 	sort_chain_func_t	sort;
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index dad7d82..8749eca 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -275,7 +275,8 @@
 			break;
 		}
 	}
-	die("bad config file line %d in %s", config_linenr, config_file_name);
+	pr_err("bad config file line %d in %s\n", config_linenr, config_file_name);
+	return -1;
 }
 
 static int parse_unit_factor(const char *end, unsigned long *val)
@@ -479,16 +480,15 @@
 
 int perf_config(config_fn_t fn, void *data)
 {
-	int ret = 0, found = 0;
+	int ret = -1;
 	const char *home = NULL;
 
 	/* Setting $PERF_CONFIG makes perf read _only_ the given config file. */
 	if (config_exclusive_filename)
 		return perf_config_from_file(fn, config_exclusive_filename, data);
 	if (perf_config_system() && !access(perf_etc_perfconfig(), R_OK)) {
-		ret += perf_config_from_file(fn, perf_etc_perfconfig(),
-					    data);
-		found += 1;
+		if (perf_config_from_file(fn, perf_etc_perfconfig(), data) < 0)
+			goto out;
 	}
 
 	home = getenv("HOME");
@@ -514,14 +514,12 @@
 		if (!st.st_size)
 			goto out_free;
 
-		ret += perf_config_from_file(fn, user_config, data);
-		found += 1;
+		ret = perf_config_from_file(fn, user_config, data);
+
 out_free:
 		free(user_config);
 	}
 out:
-	if (found == 0)
-		return -1;
 	return ret;
 }
 
@@ -609,8 +607,12 @@
 	struct perf_config_section *section = NULL;
 	struct perf_config_item *item = NULL;
 	struct perf_config_set *set = perf_config_set;
-	struct list_head *sections = &set->sections;
+	struct list_head *sections;
 
+	if (set == NULL)
+		return -1;
+
+	sections = &set->sections;
 	key = ptr = strdup(var);
 	if (!key) {
 		pr_debug("%s: strdup failed\n", __func__);
@@ -641,17 +643,64 @@
 
 out_free:
 	free(key);
-	perf_config_set__delete(set);
 	return -1;
 }
 
+static int perf_config_set__init(struct perf_config_set *set)
+{
+	int ret = -1;
+	const char *home = NULL;
+
+	/* Setting $PERF_CONFIG makes perf read _only_ the given config file. */
+	if (config_exclusive_filename)
+		return perf_config_from_file(collect_config, config_exclusive_filename, set);
+	if (perf_config_system() && !access(perf_etc_perfconfig(), R_OK)) {
+		if (perf_config_from_file(collect_config, perf_etc_perfconfig(), set) < 0)
+			goto out;
+	}
+
+	home = getenv("HOME");
+	if (perf_config_global() && home) {
+		char *user_config = strdup(mkpath("%s/.perfconfig", home));
+		struct stat st;
+
+		if (user_config == NULL) {
+			warning("Not enough memory to process %s/.perfconfig, "
+				"ignoring it.", home);
+			goto out;
+		}
+
+		if (stat(user_config, &st) < 0)
+			goto out_free;
+
+		if (st.st_uid && (st.st_uid != geteuid())) {
+			warning("File %s not owned by current user or root, "
+				"ignoring it.", user_config);
+			goto out_free;
+		}
+
+		if (!st.st_size)
+			goto out_free;
+
+		ret = perf_config_from_file(collect_config, user_config, set);
+
+out_free:
+		free(user_config);
+	}
+out:
+	return ret;
+}
+
 struct perf_config_set *perf_config_set__new(void)
 {
 	struct perf_config_set *set = zalloc(sizeof(*set));
 
 	if (set) {
 		INIT_LIST_HEAD(&set->sections);
-		perf_config(collect_config, set);
+		if (perf_config_set__init(set) < 0) {
+			perf_config_set__delete(set);
+			set = NULL;
+		}
 	}
 
 	return set;
diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c
index c9a6dc1..b0c2b5c 100644
--- a/tools/perf/util/db-export.c
+++ b/tools/perf/util/db-export.c
@@ -233,17 +233,6 @@
 	return 0;
 }
 
-static struct thread *get_main_thread(struct machine *machine, struct thread *thread)
-{
-	if (thread->pid_ == thread->tid)
-		return thread__get(thread);
-
-	if (thread->pid_ == -1)
-		return NULL;
-
-	return machine__find_thread(machine, thread->pid_, thread->pid_);
-}
-
 static int db_ids_from_al(struct db_export *dbe, struct addr_location *al,
 			  u64 *dso_db_id, u64 *sym_db_id, u64 *offset)
 {
@@ -382,7 +371,7 @@
 	if (err)
 		return err;
 
-	main_thread = get_main_thread(al->machine, thread);
+	main_thread = thread__main_thread(al->machine, thread);
 	if (main_thread)
 		comm = machine__thread_exec_comm(al->machine, main_thread);
 
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index 0953280..76d79d0 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -349,6 +349,11 @@
 	       dso->binary_type == DSO_BINARY_TYPE__GUEST_KCORE;
 }
 
+static inline bool dso__is_kallsyms(struct dso *dso)
+{
+	return dso->kernel && dso->long_name[0] != '/';
+}
+
 void dso__free_a2l(struct dso *dso);
 
 enum dso_type dso__type(struct dso *dso, struct machine *machine);
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index e82ba90..1b918aa0 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -462,9 +462,9 @@
 	return 0;
 }
 
-static int __perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd, int idx)
+static int __perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd, int idx, short revent)
 {
-	int pos = fdarray__add(&evlist->pollfd, fd, POLLIN | POLLERR | POLLHUP);
+	int pos = fdarray__add(&evlist->pollfd, fd, revent | POLLERR | POLLHUP);
 	/*
 	 * Save the idx so that when we filter out fds POLLHUP'ed we can
 	 * close the associated evlist->mmap[] entry.
@@ -480,10 +480,11 @@
 
 int perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd)
 {
-	return __perf_evlist__add_pollfd(evlist, fd, -1);
+	return __perf_evlist__add_pollfd(evlist, fd, -1, POLLIN);
 }
 
-static void perf_evlist__munmap_filtered(struct fdarray *fda, int fd)
+static void perf_evlist__munmap_filtered(struct fdarray *fda, int fd,
+					 void *arg __maybe_unused)
 {
 	struct perf_evlist *evlist = container_of(fda, struct perf_evlist, pollfd);
 
@@ -493,7 +494,7 @@
 int perf_evlist__filter_pollfd(struct perf_evlist *evlist, short revents_and_mask)
 {
 	return fdarray__filter(&evlist->pollfd, revents_and_mask,
-			       perf_evlist__munmap_filtered);
+			       perf_evlist__munmap_filtered, NULL);
 }
 
 int perf_evlist__poll(struct perf_evlist *evlist, int timeout)
@@ -777,7 +778,7 @@
 	return event;
 }
 
-union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
+union perf_event *perf_evlist__mmap_read_forward(struct perf_evlist *evlist, int idx)
 {
 	struct perf_mmap *md = &evlist->mmap[idx];
 	u64 head;
@@ -832,6 +833,13 @@
 	return perf_mmap__read(md, false, start, end, &md->prev);
 }
 
+union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
+{
+	if (!evlist->backward)
+		return perf_evlist__mmap_read_forward(evlist, idx);
+	return perf_evlist__mmap_read_backward(evlist, idx);
+}
+
 void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx)
 {
 	struct perf_mmap *md = &evlist->mmap[idx];
@@ -856,9 +864,11 @@
 
 static void perf_evlist__mmap_put(struct perf_evlist *evlist, int idx)
 {
-	BUG_ON(atomic_read(&evlist->mmap[idx].refcnt) == 0);
+	struct perf_mmap *md = &evlist->mmap[idx];
 
-	if (atomic_dec_and_test(&evlist->mmap[idx].refcnt))
+	BUG_ON(md->base && atomic_read(&md->refcnt) == 0);
+
+	if (atomic_dec_and_test(&md->refcnt))
 		__perf_evlist__munmap(evlist, idx);
 }
 
@@ -936,9 +946,12 @@
 	if (cpu_map__empty(evlist->cpus))
 		evlist->nr_mmaps = thread_map__nr(evlist->threads);
 	evlist->mmap = zalloc(evlist->nr_mmaps * sizeof(struct perf_mmap));
+	if (!evlist->mmap)
+		return -ENOMEM;
+
 	for (i = 0; i < evlist->nr_mmaps; i++)
 		evlist->mmap[i].fd = -1;
-	return evlist->mmap != NULL ? 0 : -ENOMEM;
+	return 0;
 }
 
 struct mmap_params {
@@ -983,15 +996,28 @@
 	return 0;
 }
 
+static bool
+perf_evlist__should_poll(struct perf_evlist *evlist __maybe_unused,
+			 struct perf_evsel *evsel)
+{
+	if (evsel->overwrite)
+		return false;
+	return true;
+}
+
 static int perf_evlist__mmap_per_evsel(struct perf_evlist *evlist, int idx,
 				       struct mmap_params *mp, int cpu,
 				       int thread, int *output)
 {
 	struct perf_evsel *evsel;
+	int revent;
 
 	evlist__for_each(evlist, evsel) {
 		int fd;
 
+		if (evsel->overwrite != (evlist->overwrite && evlist->backward))
+			continue;
+
 		if (evsel->system_wide && thread)
 			continue;
 
@@ -1008,6 +1034,8 @@
 			perf_evlist__mmap_get(evlist, idx);
 		}
 
+		revent = perf_evlist__should_poll(evlist, evsel) ? POLLIN : 0;
+
 		/*
 		 * The system_wide flag causes a selected event to be opened
 		 * always without a pid.  Consequently it will never get a
@@ -1016,7 +1044,7 @@
 		 * Therefore don't add it for polling.
 		 */
 		if (!evsel->system_wide &&
-		    __perf_evlist__add_pollfd(evlist, fd, idx) < 0) {
+		    __perf_evlist__add_pollfd(evlist, fd, idx, revent) < 0) {
 			perf_evlist__mmap_put(evlist, idx);
 			return -1;
 		}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index d740fb8..68cb136 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -131,6 +131,8 @@
 
 union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx);
 
+union perf_event *perf_evlist__mmap_read_forward(struct perf_evlist *evlist,
+						 int idx);
 union perf_event *perf_evlist__mmap_read_backward(struct perf_evlist *evlist,
 						  int idx);
 void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 5d7037e..9b2e3e6 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -572,6 +572,8 @@
 
 	perf_evsel__set_sample_bit(evsel, CALLCHAIN);
 
+	attr->sample_max_stack = param->max_stack;
+
 	if (param->record_mode == CALLCHAIN_LBR) {
 		if (!opts->branch_stack) {
 			if (attr->exclude_user) {
@@ -635,7 +637,8 @@
 	struct perf_event_attr *attr = &evsel->attr;
 	struct callchain_param param;
 	u32 dump_size = 0;
-	char *callgraph_buf = NULL;
+	int max_stack = 0;
+	const char *callgraph_buf = NULL;
 
 	/* callgraph default */
 	param.record_mode = callchain_param.record_mode;
@@ -662,6 +665,9 @@
 		case PERF_EVSEL__CONFIG_TERM_STACK_USER:
 			dump_size = term->val.stack_user;
 			break;
+		case PERF_EVSEL__CONFIG_TERM_MAX_STACK:
+			max_stack = term->val.max_stack;
+			break;
 		case PERF_EVSEL__CONFIG_TERM_INHERIT:
 			/*
 			 * attr->inherit should has already been set by
@@ -677,7 +683,12 @@
 	}
 
 	/* User explicitly set per-event callgraph, clear the old setting and reset. */
-	if ((callgraph_buf != NULL) || (dump_size > 0)) {
+	if ((callgraph_buf != NULL) || (dump_size > 0) || max_stack) {
+		if (max_stack) {
+			param.max_stack = max_stack;
+			if (callgraph_buf == NULL)
+				callgraph_buf = "fp";
+		}
 
 		/* parse callgraph parameters */
 		if (callgraph_buf != NULL) {
@@ -1329,6 +1340,7 @@
 	PRINT_ATTRf(clockid, p_signed);
 	PRINT_ATTRf(sample_regs_intr, p_hex);
 	PRINT_ATTRf(aux_watermark, p_unsigned);
+	PRINT_ATTRf(sample_max_stack, p_unsigned);
 
 	return ret;
 }
@@ -2239,17 +2251,11 @@
 	return sample->raw_data + offset;
 }
 
-u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample *sample,
-		       const char *name)
+u64 format_field__intval(struct format_field *field, struct perf_sample *sample,
+			 bool needs_swap)
 {
-	struct format_field *field = perf_evsel__field(evsel, name);
-	void *ptr;
 	u64 value;
-
-	if (!field)
-		return 0;
-
-	ptr = sample->raw_data + field->offset;
+	void *ptr = sample->raw_data + field->offset;
 
 	switch (field->size) {
 	case 1:
@@ -2267,7 +2273,7 @@
 		return 0;
 	}
 
-	if (!evsel->needs_swap)
+	if (!needs_swap)
 		return value;
 
 	switch (field->size) {
@@ -2284,6 +2290,17 @@
 	return 0;
 }
 
+u64 perf_evsel__intval(struct perf_evsel *evsel, struct perf_sample *sample,
+		       const char *name)
+{
+	struct format_field *field = perf_evsel__field(evsel, name);
+
+	if (!field)
+		return 0;
+
+	return field ? format_field__intval(field, sample, evsel->needs_swap) : 0;
+}
+
 bool perf_evsel__fallback(struct perf_evsel *evsel, int err,
 			  char *msg, size_t msgsize)
 {
@@ -2372,6 +2389,9 @@
 	 "No such device - did you specify an out-of-range profile CPU?");
 		break;
 	case EOPNOTSUPP:
+		if (evsel->attr.sample_period != 0)
+			return scnprintf(msg, size, "%s",
+	"PMU Hardware doesn't support sampling/overflow-interrupts.");
 		if (evsel->attr.precise_ip)
 			return scnprintf(msg, size, "%s",
 	"\'precise\' request may not be supported. Try removing 'p' modifier.");
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index c1f1015..828ddd1 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -44,6 +44,7 @@
 	PERF_EVSEL__CONFIG_TERM_CALLGRAPH,
 	PERF_EVSEL__CONFIG_TERM_STACK_USER,
 	PERF_EVSEL__CONFIG_TERM_INHERIT,
+	PERF_EVSEL__CONFIG_TERM_MAX_STACK,
 	PERF_EVSEL__CONFIG_TERM_MAX,
 };
 
@@ -56,6 +57,7 @@
 		bool	time;
 		char	*callgraph;
 		u64	stack_user;
+		int	max_stack;
 		bool	inherit;
 	} val;
 };
@@ -259,6 +261,8 @@
 
 struct format_field;
 
+u64 format_field__intval(struct format_field *field, struct perf_sample *sample, bool needs_swap);
+
 struct format_field *perf_evsel__field(struct perf_evsel *evsel, const char *name);
 
 #define perf_evsel__match(evsel, t, c)		\
diff --git a/tools/perf/util/group.h b/tools/perf/util/group.h
new file mode 100644
index 0000000..116debe
--- /dev/null
+++ b/tools/perf/util/group.h
@@ -0,0 +1,7 @@
+#ifndef GROUP_H
+#define GROUP_H 1
+
+bool arch_topdown_check_group(bool *warn);
+void arch_topdown_group_warn(void);
+
+#endif
diff --git a/tools/perf/util/libunwind/arm64.c b/tools/perf/util/libunwind/arm64.c
new file mode 100644
index 0000000..4fb5395
--- /dev/null
+++ b/tools/perf/util/libunwind/arm64.c
@@ -0,0 +1,35 @@
+/*
+ * This file setups defines to compile arch specific binary from the
+ * generic one.
+ *
+ * The function 'LIBUNWIND__ARCH_REG_ID' name is set according to arch
+ * name and the defination of this function is included directly from
+ * 'arch/arm64/util/unwind-libunwind.c', to make sure that this function
+ * is defined no matter what arch the host is.
+ *
+ * Finally, the arch specific unwind methods are exported which will
+ * be assigned to each arm64 thread.
+ */
+
+#define REMOTE_UNWIND_LIBUNWIND
+
+#define LIBUNWIND__ARCH_REG_ID(regnum) libunwind__arm64_reg_id(regnum)
+
+#include "unwind.h"
+#include "debug.h"
+#include "libunwind-aarch64.h"
+#include <../../../../arch/arm64/include/uapi/asm/perf_regs.h>
+#include "../../arch/arm64/util/unwind-libunwind.c"
+
+/* NO_LIBUNWIND_DEBUG_FRAME is a feature flag for local libunwind,
+ * assign NO_LIBUNWIND_DEBUG_FRAME_AARCH64 to it for compiling arm64
+ * unwind methods.
+ */
+#undef NO_LIBUNWIND_DEBUG_FRAME
+#ifdef NO_LIBUNWIND_DEBUG_FRAME_AARCH64
+#define NO_LIBUNWIND_DEBUG_FRAME
+#endif
+#include "util/unwind-libunwind-local.c"
+
+struct unwind_libunwind_ops *
+arm64_unwind_libunwind_ops = &_unwind_libunwind_ops;
diff --git a/tools/perf/util/libunwind/x86_32.c b/tools/perf/util/libunwind/x86_32.c
new file mode 100644
index 0000000..d98c17e
--- /dev/null
+++ b/tools/perf/util/libunwind/x86_32.c
@@ -0,0 +1,37 @@
+/*
+ * This file setups defines to compile arch specific binary from the
+ * generic one.
+ *
+ * The function 'LIBUNWIND__ARCH_REG_ID' name is set according to arch
+ * name and the defination of this function is included directly from
+ * 'arch/x86/util/unwind-libunwind.c', to make sure that this function
+ * is defined no matter what arch the host is.
+ *
+ * Finally, the arch specific unwind methods are exported which will
+ * be assigned to each x86 thread.
+ */
+
+#define REMOTE_UNWIND_LIBUNWIND
+#define LIBUNWIND__ARCH_REG_ID(regnum) libunwind__x86_reg_id(regnum)
+
+#include "unwind.h"
+#include "debug.h"
+#include "libunwind-x86.h"
+#include <../../../../arch/x86/include/uapi/asm/perf_regs.h>
+
+/* HAVE_ARCH_X86_64_SUPPORT is used in'arch/x86/util/unwind-libunwind.c'
+ * for x86_32, we undef it to compile code for x86_32 only.
+ */
+#undef HAVE_ARCH_X86_64_SUPPORT
+#include "../../arch/x86/util/unwind-libunwind.c"
+
+/* Explicitly define NO_LIBUNWIND_DEBUG_FRAME, because non-ARM has no
+ * dwarf_find_debug_frame() function.
+ */
+#ifndef NO_LIBUNWIND_DEBUG_FRAME
+#define NO_LIBUNWIND_DEBUG_FRAME
+#endif
+#include "util/unwind-libunwind-local.c"
+
+struct unwind_libunwind_ops *
+x86_32_unwind_libunwind_ops = &_unwind_libunwind_ops;
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index b177218..a0c186a 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1353,11 +1353,16 @@
 	if (map == NULL)
 		goto out_problem_map;
 
-	thread__insert_map(thread, map);
+	ret = thread__insert_map(thread, map);
+	if (ret)
+		goto out_problem_insert;
+
 	thread__put(thread);
 	map__put(map);
 	return 0;
 
+out_problem_insert:
+	map__put(map);
 out_problem_map:
 	thread__put(thread);
 out_problem:
@@ -1403,11 +1408,16 @@
 	if (map == NULL)
 		goto out_problem_map;
 
-	thread__insert_map(thread, map);
+	ret = thread__insert_map(thread, map);
+	if (ret)
+		goto out_problem_insert;
+
 	thread__put(thread);
 	map__put(map);
 	return 0;
 
+out_problem_insert:
+	map__put(map);
 out_problem_map:
 	thread__put(thread);
 out_problem:
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index c6fd047..d15e335 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -900,6 +900,7 @@
 	[PARSE_EVENTS__TERM_TYPE_STACKSIZE]		= "stack-size",
 	[PARSE_EVENTS__TERM_TYPE_NOINHERIT]		= "no-inherit",
 	[PARSE_EVENTS__TERM_TYPE_INHERIT]		= "inherit",
+	[PARSE_EVENTS__TERM_TYPE_MAX_STACK]		= "max-stack",
 };
 
 static bool config_term_shrinked;
@@ -995,6 +996,9 @@
 	case PARSE_EVENTS__TERM_TYPE_NAME:
 		CHECK_TYPE_VAL(STR);
 		break;
+	case PARSE_EVENTS__TERM_TYPE_MAX_STACK:
+		CHECK_TYPE_VAL(NUM);
+		break;
 	default:
 		err->str = strdup("unknown term");
 		err->idx = term->err_term;
@@ -1040,6 +1044,7 @@
 	case PARSE_EVENTS__TERM_TYPE_STACKSIZE:
 	case PARSE_EVENTS__TERM_TYPE_INHERIT:
 	case PARSE_EVENTS__TERM_TYPE_NOINHERIT:
+	case PARSE_EVENTS__TERM_TYPE_MAX_STACK:
 		return config_term_common(attr, term, err);
 	default:
 		if (err) {
@@ -1109,6 +1114,9 @@
 		case PARSE_EVENTS__TERM_TYPE_NOINHERIT:
 			ADD_CONFIG_TERM(INHERIT, inherit, term->val.num ? 0 : 1);
 			break;
+		case PARSE_EVENTS__TERM_TYPE_MAX_STACK:
+			ADD_CONFIG_TERM(MAX_STACK, max_stack, term->val.num);
+			break;
 		default:
 			break;
 		}
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index d740c3c..46c05cc 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -68,6 +68,7 @@
 	PARSE_EVENTS__TERM_TYPE_STACKSIZE,
 	PARSE_EVENTS__TERM_TYPE_NOINHERIT,
 	PARSE_EVENTS__TERM_TYPE_INHERIT,
+	PARSE_EVENTS__TERM_TYPE_MAX_STACK,
 	__PARSE_EVENTS__TERM_TYPE_NR,
 };
 
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 1477fbc..3c15b33 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -199,6 +199,7 @@
 time			{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_TIME); }
 call-graph		{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CALLGRAPH); }
 stack-size		{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_STACKSIZE); }
+max-stack		{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_MAX_STACK); }
 inherit			{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_INHERIT); }
 no-inherit		{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NOINHERIT); }
 ,			{ return ','; }
@@ -259,6 +260,7 @@
 cycles-t					{ return str(yyscanner, PE_KERNEL_PMU_EVENT); }
 mem-loads					{ return str(yyscanner, PE_KERNEL_PMU_EVENT); }
 mem-stores					{ return str(yyscanner, PE_KERNEL_PMU_EVENT); }
+topdown-[a-z-]+					{ return str(yyscanner, PE_KERNEL_PMU_EVENT); }
 
 L1-dcache|l1-d|l1d|L1-data		|
 L1-icache|l1-i|l1i|L1-instruction	|
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 5214974..dfedf09 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -593,6 +593,7 @@
 	if (bswap_safe(f, 0))			\
 		attr->f = bswap_##sz(attr->f);	\
 } while(0)
+#define bswap_field_16(f) bswap_field(f, 16)
 #define bswap_field_32(f) bswap_field(f, 32)
 #define bswap_field_64(f) bswap_field(f, 64)
 
@@ -608,6 +609,7 @@
 	bswap_field_64(sample_regs_user);
 	bswap_field_32(sample_stack_user);
 	bswap_field_32(aux_watermark);
+	bswap_field_16(sample_max_stack);
 
 	/*
 	 * After read_format are bitfields. Check read_format because
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index aa9efe0..8a2bbd2 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -36,6 +36,11 @@
 static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS];
 static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS];
 static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_topdown_total_slots[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_topdown_slots_issued[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_topdown_slots_retired[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_topdown_fetch_bubbles[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_topdown_recovery_bubbles[NUM_CTX][MAX_NR_CPUS];
 static bool have_frontend_stalled;
 
 struct stats walltime_nsecs_stats;
@@ -82,6 +87,11 @@
 		sizeof(runtime_transaction_stats));
 	memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats));
 	memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats));
+	memset(runtime_topdown_total_slots, 0, sizeof(runtime_topdown_total_slots));
+	memset(runtime_topdown_slots_retired, 0, sizeof(runtime_topdown_slots_retired));
+	memset(runtime_topdown_slots_issued, 0, sizeof(runtime_topdown_slots_issued));
+	memset(runtime_topdown_fetch_bubbles, 0, sizeof(runtime_topdown_fetch_bubbles));
+	memset(runtime_topdown_recovery_bubbles, 0, sizeof(runtime_topdown_recovery_bubbles));
 }
 
 /*
@@ -105,6 +115,16 @@
 		update_stats(&runtime_transaction_stats[ctx][cpu], count[0]);
 	else if (perf_stat_evsel__is(counter, ELISION_START))
 		update_stats(&runtime_elision_stats[ctx][cpu], count[0]);
+	else if (perf_stat_evsel__is(counter, TOPDOWN_TOTAL_SLOTS))
+		update_stats(&runtime_topdown_total_slots[ctx][cpu], count[0]);
+	else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_ISSUED))
+		update_stats(&runtime_topdown_slots_issued[ctx][cpu], count[0]);
+	else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_RETIRED))
+		update_stats(&runtime_topdown_slots_retired[ctx][cpu], count[0]);
+	else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_BUBBLES))
+		update_stats(&runtime_topdown_fetch_bubbles[ctx][cpu],count[0]);
+	else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES))
+		update_stats(&runtime_topdown_recovery_bubbles[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
 		update_stats(&runtime_stalled_cycles_front_stats[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
@@ -302,6 +322,107 @@
 	out->print_metric(out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio);
 }
 
+/*
+ * High level "TopDown" CPU core pipe line bottleneck break down.
+ *
+ * Basic concept following
+ * Yasin, A Top Down Method for Performance analysis and Counter architecture
+ * ISPASS14
+ *
+ * The CPU pipeline is divided into 4 areas that can be bottlenecks:
+ *
+ * Frontend -> Backend -> Retiring
+ * BadSpeculation in addition means out of order execution that is thrown away
+ * (for example branch mispredictions)
+ * Frontend is instruction decoding.
+ * Backend is execution, like computation and accessing data in memory
+ * Retiring is good execution that is not directly bottlenecked
+ *
+ * The formulas are computed in slots.
+ * A slot is an entry in the pipeline each for the pipeline width
+ * (for example a 4-wide pipeline has 4 slots for each cycle)
+ *
+ * Formulas:
+ * BadSpeculation = ((SlotsIssued - SlotsRetired) + RecoveryBubbles) /
+ *			TotalSlots
+ * Retiring = SlotsRetired / TotalSlots
+ * FrontendBound = FetchBubbles / TotalSlots
+ * BackendBound = 1.0 - BadSpeculation - Retiring - FrontendBound
+ *
+ * The kernel provides the mapping to the low level CPU events and any scaling
+ * needed for the CPU pipeline width, for example:
+ *
+ * TotalSlots = Cycles * 4
+ *
+ * The scaling factor is communicated in the sysfs unit.
+ *
+ * In some cases the CPU may not be able to measure all the formulas due to
+ * missing events. In this case multiple formulas are combined, as possible.
+ *
+ * Full TopDown supports more levels to sub-divide each area: for example
+ * BackendBound into computing bound and memory bound. For now we only
+ * support Level 1 TopDown.
+ */
+
+static double sanitize_val(double x)
+{
+	if (x < 0 && x >= -0.02)
+		return 0.0;
+	return x;
+}
+
+static double td_total_slots(int ctx, int cpu)
+{
+	return avg_stats(&runtime_topdown_total_slots[ctx][cpu]);
+}
+
+static double td_bad_spec(int ctx, int cpu)
+{
+	double bad_spec = 0;
+	double total_slots;
+	double total;
+
+	total = avg_stats(&runtime_topdown_slots_issued[ctx][cpu]) -
+		avg_stats(&runtime_topdown_slots_retired[ctx][cpu]) +
+		avg_stats(&runtime_topdown_recovery_bubbles[ctx][cpu]);
+	total_slots = td_total_slots(ctx, cpu);
+	if (total_slots)
+		bad_spec = total / total_slots;
+	return sanitize_val(bad_spec);
+}
+
+static double td_retiring(int ctx, int cpu)
+{
+	double retiring = 0;
+	double total_slots = td_total_slots(ctx, cpu);
+	double ret_slots = avg_stats(&runtime_topdown_slots_retired[ctx][cpu]);
+
+	if (total_slots)
+		retiring = ret_slots / total_slots;
+	return retiring;
+}
+
+static double td_fe_bound(int ctx, int cpu)
+{
+	double fe_bound = 0;
+	double total_slots = td_total_slots(ctx, cpu);
+	double fetch_bub = avg_stats(&runtime_topdown_fetch_bubbles[ctx][cpu]);
+
+	if (total_slots)
+		fe_bound = fetch_bub / total_slots;
+	return fe_bound;
+}
+
+static double td_be_bound(int ctx, int cpu)
+{
+	double sum = (td_fe_bound(ctx, cpu) +
+		      td_bad_spec(ctx, cpu) +
+		      td_retiring(ctx, cpu));
+	if (sum == 0)
+		return 0;
+	return sanitize_val(1.0 - sum);
+}
+
 void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 				   double avg, int cpu,
 				   struct perf_stat_output_ctx *out)
@@ -309,6 +430,7 @@
 	void *ctxp = out->ctx;
 	print_metric_t print_metric = out->print_metric;
 	double total, ratio = 0.0, total2;
+	const char *color = NULL;
 	int ctx = evsel_context(evsel);
 
 	if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
@@ -452,6 +574,46 @@
 				     avg / ratio);
 		else
 			print_metric(ctxp, NULL, NULL, "CPUs utilized", 0);
+	} else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_BUBBLES)) {
+		double fe_bound = td_fe_bound(ctx, cpu);
+
+		if (fe_bound > 0.2)
+			color = PERF_COLOR_RED;
+		print_metric(ctxp, color, "%8.1f%%", "frontend bound",
+				fe_bound * 100.);
+	} else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_RETIRED)) {
+		double retiring = td_retiring(ctx, cpu);
+
+		if (retiring > 0.7)
+			color = PERF_COLOR_GREEN;
+		print_metric(ctxp, color, "%8.1f%%", "retiring",
+				retiring * 100.);
+	} else if (perf_stat_evsel__is(evsel, TOPDOWN_RECOVERY_BUBBLES)) {
+		double bad_spec = td_bad_spec(ctx, cpu);
+
+		if (bad_spec > 0.1)
+			color = PERF_COLOR_RED;
+		print_metric(ctxp, color, "%8.1f%%", "bad speculation",
+				bad_spec * 100.);
+	} else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_ISSUED)) {
+		double be_bound = td_be_bound(ctx, cpu);
+		const char *name = "backend bound";
+		static int have_recovery_bubbles = -1;
+
+		/* In case the CPU does not support topdown-recovery-bubbles */
+		if (have_recovery_bubbles < 0)
+			have_recovery_bubbles = pmu_have_event("cpu",
+					"topdown-recovery-bubbles");
+		if (!have_recovery_bubbles)
+			name = "backend bound/bad spec";
+
+		if (be_bound > 0.2)
+			color = PERF_COLOR_RED;
+		if (td_total_slots(ctx, cpu) > 0)
+			print_metric(ctxp, color, "%8.1f%%", name,
+					be_bound * 100.);
+		else
+			print_metric(ctxp, NULL, NULL, name, 0);
 	} else if (runtime_nsecs_stats[cpu].n != 0) {
 		char unit = 'M';
 		char unit_buf[10];
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index ffa1d06..c1ba255 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -79,6 +79,11 @@
 	ID(TRANSACTION_START,	cpu/tx-start/),
 	ID(ELISION_START,	cpu/el-start/),
 	ID(CYCLES_IN_TX_CP,	cpu/cycles-ct/),
+	ID(TOPDOWN_TOTAL_SLOTS, topdown-total-slots),
+	ID(TOPDOWN_SLOTS_ISSUED, topdown-slots-issued),
+	ID(TOPDOWN_SLOTS_RETIRED, topdown-slots-retired),
+	ID(TOPDOWN_FETCH_BUBBLES, topdown-fetch-bubbles),
+	ID(TOPDOWN_RECOVERY_BUBBLES, topdown-recovery-bubbles),
 };
 #undef ID
 
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 0150e78..c29bb94 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -17,6 +17,11 @@
 	PERF_STAT_EVSEL_ID__TRANSACTION_START,
 	PERF_STAT_EVSEL_ID__ELISION_START,
 	PERF_STAT_EVSEL_ID__CYCLES_IN_TX_CP,
+	PERF_STAT_EVSEL_ID__TOPDOWN_TOTAL_SLOTS,
+	PERF_STAT_EVSEL_ID__TOPDOWN_SLOTS_ISSUED,
+	PERF_STAT_EVSEL_ID__TOPDOWN_SLOTS_RETIRED,
+	PERF_STAT_EVSEL_ID__TOPDOWN_FETCH_BUBBLES,
+	PERF_STAT_EVSEL_ID__TOPDOWN_RECOVERY_BUBBLES,
 	PERF_STAT_EVSEL_ID__MAX,
 };
 
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 54c4ff2..09c5c34 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -1641,6 +1641,20 @@
 	return ret;
 }
 
+/*
+ * Use open(O_RDONLY) to check readability directly instead of access(R_OK)
+ * since access(R_OK) only checks with real UID/GID but open() use effective
+ * UID/GID and actual capabilities (e.g. /proc/kcore requires CAP_SYS_RAWIO).
+ */
+static bool filename__readable(const char *file)
+{
+	int fd = open(file, O_RDONLY);
+	if (fd < 0)
+		return false;
+	close(fd);
+	return true;
+}
+
 static char *dso__find_kallsyms(struct dso *dso, struct map *map)
 {
 	u8 host_build_id[BUILD_ID_SIZE];
@@ -1660,58 +1674,43 @@
 				 sizeof(host_build_id)) == 0)
 		is_host = dso__build_id_equal(dso, host_build_id);
 
-	build_id__sprintf(dso->build_id, sizeof(dso->build_id), sbuild_id);
-
-	scnprintf(path, sizeof(path), "%s/%s/%s", buildid_dir,
-		  DSO__NAME_KCORE, sbuild_id);
-
-	/* Use /proc/kallsyms if possible */
+	/* Try a fast path for /proc/kallsyms if possible */
 	if (is_host) {
-		DIR *d;
-		int fd;
-
-		/* If no cached kcore go with /proc/kallsyms */
-		d = opendir(path);
-		if (!d)
-			goto proc_kallsyms;
-		closedir(d);
-
 		/*
-		 * Do not check the build-id cache, until we know we cannot use
-		 * /proc/kcore.
+		 * Do not check the build-id cache, unless we know we cannot use
+		 * /proc/kcore or module maps don't match to /proc/kallsyms.
+		 * To check readability of /proc/kcore, do not use access(R_OK)
+		 * since /proc/kcore requires CAP_SYS_RAWIO to read and access
+		 * can't check it.
 		 */
-		fd = open("/proc/kcore", O_RDONLY);
-		if (fd != -1) {
-			close(fd);
-			/* If module maps match go with /proc/kallsyms */
-			if (!validate_kcore_addresses("/proc/kallsyms", map))
-				goto proc_kallsyms;
-		}
-
-		/* Find kallsyms in build-id cache with kcore */
-		if (!find_matching_kcore(map, path, sizeof(path)))
-			return strdup(path);
-
-		goto proc_kallsyms;
+		if (filename__readable("/proc/kcore") &&
+		    !validate_kcore_addresses("/proc/kallsyms", map))
+			goto proc_kallsyms;
 	}
 
+	build_id__sprintf(dso->build_id, sizeof(dso->build_id), sbuild_id);
+
 	/* Find kallsyms in build-id cache with kcore */
+	scnprintf(path, sizeof(path), "%s/%s/%s",
+		  buildid_dir, DSO__NAME_KCORE, sbuild_id);
+
 	if (!find_matching_kcore(map, path, sizeof(path)))
 		return strdup(path);
 
-	scnprintf(path, sizeof(path), "%s/%s/%s",
-		  buildid_dir, DSO__NAME_KALLSYMS, sbuild_id);
+	/* Use current /proc/kallsyms if possible */
+	if (is_host) {
+proc_kallsyms:
+		return strdup("/proc/kallsyms");
+	}
 
-	if (access(path, F_OK)) {
+	/* Finally, find a cache of kallsyms */
+	if (!build_id_cache__kallsyms_path(sbuild_id, path, sizeof(path))) {
 		pr_err("No kallsyms or vmlinux with build-id %s was found\n",
 		       sbuild_id);
 		return NULL;
 	}
 
 	return strdup(path);
-
-proc_kallsyms:
-	return strdup("/proc/kallsyms");
 }
 
 static int dso__load_kernel_sym(struct dso *dso, struct map *map,
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 45fcb71..f30f956 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -43,9 +43,6 @@
 		thread->cpu = -1;
 		INIT_LIST_HEAD(&thread->comm_list);
 
-		if (unwind__prepare_access(thread) < 0)
-			goto err_thread;
-
 		comm_str = malloc(32);
 		if (!comm_str)
 			goto err_thread;
@@ -201,10 +198,18 @@
 	       map_groups__fprintf(thread->mg, fp);
 }
 
-void thread__insert_map(struct thread *thread, struct map *map)
+int thread__insert_map(struct thread *thread, struct map *map)
 {
+	int ret;
+
+	ret = unwind__prepare_access(thread, map);
+	if (ret)
+		return ret;
+
 	map_groups__fixup_overlappings(thread->mg, map, stderr);
 	map_groups__insert(thread->mg, map);
+
+	return 0;
 }
 
 static int thread__clone_map_groups(struct thread *thread,
@@ -265,3 +270,14 @@
 			break;
 	}
 }
+
+struct thread *thread__main_thread(struct machine *machine, struct thread *thread)
+{
+	if (thread->pid_ == thread->tid)
+		return thread__get(thread);
+
+	if (thread->pid_ == -1)
+		return NULL;
+
+	return machine__find_thread(machine, thread->pid_, thread->pid_);
+}
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 45fba13..99263cb 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -9,11 +9,9 @@
 #include "symbol.h"
 #include <strlist.h>
 #include <intlist.h>
-#ifdef HAVE_LIBUNWIND_SUPPORT
-#include <libunwind.h>
-#endif
 
 struct thread_stack;
+struct unwind_libunwind_ops;
 
 struct thread {
 	union {
@@ -36,7 +34,8 @@
 	void			*priv;
 	struct thread_stack	*ts;
 #ifdef HAVE_LIBUNWIND_SUPPORT
-	unw_addr_space_t	addr_space;
+	void				*addr_space;
+	struct unwind_libunwind_ops	*unwind_libunwind_ops;
 #endif
 };
 
@@ -77,10 +76,12 @@
 struct comm *thread__comm(const struct thread *thread);
 struct comm *thread__exec_comm(const struct thread *thread);
 const char *thread__comm_str(const struct thread *thread);
-void thread__insert_map(struct thread *thread, struct map *map);
+int thread__insert_map(struct thread *thread, struct map *map);
 int thread__fork(struct thread *thread, struct thread *parent, u64 timestamp);
 size_t thread__fprintf(struct thread *thread, FILE *fp);
 
+struct thread *thread__main_thread(struct machine *machine, struct thread *thread);
+
 void thread__find_addr_map(struct thread *thread,
 			   u8 cpumode, enum map_type type, u64 addr,
 			   struct addr_location *al);
diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c
new file mode 100644
index 0000000..01c2e86
--- /dev/null
+++ b/tools/perf/util/unwind-libunwind-local.c
@@ -0,0 +1,697 @@
+/*
+ * Post mortem Dwarf CFI based unwinding on top of regs and stack dumps.
+ *
+ * Lots of this code have been borrowed or heavily inspired from parts of
+ * the libunwind 0.99 code which are (amongst other contributors I may have
+ * forgotten):
+ *
+ * Copyright (C) 2002-2007 Hewlett-Packard Co
+ *	Contributed by David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * And the bugs have been added by:
+ *
+ * Copyright (C) 2010, Frederic Weisbecker <fweisbec@gmail.com>
+ * Copyright (C) 2012, Jiri Olsa <jolsa@redhat.com>
+ *
+ */
+
+#include <elf.h>
+#include <gelf.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <linux/list.h>
+#ifndef REMOTE_UNWIND_LIBUNWIND
+#include <libunwind.h>
+#include <libunwind-ptrace.h>
+#endif
+#include "callchain.h"
+#include "thread.h"
+#include "session.h"
+#include "perf_regs.h"
+#include "unwind.h"
+#include "symbol.h"
+#include "util.h"
+#include "debug.h"
+#include "asm/bug.h"
+
+extern int
+UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
+				    unw_word_t ip,
+				    unw_dyn_info_t *di,
+				    unw_proc_info_t *pi,
+				    int need_unwind_info, void *arg);
+
+#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
+
+extern int
+UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
+				 unw_word_t ip,
+				 unw_word_t segbase,
+				 const char *obj_name, unw_word_t start,
+				 unw_word_t end);
+
+#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
+
+#define DW_EH_PE_FORMAT_MASK	0x0f	/* format of the encoded value */
+#define DW_EH_PE_APPL_MASK	0x70	/* how the value is to be applied */
+
+/* Pointer-encoding formats: */
+#define DW_EH_PE_omit		0xff
+#define DW_EH_PE_ptr		0x00	/* pointer-sized unsigned value */
+#define DW_EH_PE_udata4		0x03	/* unsigned 32-bit value */
+#define DW_EH_PE_udata8		0x04	/* unsigned 64-bit value */
+#define DW_EH_PE_sdata4		0x0b	/* signed 32-bit value */
+#define DW_EH_PE_sdata8		0x0c	/* signed 64-bit value */
+
+/* Pointer-encoding application: */
+#define DW_EH_PE_absptr		0x00	/* absolute value */
+#define DW_EH_PE_pcrel		0x10	/* rel. to addr. of encoded value */
+
+/*
+ * The following are not documented by LSB v1.3, yet they are used by
+ * GCC, presumably they aren't documented by LSB since they aren't
+ * used on Linux:
+ */
+#define DW_EH_PE_funcrel	0x40	/* start-of-procedure-relative */
+#define DW_EH_PE_aligned	0x50	/* aligned pointer */
+
+/* Flags intentionaly not handled, since they're not needed:
+ * #define DW_EH_PE_indirect      0x80
+ * #define DW_EH_PE_uleb128       0x01
+ * #define DW_EH_PE_udata2        0x02
+ * #define DW_EH_PE_sleb128       0x09
+ * #define DW_EH_PE_sdata2        0x0a
+ * #define DW_EH_PE_textrel       0x20
+ * #define DW_EH_PE_datarel       0x30
+ */
+
+struct unwind_info {
+	struct perf_sample	*sample;
+	struct machine		*machine;
+	struct thread		*thread;
+};
+
+#define dw_read(ptr, type, end) ({	\
+	type *__p = (type *) ptr;	\
+	type  __v;			\
+	if ((__p + 1) > (type *) end)	\
+		return -EINVAL;		\
+	__v = *__p++;			\
+	ptr = (typeof(ptr)) __p;	\
+	__v;				\
+	})
+
+static int __dw_read_encoded_value(u8 **p, u8 *end, u64 *val,
+				   u8 encoding)
+{
+	u8 *cur = *p;
+	*val = 0;
+
+	switch (encoding) {
+	case DW_EH_PE_omit:
+		*val = 0;
+		goto out;
+	case DW_EH_PE_ptr:
+		*val = dw_read(cur, unsigned long, end);
+		goto out;
+	default:
+		break;
+	}
+
+	switch (encoding & DW_EH_PE_APPL_MASK) {
+	case DW_EH_PE_absptr:
+		break;
+	case DW_EH_PE_pcrel:
+		*val = (unsigned long) cur;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if ((encoding & 0x07) == 0x00)
+		encoding |= DW_EH_PE_udata4;
+
+	switch (encoding & DW_EH_PE_FORMAT_MASK) {
+	case DW_EH_PE_sdata4:
+		*val += dw_read(cur, s32, end);
+		break;
+	case DW_EH_PE_udata4:
+		*val += dw_read(cur, u32, end);
+		break;
+	case DW_EH_PE_sdata8:
+		*val += dw_read(cur, s64, end);
+		break;
+	case DW_EH_PE_udata8:
+		*val += dw_read(cur, u64, end);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+ out:
+	*p = cur;
+	return 0;
+}
+
+#define dw_read_encoded_value(ptr, end, enc) ({			\
+	u64 __v;						\
+	if (__dw_read_encoded_value(&ptr, end, &__v, enc)) {	\
+		return -EINVAL;                                 \
+	}                                                       \
+	__v;                                                    \
+	})
+
+static u64 elf_section_offset(int fd, const char *name)
+{
+	Elf *elf;
+	GElf_Ehdr ehdr;
+	GElf_Shdr shdr;
+	u64 offset = 0;
+
+	elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
+	if (elf == NULL)
+		return 0;
+
+	do {
+		if (gelf_getehdr(elf, &ehdr) == NULL)
+			break;
+
+		if (!elf_section_by_name(elf, &ehdr, &shdr, name, NULL))
+			break;
+
+		offset = shdr.sh_offset;
+	} while (0);
+
+	elf_end(elf);
+	return offset;
+}
+
+#ifndef NO_LIBUNWIND_DEBUG_FRAME
+static int elf_is_exec(int fd, const char *name)
+{
+	Elf *elf;
+	GElf_Ehdr ehdr;
+	int retval = 0;
+
+	elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
+	if (elf == NULL)
+		return 0;
+	if (gelf_getehdr(elf, &ehdr) == NULL)
+		goto out;
+
+	retval = (ehdr.e_type == ET_EXEC);
+
+out:
+	elf_end(elf);
+	pr_debug("unwind: elf_is_exec(%s): %d\n", name, retval);
+	return retval;
+}
+#endif
+
+struct table_entry {
+	u32 start_ip_offset;
+	u32 fde_offset;
+};
+
+struct eh_frame_hdr {
+	unsigned char version;
+	unsigned char eh_frame_ptr_enc;
+	unsigned char fde_count_enc;
+	unsigned char table_enc;
+
+	/*
+	 * The rest of the header is variable-length and consists of the
+	 * following members:
+	 *
+	 *	encoded_t eh_frame_ptr;
+	 *	encoded_t fde_count;
+	 */
+
+	/* A single encoded pointer should not be more than 8 bytes. */
+	u64 enc[2];
+
+	/*
+	 * struct {
+	 *    encoded_t start_ip;
+	 *    encoded_t fde_addr;
+	 * } binary_search_table[fde_count];
+	 */
+	char data[0];
+} __packed;
+
+static int unwind_spec_ehframe(struct dso *dso, struct machine *machine,
+			       u64 offset, u64 *table_data, u64 *segbase,
+			       u64 *fde_count)
+{
+	struct eh_frame_hdr hdr;
+	u8 *enc = (u8 *) &hdr.enc;
+	u8 *end = (u8 *) &hdr.data;
+	ssize_t r;
+
+	r = dso__data_read_offset(dso, machine, offset,
+				  (u8 *) &hdr, sizeof(hdr));
+	if (r != sizeof(hdr))
+		return -EINVAL;
+
+	/* We dont need eh_frame_ptr, just skip it. */
+	dw_read_encoded_value(enc, end, hdr.eh_frame_ptr_enc);
+
+	*fde_count  = dw_read_encoded_value(enc, end, hdr.fde_count_enc);
+	*segbase    = offset;
+	*table_data = (enc - (u8 *) &hdr) + offset;
+	return 0;
+}
+
+static int read_unwind_spec_eh_frame(struct dso *dso, struct machine *machine,
+				     u64 *table_data, u64 *segbase,
+				     u64 *fde_count)
+{
+	int ret = -EINVAL, fd;
+	u64 offset = dso->data.eh_frame_hdr_offset;
+
+	if (offset == 0) {
+		fd = dso__data_get_fd(dso, machine);
+		if (fd < 0)
+			return -EINVAL;
+
+		/* Check the .eh_frame section for unwinding info */
+		offset = elf_section_offset(fd, ".eh_frame_hdr");
+		dso->data.eh_frame_hdr_offset = offset;
+		dso__data_put_fd(dso);
+	}
+
+	if (offset)
+		ret = unwind_spec_ehframe(dso, machine, offset,
+					  table_data, segbase,
+					  fde_count);
+
+	return ret;
+}
+
+#ifndef NO_LIBUNWIND_DEBUG_FRAME
+static int read_unwind_spec_debug_frame(struct dso *dso,
+					struct machine *machine, u64 *offset)
+{
+	int fd;
+	u64 ofs = dso->data.debug_frame_offset;
+
+	if (ofs == 0) {
+		fd = dso__data_get_fd(dso, machine);
+		if (fd < 0)
+			return -EINVAL;
+
+		/* Check the .debug_frame section for unwinding info */
+		ofs = elf_section_offset(fd, ".debug_frame");
+		dso->data.debug_frame_offset = ofs;
+		dso__data_put_fd(dso);
+	}
+
+	*offset = ofs;
+	if (*offset)
+		return 0;
+
+	return -EINVAL;
+}
+#endif
+
+static struct map *find_map(unw_word_t ip, struct unwind_info *ui)
+{
+	struct addr_location al;
+
+	thread__find_addr_map(ui->thread, PERF_RECORD_MISC_USER,
+			      MAP__FUNCTION, ip, &al);
+	if (!al.map) {
+		/*
+		 * We've seen cases (softice) where DWARF unwinder went
+		 * through non executable mmaps, which we need to lookup
+		 * in MAP__VARIABLE tree.
+		 */
+		thread__find_addr_map(ui->thread, PERF_RECORD_MISC_USER,
+				      MAP__VARIABLE, ip, &al);
+	}
+	return al.map;
+}
+
+static int
+find_proc_info(unw_addr_space_t as, unw_word_t ip, unw_proc_info_t *pi,
+	       int need_unwind_info, void *arg)
+{
+	struct unwind_info *ui = arg;
+	struct map *map;
+	unw_dyn_info_t di;
+	u64 table_data, segbase, fde_count;
+	int ret = -EINVAL;
+
+	map = find_map(ip, ui);
+	if (!map || !map->dso)
+		return -EINVAL;
+
+	pr_debug("unwind: find_proc_info dso %s\n", map->dso->name);
+
+	/* Check the .eh_frame section for unwinding info */
+	if (!read_unwind_spec_eh_frame(map->dso, ui->machine,
+				       &table_data, &segbase, &fde_count)) {
+		memset(&di, 0, sizeof(di));
+		di.format   = UNW_INFO_FORMAT_REMOTE_TABLE;
+		di.start_ip = map->start;
+		di.end_ip   = map->end;
+		di.u.rti.segbase    = map->start + segbase;
+		di.u.rti.table_data = map->start + table_data;
+		di.u.rti.table_len  = fde_count * sizeof(struct table_entry)
+				      / sizeof(unw_word_t);
+		ret = dwarf_search_unwind_table(as, ip, &di, pi,
+						need_unwind_info, arg);
+	}
+
+#ifndef NO_LIBUNWIND_DEBUG_FRAME
+	/* Check the .debug_frame section for unwinding info */
+	if (ret < 0 &&
+	    !read_unwind_spec_debug_frame(map->dso, ui->machine, &segbase)) {
+		int fd = dso__data_get_fd(map->dso, ui->machine);
+		int is_exec = elf_is_exec(fd, map->dso->name);
+		unw_word_t base = is_exec ? 0 : map->start;
+		const char *symfile;
+
+		if (fd >= 0)
+			dso__data_put_fd(map->dso);
+
+		symfile = map->dso->symsrc_filename ?: map->dso->name;
+
+		memset(&di, 0, sizeof(di));
+		if (dwarf_find_debug_frame(0, &di, ip, base, symfile,
+					   map->start, map->end))
+			return dwarf_search_unwind_table(as, ip, &di, pi,
+							 need_unwind_info, arg);
+	}
+#endif
+
+	return ret;
+}
+
+static int access_fpreg(unw_addr_space_t __maybe_unused as,
+			unw_regnum_t __maybe_unused num,
+			unw_fpreg_t __maybe_unused *val,
+			int __maybe_unused __write,
+			void __maybe_unused *arg)
+{
+	pr_err("unwind: access_fpreg unsupported\n");
+	return -UNW_EINVAL;
+}
+
+static int get_dyn_info_list_addr(unw_addr_space_t __maybe_unused as,
+				  unw_word_t __maybe_unused *dil_addr,
+				  void __maybe_unused *arg)
+{
+	return -UNW_ENOINFO;
+}
+
+static int resume(unw_addr_space_t __maybe_unused as,
+		  unw_cursor_t __maybe_unused *cu,
+		  void __maybe_unused *arg)
+{
+	pr_err("unwind: resume unsupported\n");
+	return -UNW_EINVAL;
+}
+
+static int
+get_proc_name(unw_addr_space_t __maybe_unused as,
+	      unw_word_t __maybe_unused addr,
+		char __maybe_unused *bufp, size_t __maybe_unused buf_len,
+		unw_word_t __maybe_unused *offp, void __maybe_unused *arg)
+{
+	pr_err("unwind: get_proc_name unsupported\n");
+	return -UNW_EINVAL;
+}
+
+static int access_dso_mem(struct unwind_info *ui, unw_word_t addr,
+			  unw_word_t *data)
+{
+	struct map *map;
+	ssize_t size;
+
+	map = find_map(addr, ui);
+	if (!map) {
+		pr_debug("unwind: no map for %lx\n", (unsigned long)addr);
+		return -1;
+	}
+
+	if (!map->dso)
+		return -1;
+
+	size = dso__data_read_addr(map->dso, map, ui->machine,
+				   addr, (u8 *) data, sizeof(*data));
+
+	return !(size == sizeof(*data));
+}
+
+static int access_mem(unw_addr_space_t __maybe_unused as,
+		      unw_word_t addr, unw_word_t *valp,
+		      int __write, void *arg)
+{
+	struct unwind_info *ui = arg;
+	struct stack_dump *stack = &ui->sample->user_stack;
+	u64 start, end;
+	int offset;
+	int ret;
+
+	/* Don't support write, probably not needed. */
+	if (__write || !stack || !ui->sample->user_regs.regs) {
+		*valp = 0;
+		return 0;
+	}
+
+	ret = perf_reg_value(&start, &ui->sample->user_regs, PERF_REG_SP);
+	if (ret)
+		return ret;
+
+	end = start + stack->size;
+
+	/* Check overflow. */
+	if (addr + sizeof(unw_word_t) < addr)
+		return -EINVAL;
+
+	if (addr < start || addr + sizeof(unw_word_t) >= end) {
+		ret = access_dso_mem(ui, addr, valp);
+		if (ret) {
+			pr_debug("unwind: access_mem %p not inside range"
+				 " 0x%" PRIx64 "-0x%" PRIx64 "\n",
+				 (void *) (uintptr_t) addr, start, end);
+			*valp = 0;
+			return ret;
+		}
+		return 0;
+	}
+
+	offset = addr - start;
+	*valp  = *(unw_word_t *)&stack->data[offset];
+	pr_debug("unwind: access_mem addr %p val %lx, offset %d\n",
+		 (void *) (uintptr_t) addr, (unsigned long)*valp, offset);
+	return 0;
+}
+
+static int access_reg(unw_addr_space_t __maybe_unused as,
+		      unw_regnum_t regnum, unw_word_t *valp,
+		      int __write, void *arg)
+{
+	struct unwind_info *ui = arg;
+	int id, ret;
+	u64 val;
+
+	/* Don't support write, I suspect we don't need it. */
+	if (__write) {
+		pr_err("unwind: access_reg w %d\n", regnum);
+		return 0;
+	}
+
+	if (!ui->sample->user_regs.regs) {
+		*valp = 0;
+		return 0;
+	}
+
+	id = LIBUNWIND__ARCH_REG_ID(regnum);
+	if (id < 0)
+		return -EINVAL;
+
+	ret = perf_reg_value(&val, &ui->sample->user_regs, id);
+	if (ret) {
+		pr_err("unwind: can't read reg %d\n", regnum);
+		return ret;
+	}
+
+	*valp = (unw_word_t) val;
+	pr_debug("unwind: reg %d, val %lx\n", regnum, (unsigned long)*valp);
+	return 0;
+}
+
+static void put_unwind_info(unw_addr_space_t __maybe_unused as,
+			    unw_proc_info_t *pi __maybe_unused,
+			    void *arg __maybe_unused)
+{
+	pr_debug("unwind: put_unwind_info called\n");
+}
+
+static int entry(u64 ip, struct thread *thread,
+		 unwind_entry_cb_t cb, void *arg)
+{
+	struct unwind_entry e;
+	struct addr_location al;
+
+	thread__find_addr_location(thread, PERF_RECORD_MISC_USER,
+				   MAP__FUNCTION, ip, &al);
+
+	e.ip = ip;
+	e.map = al.map;
+	e.sym = al.sym;
+
+	pr_debug("unwind: %s:ip = 0x%" PRIx64 " (0x%" PRIx64 ")\n",
+		 al.sym ? al.sym->name : "''",
+		 ip,
+		 al.map ? al.map->map_ip(al.map, ip) : (u64) 0);
+
+	return cb(&e, arg);
+}
+
+static void display_error(int err)
+{
+	switch (err) {
+	case UNW_EINVAL:
+		pr_err("unwind: Only supports local.\n");
+		break;
+	case UNW_EUNSPEC:
+		pr_err("unwind: Unspecified error.\n");
+		break;
+	case UNW_EBADREG:
+		pr_err("unwind: Register unavailable.\n");
+		break;
+	default:
+		break;
+	}
+}
+
+static unw_accessors_t accessors = {
+	.find_proc_info		= find_proc_info,
+	.put_unwind_info	= put_unwind_info,
+	.get_dyn_info_list_addr	= get_dyn_info_list_addr,
+	.access_mem		= access_mem,
+	.access_reg		= access_reg,
+	.access_fpreg		= access_fpreg,
+	.resume			= resume,
+	.get_proc_name		= get_proc_name,
+};
+
+static int _unwind__prepare_access(struct thread *thread)
+{
+	if (callchain_param.record_mode != CALLCHAIN_DWARF)
+		return 0;
+
+	thread->addr_space = unw_create_addr_space(&accessors, 0);
+	if (!thread->addr_space) {
+		pr_err("unwind: Can't create unwind address space.\n");
+		return -ENOMEM;
+	}
+
+	unw_set_caching_policy(thread->addr_space, UNW_CACHE_GLOBAL);
+	return 0;
+}
+
+static void _unwind__flush_access(struct thread *thread)
+{
+	if (callchain_param.record_mode != CALLCHAIN_DWARF)
+		return;
+
+	unw_flush_cache(thread->addr_space, 0, 0);
+}
+
+static void _unwind__finish_access(struct thread *thread)
+{
+	if (callchain_param.record_mode != CALLCHAIN_DWARF)
+		return;
+
+	unw_destroy_addr_space(thread->addr_space);
+}
+
+static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
+		       void *arg, int max_stack)
+{
+	u64 val;
+	unw_word_t ips[max_stack];
+	unw_addr_space_t addr_space;
+	unw_cursor_t c;
+	int ret, i = 0;
+
+	ret = perf_reg_value(&val, &ui->sample->user_regs, PERF_REG_IP);
+	if (ret)
+		return ret;
+
+	ips[i++] = (unw_word_t) val;
+
+	/*
+	 * If we need more than one entry, do the DWARF
+	 * unwind itself.
+	 */
+	if (max_stack - 1 > 0) {
+		WARN_ONCE(!ui->thread, "WARNING: ui->thread is NULL");
+		addr_space = ui->thread->addr_space;
+
+		if (addr_space == NULL)
+			return -1;
+
+		ret = unw_init_remote(&c, addr_space, ui);
+		if (ret)
+			display_error(ret);
+
+		while (!ret && (unw_step(&c) > 0) && i < max_stack) {
+			unw_get_reg(&c, UNW_REG_IP, &ips[i]);
+			++i;
+		}
+
+		max_stack = i;
+	}
+
+	/*
+	 * Display what we got based on the order setup.
+	 */
+	for (i = 0; i < max_stack && !ret; i++) {
+		int j = i;
+
+		if (callchain_param.order == ORDER_CALLER)
+			j = max_stack - i - 1;
+		ret = ips[j] ? entry(ips[j], ui->thread, cb, arg) : 0;
+	}
+
+	return ret;
+}
+
+static int _unwind__get_entries(unwind_entry_cb_t cb, void *arg,
+			struct thread *thread,
+			struct perf_sample *data, int max_stack)
+{
+	struct unwind_info ui = {
+		.sample       = data,
+		.thread       = thread,
+		.machine      = thread->mg->machine,
+	};
+
+	if (!data->user_regs.regs)
+		return -EINVAL;
+
+	if (max_stack <= 0)
+		return -EINVAL;
+
+	return get_entries(&ui, cb, arg, max_stack);
+}
+
+static struct unwind_libunwind_ops
+_unwind_libunwind_ops = {
+	.prepare_access = _unwind__prepare_access,
+	.flush_access   = _unwind__flush_access,
+	.finish_access  = _unwind__finish_access,
+	.get_entries    = _unwind__get_entries,
+};
+
+#ifndef REMOTE_UNWIND_LIBUNWIND
+struct unwind_libunwind_ops *
+local_unwind_libunwind_ops = &_unwind_libunwind_ops;
+#endif
diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c
index 63687d3..8547119 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -1,682 +1,76 @@
-/*
- * Post mortem Dwarf CFI based unwinding on top of regs and stack dumps.
- *
- * Lots of this code have been borrowed or heavily inspired from parts of
- * the libunwind 0.99 code which are (amongst other contributors I may have
- * forgotten):
- *
- * Copyright (C) 2002-2007 Hewlett-Packard Co
- *	Contributed by David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * And the bugs have been added by:
- *
- * Copyright (C) 2010, Frederic Weisbecker <fweisbec@gmail.com>
- * Copyright (C) 2012, Jiri Olsa <jolsa@redhat.com>
- *
- */
-
-#include <elf.h>
-#include <gelf.h>
-#include <fcntl.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <linux/list.h>
-#include <libunwind.h>
-#include <libunwind-ptrace.h>
-#include "callchain.h"
+#include "unwind.h"
 #include "thread.h"
 #include "session.h"
-#include "perf_regs.h"
-#include "unwind.h"
-#include "symbol.h"
-#include "util.h"
 #include "debug.h"
-#include "asm/bug.h"
+#include "arch/common.h"
 
-extern int
-UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
-				    unw_word_t ip,
-				    unw_dyn_info_t *di,
-				    unw_proc_info_t *pi,
-				    int need_unwind_info, void *arg);
+struct unwind_libunwind_ops __weak *local_unwind_libunwind_ops;
+struct unwind_libunwind_ops __weak *x86_32_unwind_libunwind_ops;
+struct unwind_libunwind_ops __weak *arm64_unwind_libunwind_ops;
 
-#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
-
-extern int
-UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
-				 unw_word_t ip,
-				 unw_word_t segbase,
-				 const char *obj_name, unw_word_t start,
-				 unw_word_t end);
-
-#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
-
-#define DW_EH_PE_FORMAT_MASK	0x0f	/* format of the encoded value */
-#define DW_EH_PE_APPL_MASK	0x70	/* how the value is to be applied */
-
-/* Pointer-encoding formats: */
-#define DW_EH_PE_omit		0xff
-#define DW_EH_PE_ptr		0x00	/* pointer-sized unsigned value */
-#define DW_EH_PE_udata4		0x03	/* unsigned 32-bit value */
-#define DW_EH_PE_udata8		0x04	/* unsigned 64-bit value */
-#define DW_EH_PE_sdata4		0x0b	/* signed 32-bit value */
-#define DW_EH_PE_sdata8		0x0c	/* signed 64-bit value */
-
-/* Pointer-encoding application: */
-#define DW_EH_PE_absptr		0x00	/* absolute value */
-#define DW_EH_PE_pcrel		0x10	/* rel. to addr. of encoded value */
-
-/*
- * The following are not documented by LSB v1.3, yet they are used by
- * GCC, presumably they aren't documented by LSB since they aren't
- * used on Linux:
- */
-#define DW_EH_PE_funcrel	0x40	/* start-of-procedure-relative */
-#define DW_EH_PE_aligned	0x50	/* aligned pointer */
-
-/* Flags intentionaly not handled, since they're not needed:
- * #define DW_EH_PE_indirect      0x80
- * #define DW_EH_PE_uleb128       0x01
- * #define DW_EH_PE_udata2        0x02
- * #define DW_EH_PE_sleb128       0x09
- * #define DW_EH_PE_sdata2        0x0a
- * #define DW_EH_PE_textrel       0x20
- * #define DW_EH_PE_datarel       0x30
- */
-
-struct unwind_info {
-	struct perf_sample	*sample;
-	struct machine		*machine;
-	struct thread		*thread;
-};
-
-#define dw_read(ptr, type, end) ({	\
-	type *__p = (type *) ptr;	\
-	type  __v;			\
-	if ((__p + 1) > (type *) end)	\
-		return -EINVAL;		\
-	__v = *__p++;			\
-	ptr = (typeof(ptr)) __p;	\
-	__v;				\
-	})
-
-static int __dw_read_encoded_value(u8 **p, u8 *end, u64 *val,
-				   u8 encoding)
+static void unwind__register_ops(struct thread *thread,
+			  struct unwind_libunwind_ops *ops)
 {
-	u8 *cur = *p;
-	*val = 0;
-
-	switch (encoding) {
-	case DW_EH_PE_omit:
-		*val = 0;
-		goto out;
-	case DW_EH_PE_ptr:
-		*val = dw_read(cur, unsigned long, end);
-		goto out;
-	default:
-		break;
-	}
-
-	switch (encoding & DW_EH_PE_APPL_MASK) {
-	case DW_EH_PE_absptr:
-		break;
-	case DW_EH_PE_pcrel:
-		*val = (unsigned long) cur;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	if ((encoding & 0x07) == 0x00)
-		encoding |= DW_EH_PE_udata4;
-
-	switch (encoding & DW_EH_PE_FORMAT_MASK) {
-	case DW_EH_PE_sdata4:
-		*val += dw_read(cur, s32, end);
-		break;
-	case DW_EH_PE_udata4:
-		*val += dw_read(cur, u32, end);
-		break;
-	case DW_EH_PE_sdata8:
-		*val += dw_read(cur, s64, end);
-		break;
-	case DW_EH_PE_udata8:
-		*val += dw_read(cur, u64, end);
-		break;
-	default:
-		return -EINVAL;
-	}
-
- out:
-	*p = cur;
-	return 0;
+	thread->unwind_libunwind_ops = ops;
 }
 
-#define dw_read_encoded_value(ptr, end, enc) ({			\
-	u64 __v;						\
-	if (__dw_read_encoded_value(&ptr, end, &__v, enc)) {	\
-		return -EINVAL;                                 \
-	}                                                       \
-	__v;                                                    \
-	})
-
-static u64 elf_section_offset(int fd, const char *name)
+int unwind__prepare_access(struct thread *thread, struct map *map)
 {
-	Elf *elf;
-	GElf_Ehdr ehdr;
-	GElf_Shdr shdr;
-	u64 offset = 0;
+	const char *arch;
+	enum dso_type dso_type;
+	struct unwind_libunwind_ops *ops = local_unwind_libunwind_ops;
 
-	elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
-	if (elf == NULL)
+	if (thread->addr_space) {
+		pr_debug("unwind: thread map already set, dso=%s\n",
+			 map->dso->name);
+		return 0;
+	}
+
+	/* env->arch is NULL for live-mode (i.e. perf top) */
+	if (!thread->mg->machine->env || !thread->mg->machine->env->arch)
+		goto out_register;
+
+	dso_type = dso__type(map->dso, thread->mg->machine);
+	if (dso_type == DSO__TYPE_UNKNOWN)
 		return 0;
 
-	do {
-		if (gelf_getehdr(elf, &ehdr) == NULL)
-			break;
+	arch = normalize_arch(thread->mg->machine->env->arch);
 
-		if (!elf_section_by_name(elf, &ehdr, &shdr, name, NULL))
-			break;
-
-		offset = shdr.sh_offset;
-	} while (0);
-
-	elf_end(elf);
-	return offset;
-}
-
-#ifndef NO_LIBUNWIND_DEBUG_FRAME
-static int elf_is_exec(int fd, const char *name)
-{
-	Elf *elf;
-	GElf_Ehdr ehdr;
-	int retval = 0;
-
-	elf = elf_begin(fd, PERF_ELF_C_READ_MMAP, NULL);
-	if (elf == NULL)
-		return 0;
-	if (gelf_getehdr(elf, &ehdr) == NULL)
-		goto out;
-
-	retval = (ehdr.e_type == ET_EXEC);
-
-out:
-	elf_end(elf);
-	pr_debug("unwind: elf_is_exec(%s): %d\n", name, retval);
-	return retval;
-}
-#endif
-
-struct table_entry {
-	u32 start_ip_offset;
-	u32 fde_offset;
-};
-
-struct eh_frame_hdr {
-	unsigned char version;
-	unsigned char eh_frame_ptr_enc;
-	unsigned char fde_count_enc;
-	unsigned char table_enc;
-
-	/*
-	 * The rest of the header is variable-length and consists of the
-	 * following members:
-	 *
-	 *	encoded_t eh_frame_ptr;
-	 *	encoded_t fde_count;
-	 */
-
-	/* A single encoded pointer should not be more than 8 bytes. */
-	u64 enc[2];
-
-	/*
-	 * struct {
-	 *    encoded_t start_ip;
-	 *    encoded_t fde_addr;
-	 * } binary_search_table[fde_count];
-	 */
-	char data[0];
-} __packed;
-
-static int unwind_spec_ehframe(struct dso *dso, struct machine *machine,
-			       u64 offset, u64 *table_data, u64 *segbase,
-			       u64 *fde_count)
-{
-	struct eh_frame_hdr hdr;
-	u8 *enc = (u8 *) &hdr.enc;
-	u8 *end = (u8 *) &hdr.data;
-	ssize_t r;
-
-	r = dso__data_read_offset(dso, machine, offset,
-				  (u8 *) &hdr, sizeof(hdr));
-	if (r != sizeof(hdr))
-		return -EINVAL;
-
-	/* We dont need eh_frame_ptr, just skip it. */
-	dw_read_encoded_value(enc, end, hdr.eh_frame_ptr_enc);
-
-	*fde_count  = dw_read_encoded_value(enc, end, hdr.fde_count_enc);
-	*segbase    = offset;
-	*table_data = (enc - (u8 *) &hdr) + offset;
-	return 0;
-}
-
-static int read_unwind_spec_eh_frame(struct dso *dso, struct machine *machine,
-				     u64 *table_data, u64 *segbase,
-				     u64 *fde_count)
-{
-	int ret = -EINVAL, fd;
-	u64 offset = dso->data.eh_frame_hdr_offset;
-
-	if (offset == 0) {
-		fd = dso__data_get_fd(dso, machine);
-		if (fd < 0)
-			return -EINVAL;
-
-		/* Check the .eh_frame section for unwinding info */
-		offset = elf_section_offset(fd, ".eh_frame_hdr");
-		dso->data.eh_frame_hdr_offset = offset;
-		dso__data_put_fd(dso);
+	if (!strcmp(arch, "x86")) {
+		if (dso_type != DSO__TYPE_64BIT)
+			ops = x86_32_unwind_libunwind_ops;
+	} else if (!strcmp(arch, "arm64") || !strcmp(arch, "arm")) {
+		if (dso_type == DSO__TYPE_64BIT)
+			ops = arm64_unwind_libunwind_ops;
 	}
 
-	if (offset)
-		ret = unwind_spec_ehframe(dso, machine, offset,
-					  table_data, segbase,
-					  fde_count);
-
-	return ret;
-}
-
-#ifndef NO_LIBUNWIND_DEBUG_FRAME
-static int read_unwind_spec_debug_frame(struct dso *dso,
-					struct machine *machine, u64 *offset)
-{
-	int fd;
-	u64 ofs = dso->data.debug_frame_offset;
-
-	if (ofs == 0) {
-		fd = dso__data_get_fd(dso, machine);
-		if (fd < 0)
-			return -EINVAL;
-
-		/* Check the .debug_frame section for unwinding info */
-		ofs = elf_section_offset(fd, ".debug_frame");
-		dso->data.debug_frame_offset = ofs;
-		dso__data_put_fd(dso);
-	}
-
-	*offset = ofs;
-	if (*offset)
-		return 0;
-
-	return -EINVAL;
-}
-#endif
-
-static struct map *find_map(unw_word_t ip, struct unwind_info *ui)
-{
-	struct addr_location al;
-
-	thread__find_addr_map(ui->thread, PERF_RECORD_MISC_USER,
-			      MAP__FUNCTION, ip, &al);
-	if (!al.map) {
-		/*
-		 * We've seen cases (softice) where DWARF unwinder went
-		 * through non executable mmaps, which we need to lookup
-		 * in MAP__VARIABLE tree.
-		 */
-		thread__find_addr_map(ui->thread, PERF_RECORD_MISC_USER,
-				      MAP__VARIABLE, ip, &al);
-	}
-	return al.map;
-}
-
-static int
-find_proc_info(unw_addr_space_t as, unw_word_t ip, unw_proc_info_t *pi,
-	       int need_unwind_info, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct map *map;
-	unw_dyn_info_t di;
-	u64 table_data, segbase, fde_count;
-	int ret = -EINVAL;
-
-	map = find_map(ip, ui);
-	if (!map || !map->dso)
-		return -EINVAL;
-
-	pr_debug("unwind: find_proc_info dso %s\n", map->dso->name);
-
-	/* Check the .eh_frame section for unwinding info */
-	if (!read_unwind_spec_eh_frame(map->dso, ui->machine,
-				       &table_data, &segbase, &fde_count)) {
-		memset(&di, 0, sizeof(di));
-		di.format   = UNW_INFO_FORMAT_REMOTE_TABLE;
-		di.start_ip = map->start;
-		di.end_ip   = map->end;
-		di.u.rti.segbase    = map->start + segbase;
-		di.u.rti.table_data = map->start + table_data;
-		di.u.rti.table_len  = fde_count * sizeof(struct table_entry)
-				      / sizeof(unw_word_t);
-		ret = dwarf_search_unwind_table(as, ip, &di, pi,
-						need_unwind_info, arg);
-	}
-
-#ifndef NO_LIBUNWIND_DEBUG_FRAME
-	/* Check the .debug_frame section for unwinding info */
-	if (ret < 0 &&
-	    !read_unwind_spec_debug_frame(map->dso, ui->machine, &segbase)) {
-		int fd = dso__data_get_fd(map->dso, ui->machine);
-		int is_exec = elf_is_exec(fd, map->dso->name);
-		unw_word_t base = is_exec ? 0 : map->start;
-		const char *symfile;
-
-		if (fd >= 0)
-			dso__data_put_fd(map->dso);
-
-		symfile = map->dso->symsrc_filename ?: map->dso->name;
-
-		memset(&di, 0, sizeof(di));
-		if (dwarf_find_debug_frame(0, &di, ip, base, symfile,
-					   map->start, map->end))
-			return dwarf_search_unwind_table(as, ip, &di, pi,
-							 need_unwind_info, arg);
-	}
-#endif
-
-	return ret;
-}
-
-static int access_fpreg(unw_addr_space_t __maybe_unused as,
-			unw_regnum_t __maybe_unused num,
-			unw_fpreg_t __maybe_unused *val,
-			int __maybe_unused __write,
-			void __maybe_unused *arg)
-{
-	pr_err("unwind: access_fpreg unsupported\n");
-	return -UNW_EINVAL;
-}
-
-static int get_dyn_info_list_addr(unw_addr_space_t __maybe_unused as,
-				  unw_word_t __maybe_unused *dil_addr,
-				  void __maybe_unused *arg)
-{
-	return -UNW_ENOINFO;
-}
-
-static int resume(unw_addr_space_t __maybe_unused as,
-		  unw_cursor_t __maybe_unused *cu,
-		  void __maybe_unused *arg)
-{
-	pr_err("unwind: resume unsupported\n");
-	return -UNW_EINVAL;
-}
-
-static int
-get_proc_name(unw_addr_space_t __maybe_unused as,
-	      unw_word_t __maybe_unused addr,
-		char __maybe_unused *bufp, size_t __maybe_unused buf_len,
-		unw_word_t __maybe_unused *offp, void __maybe_unused *arg)
-{
-	pr_err("unwind: get_proc_name unsupported\n");
-	return -UNW_EINVAL;
-}
-
-static int access_dso_mem(struct unwind_info *ui, unw_word_t addr,
-			  unw_word_t *data)
-{
-	struct map *map;
-	ssize_t size;
-
-	map = find_map(addr, ui);
-	if (!map) {
-		pr_debug("unwind: no map for %lx\n", (unsigned long)addr);
+	if (!ops) {
+		pr_err("unwind: target platform=%s is not supported\n", arch);
 		return -1;
 	}
+out_register:
+	unwind__register_ops(thread, ops);
 
-	if (!map->dso)
-		return -1;
-
-	size = dso__data_read_addr(map->dso, map, ui->machine,
-				   addr, (u8 *) data, sizeof(*data));
-
-	return !(size == sizeof(*data));
-}
-
-static int access_mem(unw_addr_space_t __maybe_unused as,
-		      unw_word_t addr, unw_word_t *valp,
-		      int __write, void *arg)
-{
-	struct unwind_info *ui = arg;
-	struct stack_dump *stack = &ui->sample->user_stack;
-	u64 start, end;
-	int offset;
-	int ret;
-
-	/* Don't support write, probably not needed. */
-	if (__write || !stack || !ui->sample->user_regs.regs) {
-		*valp = 0;
-		return 0;
-	}
-
-	ret = perf_reg_value(&start, &ui->sample->user_regs, PERF_REG_SP);
-	if (ret)
-		return ret;
-
-	end = start + stack->size;
-
-	/* Check overflow. */
-	if (addr + sizeof(unw_word_t) < addr)
-		return -EINVAL;
-
-	if (addr < start || addr + sizeof(unw_word_t) >= end) {
-		ret = access_dso_mem(ui, addr, valp);
-		if (ret) {
-			pr_debug("unwind: access_mem %p not inside range"
-				 " 0x%" PRIx64 "-0x%" PRIx64 "\n",
-				 (void *) (uintptr_t) addr, start, end);
-			*valp = 0;
-			return ret;
-		}
-		return 0;
-	}
-
-	offset = addr - start;
-	*valp  = *(unw_word_t *)&stack->data[offset];
-	pr_debug("unwind: access_mem addr %p val %lx, offset %d\n",
-		 (void *) (uintptr_t) addr, (unsigned long)*valp, offset);
-	return 0;
-}
-
-static int access_reg(unw_addr_space_t __maybe_unused as,
-		      unw_regnum_t regnum, unw_word_t *valp,
-		      int __write, void *arg)
-{
-	struct unwind_info *ui = arg;
-	int id, ret;
-	u64 val;
-
-	/* Don't support write, I suspect we don't need it. */
-	if (__write) {
-		pr_err("unwind: access_reg w %d\n", regnum);
-		return 0;
-	}
-
-	if (!ui->sample->user_regs.regs) {
-		*valp = 0;
-		return 0;
-	}
-
-	id = libunwind__arch_reg_id(regnum);
-	if (id < 0)
-		return -EINVAL;
-
-	ret = perf_reg_value(&val, &ui->sample->user_regs, id);
-	if (ret) {
-		pr_err("unwind: can't read reg %d\n", regnum);
-		return ret;
-	}
-
-	*valp = (unw_word_t) val;
-	pr_debug("unwind: reg %d, val %lx\n", regnum, (unsigned long)*valp);
-	return 0;
-}
-
-static void put_unwind_info(unw_addr_space_t __maybe_unused as,
-			    unw_proc_info_t *pi __maybe_unused,
-			    void *arg __maybe_unused)
-{
-	pr_debug("unwind: put_unwind_info called\n");
-}
-
-static int entry(u64 ip, struct thread *thread,
-		 unwind_entry_cb_t cb, void *arg)
-{
-	struct unwind_entry e;
-	struct addr_location al;
-
-	thread__find_addr_location(thread, PERF_RECORD_MISC_USER,
-				   MAP__FUNCTION, ip, &al);
-
-	e.ip = ip;
-	e.map = al.map;
-	e.sym = al.sym;
-
-	pr_debug("unwind: %s:ip = 0x%" PRIx64 " (0x%" PRIx64 ")\n",
-		 al.sym ? al.sym->name : "''",
-		 ip,
-		 al.map ? al.map->map_ip(al.map, ip) : (u64) 0);
-
-	return cb(&e, arg);
-}
-
-static void display_error(int err)
-{
-	switch (err) {
-	case UNW_EINVAL:
-		pr_err("unwind: Only supports local.\n");
-		break;
-	case UNW_EUNSPEC:
-		pr_err("unwind: Unspecified error.\n");
-		break;
-	case UNW_EBADREG:
-		pr_err("unwind: Register unavailable.\n");
-		break;
-	default:
-		break;
-	}
-}
-
-static unw_accessors_t accessors = {
-	.find_proc_info		= find_proc_info,
-	.put_unwind_info	= put_unwind_info,
-	.get_dyn_info_list_addr	= get_dyn_info_list_addr,
-	.access_mem		= access_mem,
-	.access_reg		= access_reg,
-	.access_fpreg		= access_fpreg,
-	.resume			= resume,
-	.get_proc_name		= get_proc_name,
-};
-
-int unwind__prepare_access(struct thread *thread)
-{
-	if (callchain_param.record_mode != CALLCHAIN_DWARF)
-		return 0;
-
-	thread->addr_space = unw_create_addr_space(&accessors, 0);
-	if (!thread->addr_space) {
-		pr_err("unwind: Can't create unwind address space.\n");
-		return -ENOMEM;
-	}
-
-	unw_set_caching_policy(thread->addr_space, UNW_CACHE_GLOBAL);
-	return 0;
+	return thread->unwind_libunwind_ops->prepare_access(thread);
 }
 
 void unwind__flush_access(struct thread *thread)
 {
-	if (callchain_param.record_mode != CALLCHAIN_DWARF)
-		return;
-
-	unw_flush_cache(thread->addr_space, 0, 0);
+	if (thread->unwind_libunwind_ops)
+		thread->unwind_libunwind_ops->flush_access(thread);
 }
 
 void unwind__finish_access(struct thread *thread)
 {
-	if (callchain_param.record_mode != CALLCHAIN_DWARF)
-		return;
-
-	unw_destroy_addr_space(thread->addr_space);
-}
-
-static int get_entries(struct unwind_info *ui, unwind_entry_cb_t cb,
-		       void *arg, int max_stack)
-{
-	u64 val;
-	unw_word_t ips[max_stack];
-	unw_addr_space_t addr_space;
-	unw_cursor_t c;
-	int ret, i = 0;
-
-	ret = perf_reg_value(&val, &ui->sample->user_regs, PERF_REG_IP);
-	if (ret)
-		return ret;
-
-	ips[i++] = (unw_word_t) val;
-
-	/*
-	 * If we need more than one entry, do the DWARF
-	 * unwind itself.
-	 */
-	if (max_stack - 1 > 0) {
-		WARN_ONCE(!ui->thread, "WARNING: ui->thread is NULL");
-		addr_space = ui->thread->addr_space;
-
-		if (addr_space == NULL)
-			return -1;
-
-		ret = unw_init_remote(&c, addr_space, ui);
-		if (ret)
-			display_error(ret);
-
-		while (!ret && (unw_step(&c) > 0) && i < max_stack) {
-			unw_get_reg(&c, UNW_REG_IP, &ips[i]);
-			++i;
-		}
-
-		max_stack = i;
-	}
-
-	/*
-	 * Display what we got based on the order setup.
-	 */
-	for (i = 0; i < max_stack && !ret; i++) {
-		int j = i;
-
-		if (callchain_param.order == ORDER_CALLER)
-			j = max_stack - i - 1;
-		ret = ips[j] ? entry(ips[j], ui->thread, cb, arg) : 0;
-	}
-
-	return ret;
+	if (thread->unwind_libunwind_ops)
+		thread->unwind_libunwind_ops->finish_access(thread);
 }
 
 int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
-			struct thread *thread,
-			struct perf_sample *data, int max_stack)
+			 struct thread *thread,
+			 struct perf_sample *data, int max_stack)
 {
-	struct unwind_info ui = {
-		.sample       = data,
-		.thread       = thread,
-		.machine      = thread->mg->machine,
-	};
-
-	if (!data->user_regs.regs)
-		return -EINVAL;
-
-	if (max_stack <= 0)
-		return -EINVAL;
-
-	return get_entries(&ui, cb, arg, max_stack);
+	if (thread->unwind_libunwind_ops)
+		return thread->unwind_libunwind_ops->get_entries(cb, arg, thread, data, max_stack);
+	return 0;
 }
diff --git a/tools/perf/util/unwind.h b/tools/perf/util/unwind.h
index 12790cf..b074662 100644
--- a/tools/perf/util/unwind.h
+++ b/tools/perf/util/unwind.h
@@ -14,18 +14,31 @@
 
 typedef int (*unwind_entry_cb_t)(struct unwind_entry *entry, void *arg);
 
+struct unwind_libunwind_ops {
+	int (*prepare_access)(struct thread *thread);
+	void (*flush_access)(struct thread *thread);
+	void (*finish_access)(struct thread *thread);
+	int (*get_entries)(unwind_entry_cb_t cb, void *arg,
+			   struct thread *thread,
+			   struct perf_sample *data, int max_stack);
+};
+
 #ifdef HAVE_DWARF_UNWIND_SUPPORT
 int unwind__get_entries(unwind_entry_cb_t cb, void *arg,
 			struct thread *thread,
 			struct perf_sample *data, int max_stack);
 /* libunwind specific */
 #ifdef HAVE_LIBUNWIND_SUPPORT
-int libunwind__arch_reg_id(int regnum);
-int unwind__prepare_access(struct thread *thread);
+#ifndef LIBUNWIND__ARCH_REG_ID
+#define LIBUNWIND__ARCH_REG_ID(regnum) libunwind__arch_reg_id(regnum)
+#endif
+int LIBUNWIND__ARCH_REG_ID(int regnum);
+int unwind__prepare_access(struct thread *thread, struct map *map);
 void unwind__flush_access(struct thread *thread);
 void unwind__finish_access(struct thread *thread);
 #else
-static inline int unwind__prepare_access(struct thread *thread __maybe_unused)
+static inline int unwind__prepare_access(struct thread *thread __maybe_unused,
+					 struct map *map __maybe_unused)
 {
 	return 0;
 }
@@ -44,7 +57,8 @@
 	return 0;
 }
 
-static inline int unwind__prepare_access(struct thread *thread __maybe_unused)
+static inline int unwind__prepare_access(struct thread *thread __maybe_unused,
+					 struct map *map __maybe_unused)
 {
 	return 0;
 }