perf/x86/intel: Add Intel Skylake PMU support
Add perf core PMU support for future Intel Skylake CPU cores. The code is based on Haswell/Broadwell. There is a new cache event list, based on the updated Haswell event list. Skylake has removed most counter constraints on basic events, so the basic constraints table now only has a single entry (plus the fixed counters). TSX support and various other setups are all shared with Haswell. Skylake has 32 LBR entries. Add a new LBR init function to set this up. The filters are all the same as Haswell. It also has a new LBR format with a separate LBR_INFO_* MSR, but that has been already added earlier. Signed-off-by: Andi Kleen <ak@linux.intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: eranian@google.com Link: http://lkml.kernel.org/r/1431285767-27027-7-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
425507fa5f
commit
9a92e16fd7
4 changed files with 279 additions and 1 deletions
|
@ -165,7 +165,7 @@ struct intel_excl_cntrs {
|
|||
unsigned core_id; /* per-core: core id */
|
||||
};
|
||||
|
||||
#define MAX_LBR_ENTRIES 16
|
||||
#define MAX_LBR_ENTRIES 32
|
||||
|
||||
enum {
|
||||
X86_PERF_KFREE_SHARED = 0,
|
||||
|
@ -861,6 +861,8 @@ extern struct event_constraint intel_ivb_pebs_event_constraints[];
|
|||
|
||||
extern struct event_constraint intel_hsw_pebs_event_constraints[];
|
||||
|
||||
extern struct event_constraint intel_skl_pebs_event_constraints[];
|
||||
|
||||
struct event_constraint *intel_pebs_constraints(struct perf_event *event);
|
||||
|
||||
void intel_pmu_pebs_enable(struct perf_event *event);
|
||||
|
@ -899,6 +901,8 @@ void intel_pmu_lbr_init_snb(void);
|
|||
|
||||
void intel_pmu_lbr_init_hsw(void);
|
||||
|
||||
void intel_pmu_lbr_init_skl(void);
|
||||
|
||||
int intel_pmu_setup_lbr_filter(struct perf_event *event);
|
||||
|
||||
void intel_pt_interrupt(void);
|
||||
|
|
|
@ -177,6 +177,14 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly =
|
|||
EVENT_CONSTRAINT_END
|
||||
};
|
||||
|
||||
struct event_constraint intel_skl_event_constraints[] = {
|
||||
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
|
||||
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
|
||||
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
|
||||
INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
|
||||
EVENT_CONSTRAINT_END
|
||||
};
|
||||
|
||||
static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
|
||||
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
|
||||
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
|
||||
|
@ -193,6 +201,13 @@ static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
|
|||
EVENT_EXTRA_END
|
||||
};
|
||||
|
||||
static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
|
||||
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
|
||||
INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
|
||||
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
|
||||
EVENT_EXTRA_END
|
||||
};
|
||||
|
||||
EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
|
||||
EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
|
||||
EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
|
||||
|
@ -244,6 +259,200 @@ static u64 intel_pmu_event_map(int hw_event)
|
|||
return intel_perfmon_event_map[hw_event];
|
||||
}
|
||||
|
||||
/*
|
||||
* Notes on the events:
|
||||
* - data reads do not include code reads (comparable to earlier tables)
|
||||
* - data counts include speculative execution (except L1 write, dtlb, bpu)
|
||||
* - remote node access includes remote memory, remote cache, remote mmio.
|
||||
* - prefetches are not included in the counts.
|
||||
* - icache miss does not include decoded icache
|
||||
*/
|
||||
|
||||
#define SKL_DEMAND_DATA_RD BIT_ULL(0)
|
||||
#define SKL_DEMAND_RFO BIT_ULL(1)
|
||||
#define SKL_ANY_RESPONSE BIT_ULL(16)
|
||||
#define SKL_SUPPLIER_NONE BIT_ULL(17)
|
||||
#define SKL_L3_MISS_LOCAL_DRAM BIT_ULL(26)
|
||||
#define SKL_L3_MISS_REMOTE_HOP0_DRAM BIT_ULL(27)
|
||||
#define SKL_L3_MISS_REMOTE_HOP1_DRAM BIT_ULL(28)
|
||||
#define SKL_L3_MISS_REMOTE_HOP2P_DRAM BIT_ULL(29)
|
||||
#define SKL_L3_MISS (SKL_L3_MISS_LOCAL_DRAM| \
|
||||
SKL_L3_MISS_REMOTE_HOP0_DRAM| \
|
||||
SKL_L3_MISS_REMOTE_HOP1_DRAM| \
|
||||
SKL_L3_MISS_REMOTE_HOP2P_DRAM)
|
||||
#define SKL_SPL_HIT BIT_ULL(30)
|
||||
#define SKL_SNOOP_NONE BIT_ULL(31)
|
||||
#define SKL_SNOOP_NOT_NEEDED BIT_ULL(32)
|
||||
#define SKL_SNOOP_MISS BIT_ULL(33)
|
||||
#define SKL_SNOOP_HIT_NO_FWD BIT_ULL(34)
|
||||
#define SKL_SNOOP_HIT_WITH_FWD BIT_ULL(35)
|
||||
#define SKL_SNOOP_HITM BIT_ULL(36)
|
||||
#define SKL_SNOOP_NON_DRAM BIT_ULL(37)
|
||||
#define SKL_ANY_SNOOP (SKL_SPL_HIT|SKL_SNOOP_NONE| \
|
||||
SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
|
||||
SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
|
||||
SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
|
||||
#define SKL_DEMAND_READ SKL_DEMAND_DATA_RD
|
||||
#define SKL_SNOOP_DRAM (SKL_SNOOP_NONE| \
|
||||
SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
|
||||
SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
|
||||
SKL_SNOOP_HITM|SKL_SPL_HIT)
|
||||
#define SKL_DEMAND_WRITE SKL_DEMAND_RFO
|
||||
#define SKL_LLC_ACCESS SKL_ANY_RESPONSE
|
||||
#define SKL_L3_MISS_REMOTE (SKL_L3_MISS_REMOTE_HOP0_DRAM| \
|
||||
SKL_L3_MISS_REMOTE_HOP1_DRAM| \
|
||||
SKL_L3_MISS_REMOTE_HOP2P_DRAM)
|
||||
|
||||
static __initconst const u64 skl_hw_cache_event_ids
|
||||
[PERF_COUNT_HW_CACHE_MAX]
|
||||
[PERF_COUNT_HW_CACHE_OP_MAX]
|
||||
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
||||
{
|
||||
[ C(L1D ) ] = {
|
||||
[ C(OP_READ) ] = {
|
||||
[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
|
||||
[ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
|
||||
},
|
||||
[ C(OP_WRITE) ] = {
|
||||
[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
|
||||
[ C(RESULT_MISS) ] = 0x0,
|
||||
},
|
||||
[ C(OP_PREFETCH) ] = {
|
||||
[ C(RESULT_ACCESS) ] = 0x0,
|
||||
[ C(RESULT_MISS) ] = 0x0,
|
||||
},
|
||||
},
|
||||
[ C(L1I ) ] = {
|
||||
[ C(OP_READ) ] = {
|
||||
[ C(RESULT_ACCESS) ] = 0x0,
|
||||
[ C(RESULT_MISS) ] = 0x283, /* ICACHE_64B.MISS */
|
||||
},
|
||||
[ C(OP_WRITE) ] = {
|
||||
[ C(RESULT_ACCESS) ] = -1,
|
||||
[ C(RESULT_MISS) ] = -1,
|
||||
},
|
||||
[ C(OP_PREFETCH) ] = {
|
||||
[ C(RESULT_ACCESS) ] = 0x0,
|
||||
[ C(RESULT_MISS) ] = 0x0,
|
||||
},
|
||||
},
|
||||
[ C(LL ) ] = {
|
||||
[ C(OP_READ) ] = {
|
||||
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
||||
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
||||
},
|
||||
[ C(OP_WRITE) ] = {
|
||||
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
||||
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
||||
},
|
||||
[ C(OP_PREFETCH) ] = {
|
||||
[ C(RESULT_ACCESS) ] = 0x0,
|
||||
[ C(RESULT_MISS) ] = 0x0,
|
||||
},
|
||||
},
|
||||
[ C(DTLB) ] = {
|
||||
[ C(OP_READ) ] = {
|
||||
[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
|
||||
[ C(RESULT_MISS) ] = 0x608, /* DTLB_LOAD_MISSES.WALK_COMPLETED */
|
||||
},
|
||||
[ C(OP_WRITE) ] = {
|
||||
[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
|
||||
[ C(RESULT_MISS) ] = 0x649, /* DTLB_STORE_MISSES.WALK_COMPLETED */
|
||||
},
|
||||
[ C(OP_PREFETCH) ] = {
|
||||
[ C(RESULT_ACCESS) ] = 0x0,
|
||||
[ C(RESULT_MISS) ] = 0x0,
|
||||
},
|
||||
},
|
||||
[ C(ITLB) ] = {
|
||||
[ C(OP_READ) ] = {
|
||||
[ C(RESULT_ACCESS) ] = 0x2085, /* ITLB_MISSES.STLB_HIT */
|
||||
[ C(RESULT_MISS) ] = 0xe85, /* ITLB_MISSES.WALK_COMPLETED */
|
||||
},
|
||||
[ C(OP_WRITE) ] = {
|
||||
[ C(RESULT_ACCESS) ] = -1,
|
||||
[ C(RESULT_MISS) ] = -1,
|
||||
},
|
||||
[ C(OP_PREFETCH) ] = {
|
||||
[ C(RESULT_ACCESS) ] = -1,
|
||||
[ C(RESULT_MISS) ] = -1,
|
||||
},
|
||||
},
|
||||
[ C(BPU ) ] = {
|
||||
[ C(OP_READ) ] = {
|
||||
[ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
|
||||
[ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
|
||||
},
|
||||
[ C(OP_WRITE) ] = {
|
||||
[ C(RESULT_ACCESS) ] = -1,
|
||||
[ C(RESULT_MISS) ] = -1,
|
||||
},
|
||||
[ C(OP_PREFETCH) ] = {
|
||||
[ C(RESULT_ACCESS) ] = -1,
|
||||
[ C(RESULT_MISS) ] = -1,
|
||||
},
|
||||
},
|
||||
[ C(NODE) ] = {
|
||||
[ C(OP_READ) ] = {
|
||||
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
||||
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
||||
},
|
||||
[ C(OP_WRITE) ] = {
|
||||
[ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
||||
[ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
|
||||
},
|
||||
[ C(OP_PREFETCH) ] = {
|
||||
[ C(RESULT_ACCESS) ] = 0x0,
|
||||
[ C(RESULT_MISS) ] = 0x0,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
static __initconst const u64 skl_hw_cache_extra_regs
|
||||
[PERF_COUNT_HW_CACHE_MAX]
|
||||
[PERF_COUNT_HW_CACHE_OP_MAX]
|
||||
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
|
||||
{
|
||||
[ C(LL ) ] = {
|
||||
[ C(OP_READ) ] = {
|
||||
[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
|
||||
SKL_LLC_ACCESS|SKL_ANY_SNOOP,
|
||||
[ C(RESULT_MISS) ] = SKL_DEMAND_READ|
|
||||
SKL_L3_MISS|SKL_ANY_SNOOP|
|
||||
SKL_SUPPLIER_NONE,
|
||||
},
|
||||
[ C(OP_WRITE) ] = {
|
||||
[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
|
||||
SKL_LLC_ACCESS|SKL_ANY_SNOOP,
|
||||
[ C(RESULT_MISS) ] = SKL_DEMAND_WRITE|
|
||||
SKL_L3_MISS|SKL_ANY_SNOOP|
|
||||
SKL_SUPPLIER_NONE,
|
||||
},
|
||||
[ C(OP_PREFETCH) ] = {
|
||||
[ C(RESULT_ACCESS) ] = 0x0,
|
||||
[ C(RESULT_MISS) ] = 0x0,
|
||||
},
|
||||
},
|
||||
[ C(NODE) ] = {
|
||||
[ C(OP_READ) ] = {
|
||||
[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
|
||||
SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
|
||||
[ C(RESULT_MISS) ] = SKL_DEMAND_READ|
|
||||
SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
|
||||
},
|
||||
[ C(OP_WRITE) ] = {
|
||||
[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
|
||||
SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
|
||||
[ C(RESULT_MISS) ] = SKL_DEMAND_WRITE|
|
||||
SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
|
||||
},
|
||||
[ C(OP_PREFETCH) ] = {
|
||||
[ C(RESULT_ACCESS) ] = 0x0,
|
||||
[ C(RESULT_MISS) ] = 0x0,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
#define SNB_DMND_DATA_RD (1ULL << 0)
|
||||
#define SNB_DMND_RFO (1ULL << 1)
|
||||
#define SNB_DMND_IFETCH (1ULL << 2)
|
||||
|
@ -3278,6 +3487,29 @@ __init int intel_pmu_init(void)
|
|||
pr_cont("Broadwell events, ");
|
||||
break;
|
||||
|
||||
case 78: /* 14nm Skylake Mobile */
|
||||
case 94: /* 14nm Skylake Desktop */
|
||||
x86_pmu.late_ack = true;
|
||||
memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
|
||||
memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
|
||||
intel_pmu_lbr_init_skl();
|
||||
|
||||
x86_pmu.event_constraints = intel_skl_event_constraints;
|
||||
x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
|
||||
x86_pmu.extra_regs = intel_skl_extra_regs;
|
||||
x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
|
||||
/* all extra regs are per-cpu when HT is on */
|
||||
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
|
||||
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
|
||||
|
||||
x86_pmu.hw_config = hsw_hw_config;
|
||||
x86_pmu.get_event_constraints = hsw_get_event_constraints;
|
||||
x86_pmu.cpu_events = hsw_events_attrs;
|
||||
WARN_ON(!x86_pmu.format_attrs);
|
||||
x86_pmu.cpu_events = hsw_events_attrs;
|
||||
pr_cont("Skylake events, ");
|
||||
break;
|
||||
|
||||
default:
|
||||
switch (x86_pmu.version) {
|
||||
case 1:
|
||||
|
|
|
@ -688,6 +688,28 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
|
|||
EVENT_CONSTRAINT_END
|
||||
};
|
||||
|
||||
struct event_constraint intel_skl_pebs_event_constraints[] = {
|
||||
INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
|
||||
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
|
||||
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
|
||||
INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
|
||||
INTEL_PLD_CONSTRAINT(0x1cd, 0xf), /* MEM_TRANS_RETIRED.* */
|
||||
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
|
||||
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
|
||||
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
|
||||
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
|
||||
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
|
||||
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
|
||||
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
|
||||
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
|
||||
INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */
|
||||
INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */
|
||||
INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_L3_MISS_RETIRED.* */
|
||||
/* Allow all events as PEBS with no flags */
|
||||
INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
|
||||
EVENT_CONSTRAINT_END
|
||||
};
|
||||
|
||||
struct event_constraint *intel_pebs_constraints(struct perf_event *event)
|
||||
{
|
||||
struct event_constraint *c;
|
||||
|
|
|
@ -973,6 +973,26 @@ void intel_pmu_lbr_init_hsw(void)
|
|||
pr_cont("16-deep LBR, ");
|
||||
}
|
||||
|
||||
/* skylake */
|
||||
__init void intel_pmu_lbr_init_skl(void)
|
||||
{
|
||||
x86_pmu.lbr_nr = 32;
|
||||
x86_pmu.lbr_tos = MSR_LBR_TOS;
|
||||
x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
|
||||
x86_pmu.lbr_to = MSR_LBR_NHM_TO;
|
||||
|
||||
x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
|
||||
x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
|
||||
|
||||
/*
|
||||
* SW branch filter usage:
|
||||
* - support syscall, sysret capture.
|
||||
* That requires LBR_FAR but that means far
|
||||
* jmp need to be filtered out
|
||||
*/
|
||||
pr_cont("32-deep LBR, ");
|
||||
}
|
||||
|
||||
/* atom */
|
||||
void __init intel_pmu_lbr_init_atom(void)
|
||||
{
|
||||
|
|
Loading…
Reference in a new issue