diff --git a/wperf-common/iorequest.h b/wperf-common/iorequest.h index 75a376f..5ea1d3c 100644 --- a/wperf-common/iorequest.h +++ b/wperf-common/iorequest.h @@ -267,6 +267,7 @@ struct spe_ctl_hdr #define SPE_CTL_FLAG_VAL_MASK 0xFFFF // PMSLATFR_EL1.MINLAT is 16-bit wide #define SPE_CTL_FLAG_VAL_12_BIT_MASK 0x0FFF // PMSLATFR_EL1.MINLAT is 12-bit wide if CountSize == 0b0010 UINT32 interval; +#define SPE_CTL_INTERVAL_VAL_MASK 0x0FFFFF // INTERVAL, bits [31:8] is interval counter reload value }; // diff --git a/wperf-driver/spe.c b/wperf-driver/spe.c index 13de278..5821482 100644 --- a/wperf-driver/spe.c +++ b/wperf-driver/spe.c @@ -74,22 +74,35 @@ VOID SPEWorkItemFunc(WDFWORKITEM WorkItem) START_WORK_ON_CORE(context->core_idx); _WriteStatusReg(PMBPTR_EL1, (UINT64)SpeMemoryBuffer); - + __isb(_ARM64_BARRIER_SY); + /* - * Writing to PMSIRR_EL1 and PMSICR_EL1 seems to be innefective for some reason. - * When PMSIRR_EL1 is written to its value just goes to 0 and PMSICR_EL1 seems to be unchanged. - * At least this is what can be seen in the logs. - * This looks like an unexpected behaviour as the documentation seems to imply that - * PMSCIR_EL1 needs to be zeroed before sampling starts and PMSIRR_EL1.Interval needs to be set. - _WriteStatusReg(PMSICR_EL1, 0); - if (context->config_flags & SPE_CTL_FLAG_RND) + * Setup `Sampling Interval Reload Register` + */ + UINT64 pmsirr = 0x00; + UINT32 interval = (context->interval & SPE_CTL_INTERVAL_VAL_MASK); // Controlled with period= { - _WriteStatusReg(PMSIRR_EL1, PMSIRR_EL1_RND | ((UINT64)context->interval << 8)); - } - else { - _WriteStatusReg(PMSIRR_EL1, (UINT64)context->interval << 8); + UINT32 min_interval = spe_recommended_min_sampling_interval(_ReadStatusReg(PMSIDR_EL1)); + if (interval < min_interval) + { + // Software should set this to a value GREATER + // than the minimum indicated by PMSIDR_EL1.Interval + interval = min_interval + 1; + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_INFO_LEVEL, "SPE: jitter=1, interval=%u is below recommended min sampling interval, new interval=%u \n", context->interval, interval)); + } } + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_INFO_LEVEL, "SPE: interval=%u \n", interval)); + pmsirr |= (UINT64)interval << 8; + + /* + * Setup `jitter`, Controls randomization of the sampling interval */ + if (context->config_flags & SPE_CTL_FLAG_RND) + pmsirr |= PMSIRR_EL1_RND; + + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_INFO_LEVEL, "SPE: pmsirr=0x%llX \n", pmsirr)); + _WriteStatusReg(PMSIRR_EL1, pmsirr); + __isb(_ARM64_BARRIER_SY); /* * Setup `Sampling Filter Control Register` @@ -123,6 +136,7 @@ VOID SPEWorkItemFunc(WDFWORKITEM WorkItem) KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_INFO_LEVEL, "SPE: min_latency=%u is 12-bit, min_latency is trimmed! \n", min_latency)); } _WriteStatusReg(PMSLATFR_EL1, min_latency); // Configure PMSLATFR_EL1.MINLAT + __isb(_ARM64_BARRIER_SY); pmsfcr |= PMSFCR_EL1_FL; // Enable Filter by latency KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_INFO_LEVEL, "SPE: min_latency=%u PMSFCR_EL1=0x%llX\n", min_latency, pmsfcr)); @@ -130,6 +144,7 @@ VOID SPEWorkItemFunc(WDFWORKITEM WorkItem) KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_INFO_LEVEL, "SPE: pmsfcr=0x%llX \n", pmsfcr)); _WriteStatusReg(PMSFCR_EL1, pmsfcr); + __isb(_ARM64_BARRIER_SY); /* * Configure PMSCR_EL1 settings based on user-space flags. By default all settings are disabled @@ -137,6 +152,7 @@ VOID SPEWorkItemFunc(WDFWORKITEM WorkItem) * (e.g. TS bit) to "ON" in this register. */ _WriteStatusReg(PMSCR_EL1, 0x00); + __isb(_ARM64_BARRIER_SY); if (context->config_flags & SPE_CTL_FLAG_TS) { // Enable timestamps with ts_enable filter: @@ -146,9 +162,12 @@ VOID SPEWorkItemFunc(WDFWORKITEM WorkItem) } _WriteStatusReg(PMBSR_EL1, _ReadStatusReg(PMBSR_EL1) & (~PMBSR_EL1_S)); // Clear PMBSR_EL1.S + __isb(_ARM64_BARRIER_SY); //PMBPTR_EL1[63:56] must equal PMBLIMITR_EL1.LIMIT[63:56] _WriteStatusReg(PMBLIMITR_EL1, (UINT64)SpeMemoryBufferLimit | PMBLIMITR_EL1_E); // Enable PMBLIMITR_ELI1.E + __isb(_ARM64_BARRIER_SY); _WriteStatusReg(PMSCR_EL1, _ReadStatusReg(PMSCR_EL1) | PMSCR_EL1_E0SPE_E1SPE); // Enable PMSCR_EL1.{E0SPE,E1SPE} + __isb(_ARM64_BARRIER_SY); KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_INFO_LEVEL, "Statistical Profiling Extension: memory buffer 0x%llX\n", _ReadStatusReg(PMBPTR_EL1))); KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_INFO_LEVEL, "Statistical Profiling Extension: memory buffer limit address %llX\n", _ReadStatusReg(PMBLIMITR_EL1))); @@ -180,10 +199,12 @@ VOID SPEWorkItemFunc(WDFWORKITEM WorkItem) START_WORK_ON_CORE(context->core_idx); _WriteStatusReg(PMBLIMITR_EL1, 0); // Disable PMBLIMITR_ELI1.E + __isb(_ARM64_BARRIER_SY); _WriteStatusReg(PMSCR_EL1, _ReadStatusReg(PMSCR_EL1) & (~PMSCR_EL1_E0SPE_E1SPE)); // Disable PMSCR_EL1.{E0SPE,E1SPE} - + __isb(_ARM64_BARRIER_SY); _WriteStatusReg(PMBSR_EL1, _ReadStatusReg(PMBSR_EL1) & (~PMBSR_EL1_S)); // Clear PMBSR_EL1.S - + __isb(_ARM64_BARRIER_SY); + STOP_WORK_ON_CORE(); KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_INFO_LEVEL, "Statistical Profiling Extension: memory buffer 0x%llX\n", _ReadStatusReg(PMBPTR_EL1))); @@ -214,8 +235,11 @@ static VOID dpc_spe_overflow(struct _KDPC* dpc, PVOID ctx, PVOID sys_arg1, PVOID KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_INFO_LEVEL, "SPE_DPC profiling buffer full\n")); //Disable sampling _WriteStatusReg(PMBLIMITR_EL1, 0); // Disable PMBLIMITR_ELI1.E + __isb(_ARM64_BARRIER_SY); _WriteStatusReg(PMBSR_EL1, _ReadStatusReg(PMBSR_EL1) & (~PMBSR_EL1_S)); // Clear PMBSR_EL1.S + __isb(_ARM64_BARRIER_SY); _WriteStatusReg(PMSCR_EL1, _ReadStatusReg(PMSCR_EL1) & (~PMSCR_EL1_E0SPE_E1SPE)); // Disable PMSCR_EL1.{E0SPE,E1SPE} + __isb(_ARM64_BARRIER_SY); spu->profiling_running = FALSE; } } @@ -384,4 +408,22 @@ void spe_stop(WDFWORKITEM* workItem, UINT32 core_idx) UNREFERENCED_PARAMETER(workItem); UNREFERENCED_PARAMETER(core_idx); #endif -} \ No newline at end of file +} + +UINT32 spe_recommended_min_sampling_interval(UINT64 pmsidr_el1_value) +{ + const UINT64 interval = (pmsidr_el1_value & PMSIDR_EL1_Interval_MASK) >> 8; + switch (interval) + { + // All other values are reserved. + case 0b0000: return 256; + case 0b0010: return 512; + case 0b0011: return 768; + case 0b0100: return 1024; + case 0b0101: return 1536; + case 0b0110: return 2048; + case 0b0111: return 3072; + default: + case 0b1000: return 4096; + } +} diff --git a/wperf-driver/spe.h b/wperf-driver/spe.h index 921619e..7bbe7b2 100644 --- a/wperf-driver/spe.h +++ b/wperf-driver/spe.h @@ -47,8 +47,9 @@ #define PMSIDR_EL1_CountSize_MASK (0x0F << 16) // CountSize, bits [19:16] #define PMSIDR_EL1_CountSize_12Bit 0b0010 // 12-bit saturating counters. #define PMSIDR_EL1_CountSize_16Bit 0b0011 // 16-bit saturating counters. +#define PMSIDR_EL1_Interval_MASK (0x0F << 8) // Recommended minimum sampling interval. -#define PMSIRR_EL1_RND BIT(0) +#define PMSIRR_EL1_RND BIT(0) // Add (pseudo-)random jitter to sampling interval. #define PMBLIMITR_EL1_LIMIT_MASK (~((UINT64)0xFFF)) // PMBLIMITR.LIMIT, bits [63:12] #define SPE_MEMORY_BUFFER_SIZE (PAGE_SIZE*128) // PAGE_SIZE is defined in WDM.h @@ -111,4 +112,5 @@ void spe_stop(WDFWORKITEM* workItem, UINT32 core_idx); void spe_destroy(); NTSTATUS spe_setup(ULONG numCores); -void spe_destroy(); \ No newline at end of file +void spe_destroy(); +UINT32 spe_recommended_min_sampling_interval(UINT64 pmsidr_el1_value); \ No newline at end of file diff --git a/wperf/pmu_device.cpp b/wperf/pmu_device.cpp index d616a9c..c60e9e9 100644 --- a/wperf/pmu_device.cpp +++ b/wperf/pmu_device.cpp @@ -348,6 +348,7 @@ void pmu_device::spe_start(const std::map& flags) ctl.event_filter = 0; UINT8 opfilter = 0; UINT64 config_flags = 0; + UINT32 interval = 0; // 0 will force minimum indicated by PMSIDR_EL1.Interval. /* * `config_flags` stores multiple values: * @@ -364,6 +365,8 @@ void pmu_device::spe_start(const std::map& flags) if (spe_device::get_filter_name(key) == L"store_filter" && val) opfilter |= SPE_OPERATON_FILTER_ST; if (spe_device::get_filter_name(key) == L"branch_filter" && val) opfilter |= SPE_OPERATON_FILTER_B; if (spe_device::get_filter_name(key) == L"ts_enable" && val) config_flags |= SPE_CTL_FLAG_TS; + if (spe_device::get_filter_name(key) == L"jitter" && val) config_flags |= SPE_CTL_FLAG_RND; + if (spe_device::get_filter_name(key) == L"period" && val) interval = val & SPE_CTL_INTERVAL_VAL_MASK; if (spe_device::get_filter_name(key) == L"min_latency" && val) { UINT64 minlat = val & SPE_CTL_FLAG_VAL_MASK; // PMSLATFR_EL1.MINLAT is 16 - bit value @@ -372,7 +375,7 @@ void pmu_device::spe_start(const std::map& flags) } } ctl.operation_filter = opfilter; - ctl.interval = 1024; + ctl.interval = interval; ctl.config_flags = config_flags; BOOL status = DeviceAsyncIoControl(m_device_handle, PMU_CTL_SPE_START, &ctl, sizeof(struct spe_ctl_hdr), NULL, 0, &res_len); diff --git a/wperf/spe_device.cpp b/wperf/spe_device.cpp index d07f89b..46aa7a7 100644 --- a/wperf/spe_device.cpp +++ b/wperf/spe_device.cpp @@ -409,7 +409,9 @@ const std::vector spe_device::m_filter_names = { L"store_filter", L"branch_filter", L"ts_enable", - L"min_latency" + L"min_latency", + L"jitter", + L"period" }; // Filters also have aliases, this structure helps to translate alias to filter name @@ -418,7 +420,9 @@ const std::map spe_device::m_filter_names_aliases = { L"st", L"store_filter" }, { L"b" , L"branch_filter" }, { L"ts", L"ts_enable" }, - { L"min", L"min_latency" } + { L"min", L"min_latency" }, + { L"j", L"jitter" }, + { L"per", L"period" } }; // Filters also have aliases, this structure helps to translate alias to filter name @@ -427,7 +431,9 @@ const std::map spe_device::m_filter_names_descriptio { L"store_filter", L"Enables collection of store sampled operations, including all atomic operations." }, { L"branch_filter", L"Enables collection of branch sampled operations, including direct and indirect branches and exception returns." }, { L"ts_enable", L"Enables timestamping with value of generic timer." }, - { L"min_latency", L"Collect only samples with this latency or higher." } + { L"min_latency", L"Collect only samples with this latency or higher." }, + { L"jitter", L"Use jitter to avoid resonance when sampling." }, + { L"pertiod", L"Use period to set interval counter reload value. The minimum interval is used by default." }, }; spe_device::spe_device() {} diff --git a/wperf/spe_device.h b/wperf/spe_device.h index b9e68dc..da10120 100644 --- a/wperf/spe_device.h +++ b/wperf/spe_device.h @@ -77,6 +77,8 @@ class spe_device { if (get_filter_name(fname) == L"min_latency") return SPE_CTL_FLAG_VAL_MASK; // PMSLATFR_EL1, Sampling Latency Filter Register, MINLAT, bits [15:0] + else if (get_filter_name(fname) == L"period") + return SPE_CTL_INTERVAL_VAL_MASK; // PMSIRR_EL1, Sampling Interval Reload Register, INTERVAL, bits [31:8] return 1; }