Skip to content

Commit 2f8d560

Browse files
committed
Merge: arm64: AMU-based CPU frequency
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-10/-/merge_requests/462 JIRA: https://issues.redhat.com/browse/RHEL-80968 This is a backport of the patch series "Add support for AArch64 AMUv1-based average freq" (https://lore.kernel.org/all/20250131162439.3843071-1-beata.michalska@arm.com/) and its dependencies. The first patch in this MR is upstream in v6.14 (since rc3). The rest are now in Linus's master branch and slated for 6.15. Signed-off-by: Jennifer Berringer <jberring@redhat.com> Approved-by: David Arcari <darcari@redhat.com> Approved-by: Mark Langsdorf <mlangsdo@redhat.com> Approved-by: CKI KWF Bot <cki-ci-bot+kwf-gitlab-com@redhat.com> Merged-by: Julio Faracco <jfaracco@redhat.com>
2 parents 5311f19 + a8a2d67 commit 2f8d560

File tree

9 files changed

+202
-35
lines changed

9 files changed

+202
-35
lines changed

Documentation/admin-guide/pm/cpufreq.rst

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,20 @@ are the following:
248248
If that frequency cannot be determined, this attribute should not
249249
be present.
250250

251+
``cpuinfo_avg_freq``
252+
An average frequency (in KHz) of all CPUs belonging to a given policy,
253+
derived from a hardware provided feedback and reported on a time frame
254+
spanning at most few milliseconds.
255+
256+
This is expected to be based on the frequency the hardware actually runs
257+
at and, as such, might require specialised hardware support (such as AMU
258+
extension on ARM). If one cannot be determined, this attribute should
259+
not be present.
260+
261+
Note, that failed attempt to retrieve current frequency for a given
262+
CPU(s) will result in an appropriate error, i.e: EAGAIN for CPU that
263+
remains idle (raised on ARM).
264+
251265
``cpuinfo_max_freq``
252266
Maximum possible operating frequency the CPUs belonging to this policy
253267
can run at (in kHz).
@@ -293,7 +307,8 @@ are the following:
293307
Some architectures (e.g. ``x86``) may attempt to provide information
294308
more precisely reflecting the current CPU frequency through this
295309
attribute, but that still may not be the exact current CPU frequency as
296-
seen by the hardware at the moment.
310+
seen by the hardware at the moment. This behavior though, is only
311+
available via c:macro:``CPUFREQ_ARCH_CUR_FREQ`` option.
297312

298313
``scaling_driver``
299314
The scaling driver currently in use.

arch/arm64/kernel/topology.c

Lines changed: 128 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <linux/cpufreq.h>
1818
#include <linux/init.h>
1919
#include <linux/percpu.h>
20+
#include <linux/sched/isolation.h>
2021

2122
#include <asm/cpu.h>
2223
#include <asm/cputype.h>
@@ -88,18 +89,28 @@ int __init parse_acpi_topology(void)
8889
* initialized.
8990
*/
9091
static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) = 1UL << (2 * SCHED_CAPACITY_SHIFT);
91-
static DEFINE_PER_CPU(u64, arch_const_cycles_prev);
92-
static DEFINE_PER_CPU(u64, arch_core_cycles_prev);
9392
static cpumask_var_t amu_fie_cpus;
9493

94+
struct amu_cntr_sample {
95+
u64 arch_const_cycles_prev;
96+
u64 arch_core_cycles_prev;
97+
unsigned long last_scale_update;
98+
};
99+
100+
static DEFINE_PER_CPU_SHARED_ALIGNED(struct amu_cntr_sample, cpu_amu_samples);
101+
95102
void update_freq_counters_refs(void)
96103
{
97-
this_cpu_write(arch_core_cycles_prev, read_corecnt());
98-
this_cpu_write(arch_const_cycles_prev, read_constcnt());
104+
struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
105+
106+
amu_sample->arch_core_cycles_prev = read_corecnt();
107+
amu_sample->arch_const_cycles_prev = read_constcnt();
99108
}
100109

101110
static inline bool freq_counters_valid(int cpu)
102111
{
112+
struct amu_cntr_sample *amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
113+
103114
if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask))
104115
return false;
105116

@@ -108,8 +119,8 @@ static inline bool freq_counters_valid(int cpu)
108119
return false;
109120
}
110121

111-
if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) ||
112-
!per_cpu(arch_core_cycles_prev, cpu))) {
122+
if (unlikely(!amu_sample->arch_const_cycles_prev ||
123+
!amu_sample->arch_core_cycles_prev)) {
113124
pr_debug("CPU%d: cycle counters are not enabled.\n", cpu);
114125
return false;
115126
}
@@ -152,17 +163,22 @@ void freq_inv_set_max_ratio(int cpu, u64 max_rate)
152163

153164
static void amu_scale_freq_tick(void)
154165
{
166+
struct amu_cntr_sample *amu_sample = this_cpu_ptr(&cpu_amu_samples);
155167
u64 prev_core_cnt, prev_const_cnt;
156168
u64 core_cnt, const_cnt, scale;
157169

158-
prev_const_cnt = this_cpu_read(arch_const_cycles_prev);
159-
prev_core_cnt = this_cpu_read(arch_core_cycles_prev);
170+
prev_const_cnt = amu_sample->arch_const_cycles_prev;
171+
prev_core_cnt = amu_sample->arch_core_cycles_prev;
160172

161173
update_freq_counters_refs();
162174

163-
const_cnt = this_cpu_read(arch_const_cycles_prev);
164-
core_cnt = this_cpu_read(arch_core_cycles_prev);
175+
const_cnt = amu_sample->arch_const_cycles_prev;
176+
core_cnt = amu_sample->arch_core_cycles_prev;
165177

178+
/*
179+
* This should not happen unless the AMUs have been reset and the
180+
* counter values have not been restored - unlikely
181+
*/
166182
if (unlikely(core_cnt <= prev_core_cnt ||
167183
const_cnt <= prev_const_cnt))
168184
return;
@@ -182,24 +198,123 @@ static void amu_scale_freq_tick(void)
182198

183199
scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE);
184200
this_cpu_write(arch_freq_scale, (unsigned long)scale);
201+
202+
amu_sample->last_scale_update = jiffies;
185203
}
186204

187205
static struct scale_freq_data amu_sfd = {
188206
.source = SCALE_FREQ_SOURCE_ARCH,
189207
.set_freq_scale = amu_scale_freq_tick,
190208
};
191209

210+
static __always_inline bool amu_fie_cpu_supported(unsigned int cpu)
211+
{
212+
return cpumask_available(amu_fie_cpus) &&
213+
cpumask_test_cpu(cpu, amu_fie_cpus);
214+
}
215+
216+
void arch_cpu_idle_enter(void)
217+
{
218+
unsigned int cpu = smp_processor_id();
219+
220+
if (!amu_fie_cpu_supported(cpu))
221+
return;
222+
223+
/* Kick in AMU update but only if one has not happened already */
224+
if (housekeeping_cpu(cpu, HK_TYPE_TICK) &&
225+
time_is_before_jiffies(per_cpu(cpu_amu_samples.last_scale_update, cpu)))
226+
amu_scale_freq_tick();
227+
}
228+
229+
#define AMU_SAMPLE_EXP_MS 20
230+
231+
int arch_freq_get_on_cpu(int cpu)
232+
{
233+
struct amu_cntr_sample *amu_sample;
234+
unsigned int start_cpu = cpu;
235+
unsigned long last_update;
236+
unsigned int freq = 0;
237+
u64 scale;
238+
239+
if (!amu_fie_cpu_supported(cpu) || !arch_scale_freq_ref(cpu))
240+
return -EOPNOTSUPP;
241+
242+
while (1) {
243+
244+
amu_sample = per_cpu_ptr(&cpu_amu_samples, cpu);
245+
246+
last_update = amu_sample->last_scale_update;
247+
248+
/*
249+
* For those CPUs that are in full dynticks mode, or those that have
250+
* not seen tick for a while, try an alternative source for the counters
251+
* (and thus freq scale), if available, for given policy: this boils
252+
* down to identifying an active cpu within the same freq domain, if any.
253+
*/
254+
if (!housekeeping_cpu(cpu, HK_TYPE_TICK) ||
255+
time_is_before_jiffies(last_update + msecs_to_jiffies(AMU_SAMPLE_EXP_MS))) {
256+
struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
257+
int ref_cpu;
258+
259+
if (!policy)
260+
return -EINVAL;
261+
262+
if (!cpumask_intersects(policy->related_cpus,
263+
housekeeping_cpumask(HK_TYPE_TICK))) {
264+
cpufreq_cpu_put(policy);
265+
return -EOPNOTSUPP;
266+
}
267+
268+
for_each_cpu_wrap(ref_cpu, policy->cpus, cpu + 1) {
269+
if (ref_cpu == start_cpu) {
270+
/* Prevent verifying same CPU twice */
271+
ref_cpu = nr_cpu_ids;
272+
break;
273+
}
274+
if (!idle_cpu(ref_cpu))
275+
break;
276+
}
277+
278+
cpufreq_cpu_put(policy);
279+
280+
if (ref_cpu >= nr_cpu_ids)
281+
/* No alternative to pull info from */
282+
return -EAGAIN;
283+
284+
cpu = ref_cpu;
285+
} else {
286+
break;
287+
}
288+
}
289+
/*
290+
* Reversed computation to the one used to determine
291+
* the arch_freq_scale value
292+
* (see amu_scale_freq_tick for details)
293+
*/
294+
scale = arch_scale_freq_capacity(cpu);
295+
freq = scale * arch_scale_freq_ref(cpu);
296+
freq >>= SCHED_CAPACITY_SHIFT;
297+
return freq;
298+
}
299+
192300
static void amu_fie_setup(const struct cpumask *cpus)
193301
{
194302
int cpu;
195303

196304
/* We are already set since the last insmod of cpufreq driver */
197-
if (unlikely(cpumask_subset(cpus, amu_fie_cpus)))
305+
if (cpumask_available(amu_fie_cpus) &&
306+
unlikely(cpumask_subset(cpus, amu_fie_cpus)))
198307
return;
199308

200-
for_each_cpu(cpu, cpus) {
309+
for_each_cpu(cpu, cpus)
201310
if (!freq_counters_valid(cpu))
202311
return;
312+
313+
if (!cpumask_available(amu_fie_cpus) &&
314+
!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) {
315+
WARN_ONCE(1, "Failed to allocate FIE cpumask for CPUs[%*pbl]\n",
316+
cpumask_pr_args(cpus));
317+
return;
203318
}
204319

205320
cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus);
@@ -237,17 +352,8 @@ static struct notifier_block init_amu_fie_notifier = {
237352

238353
static int __init init_amu_fie(void)
239354
{
240-
int ret;
241-
242-
if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL))
243-
return -ENOMEM;
244-
245-
ret = cpufreq_register_notifier(&init_amu_fie_notifier,
355+
return cpufreq_register_notifier(&init_amu_fie_notifier,
246356
CPUFREQ_POLICY_NOTIFIER);
247-
if (ret)
248-
free_cpumask_var(amu_fie_cpus);
249-
250-
return ret;
251357
}
252358
core_initcall(init_amu_fie);
253359

arch/x86/kernel/cpu/aperfmperf.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -498,7 +498,7 @@ void arch_scale_freq_tick(void)
498498
*/
499499
#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
500500

501-
unsigned int arch_freq_get_on_cpu(int cpu)
501+
int arch_freq_get_on_cpu(int cpu)
502502
{
503503
struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
504504
unsigned int seq, freq;

arch/x86/kernel/cpu/proc.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,12 @@ static int show_cpuinfo(struct seq_file *m, void *v)
8686
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
8787

8888
if (cpu_has(c, X86_FEATURE_TSC)) {
89-
unsigned int freq = arch_freq_get_on_cpu(cpu);
89+
int freq = arch_freq_get_on_cpu(cpu);
9090

91-
seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000));
91+
if (freq < 0)
92+
seq_puts(m, "cpu MHz\t\t: Unknown\n");
93+
else
94+
seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000));
9295
}
9396

9497
/* Cache size */

drivers/base/arch_topology.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data);
2929
static struct cpumask scale_freq_counters_mask;
3030
static bool scale_freq_invariant;
31-
DEFINE_PER_CPU(unsigned long, capacity_freq_ref) = 1;
31+
DEFINE_PER_CPU(unsigned long, capacity_freq_ref) = 0;
3232
EXPORT_PER_CPU_SYMBOL_GPL(capacity_freq_ref);
3333

3434
static bool supports_scale_freq_counters(const struct cpumask *cpus)
@@ -293,13 +293,15 @@ void topology_normalize_cpu_scale(void)
293293

294294
capacity_scale = 1;
295295
for_each_possible_cpu(cpu) {
296-
capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu);
296+
capacity = raw_capacity[cpu] *
297+
(per_cpu(capacity_freq_ref, cpu) ?: 1);
297298
capacity_scale = max(capacity, capacity_scale);
298299
}
299300

300301
pr_debug("cpu_capacity: capacity_scale=%llu\n", capacity_scale);
301302
for_each_possible_cpu(cpu) {
302-
capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu);
303+
capacity = raw_capacity[cpu] *
304+
(per_cpu(capacity_freq_ref, cpu) ?: 1);
303305
capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT,
304306
capacity_scale);
305307
topology_set_cpu_scale(cpu, capacity);

drivers/cpufreq/Kconfig.x86

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,3 +340,15 @@ config X86_SPEEDSTEP_RELAXED_CAP_CHECK
340340
option lets the probing code bypass some of those checks if the
341341
parameter "relaxed_check=1" is passed to the module.
342342

343+
config CPUFREQ_ARCH_CUR_FREQ
344+
default y
345+
bool "Current frequency derived from HW provided feedback"
346+
help
347+
This determines whether the scaling_cur_freq sysfs attribute returns
348+
the last requested frequency or a more precise value based on hardware
349+
provided feedback (as architected counters).
350+
Given that a more precise frequency can now be provided via the
351+
cpuinfo_avg_freq attribute, by enabling this option,
352+
scaling_cur_freq maintains the provision of a counter based frequency,
353+
for compatibility reasons.
354+

drivers/cpufreq/cpufreq.c

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -728,18 +728,26 @@ show_one(cpuinfo_transition_latency, cpuinfo.transition_latency);
728728
show_one(scaling_min_freq, min);
729729
show_one(scaling_max_freq, max);
730730

731-
__weak unsigned int arch_freq_get_on_cpu(int cpu)
731+
__weak int arch_freq_get_on_cpu(int cpu)
732732
{
733-
return 0;
733+
return -EOPNOTSUPP;
734+
}
735+
736+
static inline bool cpufreq_avg_freq_supported(struct cpufreq_policy *policy)
737+
{
738+
return arch_freq_get_on_cpu(policy->cpu) != -EOPNOTSUPP;
734739
}
735740

736741
static ssize_t show_scaling_cur_freq(struct cpufreq_policy *policy, char *buf)
737742
{
738743
ssize_t ret;
739-
unsigned int freq;
744+
int freq;
745+
746+
freq = IS_ENABLED(CONFIG_CPUFREQ_ARCH_CUR_FREQ)
747+
? arch_freq_get_on_cpu(policy->cpu)
748+
: 0;
740749

741-
freq = arch_freq_get_on_cpu(policy->cpu);
742-
if (freq)
750+
if (freq > 0)
743751
ret = sysfs_emit(buf, "%u\n", freq);
744752
else if (cpufreq_driver->setpolicy && cpufreq_driver->get)
745753
ret = sysfs_emit(buf, "%u\n", cpufreq_driver->get(policy->cpu));
@@ -783,6 +791,19 @@ static ssize_t show_cpuinfo_cur_freq(struct cpufreq_policy *policy,
783791
return sysfs_emit(buf, "<unknown>\n");
784792
}
785793

794+
/*
795+
* show_cpuinfo_avg_freq - average CPU frequency as detected by hardware
796+
*/
797+
static ssize_t show_cpuinfo_avg_freq(struct cpufreq_policy *policy,
798+
char *buf)
799+
{
800+
int avg_freq = arch_freq_get_on_cpu(policy->cpu);
801+
802+
if (avg_freq > 0)
803+
return sysfs_emit(buf, "%u\n", avg_freq);
804+
return avg_freq != 0 ? avg_freq : -EINVAL;
805+
}
806+
786807
/*
787808
* show_scaling_governor - show the current policy for the specified CPU
788809
*/
@@ -945,6 +966,7 @@ static ssize_t show_bios_limit(struct cpufreq_policy *policy, char *buf)
945966
}
946967

947968
cpufreq_freq_attr_ro_perm(cpuinfo_cur_freq, 0400);
969+
cpufreq_freq_attr_ro(cpuinfo_avg_freq);
948970
cpufreq_freq_attr_ro(cpuinfo_min_freq);
949971
cpufreq_freq_attr_ro(cpuinfo_max_freq);
950972
cpufreq_freq_attr_ro(cpuinfo_transition_latency);
@@ -1072,6 +1094,12 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy)
10721094
return ret;
10731095
}
10741096

1097+
if (cpufreq_avg_freq_supported(policy)) {
1098+
ret = sysfs_create_file(&policy->kobj, &cpuinfo_avg_freq.attr);
1099+
if (ret)
1100+
return ret;
1101+
}
1102+
10751103
ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr);
10761104
if (ret)
10771105
return ret;

0 commit comments

Comments
 (0)