From 245bd6f6af8a62a2e2e14976e5ef3dc2b82ec153 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 6 Nov 2014 15:51:00 +0200 Subject: [PATCH 01/12] PM / clock_ops: Add pm_clk_add_clk() The existing pm_clk_add() allows to pass a clock by con_id. However, when referring to a specific clock from DT, no con_id is available. Add pm_clk_add_clk(), which allows to specify the struct clk * directly. The will will increment refcount on clock pointer, so the caller has to use clk_put() on clock pointer when done. Reviewed-by: Santosh Shilimkar Reviewed-by: Kevin Hilman Signed-off-by: Geert Uytterhoeven Signed-off-by: Grygorii Strashko Reviewed-by: Dmitry Torokhov Signed-off-by: Rafael J. Wysocki --- drivers/base/power/clock_ops.c | 47 ++++++++++++++++++++++++++-------- include/linux/pm_clock.h | 8 ++++++ 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/drivers/base/power/clock_ops.c b/drivers/base/power/clock_ops.c index 78369305e069..f061eaa48bb5 100644 --- a/drivers/base/power/clock_ops.c +++ b/drivers/base/power/clock_ops.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -53,7 +54,8 @@ static inline int __pm_clk_enable(struct device *dev, struct clk *clk) */ static void pm_clk_acquire(struct device *dev, struct pm_clock_entry *ce) { - ce->clk = clk_get(dev, ce->con_id); + if (!ce->clk) + ce->clk = clk_get(dev, ce->con_id); if (IS_ERR(ce->clk)) { ce->status = PCE_STATUS_ERROR; } else { @@ -63,15 +65,8 @@ static void pm_clk_acquire(struct device *dev, struct pm_clock_entry *ce) } } -/** - * pm_clk_add - Start using a device clock for power management. - * @dev: Device whose clock is going to be used for power management. - * @con_id: Connection ID of the clock. - * - * Add the clock represented by @con_id to the list of clocks used for - * the power management of @dev. - */ -int pm_clk_add(struct device *dev, const char *con_id) +static int __pm_clk_add(struct device *dev, const char *con_id, + struct clk *clk) { struct pm_subsys_data *psd = dev_to_psd(dev); struct pm_clock_entry *ce; @@ -93,6 +88,12 @@ int pm_clk_add(struct device *dev, const char *con_id) kfree(ce); return -ENOMEM; } + } else { + if (IS_ERR(ce->clk) || !__clk_get(clk)) { + kfree(ce); + return -ENOENT; + } + ce->clk = clk; } pm_clk_acquire(dev, ce); @@ -103,6 +104,32 @@ int pm_clk_add(struct device *dev, const char *con_id) return 0; } +/** + * pm_clk_add - Start using a device clock for power management. + * @dev: Device whose clock is going to be used for power management. + * @con_id: Connection ID of the clock. + * + * Add the clock represented by @con_id to the list of clocks used for + * the power management of @dev. + */ +int pm_clk_add(struct device *dev, const char *con_id) +{ + return __pm_clk_add(dev, con_id, NULL); +} + +/** + * pm_clk_add_clk - Start using a device clock for power management. + * @dev: Device whose clock is going to be used for power management. + * @clk: Clock pointer + * + * Add the clock to the list of clocks used for the power management of @dev. + * It will increment refcount on clock pointer, use clk_put() on it when done. + */ +int pm_clk_add_clk(struct device *dev, struct clk *clk) +{ + return __pm_clk_add(dev, NULL, clk); +} + /** * __pm_clk_remove - Destroy PM clock entry. * @ce: PM clock entry to destroy. diff --git a/include/linux/pm_clock.h b/include/linux/pm_clock.h index 8348866e7b05..0b0039634410 100644 --- a/include/linux/pm_clock.h +++ b/include/linux/pm_clock.h @@ -18,6 +18,8 @@ struct pm_clk_notifier_block { char *con_ids[]; }; +struct clk; + #ifdef CONFIG_PM_CLK static inline bool pm_clk_no_clocks(struct device *dev) { @@ -29,6 +31,7 @@ extern void pm_clk_init(struct device *dev); extern int pm_clk_create(struct device *dev); extern void pm_clk_destroy(struct device *dev); extern int pm_clk_add(struct device *dev, const char *con_id); +extern int pm_clk_add_clk(struct device *dev, struct clk *clk); extern void pm_clk_remove(struct device *dev, const char *con_id); extern int pm_clk_suspend(struct device *dev); extern int pm_clk_resume(struct device *dev); @@ -51,6 +54,11 @@ static inline int pm_clk_add(struct device *dev, const char *con_id) { return -EINVAL; } + +static inline int pm_clk_add_clk(struct device *dev, struct clk *clk) +{ + return -EINVAL; +} static inline void pm_clk_remove(struct device *dev, const char *con_id) { } From 471f7707b6f0b18b6aa81119ed01525d9e712427 Mon Sep 17 00:00:00 2001 From: "Strashko, Grygorii" Date: Thu, 6 Nov 2014 15:51:01 +0200 Subject: [PATCH 02/12] PM / clock_ops: make __pm_clk_enable more generic Now there are two places in code which do the same things, so allow __pm_clk_enable() to accept pointer on pm_clock_entry structure as second parameter instead of pointer on clock and remove duplicated code. Also, updated function intended to be used by following patch. Signed-off-by: Grygorii Strashko Reviewed-by: Dmitry Torokhov Signed-off-by: Rafael J. Wysocki --- drivers/base/power/clock_ops.c | 38 ++++++++++++++-------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/drivers/base/power/clock_ops.c b/drivers/base/power/clock_ops.c index f061eaa48bb5..b32b5d47b3c5 100644 --- a/drivers/base/power/clock_ops.c +++ b/drivers/base/power/clock_ops.c @@ -35,14 +35,20 @@ struct pm_clock_entry { /** * pm_clk_enable - Enable a clock, reporting any errors * @dev: The device for the given clock - * @clk: The clock being enabled. + * @ce: PM clock entry corresponding to the clock. */ -static inline int __pm_clk_enable(struct device *dev, struct clk *clk) +static inline int __pm_clk_enable(struct device *dev, struct pm_clock_entry *ce) { - int ret = clk_enable(clk); - if (ret) - dev_err(dev, "%s: failed to enable clk %p, error %d\n", - __func__, clk, ret); + int ret; + + if (ce->status < PCE_STATUS_ERROR) { + ret = clk_enable(ce->clk); + if (!ret) + ce->status = PCE_STATUS_ENABLED; + else + dev_err(dev, "%s: failed to enable clk %p, error %d\n", + __func__, ce->clk, ret); + } return ret; } @@ -293,7 +299,6 @@ int pm_clk_resume(struct device *dev) struct pm_subsys_data *psd = dev_to_psd(dev); struct pm_clock_entry *ce; unsigned long flags; - int ret; dev_dbg(dev, "%s()\n", __func__); @@ -302,13 +307,8 @@ int pm_clk_resume(struct device *dev) spin_lock_irqsave(&psd->lock, flags); - list_for_each_entry(ce, &psd->clock_list, node) { - if (ce->status < PCE_STATUS_ERROR) { - ret = __pm_clk_enable(dev, ce->clk); - if (!ret) - ce->status = PCE_STATUS_ENABLED; - } - } + list_for_each_entry(ce, &psd->clock_list, node) + __pm_clk_enable(dev, ce); spin_unlock_irqrestore(&psd->lock, flags); @@ -417,7 +417,6 @@ int pm_clk_resume(struct device *dev) struct pm_subsys_data *psd = dev_to_psd(dev); struct pm_clock_entry *ce; unsigned long flags; - int ret; dev_dbg(dev, "%s()\n", __func__); @@ -427,13 +426,8 @@ int pm_clk_resume(struct device *dev) spin_lock_irqsave(&psd->lock, flags); - list_for_each_entry(ce, &psd->clock_list, node) { - if (ce->status < PCE_STATUS_ERROR) { - ret = __pm_clk_enable(dev, ce->clk); - if (!ret) - ce->status = PCE_STATUS_ENABLED; - } - } + list_for_each_entry(ce, &psd->clock_list, node) + __pm_clk_enable(dev, ce); spin_unlock_irqrestore(&psd->lock, flags); From 087e9cbab5022f8bb6dc9574ff5e320569903b80 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 7 Nov 2014 09:29:25 -0800 Subject: [PATCH 03/12] powercap / RAPL: abstract per cpu type functions RAPL implementations may vary slightly between Core and Atom CPUs. There are also potential differences among generations of CPUs within the family. This patch adds a per model structure to abstract the differences such that variations can be handled cleanly. Signed-off-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl.c | 45 +++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c index 45e05b32f9b6..256efed5b88f 100644 --- a/drivers/powercap/intel_rapl.c +++ b/drivers/powercap/intel_rapl.c @@ -188,6 +188,15 @@ struct rapl_package { */ struct list_head plist; }; + +struct rapl_defaults { + int (*check_unit)(struct rapl_package *rp, int cpu); + void (*set_floor_freq)(struct rapl_domain *rd, bool mode); + u64 (*compute_time_window)(struct rapl_package *rp, u64 val, + bool to_raw); +}; +static struct rapl_defaults *rapl_defaults; + #define PACKAGE_PLN_INT_SAVED BIT(0) #define MAX_PRIM_NAME (32) @@ -946,16 +955,28 @@ static void package_power_limit_irq_restore(int package_id) wrmsr_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); } +static const struct rapl_defaults rapl_defaults_core = { +}; + +static const struct rapl_defaults rapl_defaults_atom = { +}; + +#define RAPL_CPU(_model, _ops) { \ + .vendor = X86_VENDOR_INTEL, \ + .family = 6, \ + .model = _model, \ + .driver_data = (kernel_ulong_t)&_ops, \ + } + static const struct x86_cpu_id rapl_ids[] = { - { X86_VENDOR_INTEL, 6, 0x2a},/* Sandy Bridge */ - { X86_VENDOR_INTEL, 6, 0x2d},/* Sandy Bridge EP */ - { X86_VENDOR_INTEL, 6, 0x37},/* Valleyview */ - { X86_VENDOR_INTEL, 6, 0x3a},/* Ivy Bridge */ - { X86_VENDOR_INTEL, 6, 0x3c},/* Haswell */ - { X86_VENDOR_INTEL, 6, 0x3d},/* Broadwell */ - { X86_VENDOR_INTEL, 6, 0x3f},/* Haswell */ - { X86_VENDOR_INTEL, 6, 0x45},/* Haswell ULT */ - /* TODO: Add more CPU IDs after testing */ + RAPL_CPU(0x2a, rapl_defaults_core),/* Sandy Bridge */ + RAPL_CPU(0x2d, rapl_defaults_core),/* Sandy Bridge EP */ + RAPL_CPU(0x37, rapl_defaults_atom),/* Valleyview */ + RAPL_CPU(0x3a, rapl_defaults_core),/* Ivy Bridge */ + RAPL_CPU(0x3c, rapl_defaults_core),/* Haswell */ + RAPL_CPU(0x3d, rapl_defaults_core),/* Broadwell */ + RAPL_CPU(0x3f, rapl_defaults_core),/* Haswell */ + RAPL_CPU(0x45, rapl_defaults_core),/* Haswell ULT */ {} }; MODULE_DEVICE_TABLE(x86cpu, rapl_ids); @@ -1358,14 +1379,18 @@ static struct notifier_block rapl_cpu_notifier = { static int __init rapl_init(void) { int ret = 0; + const struct x86_cpu_id *id; - if (!x86_match_cpu(rapl_ids)) { + id = x86_match_cpu(rapl_ids); + if (!id) { pr_err("driver does not support CPU family %d model %d\n", boot_cpu_data.x86, boot_cpu_data.x86_model); return -ENODEV; } + rapl_defaults = (struct rapl_defaults *)id->driver_data; + cpu_notifier_register_begin(); /* prevent CPU hotplug during detection */ From 3c2c08454ce9cabf55ff927ecc240573c177a659 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 7 Nov 2014 09:29:26 -0800 Subject: [PATCH 04/12] powercap / RAPL: handle atom and core differences RAPL implementation on Atom has made some changes that are not compatible with Core CPUs. Specifically, it is different in the way units are computed as well as floor frequency is enforced. This patch uses the per CPU model functions to handle the differences. Intel Software Developers' Manual has also been updated to reflect the changes in Table 35-7 V3C. Signed-off-by: Ajay Thomas Signed-off-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl.c | 216 ++++++++++++++++++++++------------ 1 file changed, 139 insertions(+), 77 deletions(-) diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c index 256efed5b88f..4631696a7ccf 100644 --- a/drivers/powercap/intel_rapl.c +++ b/drivers/powercap/intel_rapl.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -70,11 +71,6 @@ #define RAPL_PRIMITIVE_DERIVED BIT(1) /* not from raw data */ #define RAPL_PRIMITIVE_DUMMY BIT(2) -/* scale RAPL units to avoid floating point math inside kernel */ -#define POWER_UNIT_SCALE (1000000) -#define ENERGY_UNIT_SCALE (1000000) -#define TIME_UNIT_SCALE (1000000) - #define TIME_WINDOW_MAX_MSEC 40000 #define TIME_WINDOW_MIN_MSEC 250 @@ -175,9 +171,9 @@ struct rapl_package { unsigned int id; /* physical package/socket id */ unsigned int nr_domains; unsigned long domain_map; /* bit map of active domains */ - unsigned int power_unit_divisor; - unsigned int energy_unit_divisor; - unsigned int time_unit_divisor; + unsigned int power_unit; + unsigned int energy_unit; + unsigned int time_unit; struct rapl_domain *domains; /* array of domains, sized at runtime */ struct powercap_zone *power_zone; /* keep track of parent zone */ int nr_cpus; /* active cpus on the package, topology info is lost during @@ -197,6 +193,9 @@ struct rapl_defaults { }; static struct rapl_defaults *rapl_defaults; +/* Sideband MBI registers */ +#define IOSF_CPU_POWER_BUDGET_CTL (0x2) + #define PACKAGE_PLN_INT_SAVED BIT(0) #define MAX_PRIM_NAME (32) @@ -348,23 +347,13 @@ static int find_nr_power_limit(struct rapl_domain *rd) static int set_domain_enable(struct powercap_zone *power_zone, bool mode) { struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); - int nr_powerlimit; if (rd->state & DOMAIN_STATE_BIOS_LOCKED) return -EACCES; + get_online_cpus(); - nr_powerlimit = find_nr_power_limit(rd); - /* here we activate/deactivate the hardware for power limiting */ rapl_write_data_raw(rd, PL1_ENABLE, mode); - /* always enable clamp such that p-state can go below OS requested - * range. power capping priority over guranteed frequency. - */ - rapl_write_data_raw(rd, PL1_CLAMP, mode); - /* some domains have pl2 */ - if (nr_powerlimit > 1) { - rapl_write_data_raw(rd, PL2_ENABLE, mode); - rapl_write_data_raw(rd, PL2_CLAMP, mode); - } + rapl_defaults->set_floor_freq(rd, mode); put_online_cpus(); return 0; @@ -662,9 +651,7 @@ static void rapl_init_domains(struct rapl_package *rp) static u64 rapl_unit_xlate(int package, enum unit_type type, u64 value, int to_raw) { - u64 divisor = 1; - int scale = 1; /* scale to user friendly data without floating point */ - u64 f, y; /* fraction and exp. used for time unit */ + u64 units = 1; struct rapl_package *rp; rp = find_package_by_id(package); @@ -673,42 +660,24 @@ static u64 rapl_unit_xlate(int package, enum unit_type type, u64 value, switch (type) { case POWER_UNIT: - divisor = rp->power_unit_divisor; - scale = POWER_UNIT_SCALE; + units = rp->power_unit; break; case ENERGY_UNIT: - scale = ENERGY_UNIT_SCALE; - divisor = rp->energy_unit_divisor; + units = rp->energy_unit; break; case TIME_UNIT: - divisor = rp->time_unit_divisor; - scale = TIME_UNIT_SCALE; - /* special processing based on 2^Y*(1+F)/4 = val/divisor, refer - * to Intel Software Developer's manual Vol. 3a, CH 14.7.4. - */ - if (!to_raw) { - f = (value & 0x60) >> 5; - y = value & 0x1f; - value = (1 << y) * (4 + f) * scale / 4; - return div64_u64(value, divisor); - } else { - do_div(value, scale); - value *= divisor; - y = ilog2(value); - f = div64_u64(4 * (value - (1 << y)), 1 << y); - value = (y & 0x1f) | ((f & 0x3) << 5); - return value; - } - break; + return rapl_defaults->compute_time_window(rp, value, to_raw); case ARBITRARY_UNIT: default: return value; }; if (to_raw) - return div64_u64(value * divisor, scale); - else - return div64_u64(value * scale, divisor); + return div64_u64(value, units); + + value *= units; + + return value; } /* in the order of enum rapl_primitives */ @@ -842,12 +811,18 @@ static int rapl_write_data_raw(struct rapl_domain *rd, return 0; } -static const struct x86_cpu_id energy_unit_quirk_ids[] = { - { X86_VENDOR_INTEL, 6, 0x37},/* Valleyview */ - {} -}; - -static int rapl_check_unit(struct rapl_package *rp, int cpu) +/* + * Raw RAPL data stored in MSRs are in certain scales. We need to + * convert them into standard units based on the units reported in + * the RAPL unit MSRs. This is specific to CPUs as the method to + * calculate units differ on different CPUs. + * We convert the units to below format based on CPUs. + * i.e. + * energy unit: microJoules : Represented in microJoules by default + * power unit : microWatts : Represented in milliWatts by default + * time unit : microseconds: Represented in seconds by default + */ +static int rapl_check_unit_core(struct rapl_package *rp, int cpu) { u64 msr_val; u32 value; @@ -858,36 +833,47 @@ static int rapl_check_unit(struct rapl_package *rp, int cpu) return -ENODEV; } - /* Raw RAPL data stored in MSRs are in certain scales. We need to - * convert them into standard units based on the divisors reported in - * the RAPL unit MSRs. - * i.e. - * energy unit: 1/enery_unit_divisor Joules - * power unit: 1/power_unit_divisor Watts - * time unit: 1/time_unit_divisor Seconds - */ value = (msr_val & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; - /* some CPUs have different way to calculate energy unit */ - if (x86_match_cpu(energy_unit_quirk_ids)) - rp->energy_unit_divisor = 1000000 / (1 << value); - else - rp->energy_unit_divisor = 1 << value; + rp->energy_unit = 1000000 / (1 << value); value = (msr_val & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; - rp->power_unit_divisor = 1 << value; + rp->power_unit = 1000000 / (1 << value); value = (msr_val & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; - rp->time_unit_divisor = 1 << value; + rp->time_unit = 1000000 / (1 << value); - pr_debug("Physical package %d units: energy=%d, time=%d, power=%d\n", - rp->id, - rp->energy_unit_divisor, - rp->time_unit_divisor, - rp->power_unit_divisor); + pr_debug("Core CPU package %d energy=%duJ, time=%dus, power=%duW\n", + rp->id, rp->energy_unit, rp->time_unit, rp->power_unit); return 0; } +static int rapl_check_unit_atom(struct rapl_package *rp, int cpu) +{ + u64 msr_val; + u32 value; + + if (rdmsrl_safe_on_cpu(cpu, MSR_RAPL_POWER_UNIT, &msr_val)) { + pr_err("Failed to read power unit MSR 0x%x on CPU %d, exit.\n", + MSR_RAPL_POWER_UNIT, cpu); + return -ENODEV; + } + value = (msr_val & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; + rp->energy_unit = 1 << value; + + value = (msr_val & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; + rp->power_unit = (1 << value) * 1000; + + value = (msr_val & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; + rp->time_unit = 1000000 / (1 << value); + + pr_debug("Atom package %d energy=%duJ, time=%dus, power=%duW\n", + rp->id, rp->energy_unit, rp->time_unit, rp->power_unit); + + return 0; +} + + /* REVISIT: * When package power limit is set artificially low by RAPL, LVT * thermal interrupt for package power limit should be ignored @@ -955,10 +941,86 @@ static void package_power_limit_irq_restore(int package_id) wrmsr_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); } +static void set_floor_freq_default(struct rapl_domain *rd, bool mode) +{ + int nr_powerlimit = find_nr_power_limit(rd); + + /* always enable clamp such that p-state can go below OS requested + * range. power capping priority over guranteed frequency. + */ + rapl_write_data_raw(rd, PL1_CLAMP, mode); + + /* some domains have pl2 */ + if (nr_powerlimit > 1) { + rapl_write_data_raw(rd, PL2_ENABLE, mode); + rapl_write_data_raw(rd, PL2_CLAMP, mode); + } +} + +static void set_floor_freq_atom(struct rapl_domain *rd, bool enable) +{ + static u32 power_ctrl_orig_val; + u32 mdata; + + if (!power_ctrl_orig_val) + iosf_mbi_read(BT_MBI_UNIT_PMC, BT_MBI_PMC_READ, + IOSF_CPU_POWER_BUDGET_CTL, &power_ctrl_orig_val); + mdata = power_ctrl_orig_val; + if (enable) { + mdata &= ~(0x7f << 8); + mdata |= 1 << 8; + } + iosf_mbi_write(BT_MBI_UNIT_PMC, BT_MBI_PMC_WRITE, + IOSF_CPU_POWER_BUDGET_CTL, mdata); +} + +static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value, + bool to_raw) +{ + u64 f, y; /* fraction and exp. used for time unit */ + + /* + * Special processing based on 2^Y*(1+F/4), refer + * to Intel Software Developer's manual Vol.3B: CH 14.9.3. + */ + if (!to_raw) { + f = (value & 0x60) >> 5; + y = value & 0x1f; + value = (1 << y) * (4 + f) * rp->time_unit / 4; + } else { + do_div(value, rp->time_unit); + y = ilog2(value); + f = div64_u64(4 * (value - (1 << y)), 1 << y); + value = (y & 0x1f) | ((f & 0x3) << 5); + } + return value; +} + +static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value, + bool to_raw) +{ + /* + * Atom time unit encoding is straight forward val * time_unit, + * where time_unit is default to 1 sec. Never 0. + */ + if (!to_raw) + return (value) ? value *= rp->time_unit : rp->time_unit; + else + value = div64_u64(value, rp->time_unit); + + return value; +} + static const struct rapl_defaults rapl_defaults_core = { + .check_unit = rapl_check_unit_core, + .set_floor_freq = set_floor_freq_default, + .compute_time_window = rapl_compute_time_window_core, }; static const struct rapl_defaults rapl_defaults_atom = { + .check_unit = rapl_check_unit_atom, + .set_floor_freq = set_floor_freq_atom, + .compute_time_window = rapl_compute_time_window_atom, }; #define RAPL_CPU(_model, _ops) { \ @@ -1262,7 +1324,7 @@ static int rapl_detect_topology(void) /* check if the package contains valid domains */ if (rapl_detect_domains(new_package, i) || - rapl_check_unit(new_package, i)) { + rapl_defaults->check_unit(new_package, i)) { kfree(new_package->domains); kfree(new_package); /* free up the packages already initialized */ @@ -1317,7 +1379,7 @@ static int rapl_add_package(int cpu) rp->nr_cpus = 1; /* check if the package contains valid domains */ if (rapl_detect_domains(rp, cpu) || - rapl_check_unit(rp, cpu)) { + rapl_defaults->check_unit(rp, cpu)) { ret = -ENODEV; goto err_free_package; } From 74af752e489537391294b51d73fafbcd53672ea4 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 7 Nov 2014 09:29:27 -0800 Subject: [PATCH 05/12] powercap / RAPL: add new model ids Signed-off-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c index 4631696a7ccf..c71443c4f265 100644 --- a/drivers/powercap/intel_rapl.c +++ b/drivers/powercap/intel_rapl.c @@ -1039,6 +1039,9 @@ static const struct x86_cpu_id rapl_ids[] = { RAPL_CPU(0x3d, rapl_defaults_core),/* Broadwell */ RAPL_CPU(0x3f, rapl_defaults_core),/* Haswell */ RAPL_CPU(0x45, rapl_defaults_core),/* Haswell ULT */ + RAPL_CPU(0x4C, rapl_defaults_atom),/* Braswell */ + RAPL_CPU(0x4A, rapl_defaults_atom),/* Tangier */ + RAPL_CPU(0x5A, rapl_defaults_atom),/* Annidale */ {} }; MODULE_DEVICE_TABLE(x86cpu, rapl_ids); From 24b5984118575ec576757df1e2f2bd74bffc9e8d Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Wed, 12 Nov 2014 10:50:36 -0800 Subject: [PATCH 06/12] powercap / RAPL: fix build dependency on iosf_mbi RAPL on Atom depends on IOSF_MBI driver for setting floor frequency. This patch adds explicit dependency on CONFIG_IOSF_MB. Signed-off-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- drivers/powercap/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/powercap/Kconfig b/drivers/powercap/Kconfig index a7c81b53d88a..85727ef6ce8e 100644 --- a/drivers/powercap/Kconfig +++ b/drivers/powercap/Kconfig @@ -17,7 +17,7 @@ if POWERCAP # Client driver configurations go here. config INTEL_RAPL tristate "Intel RAPL Support" - depends on X86 + depends on X86 && IOSF_MBI default n ---help--- This enables support for the Intel Running Average Power Limit (RAPL) From 7d3dcd042c328891a61cd69ee5a1f15a59facd1b Mon Sep 17 00:00:00 2001 From: Pankaj Dubey Date: Mon, 17 Nov 2014 11:42:44 +0530 Subject: [PATCH 07/12] PM: Kconfig: fix unmet dependency for CPU_PM If BL_SWITCHER is enabled but SUSPEND and CPU_IDLE is not enabled we are getting following config warning. warning: (BL_SWITCHER) selects CPU_PM which has unmet direct dependencies (SUSPEND || CPU_IDLE) It has been noticed that CPU_PM dependencies in this file are not really required so let's remove these dependencies from CPU_PM. Signed-off-by: Pankaj Dubey Acked-by: Nicolas Pitre Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index bbef57f5bdfd..1eb7da7bc8e8 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -308,4 +308,3 @@ config PM_GENERIC_DOMAINS_OF config CPU_PM bool - depends on SUSPEND || CPU_IDLE From cd1a068a52ee6a3331a62fe53e87d6ca6cfd68a5 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 25 Nov 2014 16:04:16 +0530 Subject: [PATCH 08/12] PM / OPP rename 'head' as 'rcu_head' or 'srcu_head' based on its type Both 'struct dev_pm_opp' and 'struct device_opp' have member with name 'head' but with different types. This leads to confusion while reading the code. Name them 'rcu_head' and 'srcu_head'. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/base/power/opp.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c index 89ced955fafa..76ae6fd28345 100644 --- a/drivers/base/power/opp.c +++ b/drivers/base/power/opp.c @@ -53,7 +53,7 @@ * @rate: Frequency in hertz * @u_volt: Nominal voltage in microvolts corresponding to this OPP * @dev_opp: points back to the device_opp struct this opp belongs to - * @head: RCU callback head used for deferred freeing + * @rcu_head: RCU callback head used for deferred freeing * * This structure stores the OPP information for a given device. */ @@ -65,7 +65,7 @@ struct dev_pm_opp { unsigned long u_volt; struct device_opp *dev_opp; - struct rcu_head head; + struct rcu_head rcu_head; }; /** @@ -76,7 +76,7 @@ struct dev_pm_opp { * RCU usage: nodes are not modified in the list of device_opp, * however addition is possible and is secured by dev_opp_list_lock * @dev: device pointer - * @head: notifier head to notify the OPP availability changes. + * @srcu_head: notifier head to notify the OPP availability changes. * @opp_list: list of opps * * This is an internal data structure maintaining the link to opps attached to @@ -87,7 +87,7 @@ struct device_opp { struct list_head node; struct device *dev; - struct srcu_notifier_head head; + struct srcu_notifier_head srcu_head; struct list_head opp_list; }; @@ -436,7 +436,7 @@ int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt) } dev_opp->dev = dev; - srcu_init_notifier_head(&dev_opp->head); + srcu_init_notifier_head(&dev_opp->srcu_head); INIT_LIST_HEAD(&dev_opp->opp_list); /* Secure the device list modification */ @@ -481,7 +481,7 @@ int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt) * Notify the changes in the availability of the operable * frequency/voltage list. */ - srcu_notifier_call_chain(&dev_opp->head, OPP_EVENT_ADD, new_opp); + srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_ADD, new_opp); return 0; } EXPORT_SYMBOL_GPL(dev_pm_opp_add); @@ -557,14 +557,14 @@ static int opp_set_availability(struct device *dev, unsigned long freq, list_replace_rcu(&opp->node, &new_opp->node); mutex_unlock(&dev_opp_list_lock); - kfree_rcu(opp, head); + kfree_rcu(opp, rcu_head); /* Notify the change of the OPP availability */ if (availability_req) - srcu_notifier_call_chain(&dev_opp->head, OPP_EVENT_ENABLE, + srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_ENABLE, new_opp); else - srcu_notifier_call_chain(&dev_opp->head, OPP_EVENT_DISABLE, + srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_DISABLE, new_opp); return 0; @@ -629,7 +629,7 @@ struct srcu_notifier_head *dev_pm_opp_get_notifier(struct device *dev) if (IS_ERR(dev_opp)) return ERR_CAST(dev_opp); /* matching type */ - return &dev_opp->head; + return &dev_opp->srcu_head; } #ifdef CONFIG_OF From a7470db6fec481b14afea117846fce383671ba59 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 25 Nov 2014 16:04:17 +0530 Subject: [PATCH 09/12] PM / OPP don't match for existing OPPs when list is empty OPP list is guaranteed to be empty when 'dev_opp' is created. And so we don't need to run the comparison loop with existing OPPs. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/base/power/opp.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c index 76ae6fd28345..a333e2ef5cb2 100644 --- a/drivers/base/power/opp.c +++ b/drivers/base/power/opp.c @@ -417,6 +417,12 @@ int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt) /* Hold our list modification lock here */ mutex_lock(&dev_opp_list_lock); + /* populate the opp table */ + new_opp->dev_opp = dev_opp; + new_opp->rate = freq; + new_opp->u_volt = u_volt; + new_opp->available = true; + /* Check for existing list for 'dev' */ dev_opp = find_device_opp(dev); if (IS_ERR(dev_opp)) { @@ -441,14 +447,10 @@ int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt) /* Secure the device list modification */ list_add_rcu(&dev_opp->node, &dev_opp_list); + head = &dev_opp->opp_list; + goto list_add; } - /* populate the opp table */ - new_opp->dev_opp = dev_opp; - new_opp->rate = freq; - new_opp->u_volt = u_volt; - new_opp->available = true; - /* * Insert new OPP in order of increasing frequency * and discard if already present @@ -474,6 +476,7 @@ int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt) return ret; } +list_add: list_add_rcu(&new_opp->node, head); mutex_unlock(&dev_opp_list_lock); From 38393409da345cd48d94a0e74c7bbc3402742882 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 25 Nov 2014 16:04:18 +0530 Subject: [PATCH 10/12] PM / OPP mark OPPs as 'static' or 'dynamic' Static OPPs are the ones created from Device Tree entries and dynamic are the ones created at runtime by calling dev_pm_opp_add(). There is a need to distinguish them as we need to free static OPPs from cpufreq drivers when they are removed. So, add another field 'dynamic' in 'struct dev_pm_opp' to keep this information. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/base/power/opp.c | 59 +++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c index a333e2ef5cb2..b249b0127eff 100644 --- a/drivers/base/power/opp.c +++ b/drivers/base/power/opp.c @@ -49,6 +49,7 @@ * are protected by the dev_opp_list_lock for integrity. * IMPORTANT: the opp nodes should be maintained in increasing * order. + * @dynamic: not-created from static DT entries. * @available: true/false - marks if this OPP as available or not * @rate: Frequency in hertz * @u_volt: Nominal voltage in microvolts corresponding to this OPP @@ -61,6 +62,7 @@ struct dev_pm_opp { struct list_head node; bool available; + bool dynamic; unsigned long rate; unsigned long u_volt; @@ -378,30 +380,8 @@ struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev, } EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_floor); -/** - * dev_pm_opp_add() - Add an OPP table from a table definitions - * @dev: device for which we do this operation - * @freq: Frequency in Hz for this OPP - * @u_volt: Voltage in uVolts for this OPP - * - * This function adds an opp definition to the opp list and returns status. - * The opp is made available by default and it can be controlled using - * dev_pm_opp_enable/disable functions. - * - * Locking: The internal device_opp and opp structures are RCU protected. - * Hence this function internally uses RCU updater strategy with mutex locks - * to keep the integrity of the internal data structures. Callers should ensure - * that this function is *NOT* called under RCU protection or in contexts where - * mutex cannot be locked. - * - * Return: - * 0: On success OR - * Duplicate OPPs (both freq and volt are same) and opp->available - * -EEXIST: Freq are same and volt are different OR - * Duplicate OPPs (both freq and volt are same) and !opp->available - * -ENOMEM: Memory allocation failure - */ -int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt) +static int dev_pm_opp_add_dynamic(struct device *dev, unsigned long freq, + unsigned long u_volt, bool dynamic) { struct device_opp *dev_opp = NULL; struct dev_pm_opp *opp, *new_opp; @@ -422,6 +402,7 @@ int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt) new_opp->rate = freq; new_opp->u_volt = u_volt; new_opp->available = true; + new_opp->dynamic = dynamic; /* Check for existing list for 'dev' */ dev_opp = find_device_opp(dev); @@ -487,6 +468,34 @@ int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt) srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_ADD, new_opp); return 0; } + +/** + * dev_pm_opp_add() - Add an OPP table from a table definitions + * @dev: device for which we do this operation + * @freq: Frequency in Hz for this OPP + * @u_volt: Voltage in uVolts for this OPP + * + * This function adds an opp definition to the opp list and returns status. + * The opp is made available by default and it can be controlled using + * dev_pm_opp_enable/disable functions. + * + * Locking: The internal device_opp and opp structures are RCU protected. + * Hence this function internally uses RCU updater strategy with mutex locks + * to keep the integrity of the internal data structures. Callers should ensure + * that this function is *NOT* called under RCU protection or in contexts where + * mutex cannot be locked. + * + * Return: + * 0: On success OR + * Duplicate OPPs (both freq and volt are same) and opp->available + * -EEXIST: Freq are same and volt are different OR + * Duplicate OPPs (both freq and volt are same) and !opp->available + * -ENOMEM: Memory allocation failure + */ +int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt) +{ + return dev_pm_opp_add_dynamic(dev, freq, u_volt, true); +} EXPORT_SYMBOL_GPL(dev_pm_opp_add); /** @@ -669,7 +678,7 @@ int of_init_opp_table(struct device *dev) unsigned long freq = be32_to_cpup(val++) * 1000; unsigned long volt = be32_to_cpup(val++); - if (dev_pm_opp_add(dev, freq, volt)) + if (dev_pm_opp_add_dynamic(dev, freq, volt, false)) dev_warn(dev, "%s: Failed to add OPP %ld\n", __func__, freq); nr -= 2; From 129eec55df6ab1b5ecdd89fd7db7a2cd103200b5 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 27 Nov 2014 08:54:06 +0530 Subject: [PATCH 11/12] PM / OPP Introduce APIs to remove OPPs OPPs are created statically (from DT) or dynamically. Currently we don't free OPPs that are created statically, when the module unloads. And so if the module is inserted back again, we get warning for duplicate OPPs as the same were already present. Also, there might be a need to remove dynamic OPPs in future and so API for that is also added. This patch adds helper APIs to remove/free existing static and dynamic OPPs. Because the OPPs are used both under RCU and SRCU, we have to wait for grace period of both. And so are using kfree_rcu() from within call_srcu(). Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/base/power/opp.c | 102 +++++++++++++++++++++++++++++++++++++++ include/linux/pm_opp.h | 12 ++++- 2 files changed, 113 insertions(+), 1 deletion(-) diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c index b249b0127eff..977474a3c64f 100644 --- a/drivers/base/power/opp.c +++ b/drivers/base/power/opp.c @@ -79,6 +79,7 @@ struct dev_pm_opp { * however addition is possible and is secured by dev_opp_list_lock * @dev: device pointer * @srcu_head: notifier head to notify the OPP availability changes. + * @rcu_head: RCU callback head used for deferred freeing * @opp_list: list of opps * * This is an internal data structure maintaining the link to opps attached to @@ -90,6 +91,7 @@ struct device_opp { struct device *dev; struct srcu_notifier_head srcu_head; + struct rcu_head rcu_head; struct list_head opp_list; }; @@ -498,6 +500,76 @@ int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt) } EXPORT_SYMBOL_GPL(dev_pm_opp_add); +static void kfree_opp_rcu(struct rcu_head *head) +{ + struct dev_pm_opp *opp = container_of(head, struct dev_pm_opp, rcu_head); + + kfree_rcu(opp, rcu_head); +} + +static void kfree_device_rcu(struct rcu_head *head) +{ + struct device_opp *device_opp = container_of(head, struct device_opp, rcu_head); + + kfree(device_opp); +} + +void __dev_pm_opp_remove(struct device_opp *dev_opp, struct dev_pm_opp *opp) +{ + /* + * Notify the changes in the availability of the operable + * frequency/voltage list. + */ + srcu_notifier_call_chain(&dev_opp->srcu_head, OPP_EVENT_REMOVE, opp); + list_del_rcu(&opp->node); + call_srcu(&dev_opp->srcu_head.srcu, &opp->rcu_head, kfree_opp_rcu); + + if (list_empty(&dev_opp->opp_list)) { + list_del_rcu(&dev_opp->node); + call_srcu(&dev_opp->srcu_head.srcu, &dev_opp->rcu_head, + kfree_device_rcu); + } +} + +/** + * dev_pm_opp_remove() - Remove an OPP from OPP list + * @dev: device for which we do this operation + * @freq: OPP to remove with matching 'freq' + * + * This function removes an opp from the opp list. + */ +void dev_pm_opp_remove(struct device *dev, unsigned long freq) +{ + struct dev_pm_opp *opp; + struct device_opp *dev_opp; + bool found = false; + + /* Hold our list modification lock here */ + mutex_lock(&dev_opp_list_lock); + + dev_opp = find_device_opp(dev); + if (IS_ERR(dev_opp)) + goto unlock; + + list_for_each_entry(opp, &dev_opp->opp_list, node) { + if (opp->rate == freq) { + found = true; + break; + } + } + + if (!found) { + dev_warn(dev, "%s: Couldn't find OPP with freq: %lu\n", + __func__, freq); + goto unlock; + } + + __dev_pm_opp_remove(dev_opp, opp); +unlock: + mutex_unlock(&dev_opp_list_lock); +} +EXPORT_SYMBOL_GPL(dev_pm_opp_remove); + /** * opp_set_availability() - helper to set the availability of an opp * @dev: device for which we do this operation @@ -687,4 +759,34 @@ int of_init_opp_table(struct device *dev) return 0; } EXPORT_SYMBOL_GPL(of_init_opp_table); + +/** + * of_free_opp_table() - Free OPP table entries created from static DT entries + * @dev: device pointer used to lookup device OPPs. + * + * Free OPPs created using static entries present in DT. + */ +void of_free_opp_table(struct device *dev) +{ + struct device_opp *dev_opp = find_device_opp(dev); + struct dev_pm_opp *opp, *tmp; + + /* Check for existing list for 'dev' */ + dev_opp = find_device_opp(dev); + if (WARN(IS_ERR(dev_opp), "%s: dev_opp: %ld\n", dev_name(dev), + PTR_ERR(dev_opp))) + return; + + /* Hold our list modification lock here */ + mutex_lock(&dev_opp_list_lock); + + /* Free static OPPs */ + list_for_each_entry_safe(opp, tmp, &dev_opp->opp_list, node) { + if (!opp->dynamic) + __dev_pm_opp_remove(dev_opp, opp); + } + + mutex_unlock(&dev_opp_list_lock); +} +EXPORT_SYMBOL_GPL(of_free_opp_table); #endif diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 0330217abfad..cec2d4540914 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -21,7 +21,7 @@ struct dev_pm_opp; struct device; enum dev_pm_opp_event { - OPP_EVENT_ADD, OPP_EVENT_ENABLE, OPP_EVENT_DISABLE, + OPP_EVENT_ADD, OPP_EVENT_REMOVE, OPP_EVENT_ENABLE, OPP_EVENT_DISABLE, }; #if defined(CONFIG_PM_OPP) @@ -44,6 +44,7 @@ struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev, int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt); +void dev_pm_opp_remove(struct device *dev, unsigned long freq); int dev_pm_opp_enable(struct device *dev, unsigned long freq); @@ -90,6 +91,10 @@ static inline int dev_pm_opp_add(struct device *dev, unsigned long freq, return -EINVAL; } +static inline void dev_pm_opp_remove(struct device *dev, unsigned long freq) +{ +} + static inline int dev_pm_opp_enable(struct device *dev, unsigned long freq) { return 0; @@ -109,11 +114,16 @@ static inline struct srcu_notifier_head *dev_pm_opp_get_notifier( #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF) int of_init_opp_table(struct device *dev); +void of_free_opp_table(struct device *dev); #else static inline int of_init_opp_table(struct device *dev) { return -EINVAL; } + +static inline void of_free_opp_table(struct device *dev) +{ +} #endif #endif /* __LINUX_OPP_H__ */ From b4037aaa584bd914bbf578f5ceb2d9884fa7ddb6 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 27 Nov 2014 08:54:07 +0530 Subject: [PATCH 12/12] PM / OPP replace kfree_rcu() with call_srcu() in opp_set_availability() This existed before we introduced call_srcu() in opp layer to synchronize with srcu_notifier_call_chain() while removing OPPs. And is a potential bug which wasn't noticed earlier. Let fix it as well by using the right API to free OPP. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/base/power/opp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c index 977474a3c64f..2d195f3a1998 100644 --- a/drivers/base/power/opp.c +++ b/drivers/base/power/opp.c @@ -641,7 +641,7 @@ static int opp_set_availability(struct device *dev, unsigned long freq, list_replace_rcu(&opp->node, &new_opp->node); mutex_unlock(&dev_opp_list_lock); - kfree_rcu(opp, rcu_head); + call_srcu(&dev_opp->srcu_head.srcu, &opp->rcu_head, kfree_opp_rcu); /* Notify the change of the OPP availability */ if (availability_req)