From 3b9c10e98021e1f92e6f8c7ce1778b86ba68db10 Mon Sep 17 00:00:00 2001 From: Daniel Fu Date: Fri, 30 Aug 2013 19:48:22 +0800 Subject: [PATCH 01/25] cpuidle: Check the result of cpuidle_get_driver() against NULL If the current CPU has no cpuidle driver, drv will be NULL in cpuidle_driver_ref(). Check if that is the case before trying to bump up the driver's refcount to prevent the kernel from crashing. [rjw: Subject and changelog] Signed-off-by: Daniel Fu Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/driver.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index 3ac499d5a207..6e11701f0fca 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -331,7 +331,8 @@ struct cpuidle_driver *cpuidle_driver_ref(void) spin_lock(&cpuidle_driver_lock); drv = cpuidle_get_driver(); - drv->refcnt++; + if (drv) + drv->refcnt++; spin_unlock(&cpuidle_driver_lock); return drv; From e0ae8fee0e11c1a8e9b45ab14ab5fe58d87f031d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 30 Aug 2013 14:19:29 +0200 Subject: [PATCH 02/25] ACPI / scan: Change ordering of locks for device hotplug Change the ordering of device hotplug locks in scan.c so that acpi_scan_lock is always acquired after device_hotplug_lock. This will make it possible to use device_hotplug_lock around some code paths that acquire acpi_scan_lock safely (most importantly system suspend and hibernation). Apart from that, acpi_scan_lock is platform-specific and device_hotplug_lock is general, so the new ordering appears to be more appropriate from the overall design viewpoint. Signed-off-by: Rafael J. Wysocki Acked-by: Toshi Kani --- drivers/acpi/scan.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index e2f6d9dbdf0d..42982b522b36 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -207,8 +207,6 @@ static int acpi_scan_hot_remove(struct acpi_device *device) return -EINVAL; } - lock_device_hotplug(); - /* * Carry out two passes here and ignore errors in the first pass, * because if the devices in question are memory blocks and @@ -239,9 +237,6 @@ static int acpi_scan_hot_remove(struct acpi_device *device) ACPI_UINT32_MAX, acpi_bus_online_companions, NULL, NULL, NULL); - - unlock_device_hotplug(); - put_device(&device->dev); return -EBUSY; } @@ -252,8 +247,6 @@ static int acpi_scan_hot_remove(struct acpi_device *device) acpi_bus_trim(device); - unlock_device_hotplug(); - /* Device node has been unregistered. */ put_device(&device->dev); device = NULL; @@ -309,6 +302,7 @@ static void acpi_bus_device_eject(void *context) u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; int error; + lock_device_hotplug(); mutex_lock(&acpi_scan_lock); acpi_bus_get_device(handle, &device); @@ -332,6 +326,7 @@ static void acpi_bus_device_eject(void *context) out: mutex_unlock(&acpi_scan_lock); + unlock_device_hotplug(); return; err_out: @@ -346,8 +341,8 @@ static void acpi_scan_bus_device_check(acpi_handle handle, u32 ost_source) u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; int error; - mutex_lock(&acpi_scan_lock); lock_device_hotplug(); + mutex_lock(&acpi_scan_lock); if (ost_source != ACPI_NOTIFY_BUS_CHECK) { acpi_bus_get_device(handle, &device); @@ -373,9 +368,9 @@ static void acpi_scan_bus_device_check(acpi_handle handle, u32 ost_source) kobject_uevent(&device->dev.kobj, KOBJ_ONLINE); out: - unlock_device_hotplug(); acpi_evaluate_hotplug_ost(handle, ost_source, ost_code, NULL); mutex_unlock(&acpi_scan_lock); + unlock_device_hotplug(); } static void acpi_scan_bus_check(void *context) @@ -466,6 +461,7 @@ void acpi_bus_hot_remove_device(void *context) acpi_handle handle = device->handle; int error; + lock_device_hotplug(); mutex_lock(&acpi_scan_lock); error = acpi_scan_hot_remove(device); @@ -475,6 +471,7 @@ void acpi_bus_hot_remove_device(void *context) NULL); mutex_unlock(&acpi_scan_lock); + unlock_device_hotplug(); kfree(context); } EXPORT_SYMBOL(acpi_bus_hot_remove_device); From 8fd37a4c9822d58c93f764864582aa13112b1513 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 30 Aug 2013 14:19:38 +0200 Subject: [PATCH 03/25] PM / hibernate: Create memory bitmaps after freezing user space The hibernation core uses special memory bitmaps during image creation and restoration and traditionally those bitmaps are allocated before freezing tasks, because in the past GFP_KERNEL allocations might not work after all tasks had been frozen. However, this is an anachronism, because hibernation_snapshot() now calls hibernate_preallocate_memory() which allocates memory for the image upfront anyway, so the memory bitmaps may be allocated after freezing user space safely. For this reason, move all of the create_basic_memory_bitmaps() calls after freeze_processes() and all of the corresponding free_basic_memory_bitmaps() calls before thaw_processes(). This will allow us to hold device_hotplug_lock around hibernation without the need to worry about freezing issues with user space processes attempting to acquire it via sysfs attributes after the creation of memory bitmaps and before the freezing of tasks. Signed-off-by: Rafael J. Wysocki Acked-by: Toshi Kani --- kernel/power/hibernate.c | 41 +++++++++++++++++++--------------------- kernel/power/user.c | 22 +++++++++++---------- 2 files changed, 31 insertions(+), 32 deletions(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b26f5f1e773e..d4e54053d009 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -644,22 +644,22 @@ int hibernate(void) if (error) goto Exit; - /* Allocate memory management structures */ - error = create_basic_memory_bitmaps(); - if (error) - goto Exit; - printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); printk("done.\n"); error = freeze_processes(); if (error) - goto Free_bitmaps; + goto Exit; + + /* Allocate memory management structures */ + error = create_basic_memory_bitmaps(); + if (error) + goto Thaw; error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); if (error || freezer_test_done) - goto Thaw; + goto Free_bitmaps; if (in_suspend) { unsigned int flags = 0; @@ -682,14 +682,13 @@ int hibernate(void) pr_debug("PM: Image restored successfully.\n"); } + Free_bitmaps: + free_basic_memory_bitmaps(); Thaw: thaw_processes(); /* Don't bother checking whether freezer_test_done is true */ freezer_test_done = false; - - Free_bitmaps: - free_basic_memory_bitmaps(); Exit: pm_notifier_call_chain(PM_POST_HIBERNATION); pm_restore_console(); @@ -806,21 +805,19 @@ static int software_resume(void) pm_prepare_console(); error = pm_notifier_call_chain(PM_RESTORE_PREPARE); if (error) - goto close_finish; - - error = create_basic_memory_bitmaps(); - if (error) - goto close_finish; + goto Close_Finish; pr_debug("PM: Preparing processes for restore.\n"); error = freeze_processes(); - if (error) { - swsusp_close(FMODE_READ); - goto Done; - } + if (error) + goto Close_Finish; pr_debug("PM: Loading hibernation image.\n"); + error = create_basic_memory_bitmaps(); + if (error) + goto Thaw; + error = swsusp_read(&flags); swsusp_close(FMODE_READ); if (!error) @@ -828,9 +825,9 @@ static int software_resume(void) printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); swsusp_free(); - thaw_processes(); - Done: free_basic_memory_bitmaps(); + Thaw: + thaw_processes(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); pm_restore_console(); @@ -840,7 +837,7 @@ static int software_resume(void) mutex_unlock(&pm_mutex); pr_debug("PM: Hibernation image not present or could not be loaded.\n"); return error; -close_finish: + Close_Finish: swsusp_close(FMODE_READ); goto Finish; } diff --git a/kernel/power/user.c b/kernel/power/user.c index 4ed81e74f86f..63368163e98d 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -60,11 +60,6 @@ static int snapshot_open(struct inode *inode, struct file *filp) error = -ENOSYS; goto Unlock; } - if(create_basic_memory_bitmaps()) { - atomic_inc(&snapshot_device_available); - error = -ENOMEM; - goto Unlock; - } nonseekable_open(inode, filp); data = &snapshot_state; filp->private_data = data; @@ -90,10 +85,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) if (error) pm_notifier_call_chain(PM_POST_RESTORE); } - if (error) { - free_basic_memory_bitmaps(); + if (error) atomic_inc(&snapshot_device_available); - } + data->frozen = 0; data->ready = 0; data->platform_support = 0; @@ -111,11 +105,11 @@ static int snapshot_release(struct inode *inode, struct file *filp) lock_system_sleep(); swsusp_free(); - free_basic_memory_bitmaps(); data = filp->private_data; free_all_swap_pages(data->swap); if (data->frozen) { pm_restore_gfp_mask(); + free_basic_memory_bitmaps(); thaw_processes(); } pm_notifier_call_chain(data->mode == O_RDONLY ? @@ -220,14 +214,22 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, printk("done.\n"); error = freeze_processes(); - if (!error) + if (error) + break; + + error = create_basic_memory_bitmaps(); + if (error) + thaw_processes(); + else data->frozen = 1; + break; case SNAPSHOT_UNFREEZE: if (!data->frozen || data->ready) break; pm_restore_gfp_mask(); + free_basic_memory_bitmaps(); thaw_processes(); data->frozen = 0; break; From 942f40155a743f4204308d62405dacaa4bfadb11 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 30 Aug 2013 14:19:46 +0200 Subject: [PATCH 04/25] PM / hibernate / memory hotplug: Rework mutual exclusion Since all of the memory hotplug operations have to be carried out under device_hotplug_lock, they won't need to acquire pm_mutex if device_hotplug_lock is held around hibernation. For this reason, make the hibernation code acquire device_hotplug_lock after freezing user space processes and release it before thawing them. At the same tim drop the lock_system_sleep() and unlock_system_sleep() calls from lock_memory_hotplug() and unlock_memory_hotplug(), respectively. Signed-off-by: Rafael J. Wysocki Acked-by: Toshi Kani --- kernel/power/hibernate.c | 4 ++++ kernel/power/user.c | 2 ++ mm/memory_hotplug.c | 4 ---- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index d4e54053d009..0b78f72ad39d 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -652,6 +652,7 @@ int hibernate(void) if (error) goto Exit; + lock_device_hotplug(); /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); if (error) @@ -685,6 +686,7 @@ int hibernate(void) Free_bitmaps: free_basic_memory_bitmaps(); Thaw: + unlock_device_hotplug(); thaw_processes(); /* Don't bother checking whether freezer_test_done is true */ @@ -814,6 +816,7 @@ static int software_resume(void) pr_debug("PM: Loading hibernation image.\n"); + lock_device_hotplug(); error = create_basic_memory_bitmaps(); if (error) goto Thaw; @@ -827,6 +830,7 @@ static int software_resume(void) swsusp_free(); free_basic_memory_bitmaps(); Thaw: + unlock_device_hotplug(); thaw_processes(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); diff --git a/kernel/power/user.c b/kernel/power/user.c index 63368163e98d..72e8f4fd616d 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -201,6 +201,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (!mutex_trylock(&pm_mutex)) return -EBUSY; + lock_device_hotplug(); data = filp->private_data; switch (cmd) { @@ -373,6 +374,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } + unlock_device_hotplug(); mutex_unlock(&pm_mutex); return error; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ca1dd3aa5eee..53ad1325d7a7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -51,14 +51,10 @@ DEFINE_MUTEX(mem_hotplug_mutex); void lock_memory_hotplug(void) { mutex_lock(&mem_hotplug_mutex); - - /* for exclusive hibernation if CONFIG_HIBERNATION=y */ - lock_system_sleep(); } void unlock_memory_hotplug(void) { - unlock_system_sleep(); mutex_unlock(&mem_hotplug_mutex); } From af65cfe9aeae03e0682bebdf4db94582d75562dd Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 2 Sep 2013 13:30:25 +0300 Subject: [PATCH 05/25] ACPI / LPSS: don't crash if a device has no MMIO resources Intel LPSS devices that are enumerated from ACPI have both MMIO and IRQ resources returned in their _CRS method. However, Apple Macbook Air with Haswell has LPSS devices enumerated from PCI bus instead and _CRS method returns only an interrupt number (but the device has _HID set that causes the scan handler to match it). The current ACPI / LPSS code sets pdata->dev_desc only when MMIO resource is found for the device and in case of Macbook Air it is never found. That leads to a NULL pointer dereference in register_device_clock(). Correct this by always setting the pdata->dev_desc. Reported-and-tested-by: Imre Kaloz Signed-off-by: Mika Westerberg Cc: 3.10+ # 3.10+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_lpss.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c index 6a382188fa20..fb78bb9ad8f6 100644 --- a/drivers/acpi/acpi_lpss.c +++ b/drivers/acpi/acpi_lpss.c @@ -257,12 +257,13 @@ static int acpi_lpss_create_device(struct acpi_device *adev, pdata->mmio_size = resource_size(&rentry->res); pdata->mmio_base = ioremap(rentry->res.start, pdata->mmio_size); - pdata->dev_desc = dev_desc; break; } acpi_dev_free_resource_list(&resource_list); + pdata->dev_desc = dev_desc; + if (dev_desc->clk_required) { ret = register_device_clock(adev, pdata); if (ret) { From 89ec2f2ee104970329139e6526a075113c92f650 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 5 Sep 2013 23:39:20 +0200 Subject: [PATCH 06/25] ACPI / hotplug / PCI: Don't trim devices before scanning the namespace In acpiphp_bus_add() we first remove device objects corresponding to the given handle and the ACPI namespace branch below it, which are then re-created by acpi_bus_scan(). This used to be done to clean up after surprise removals, but now we do the cleanup through trim_stale_devices() which checks if the devices in question are actually gone before removing them, so the device hierarchy trimming in acpiphp_bus_add() is not necessary any more and, moreover, it may lead to problems if it removes device objects corresponding to devices that are actually present. For this reason, remove the leftover acpiphp_bus_trim() from acpiphp_bus_add(). Reported-by: Alex Williamson Signed-off-by: Rafael J. Wysocki --- drivers/pci/hotplug/acpiphp_glue.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index 8054ddcdaed0..3f78212f4eee 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -487,7 +487,6 @@ static void acpiphp_bus_add(acpi_handle handle) { struct acpi_device *adev = NULL; - acpiphp_bus_trim(handle); acpi_bus_scan(handle); acpi_bus_get_device(handle, &adev); if (adev) From 4be4be8fee2ee99a52f94f90d03d2f287ee1db86 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Fri, 6 Sep 2013 14:27:15 +0800 Subject: [PATCH 07/25] ACPICA: Fix for a Store->ArgX when ArgX contains a reference to a field. This change fixes a problem where a Store operation to an ArgX object that contained a reference to a field object did not complete the automatic dereference and then write to the actual field object. Instead, the object type of the field object was inadvertently changed to match the type of the source operand. The new behavior will actually write to the field object (buffer field or field unit), thus matching the correct ACPI-defined behavior. Signed-off-by: Bob Moore Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/exstore.c | 166 +++++++++++++++++++++------------- 1 file changed, 102 insertions(+), 64 deletions(-) diff --git a/drivers/acpi/acpica/exstore.c b/drivers/acpi/acpica/exstore.c index 2bdba6f7d762..f0b09bf9887d 100644 --- a/drivers/acpi/acpica/exstore.c +++ b/drivers/acpi/acpica/exstore.c @@ -57,6 +57,11 @@ acpi_ex_store_object_to_index(union acpi_operand_object *val_desc, union acpi_operand_object *dest_desc, struct acpi_walk_state *walk_state); +static acpi_status +acpi_ex_store_direct_to_node(union acpi_operand_object *source_desc, + struct acpi_namespace_node *node, + struct acpi_walk_state *walk_state); + /******************************************************************************* * * FUNCTION: acpi_ex_store @@ -375,7 +380,11 @@ acpi_ex_store_object_to_index(union acpi_operand_object *source_desc, * When storing into an object the data is converted to the * target object type then stored in the object. This means * that the target object type (for an initialized target) will - * not be changed by a store operation. + * not be changed by a store operation. A copy_object can change + * the target type, however. + * + * The implicit_conversion flag is set to NO/FALSE only when + * storing to an arg_x -- as per the rules of the ACPI spec. * * Assumes parameters are already validated. * @@ -399,7 +408,7 @@ acpi_ex_store_object_to_node(union acpi_operand_object *source_desc, target_type = acpi_ns_get_type(node); target_desc = acpi_ns_get_attached_object(node); - ACPI_DEBUG_PRINT((ACPI_DB_EXEC, "Storing %p(%s) into node %p(%s)\n", + ACPI_DEBUG_PRINT((ACPI_DB_EXEC, "Storing %p (%s) to node %p (%s)\n", source_desc, acpi_ut_get_object_type_name(source_desc), node, acpi_ut_get_type_name(target_type))); @@ -413,45 +422,30 @@ acpi_ex_store_object_to_node(union acpi_operand_object *source_desc, return_ACPI_STATUS(status); } - /* If no implicit conversion, drop into the default case below */ - - if ((!implicit_conversion) || - ((walk_state->opcode == AML_COPY_OP) && - (target_type != ACPI_TYPE_LOCAL_REGION_FIELD) && - (target_type != ACPI_TYPE_LOCAL_BANK_FIELD) && - (target_type != ACPI_TYPE_LOCAL_INDEX_FIELD))) { - /* - * Force execution of default (no implicit conversion). Note: - * copy_object does not perform an implicit conversion, as per the ACPI - * spec -- except in case of region/bank/index fields -- because these - * objects must retain their original type permanently. - */ - target_type = ACPI_TYPE_ANY; - } - /* Do the actual store operation */ switch (target_type) { - case ACPI_TYPE_BUFFER_FIELD: - case ACPI_TYPE_LOCAL_REGION_FIELD: - case ACPI_TYPE_LOCAL_BANK_FIELD: - case ACPI_TYPE_LOCAL_INDEX_FIELD: - - /* For fields, copy the source data to the target field. */ - - status = acpi_ex_write_data_to_field(source_desc, target_desc, - &walk_state->result_obj); - break; - case ACPI_TYPE_INTEGER: case ACPI_TYPE_STRING: case ACPI_TYPE_BUFFER: /* - * These target types are all of type Integer/String/Buffer, and - * therefore support implicit conversion before the store. - * - * Copy and/or convert the source object to a new target object + * The simple data types all support implicit source operand + * conversion before the store. */ + + if ((walk_state->opcode == AML_COPY_OP) || !implicit_conversion) { + /* + * However, copy_object and Stores to arg_x do not perform + * an implicit conversion, as per the ACPI specification. + * A direct store is performed instead. + */ + status = acpi_ex_store_direct_to_node(source_desc, node, + walk_state); + break; + } + + /* Store with implicit source operand conversion support */ + status = acpi_ex_store_object_to_object(source_desc, target_desc, &new_desc, walk_state); @@ -465,13 +459,12 @@ acpi_ex_store_object_to_node(union acpi_operand_object *source_desc, * the Name's type to that of the value being stored in it. * source_desc reference count is incremented by attach_object. * - * Note: This may change the type of the node if an explicit store - * has been performed such that the node/object type has been - * changed. + * Note: This may change the type of the node if an explicit + * store has been performed such that the node/object type + * has been changed. */ - status = - acpi_ns_attach_object(node, new_desc, - new_desc->common.type); + status = acpi_ns_attach_object(node, new_desc, + new_desc->common.type); ACPI_DEBUG_PRINT((ACPI_DB_EXEC, "Store %s into %s via Convert/Attach\n", @@ -482,38 +475,83 @@ acpi_ex_store_object_to_node(union acpi_operand_object *source_desc, } break; + case ACPI_TYPE_BUFFER_FIELD: + case ACPI_TYPE_LOCAL_REGION_FIELD: + case ACPI_TYPE_LOCAL_BANK_FIELD: + case ACPI_TYPE_LOCAL_INDEX_FIELD: + /* + * For all fields, always write the source data to the target + * field. Any required implicit source operand conversion is + * performed in the function below as necessary. Note, field + * objects must retain their original type permanently. + */ + status = acpi_ex_write_data_to_field(source_desc, target_desc, + &walk_state->result_obj); + break; + default: - - ACPI_DEBUG_PRINT((ACPI_DB_EXEC, - "Storing [%s] (%p) directly into node [%s] (%p)" - " with no implicit conversion\n", - acpi_ut_get_object_type_name(source_desc), - source_desc, - acpi_ut_get_object_type_name(target_desc), - node)); - /* * No conversions for all other types. Directly store a copy of - * the source object. NOTE: This is a departure from the ACPI - * spec, which states "If conversion is impossible, abort the - * running control method". + * the source object. This is the ACPI spec-defined behavior for + * the copy_object operator. * - * This code implements "If conversion is impossible, treat the - * Store operation as a CopyObject". + * NOTE: For the Store operator, this is a departure from the + * ACPI spec, which states "If conversion is impossible, abort + * the running control method". Instead, this code implements + * "If conversion is impossible, treat the Store operation as + * a CopyObject". */ - status = - acpi_ut_copy_iobject_to_iobject(source_desc, &new_desc, - walk_state); - if (ACPI_FAILURE(status)) { - return_ACPI_STATUS(status); - } - - status = - acpi_ns_attach_object(node, new_desc, - new_desc->common.type); - acpi_ut_remove_reference(new_desc); + status = acpi_ex_store_direct_to_node(source_desc, node, + walk_state); break; } return_ACPI_STATUS(status); } + +/******************************************************************************* + * + * FUNCTION: acpi_ex_store_direct_to_node + * + * PARAMETERS: source_desc - Value to be stored + * node - Named object to receive the value + * walk_state - Current walk state + * + * RETURN: Status + * + * DESCRIPTION: "Store" an object directly to a node. This involves a copy + * and an attach. + * + ******************************************************************************/ + +static acpi_status +acpi_ex_store_direct_to_node(union acpi_operand_object *source_desc, + struct acpi_namespace_node *node, + struct acpi_walk_state *walk_state) +{ + acpi_status status; + union acpi_operand_object *new_desc; + + ACPI_FUNCTION_TRACE(ex_store_direct_to_node); + + ACPI_DEBUG_PRINT((ACPI_DB_EXEC, + "Storing [%s] (%p) directly into node [%s] (%p)" + " with no implicit conversion\n", + acpi_ut_get_object_type_name(source_desc), + source_desc, acpi_ut_get_type_name(node->type), + node)); + + /* Copy the source object to a new object */ + + status = + acpi_ut_copy_iobject_to_iobject(source_desc, &new_desc, walk_state); + if (ACPI_FAILURE(status)) { + return_ACPI_STATUS(status); + } + + /* Attach the new object to the node */ + + status = acpi_ns_attach_object(node, new_desc, new_desc->common.type); + acpi_ut_remove_reference(new_desc); + return_ACPI_STATUS(status); +} From 2dc41281b1d1178befe4b76adf817570a7f45ec1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 6 Sep 2013 15:41:32 +0200 Subject: [PATCH 08/25] ACPI / hotplug / PCI: Avoid doing too much for spurious notifies Sometimes we may get a spurious device check or bus check notify for a hotplug device and in those cases we should avoid doing all of the configuration work needed when something actually changes. To that end, check the return value of pci_scan_slot() in enable_slot() and bail out early if it is 0. This turns out to help reduce the amount of diagnostic output from the ACPIPHP subsystem and speed up boot on at least one system that generates multiple device check notifies for PCIe devices on the root bus during boot. Reported-and-tested-by: Alex Williamson Signed-off-by: Rafael J. Wysocki --- drivers/pci/hotplug/acpiphp_glue.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index 3f78212f4eee..65290226e5dd 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -542,12 +542,12 @@ static void __ref enable_slot(struct acpiphp_slot *slot) struct acpiphp_func *func; int max, pass; LIST_HEAD(add_list); + int nr_found; list_for_each_entry(func, &slot->funcs, sibling) acpiphp_bus_add(func_to_handle(func)); - pci_scan_slot(bus, PCI_DEVFN(slot->device, 0)); - + nr_found = pci_scan_slot(bus, PCI_DEVFN(slot->device, 0)); max = acpiphp_max_busnr(bus); for (pass = 0; pass < 2; pass++) { list_for_each_entry(dev, &bus->devices, bus_list) { @@ -566,8 +566,11 @@ static void __ref enable_slot(struct acpiphp_slot *slot) } } } - __pci_bus_assign_resources(bus, &add_list, NULL); + /* Nothing more to do here if there are no new devices on this bus. */ + if (!nr_found && (slot->flags & SLOT_ENABLED)) + return; + acpiphp_sanitize_bus(bus); acpiphp_set_hpp_values(bus); acpiphp_set_acpi_region(slot); From e532e84ea11399a6066f31641425a76dd012ce77 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 6 Sep 2013 15:41:41 +0200 Subject: [PATCH 09/25] ACPI / hotplug / PCI: Use _OST to notify firmware about notify status The spec suggests that we should use _OST to notify the platform about the status of notifications it sends us, for example so that it doesn't repeate a notification that has been handled already. This turns out to help reduce the amount of diagnostic output from the ACPIPHP subsystem and speed up boot on at least one system that generates multiple device check notifies for PCIe devices on the root bus during boot. Reported-and-tested-by: Alex Williamson Signed-off-by: Rafael J. Wysocki --- drivers/pci/hotplug/acpiphp_glue.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index 65290226e5dd..1971d2943de4 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -870,6 +870,8 @@ static void hotplug_event_work(struct work_struct *work) hotplug_event(hp_work->handle, hp_work->type, context); acpi_scan_lock_release(); + acpi_evaluate_hotplug_ost(hp_work->handle, hp_work->type, + ACPI_OST_SC_SUCCESS, NULL); kfree(hp_work); /* allocated in handle_hotplug_event() */ put_bridge(context->func.parent); } @@ -885,11 +887,15 @@ static void hotplug_event_work(struct work_struct *work) static void handle_hotplug_event(acpi_handle handle, u32 type, void *data) { struct acpiphp_context *context; + u32 ost_code = ACPI_OST_SC_SUCCESS; switch (type) { case ACPI_NOTIFY_BUS_CHECK: case ACPI_NOTIFY_DEVICE_CHECK: + break; case ACPI_NOTIFY_EJECT_REQUEST: + ost_code = ACPI_OST_SC_EJECT_IN_PROGRESS; + acpi_evaluate_hotplug_ost(handle, type, ost_code, NULL); break; case ACPI_NOTIFY_DEVICE_WAKE: @@ -898,20 +904,21 @@ static void handle_hotplug_event(acpi_handle handle, u32 type, void *data) case ACPI_NOTIFY_FREQUENCY_MISMATCH: acpi_handle_err(handle, "Device cannot be configured due " "to a frequency mismatch\n"); - return; + goto out; case ACPI_NOTIFY_BUS_MODE_MISMATCH: acpi_handle_err(handle, "Device cannot be configured due " "to a bus mode mismatch\n"); - return; + goto out; case ACPI_NOTIFY_POWER_FAULT: acpi_handle_err(handle, "Device has suffered a power fault\n"); - return; + goto out; default: acpi_handle_warn(handle, "Unsupported event type 0x%x\n", type); - return; + ost_code = ACPI_OST_SC_UNRECOGNIZED_NOTIFY; + goto out; } mutex_lock(&acpiphp_context_lock); @@ -920,8 +927,14 @@ static void handle_hotplug_event(acpi_handle handle, u32 type, void *data) get_bridge(context->func.parent); acpiphp_put_context(context); alloc_acpi_hp_work(handle, type, context, hotplug_event_work); + mutex_unlock(&acpiphp_context_lock); + return; } mutex_unlock(&acpiphp_context_lock); + ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; + + out: + acpi_evaluate_hotplug_ost(handle, type, ost_code, NULL); } /* From a47d8c8e72a5fa2e69117674c4b0b6cc79c5bc53 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 8 Sep 2013 00:07:28 +0200 Subject: [PATCH 10/25] ACPI / hotplug / PCI: Avoid parent bus rescans on spurious device checks In the current ACPIPHP notify handler we always go directly for a rescan of the parent bus if we get a device check notification for a device that is not a bridge. However, this obviously is overzealous if nothing really changes, because this way we may rescan the whole PCI hierarchy pretty much in vain. That happens on Alex Williamson's machine whose ACPI tables contain device objects that are supposed to coresspond to PCIe root ports, but those ports aren't physically present (or at least they aren't visible in the PCI config space to us). The BIOS generates multiple device check notifies for those objects during boot and for each of them we go straight for the parent bus rescan, but the parent bus is the root bus in this particular case. In consequence, we rescan the whole PCI bus from the top several times in a row, which is completely unnecessary, increases boot time by 50% (after previous fixes) and generates excess dmesg output from the PCI subsystem. Fix the problem by checking if we can find anything new in the slot corresponding to the device we've got a device check notify for and doing nothig if that's not the case. The spec (ACPI 5.0, Section 5.6.6) appears to mandate this behavior, as it says: Device Check. Used to notify OSPM that the device either appeared or disappeared. If the device has appeared, OSPM will re-enumerate from the parent. If the device has disappeared, OSPM will invalidate the state of the device. OSPM may optimize out re-enumeration. Therefore, according to the spec, we are free to do nothing if nothing changes. References: https://bugzilla.kernel.org/show_bug.cgi?id=60865 Reported-and-tested-by: Alex Williamson Signed-off-by: Rafael J. Wysocki --- drivers/pci/hotplug/acpiphp_glue.c | 32 +++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index 1971d2943de4..9d6e535e74a1 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -528,6 +528,16 @@ static void check_hotplug_bridge(struct acpiphp_slot *slot, struct pci_dev *dev) } } +static int acpiphp_rescan_slot(struct acpiphp_slot *slot) +{ + struct acpiphp_func *func; + + list_for_each_entry(func, &slot->funcs, sibling) + acpiphp_bus_add(func_to_handle(func)); + + return pci_scan_slot(slot->bus, PCI_DEVFN(slot->device, 0)); +} + /** * enable_slot - enable, configure a slot * @slot: slot to be enabled @@ -544,10 +554,7 @@ static void __ref enable_slot(struct acpiphp_slot *slot) LIST_HEAD(add_list); int nr_found; - list_for_each_entry(func, &slot->funcs, sibling) - acpiphp_bus_add(func_to_handle(func)); - - nr_found = pci_scan_slot(bus, PCI_DEVFN(slot->device, 0)); + nr_found = acpiphp_rescan_slot(slot); max = acpiphp_max_busnr(bus); for (pass = 0; pass < 2; pass++) { list_for_each_entry(dev, &bus->devices, bus_list) { @@ -840,11 +847,22 @@ static void hotplug_event(acpi_handle handle, u32 type, void *data) case ACPI_NOTIFY_DEVICE_CHECK: /* device check */ dbg("%s: Device check notify on %s\n", __func__, objname); - if (bridge) + if (bridge) { acpiphp_check_bridge(bridge); - else - acpiphp_check_bridge(func->parent); + } else { + struct acpiphp_slot *slot = func->slot; + int ret; + /* + * Check if anything has changed in the slot and rescan + * from the parent if that's the case. + */ + mutex_lock(&slot->crit_sect); + ret = acpiphp_rescan_slot(slot); + mutex_unlock(&slot->crit_sect); + if (ret) + acpiphp_check_bridge(func->parent); + } break; case ACPI_NOTIFY_EJECT_REQUEST: From 11b88ee275ec8590a373396888c2460ee89364d6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 9 Sep 2013 23:07:47 +0200 Subject: [PATCH 11/25] ACPI / bind: Prefer device objects with _STA to those without it As reported at https://bugzilla.kernel.org/show_bug.cgi?id=60829, there still are cases in which do_find_child() doesn't choose the ACPI device object it is "expected" to choose if there are more such objects matching one PCI device present. This particular problem may be worked around by making do_find_child() return device obejcts witn _STA whose result indicates that the device is enabled before device objects without _STA if there's more than one device object to choose from. This change doesn't affect the case in which there's only one matching ACPI device object per PCI device. References: https://bugzilla.kernel.org/show_bug.cgi?id=60829 Reported-by: Peter Wu Tested-by: Felix Lisczyk Signed-off-by: Rafael J. Wysocki --- drivers/acpi/glue.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/drivers/acpi/glue.c b/drivers/acpi/glue.c index 94672297e1b1..10f0f40587bb 100644 --- a/drivers/acpi/glue.c +++ b/drivers/acpi/glue.c @@ -79,6 +79,9 @@ static struct acpi_bus_type *acpi_get_bus_type(struct device *dev) return ret; } +#define FIND_CHILD_MIN_SCORE 1 +#define FIND_CHILD_MAX_SCORE 2 + static acpi_status acpi_dev_present(acpi_handle handle, u32 lvl_not_used, void *not_used, void **ret_p) { @@ -92,14 +95,17 @@ static acpi_status acpi_dev_present(acpi_handle handle, u32 lvl_not_used, return AE_OK; } -static bool acpi_extra_checks_passed(acpi_handle handle, bool is_bridge) +static int do_find_child_checks(acpi_handle handle, bool is_bridge) { + bool sta_present = true; unsigned long long sta; acpi_status status; - status = acpi_bus_get_status_handle(handle, &sta); - if (ACPI_FAILURE(status) || !(sta & ACPI_STA_DEVICE_ENABLED)) - return false; + status = acpi_evaluate_integer(handle, "_STA", NULL, &sta); + if (status == AE_NOT_FOUND) + sta_present = false; + else if (ACPI_FAILURE(status) || !(sta & ACPI_STA_DEVICE_ENABLED)) + return -ENODEV; if (is_bridge) { void *test = NULL; @@ -107,16 +113,17 @@ static bool acpi_extra_checks_passed(acpi_handle handle, bool is_bridge) /* Check if this object has at least one child device. */ acpi_walk_namespace(ACPI_TYPE_DEVICE, handle, 1, acpi_dev_present, NULL, NULL, &test); - return !!test; + if (!test) + return -ENODEV; } - return true; + return sta_present ? FIND_CHILD_MAX_SCORE : FIND_CHILD_MIN_SCORE; } struct find_child_context { u64 addr; bool is_bridge; acpi_handle ret; - bool ret_checked; + int ret_score; }; static acpi_status do_find_child(acpi_handle handle, u32 lvl_not_used, @@ -125,6 +132,7 @@ static acpi_status do_find_child(acpi_handle handle, u32 lvl_not_used, struct find_child_context *context = data; unsigned long long addr; acpi_status status; + int score; status = acpi_evaluate_integer(handle, METHOD_NAME__ADR, NULL, &addr); if (ACPI_FAILURE(status) || addr != context->addr) @@ -144,15 +152,20 @@ static acpi_status do_find_child(acpi_handle handle, u32 lvl_not_used, * its handle if so. Second, check the same for the object that we've * just found. */ - if (!context->ret_checked) { - if (acpi_extra_checks_passed(context->ret, context->is_bridge)) + if (!context->ret_score) { + score = do_find_child_checks(context->ret, context->is_bridge); + if (score == FIND_CHILD_MAX_SCORE) return AE_CTRL_TERMINATE; else - context->ret_checked = true; + context->ret_score = score; } - if (acpi_extra_checks_passed(handle, context->is_bridge)) { + score = do_find_child_checks(handle, context->is_bridge); + if (score == FIND_CHILD_MAX_SCORE) { context->ret = handle; return AE_CTRL_TERMINATE; + } else if (score > context->ret_score) { + context->ret = handle; + context->ret_score = score; } return AE_OK; } From f73d39338444d9915c746403bd98b145ff9d2ba4 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Sat, 31 Aug 2013 17:53:40 +0530 Subject: [PATCH 12/25] cpufreq: don't allow governor limits to be changed when it is disabled __cpufreq_governor() returns with -EBUSY when governor is already stopped and we try to stop it again, but when it is stopped we must not allow calls to CPUFREQ_GOV_LIMITS event as well. This patch adds this check in __cpufreq_governor(). Reported-by: Stephen Boyd Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 5c75e3147a60..06a2496d2075 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1692,8 +1692,9 @@ static int __cpufreq_governor(struct cpufreq_policy *policy, policy->cpu, event); mutex_lock(&cpufreq_governor_lock); - if ((!policy->governor_enabled && (event == CPUFREQ_GOV_STOP)) || - (policy->governor_enabled && (event == CPUFREQ_GOV_START))) { + if ((policy->governor_enabled && event == CPUFREQ_GOV_START) + || (!policy->governor_enabled + && (event == CPUFREQ_GOV_LIMITS || event == CPUFREQ_GOV_STOP))) { mutex_unlock(&cpufreq_governor_lock); return -EBUSY; } From 19c763031acb831a5ab9c1a701b7fedda073eb3f Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Sat, 31 Aug 2013 17:48:23 +0530 Subject: [PATCH 13/25] cpufreq: serialize calls to __cpufreq_governor() We can't take a big lock around __cpufreq_governor() as this causes recursive locking for some cases. But calls to this routine must be serialized for every policy. Otherwise we can see some unpredictable events. For example, consider following scenario: __cpufreq_remove_dev() __cpufreq_governor(policy, CPUFREQ_GOV_STOP); policy->governor->governor(policy, CPUFREQ_GOV_STOP); cpufreq_governor_dbs() case CPUFREQ_GOV_STOP: mutex_destroy(&cpu_cdbs->timer_mutex) cpu_cdbs->cur_policy = NULL; store() __cpufreq_set_policy() __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); policy->governor->governor(policy, CPUFREQ_GOV_LIMITS); case CPUFREQ_GOV_LIMITS: mutex_lock(&cpu_cdbs->timer_mutex); <-- Warning (destroyed mutex) if (policy->max < cpu_cdbs->cur_policy->cur) <- cur_policy == NULL And so store() will eventually result in a crash if cur_policy is NULL at this point. Introduce an additional variable which would guarantee serialization here. Reported-by: Stephen Boyd Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 7 ++++++- include/linux/cpufreq.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 06a2496d2075..7e6baa58a7f2 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1692,13 +1692,15 @@ static int __cpufreq_governor(struct cpufreq_policy *policy, policy->cpu, event); mutex_lock(&cpufreq_governor_lock); - if ((policy->governor_enabled && event == CPUFREQ_GOV_START) + if (policy->governor_busy + || (policy->governor_enabled && event == CPUFREQ_GOV_START) || (!policy->governor_enabled && (event == CPUFREQ_GOV_LIMITS || event == CPUFREQ_GOV_STOP))) { mutex_unlock(&cpufreq_governor_lock); return -EBUSY; } + policy->governor_busy = true; if (event == CPUFREQ_GOV_STOP) policy->governor_enabled = false; else if (event == CPUFREQ_GOV_START) @@ -1727,6 +1729,9 @@ static int __cpufreq_governor(struct cpufreq_policy *policy, ((event == CPUFREQ_GOV_POLICY_EXIT) && !ret)) module_put(policy->governor->owner); + mutex_lock(&cpufreq_governor_lock); + policy->governor_busy = false; + mutex_unlock(&cpufreq_governor_lock); return ret; } diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index d568f3975eeb..cca885dac1d3 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -76,6 +76,7 @@ struct cpufreq_policy { struct cpufreq_governor *governor; /* see below */ void *governor_data; bool governor_enabled; /* governor start/stop flag */ + bool governor_busy; struct work_struct update; /* if update_policy() needs to be * called, but you're in IRQ context */ From a857c0b9e24e39fe5be82451b65377795f9538d8 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Sat, 7 Sep 2013 18:35:08 +0200 Subject: [PATCH 14/25] cpufreq: Fix wrong time unit conversion The time spent by a CPU under a given frequency is stored in jiffies unit in the cpu var cpufreq_stats_table->time_in_state[i], i being the index of the frequency. This is what is displayed in the following file on the right column: cat /sys/devices/system/cpu/cpuX/cpufreq/stats/time_in_state 2301000 19835820 2300000 3172 [...] Now cpufreq converts this jiffies unit delta to clock_t before returning it to the user as in the above file. And that conversion is achieved using the API cputime64_to_clock_t(). Although it accidentally works on traditional tick based cputime accounting, where cputime_t maps directly to jiffies, it doesn't work with other types of cputime accounting such as CONFIG_VIRT_CPU_ACCOUNTING_* where cputime_t can map to nsecs or any granularity preffered by the architecture. For example we get a buggy zero delta on full dyntick configurations: cat /sys/devices/system/cpu/cpuX/cpufreq/stats/time_in_state 2301000 0 2300000 0 [...] Fix this with using the proper jiffies_64_t to clock_t conversion. Reported-and-tested-by: Carsten Emde Signed-off-by: Andreas Schwab Signed-off-by: Frederic Weisbecker Acked-by: Paul E. McKenney Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_stats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 04452f026ed0..4cf0d2805cb2 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -74,7 +74,7 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) for (i = 0; i < stat->state_num; i++) { len += sprintf(buf + len, "%u %llu\n", stat->freq_table[i], (unsigned long long) - cputime64_to_clock_t(stat->time_in_state[i])); + jiffies_64_to_clock_t(stat->time_in_state[i])); } return len; } From cedb70afd077b00bff7379042fdbf7eef32606c9 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 7 Sep 2013 01:23:09 +0530 Subject: [PATCH 15/25] cpufreq: Split __cpufreq_remove_dev() into two parts During CPU offline, the cpufreq core invokes __cpufreq_remove_dev() to perform work such as stopping the cpufreq governor, clearing the CPU from the policy structure etc, and finally cleaning up the kobject. There are certain subtle issues related to the kobject cleanup, and it would be much easier to deal with them if we separate that part from the rest of the cleanup-work in the CPU offline phase. So split the __cpufreq_remove_dev() function into 2 parts: one that handles the kobject cleanup, and the other that handles the rest of the work. Reported-by: Stephen Boyd Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 65 +++++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 12 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 7e6baa58a7f2..a33174e324d1 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1141,22 +1141,14 @@ static int cpufreq_nominate_new_policy_cpu(struct cpufreq_policy *policy, return cpu_dev->id; } -/** - * __cpufreq_remove_dev - remove a CPU device - * - * Removes the cpufreq interface for a CPU device. - * Caller should already have policy_rwsem in write mode for this CPU. - * This routine frees the rwsem before returning. - */ -static int __cpufreq_remove_dev(struct device *dev, - struct subsys_interface *sif, bool frozen) +static int __cpufreq_remove_dev_prepare(struct device *dev, + struct subsys_interface *sif, + bool frozen) { unsigned int cpu = dev->id, cpus; int new_cpu, ret; unsigned long flags; struct cpufreq_policy *policy; - struct kobject *kobj; - struct completion *cmp; pr_debug("%s: unregistering CPU %u\n", __func__, cpu); @@ -1213,6 +1205,33 @@ static int __cpufreq_remove_dev(struct device *dev, } } + return 0; +} + +static int __cpufreq_remove_dev_finish(struct device *dev, + struct subsys_interface *sif, + bool frozen) +{ + unsigned int cpu = dev->id, cpus; + int ret; + unsigned long flags; + struct cpufreq_policy *policy; + struct kobject *kobj; + struct completion *cmp; + + read_lock_irqsave(&cpufreq_driver_lock, flags); + policy = per_cpu(cpufreq_cpu_data, cpu); + read_unlock_irqrestore(&cpufreq_driver_lock, flags); + + if (!policy) { + pr_debug("%s: No cpu_data found\n", __func__); + return -EINVAL; + } + + lock_policy_rwsem_read(cpu); + cpus = cpumask_weight(policy->cpus); + unlock_policy_rwsem_read(cpu); + /* If cpu is last user of policy, free policy */ if (cpus == 1) { if (cpufreq_driver->target) { @@ -1272,6 +1291,27 @@ static int __cpufreq_remove_dev(struct device *dev, return 0; } +/** + * __cpufreq_remove_dev - remove a CPU device + * + * Removes the cpufreq interface for a CPU device. + * Caller should already have policy_rwsem in write mode for this CPU. + * This routine frees the rwsem before returning. + */ +static inline int __cpufreq_remove_dev(struct device *dev, + struct subsys_interface *sif, + bool frozen) +{ + int ret; + + ret = __cpufreq_remove_dev_prepare(dev, sif, frozen); + + if (!ret) + ret = __cpufreq_remove_dev_finish(dev, sif, frozen); + + return ret; +} + static int cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif) { unsigned int cpu = dev->id; @@ -2000,7 +2040,8 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb, break; case CPU_DOWN_PREPARE: - __cpufreq_remove_dev(dev, NULL, frozen); + __cpufreq_remove_dev_prepare(dev, NULL, frozen); + __cpufreq_remove_dev_finish(dev, NULL, frozen); break; case CPU_DOWN_FAILED: From 1aee40ac9c86759c05f2ceb4523642b22fc8ea36 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 7 Sep 2013 01:23:27 +0530 Subject: [PATCH 16/25] cpufreq: Invoke __cpufreq_remove_dev_finish() after releasing cpu_hotplug.lock __cpufreq_remove_dev_finish() handles the kobject cleanup for a CPU going offline. But because we destroy the kobject towards the end of the CPU offline phase, there are certain race windows where a task can try to write to a cpufreq sysfs file (eg: using store_scaling_max_freq()) while we are taking that CPU offline, and this can bump up the kobject refcount, which in turn might hinder the CPU offline task from running to completion. (It can also cause other more serious problems such as trying to acquire a destroyed timer-mutex etc., depending on the exact stage of the cleanup at which the task managed to take a new refcount). To fix the race window, we will need to synchronize those store_*() call-sites with CPU hotplug, using get_online_cpus()/put_online_cpus(). However, that in turn can cause a total deadlock because it can end up waiting for the CPU offline task to complete, with incremented refcount! Write to sysfs CPU offline task -------------- ---------------- kobj_refcnt++ Acquire cpu_hotplug.lock get_online_cpus(); Wait for kobj_refcnt to drop to zero **DEADLOCK** A simple way to avoid this problem is to perform the kobject cleanup in the CPU offline path, with the cpu_hotplug.lock *released*. That is, we can perform the wait-for-kobj-refcnt-to-drop as well as the subsequent cleanup in the CPU_POST_DEAD stage of CPU offline, which is run with cpu_hotplug.lock released. Doing this helps us avoid deadlocks due to holding kobject refcounts and waiting on each other on the cpu_hotplug.lock. (Note: We can't move all of the cpufreq CPU offline steps to the CPU_POST_DEAD stage, because certain things such as stopping the governors have to be done before the outgoing CPU is marked offline. So retain those parts in the CPU_DOWN_PREPARE stage itself). Reported-by: Stephen Boyd Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index a33174e324d1..34999fc3216f 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2041,6 +2041,9 @@ static int cpufreq_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: __cpufreq_remove_dev_prepare(dev, NULL, frozen); + break; + + case CPU_POST_DEAD: __cpufreq_remove_dev_finish(dev, NULL, frozen); break; From 4f750c930822b92df74327a4d1364eff87701360 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 7 Sep 2013 01:23:43 +0530 Subject: [PATCH 17/25] cpufreq: Synchronize the cpufreq store_*() routines with CPU hotplug The functions that are used to write to cpufreq sysfs files (such as store_scaling_max_freq()) are not hotplug safe. They can race with CPU hotplug tasks and lead to problems such as trying to acquire an already destroyed timer-mutex etc. Eg: __cpufreq_remove_dev() __cpufreq_governor(policy, CPUFREQ_GOV_STOP); policy->governor->governor(policy, CPUFREQ_GOV_STOP); cpufreq_governor_dbs() case CPUFREQ_GOV_STOP: mutex_destroy(&cpu_cdbs->timer_mutex) cpu_cdbs->cur_policy = NULL; store() __cpufreq_set_policy() __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); policy->governor->governor(policy, CPUFREQ_GOV_LIMITS); case CPUFREQ_GOV_LIMITS: mutex_lock(&cpu_cdbs->timer_mutex); <-- Warning (destroyed mutex) if (policy->max < cpu_cdbs->cur_policy->cur) <- cur_policy == NULL So use get_online_cpus()/put_online_cpus() in the store_*() functions, to synchronize with CPU hotplug. However, there is an additional point to note here: some parts of the CPU teardown in the cpufreq subsystem are done in the CPU_POST_DEAD stage, with cpu_hotplug.lock *released*. So, using the get/put_online_cpus() functions alone is insufficient; we should also ensure that we don't race with those latter steps in the hotplug sequence. We can easily achieve this by checking if the CPU is online before proceeding with the store, since the CPU would have been marked offline by the time the CPU_POST_DEAD notifiers are executed. Reported-by: Stephen Boyd Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 34999fc3216f..cf016584b4ac 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -694,8 +694,13 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, struct freq_attr *fattr = to_attr(attr); ssize_t ret = -EINVAL; + get_online_cpus(); + + if (!cpu_online(policy->cpu)) + goto unlock; + if (!down_read_trylock(&cpufreq_rwsem)) - goto exit; + goto unlock; if (lock_policy_rwsem_write(policy->cpu) < 0) goto up_read; @@ -709,7 +714,9 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, up_read: up_read(&cpufreq_rwsem); -exit: +unlock: + put_online_cpus(); + return ret; } From 56d07db274b7b15ca38b60ea4a762d40de093000 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 7 Sep 2013 01:23:55 +0530 Subject: [PATCH 18/25] cpufreq: Remove temporary fix for race between CPU hotplug and sysfs-writes Commit "cpufreq: serialize calls to __cpufreq_governor()" had been a temporary and partial solution to the race condition between writing to a cpufreq sysfs file and taking a CPU offline. Now that we have a proper and complete solution to that problem, remove the temporary fix. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 7 +------ include/linux/cpufreq.h | 1 - 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index cf016584b4ac..2863214c5381 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1739,15 +1739,13 @@ static int __cpufreq_governor(struct cpufreq_policy *policy, policy->cpu, event); mutex_lock(&cpufreq_governor_lock); - if (policy->governor_busy - || (policy->governor_enabled && event == CPUFREQ_GOV_START) + if ((policy->governor_enabled && event == CPUFREQ_GOV_START) || (!policy->governor_enabled && (event == CPUFREQ_GOV_LIMITS || event == CPUFREQ_GOV_STOP))) { mutex_unlock(&cpufreq_governor_lock); return -EBUSY; } - policy->governor_busy = true; if (event == CPUFREQ_GOV_STOP) policy->governor_enabled = false; else if (event == CPUFREQ_GOV_START) @@ -1776,9 +1774,6 @@ static int __cpufreq_governor(struct cpufreq_policy *policy, ((event == CPUFREQ_GOV_POLICY_EXIT) && !ret)) module_put(policy->governor->owner); - mutex_lock(&cpufreq_governor_lock); - policy->governor_busy = false; - mutex_unlock(&cpufreq_governor_lock); return ret; } diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index cca885dac1d3..d568f3975eeb 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -76,7 +76,6 @@ struct cpufreq_policy { struct cpufreq_governor *governor; /* see below */ void *governor_data; bool governor_enabled; /* governor start/stop flag */ - bool governor_busy; struct work_struct update; /* if update_policy() needs to be * called, but you're in IRQ context */ From 5136fa56582beadb7fa71eb30bc79148bfcba5c1 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 7 Sep 2013 01:24:06 +0530 Subject: [PATCH 19/25] cpufreq: Use signed type for 'ret' variable, to store negative error values There are places where the variable 'ret' is declared as unsigned int and then used to store negative return values such as -EINVAL. Fix them by declaring the variable as a signed quantity. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 2863214c5381..73d53d5a16ee 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -437,7 +437,7 @@ static int __cpufreq_set_policy(struct cpufreq_policy *policy, static ssize_t store_##file_name \ (struct cpufreq_policy *policy, const char *buf, size_t count) \ { \ - unsigned int ret; \ + int ret; \ struct cpufreq_policy new_policy; \ \ ret = cpufreq_get_policy(&new_policy, policy->cpu); \ @@ -490,7 +490,7 @@ static ssize_t show_scaling_governor(struct cpufreq_policy *policy, char *buf) static ssize_t store_scaling_governor(struct cpufreq_policy *policy, const char *buf, size_t count) { - unsigned int ret; + int ret; char str_governor[16]; struct cpufreq_policy new_policy; From 798282a8718347b04a2f0a4bae7d775c48c6bcb9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 Sep 2013 02:54:50 +0200 Subject: [PATCH 20/25] Revert "cpufreq: make sure frequency transitions are serialized" Commit 7c30ed5 (cpufreq: make sure frequency transitions are serialized) attempted to serialize frequency transitions by adding checks to the CPUFREQ_PRECHANGE and CPUFREQ_POSTCHANGE notifications. However, it assumed that the notifications will always originate from the driver's .target() callback, but they also can be triggered by cpufreq_out_of_sync() and that leads to warnings like this on some systems: WARNING: CPU: 0 PID: 14543 at drivers/cpufreq/cpufreq.c:317 __cpufreq_notify_transition+0x238/0x260() In middle of another frequency transition accompanied by a call trace similar to this one: [] dump_stack+0x46/0x58 [] warn_slowpath_common+0x8c/0xc0 [] ? acpi_cpufreq_target+0x320/0x320 [] warn_slowpath_fmt+0x46/0x50 [] __cpufreq_notify_transition+0x238/0x260 [] cpufreq_notify_transition+0x3e/0x70 [] cpufreq_out_of_sync+0x6d/0xb0 [] cpufreq_update_policy+0x10c/0x160 [] ? cpufreq_update_policy+0x160/0x160 [] cpufreq_set_cur_state+0x8c/0xb5 [] processor_set_cur_state+0xa3/0xcf [] thermal_cdev_update+0x9c/0xb0 [] step_wise_throttle+0x5a/0x90 [] handle_thermal_trip+0x4f/0x140 [] thermal_zone_device_update+0x57/0xa0 [] acpi_thermal_check+0x2e/0x30 [] acpi_thermal_notify+0x40/0xdc [] acpi_device_notify+0x19/0x1b [] acpi_ev_notify_dispatch+0x41/0x5c [] acpi_os_execute_deferred+0x25/0x32 [] process_one_work+0x170/0x4a0 [] worker_thread+0x121/0x390 [] ? manage_workers.isra.20+0x170/0x170 [] kthread+0xc0/0xd0 [] ? flush_kthread_worker+0xb0/0xb0 [] ret_from_fork+0x7c/0xb0 [] ? flush_kthread_worker+0xb0/0xb0 For this reason, revert commit 7c30ed5 along with the fix 266c13d (cpufreq: Fix serialization of frequency transitions) on top of it and we will revisit the serialization problem later. Reported-by: Alessandro Bono Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 15 --------------- include/linux/cpufreq.h | 1 - 2 files changed, 16 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 73d53d5a16ee..5a64f66d36e0 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -280,13 +280,6 @@ static void __cpufreq_notify_transition(struct cpufreq_policy *policy, switch (state) { case CPUFREQ_PRECHANGE: - if (WARN(policy->transition_ongoing == - cpumask_weight(policy->cpus), - "In middle of another frequency transition\n")) - return; - - policy->transition_ongoing++; - /* detect if the driver reported a value as "old frequency" * which is not equal to what the cpufreq core thinks is * "old frequency". @@ -306,12 +299,6 @@ static void __cpufreq_notify_transition(struct cpufreq_policy *policy, break; case CPUFREQ_POSTCHANGE: - if (WARN(!policy->transition_ongoing, - "No frequency transition in progress\n")) - return; - - policy->transition_ongoing--; - adjust_jiffies(CPUFREQ_POSTCHANGE, freqs); pr_debug("FREQ: %lu - CPU: %lu", (unsigned long)freqs->new, (unsigned long)freqs->cpu); @@ -1657,8 +1644,6 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy, if (cpufreq_disabled()) return -ENODEV; - if (policy->transition_ongoing) - return -EBUSY; /* Make sure that target_freq is within supported range */ if (target_freq > policy->max) diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index d568f3975eeb..fcabc42d66ab 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -85,7 +85,6 @@ struct cpufreq_policy { struct list_head policy_list; struct kobject kobj; struct completion kobj_unregister; - int transition_ongoing; /* Tracks transition status */ }; /* Only for ACPI */ From 6cdcdb793791f776ea9408581b1242b636d43b37 Mon Sep 17 00:00:00 2001 From: Nell Hardcastle Date: Sun, 30 Jun 2013 15:58:57 -0700 Subject: [PATCH 21/25] intel_pstate: Add Haswell CPU models Enable the intel_pstate driver for Haswell CPUs. One missing Ivy Bridge model (0x3E) is also included. Models referenced from tools/power/x86/turbostat/turbostat.c:has_nehalem_turbo_ratio_limit Signed-off-by: Nell Hardcastle Acked-by: Viresh Kumar Acked-by: Dirk Brandewie Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 6efd96c196b2..9733f29ed148 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -522,6 +522,11 @@ static const struct x86_cpu_id intel_pstate_cpu_ids[] = { ICPU(0x2a, default_policy), ICPU(0x2d, default_policy), ICPU(0x3a, default_policy), + ICPU(0x3c, default_policy), + ICPU(0x3e, default_policy), + ICPU(0x3f, default_policy), + ICPU(0x45, default_policy), + ICPU(0x46, default_policy), {} }; MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids); From 0d66b91ebff49841f607a3c079984c907c8a4199 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 12 Sep 2013 01:42:59 +0530 Subject: [PATCH 22/25] cpufreq: Fix crash in cpufreq-stats during suspend/resume Stephen Warren reported that the cpufreq-stats code hits a NULL pointer dereference during the second attempt to suspend a system. He also pin-pointed the problem to commit 5302c3f "cpufreq: Perform light-weight init/teardown during suspend/resume". That commit actually ensured that the cpufreq-stats table and the cpufreq-stats sysfs entries are *not* torn down (ie., not freed) during suspend/resume, which makes it all the more surprising. However, it turns out that the root-cause is not that we access an already freed memory, but that the reference to the allocated memory gets moved around and we lose track of that during resume, leading to the reported crash in a subsequent suspend attempt. In the suspend path, during CPU offline, the value of policy->cpu is updated by choosing one of the surviving CPUs in that policy, as long as there is atleast one CPU in that policy. And cpufreq_stats_update_policy_cpu() is invoked to update the reference to the stats structure by assigning it to the new CPU. However, in the resume path, during CPU online, we end up assigning a fresh CPU as the policy->cpu, without letting cpufreq-stats know about this. Thus the reference to the stats structure remains (incorrectly) associated with the old CPU. So, in a subsequent suspend attempt, during CPU offline, we end up accessing an incorrect location to get the stats structure, which eventually leads to the NULL pointer dereference. Fix this by letting cpufreq-stats know about the update of the policy->cpu during CPU online in the resume path. (Also, move the update_policy_cpu() function higher up in the file, so that __cpufreq_add_dev() can invoke it). Reported-and-tested-by: Stephen Warren Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 5a64f66d36e0..62bdb955ea56 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -947,6 +947,18 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy) kfree(policy); } +static void update_policy_cpu(struct cpufreq_policy *policy, unsigned int cpu) +{ + policy->last_cpu = policy->cpu; + policy->cpu = cpu; + +#ifdef CONFIG_CPU_FREQ_TABLE + cpufreq_frequency_table_update_policy_cpu(policy); +#endif + blocking_notifier_call_chain(&cpufreq_policy_notifier_list, + CPUFREQ_UPDATE_POLICY_CPU, policy); +} + static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif, bool frozen) { @@ -1000,7 +1012,18 @@ static int __cpufreq_add_dev(struct device *dev, struct subsys_interface *sif, if (!policy) goto nomem_out; - policy->cpu = cpu; + + /* + * In the resume path, since we restore a saved policy, the assignment + * to policy->cpu is like an update of the existing policy, rather than + * the creation of a brand new one. So we need to perform this update + * by invoking update_policy_cpu(). + */ + if (frozen && cpu != policy->cpu) + update_policy_cpu(policy, cpu); + else + policy->cpu = cpu; + policy->governor = CPUFREQ_DEFAULT_GOVERNOR; cpumask_copy(policy->cpus, cpumask_of(cpu)); @@ -1092,18 +1115,6 @@ static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif) return __cpufreq_add_dev(dev, sif, false); } -static void update_policy_cpu(struct cpufreq_policy *policy, unsigned int cpu) -{ - policy->last_cpu = policy->cpu; - policy->cpu = cpu; - -#ifdef CONFIG_CPU_FREQ_TABLE - cpufreq_frequency_table_update_policy_cpu(policy); -#endif - blocking_notifier_call_chain(&cpufreq_policy_notifier_list, - CPUFREQ_UPDATE_POLICY_CPU, policy); -} - static int cpufreq_nominate_new_policy_cpu(struct cpufreq_policy *policy, unsigned int old_cpu, bool frozen) { From 61173f256a3bebfbd09b4bd2c164dde378614091 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 12 Sep 2013 01:43:25 +0530 Subject: [PATCH 23/25] cpufreq: Restructure if/else block to avoid unintended behavior In __cpufreq_remove_dev_prepare(), the code which decides whether to remove the sysfs link or nominate a new policy cpu, is governed by an if/else block with a rather complex set of conditionals. Worse, they harbor a subtlety which leads to certain unintended behavior. The code looks like this: if (cpu != policy->cpu && !frozen) { sysfs_remove_link(&dev->kobj, "cpufreq"); } else if (cpus > 1) { new_cpu = cpufreq_nominate_new_policy_cpu(...); ... update_policy_cpu(..., new_cpu); } The original intention was: If the CPU going offline is not policy->cpu, just remove the link. On the other hand, if the CPU going offline is the policy->cpu itself, handover the policy->cpu job to some other surviving CPU in that policy. But because the 'if' condition also includes the 'frozen' check, now there are *two* possibilities by which we can enter the 'else' block: 1. cpu == policy->cpu (intended) 2. cpu != policy->cpu && frozen (unintended) Due to the second (unintended) scenario, we end up spuriously nominating a CPU as the policy->cpu, even when the existing policy->cpu is alive and well. This can cause problems further down the line, especially when we end up nominating the same policy->cpu as the new one (ie., old == new), because it totally confuses update_policy_cpu(). To avoid this mess, restructure the if/else block to only do what was originally intended, and thus prevent any unwelcome surprises. Signed-off-by: Srivatsa S. Bhat Tested-by: Stephen Warren Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 62bdb955ea56..247842b2ee2d 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1193,8 +1193,9 @@ static int __cpufreq_remove_dev_prepare(struct device *dev, cpumask_clear_cpu(cpu, policy->cpus); unlock_policy_rwsem_write(cpu); - if (cpu != policy->cpu && !frozen) { - sysfs_remove_link(&dev->kobj, "cpufreq"); + if (cpu != policy->cpu) { + if (!frozen) + sysfs_remove_link(&dev->kobj, "cpufreq"); } else if (cpus > 1) { new_cpu = cpufreq_nominate_new_policy_cpu(policy, cpu, frozen); From cb38ed5cf1c4fdb7454e4b48fb70c396f5acfb21 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 12 Sep 2013 01:43:42 +0530 Subject: [PATCH 24/25] cpufreq: Prevent problems in update_policy_cpu() if last_cpu == new_cpu If update_policy_cpu() is invoked with the existing policy->cpu itself as the new-cpu parameter, then a lot of things can go terribly wrong. In its present form, update_policy_cpu() always assumes that the new-cpu is different from policy->cpu and invokes other functions to perform their respective updates. And those functions implement the actual update like this: per_cpu(..., new_cpu) = per_cpu(..., last_cpu); per_cpu(..., last_cpu) = NULL; Thus, when new_cpu == last_cpu, the final NULL assignment makes the per-cpu references vanish into thin air! (memory leak). From there, it leads to more problems: cpufreq_stats_create_table() now doesn't find the per-cpu reference and hence tries to create a new sysfs-group; but sysfs already had created the group earlier, so it complains that it cannot create a duplicate filename. In short, the repercussions of a rather innocuous invocation of update_policy_cpu() can turn out to be pretty nasty. Ideally update_policy_cpu() should handle this situation (new == last) gracefully, and not lead to such severe problems. So fix it by adding an appropriate check. Signed-off-by: Srivatsa S. Bhat Tested-by: Stephen Warren Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 247842b2ee2d..d32040cc1c46 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -949,6 +949,9 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy) static void update_policy_cpu(struct cpufreq_policy *policy, unsigned int cpu) { + if (cpu == policy->cpu) + return; + policy->last_cpu = policy->cpu; policy->cpu = cpu; From 44871c9c7f7963f8869dd8bc9620221c9e9db153 Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Wed, 11 Sep 2013 15:05:05 +0800 Subject: [PATCH 25/25] cpufreq: Acquire the lock in cpufreq_policy_restore() for reading In cpufreq_policy_restore() before system suspend policy is read from percpu's cpufreq_cpu_data_fallback. It's a read operation rather than a write one, so take the lock for reading in there. Signed-off-by: Lan Tianyu Reviewed-by: Srivatsa S. Bhat Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index d32040cc1c46..43c24aa756f6 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -906,11 +906,11 @@ static struct cpufreq_policy *cpufreq_policy_restore(unsigned int cpu) struct cpufreq_policy *policy; unsigned long flags; - write_lock_irqsave(&cpufreq_driver_lock, flags); + read_lock_irqsave(&cpufreq_driver_lock, flags); policy = per_cpu(cpufreq_cpu_data_fallback, cpu); - write_unlock_irqrestore(&cpufreq_driver_lock, flags); + read_unlock_irqrestore(&cpufreq_driver_lock, flags); return policy; }