kernel-fxtec-pro1x/kernel/time/tick-sched.c

1431 lines
35 KiB
C
Raw Normal View History

/*
* linux/kernel/time/tick-sched.c
*
* Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
*
* No idle tick implementation for low and high resolution timers
*
* Started by: Thomas Gleixner and Ingo Molnar
*
* Distribute under GPLv2.
*/
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/hrtimer.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/percpu.h>
#include <linux/nmi.h>
#include <linux/profile.h>
#include <linux/sched/signal.h>
#include <linux/sched/clock.h>
#include <linux/sched/stat.h>
#include <linux/sched/nohz.h>
#include <linux/module.h>
#include <linux/irq_work.h>
#include <linux/posix-timers.h>
#include <linux/timer.h>
#include <linux/context_tracking.h>
#include <linux/mm.h>
#include <linux/rq_stats.h>
#include <asm/irq_regs.h>
#include "tick-internal.h"
#include <trace/events/timer.h>
struct rq_data rq_info;
struct workqueue_struct *rq_wq;
spinlock_t rq_lock;
/*
* Per-CPU nohz control structure
*/
static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
[PATCH] Add debugging feature /proc/timer_list add /proc/timer_list, which prints all currently pending (high-res) timers, all clock-event sources and their parameters in a human-readable form. Sample output: Timer List Version: v0.1 HRTIMER_MAX_CLOCK_BASES: 2 now at 4246046273872 nsecs cpu: 0 clock 0: .index: 0 .resolution: 1 nsecs .get_time: ktime_get_real .offset: 1273998312645738432 nsecs active timers: clock 1: .index: 1 .resolution: 1 nsecs .get_time: ktime_get .offset: 0 nsecs active timers: #0: <f5a90ec8>, hrtimer_sched_tick, hrtimer_stop_sched_tick, swapper/0 # expires at 4246432689566 nsecs [in 386415694 nsecs] #1: <f5a90ec8>, hrtimer_wakeup, do_nanosleep, pcscd/2050 # expires at 4247018194689 nsecs [in 971920817 nsecs] #2: <f5a90ec8>, hrtimer_wakeup, do_nanosleep, irqbalance/1909 # expires at 4247351358392 nsecs [in 1305084520 nsecs] #3: <f5a90ec8>, hrtimer_wakeup, do_nanosleep, crond/2157 # expires at 4249097614968 nsecs [in 3051341096 nsecs] #4: <f5a90ec8>, it_real_fn, do_setitimer, syslogd/1888 # expires at 4251329900926 nsecs [in 5283627054 nsecs] .expires_next : 4246432689566 nsecs .hres_active : 1 .check_clocks : 0 .nr_events : 31306 .idle_tick : 4246020791890 nsecs .tick_stopped : 1 .idle_jiffies : 986504 .idle_calls : 40700 .idle_sleeps : 36014 .idle_entrytime : 4246019418883 nsecs .idle_sleeptime : 4178181972709 nsecs cpu: 1 clock 0: .index: 0 .resolution: 1 nsecs .get_time: ktime_get_real .offset: 1273998312645738432 nsecs active timers: clock 1: .index: 1 .resolution: 1 nsecs .get_time: ktime_get .offset: 0 nsecs active timers: #0: <f5a90ec8>, hrtimer_sched_tick, hrtimer_restart_sched_tick, swapper/0 # expires at 4246050084568 nsecs [in 3810696 nsecs] #1: <f5a90ec8>, hrtimer_wakeup, do_nanosleep, atd/2227 # expires at 4261010635003 nsecs [in 14964361131 nsecs] #2: <f5a90ec8>, hrtimer_wakeup, do_nanosleep, smartd/2332 # expires at 5469485798970 nsecs [in 1223439525098 nsecs] .expires_next : 4246050084568 nsecs .hres_active : 1 .check_clocks : 0 .nr_events : 24043 .idle_tick : 4246046084568 nsecs .tick_stopped : 0 .idle_jiffies : 986510 .idle_calls : 26360 .idle_sleeps : 22551 .idle_entrytime : 4246043874339 nsecs .idle_sleeptime : 4170763761184 nsecs tick_broadcast_mask: 00000003 event_broadcast_mask: 00000001 CPU#0's local event device: Clock Event Device: lapic capabilities: 0000000e max_delta_ns: 807385544 min_delta_ns: 1443 mult: 44624025 shift: 32 set_next_event: lapic_next_event set_mode: lapic_timer_setup event_handler: hrtimer_interrupt .installed: 1 .expires: 4246432689566 nsecs CPU#1's local event device: Clock Event Device: lapic capabilities: 0000000e max_delta_ns: 807385544 min_delta_ns: 1443 mult: 44624025 shift: 32 set_next_event: lapic_next_event set_mode: lapic_timer_setup event_handler: hrtimer_interrupt .installed: 1 .expires: 4246050084568 nsecs Clock Event Device: hpet capabilities: 00000007 max_delta_ns: 2147483647 min_delta_ns: 3352 mult: 61496110 shift: 32 set_next_event: hpet_next_event set_mode: hpet_set_mode event_handler: handle_nextevt_broadcast Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: john stultz <johnstul@us.ibm.com> Cc: Roman Zippel <zippel@linux-m68k.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-16 02:28:15 -07:00
struct tick_sched *tick_get_tick_sched(int cpu)
{
return &per_cpu(tick_cpu_sched, cpu);
}
#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
/*
* The time, when the last jiffy update happened. Protected by jiffies_lock.
*/
static ktime_t last_jiffies_update;
/*
* Must be called with interrupts disabled !
*/
static void tick_do_update_jiffies64(ktime_t now)
{
unsigned long ticks = 0;
ktime_t delta;
/*
* Do a quick check without holding jiffies_lock:
tick/sched: Annotate lockless access to last_jiffies_update commit de95a991bb72e009f47e0c4bbc90fc5f594588d5 upstream. syzbot (KCSAN) reported a data-race in tick_do_update_jiffies64(): BUG: KCSAN: data-race in tick_do_update_jiffies64 / tick_do_update_jiffies64 write to 0xffffffff8603d008 of 8 bytes by interrupt on cpu 1: tick_do_update_jiffies64+0x100/0x250 kernel/time/tick-sched.c:73 tick_sched_do_timer+0xd4/0xe0 kernel/time/tick-sched.c:138 tick_sched_timer+0x43/0xe0 kernel/time/tick-sched.c:1292 __run_hrtimer kernel/time/hrtimer.c:1514 [inline] __hrtimer_run_queues+0x274/0x5f0 kernel/time/hrtimer.c:1576 hrtimer_interrupt+0x22a/0x480 kernel/time/hrtimer.c:1638 local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1110 [inline] smp_apic_timer_interrupt+0xdc/0x280 arch/x86/kernel/apic/apic.c:1135 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830 arch_local_irq_restore arch/x86/include/asm/paravirt.h:756 [inline] kcsan_setup_watchpoint+0x1d4/0x460 kernel/kcsan/core.c:436 check_access kernel/kcsan/core.c:466 [inline] __tsan_read1 kernel/kcsan/core.c:593 [inline] __tsan_read1+0xc2/0x100 kernel/kcsan/core.c:593 kallsyms_expand_symbol.constprop.0+0x70/0x160 kernel/kallsyms.c:79 kallsyms_lookup_name+0x7f/0x120 kernel/kallsyms.c:170 insert_report_filterlist kernel/kcsan/debugfs.c:155 [inline] debugfs_write+0x14b/0x2d0 kernel/kcsan/debugfs.c:256 full_proxy_write+0xbd/0x100 fs/debugfs/file.c:225 __vfs_write+0x67/0xc0 fs/read_write.c:494 vfs_write fs/read_write.c:558 [inline] vfs_write+0x18a/0x390 fs/read_write.c:542 ksys_write+0xd5/0x1b0 fs/read_write.c:611 __do_sys_write fs/read_write.c:623 [inline] __se_sys_write fs/read_write.c:620 [inline] __x64_sys_write+0x4c/0x60 fs/read_write.c:620 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9 read to 0xffffffff8603d008 of 8 bytes by task 0 on cpu 0: tick_do_update_jiffies64+0x2b/0x250 kernel/time/tick-sched.c:62 tick_nohz_update_jiffies kernel/time/tick-sched.c:505 [inline] tick_nohz_irq_enter kernel/time/tick-sched.c:1257 [inline] tick_irq_enter+0x139/0x1c0 kernel/time/tick-sched.c:1274 irq_enter+0x4f/0x60 kernel/softirq.c:354 entering_irq arch/x86/include/asm/apic.h:517 [inline] entering_ack_irq arch/x86/include/asm/apic.h:523 [inline] smp_apic_timer_interrupt+0x55/0x280 arch/x86/kernel/apic/apic.c:1133 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830 native_safe_halt+0xe/0x10 arch/x86/include/asm/irqflags.h:60 arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:571 default_idle_call+0x1e/0x40 kernel/sched/idle.c:94 cpuidle_idle_call kernel/sched/idle.c:154 [inline] do_idle+0x1af/0x280 kernel/sched/idle.c:263 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:355 rest_init+0xec/0xf6 init/main.c:452 arch_call_rest_init+0x17/0x37 start_kernel+0x838/0x85e init/main.c:786 x86_64_start_reservations+0x29/0x2b arch/x86/kernel/head64.c:490 x86_64_start_kernel+0x72/0x76 arch/x86/kernel/head64.c:471 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:241 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.4.0-rc7+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Use READ_ONCE() and WRITE_ONCE() to annotate this expected race. Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Link: https://lore.kernel.org/r/20191205045619.204946-1-edumazet@google.com Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-12-04 21:56:19 -07:00
* The READ_ONCE() pairs with two updates done later in this function.
*/
tick/sched: Annotate lockless access to last_jiffies_update commit de95a991bb72e009f47e0c4bbc90fc5f594588d5 upstream. syzbot (KCSAN) reported a data-race in tick_do_update_jiffies64(): BUG: KCSAN: data-race in tick_do_update_jiffies64 / tick_do_update_jiffies64 write to 0xffffffff8603d008 of 8 bytes by interrupt on cpu 1: tick_do_update_jiffies64+0x100/0x250 kernel/time/tick-sched.c:73 tick_sched_do_timer+0xd4/0xe0 kernel/time/tick-sched.c:138 tick_sched_timer+0x43/0xe0 kernel/time/tick-sched.c:1292 __run_hrtimer kernel/time/hrtimer.c:1514 [inline] __hrtimer_run_queues+0x274/0x5f0 kernel/time/hrtimer.c:1576 hrtimer_interrupt+0x22a/0x480 kernel/time/hrtimer.c:1638 local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1110 [inline] smp_apic_timer_interrupt+0xdc/0x280 arch/x86/kernel/apic/apic.c:1135 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830 arch_local_irq_restore arch/x86/include/asm/paravirt.h:756 [inline] kcsan_setup_watchpoint+0x1d4/0x460 kernel/kcsan/core.c:436 check_access kernel/kcsan/core.c:466 [inline] __tsan_read1 kernel/kcsan/core.c:593 [inline] __tsan_read1+0xc2/0x100 kernel/kcsan/core.c:593 kallsyms_expand_symbol.constprop.0+0x70/0x160 kernel/kallsyms.c:79 kallsyms_lookup_name+0x7f/0x120 kernel/kallsyms.c:170 insert_report_filterlist kernel/kcsan/debugfs.c:155 [inline] debugfs_write+0x14b/0x2d0 kernel/kcsan/debugfs.c:256 full_proxy_write+0xbd/0x100 fs/debugfs/file.c:225 __vfs_write+0x67/0xc0 fs/read_write.c:494 vfs_write fs/read_write.c:558 [inline] vfs_write+0x18a/0x390 fs/read_write.c:542 ksys_write+0xd5/0x1b0 fs/read_write.c:611 __do_sys_write fs/read_write.c:623 [inline] __se_sys_write fs/read_write.c:620 [inline] __x64_sys_write+0x4c/0x60 fs/read_write.c:620 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9 read to 0xffffffff8603d008 of 8 bytes by task 0 on cpu 0: tick_do_update_jiffies64+0x2b/0x250 kernel/time/tick-sched.c:62 tick_nohz_update_jiffies kernel/time/tick-sched.c:505 [inline] tick_nohz_irq_enter kernel/time/tick-sched.c:1257 [inline] tick_irq_enter+0x139/0x1c0 kernel/time/tick-sched.c:1274 irq_enter+0x4f/0x60 kernel/softirq.c:354 entering_irq arch/x86/include/asm/apic.h:517 [inline] entering_ack_irq arch/x86/include/asm/apic.h:523 [inline] smp_apic_timer_interrupt+0x55/0x280 arch/x86/kernel/apic/apic.c:1133 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830 native_safe_halt+0xe/0x10 arch/x86/include/asm/irqflags.h:60 arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:571 default_idle_call+0x1e/0x40 kernel/sched/idle.c:94 cpuidle_idle_call kernel/sched/idle.c:154 [inline] do_idle+0x1af/0x280 kernel/sched/idle.c:263 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:355 rest_init+0xec/0xf6 init/main.c:452 arch_call_rest_init+0x17/0x37 start_kernel+0x838/0x85e init/main.c:786 x86_64_start_reservations+0x29/0x2b arch/x86/kernel/head64.c:490 x86_64_start_kernel+0x72/0x76 arch/x86/kernel/head64.c:471 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:241 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.4.0-rc7+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Use READ_ONCE() and WRITE_ONCE() to annotate this expected race. Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Link: https://lore.kernel.org/r/20191205045619.204946-1-edumazet@google.com Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-12-04 21:56:19 -07:00
delta = ktime_sub(now, READ_ONCE(last_jiffies_update));
if (delta < tick_period)
return;
/* Reevaluate with jiffies_lock held */
write_seqlock(&jiffies_lock);
delta = ktime_sub(now, last_jiffies_update);
if (delta >= tick_period) {
delta = ktime_sub(delta, tick_period);
tick/sched: Annotate lockless access to last_jiffies_update commit de95a991bb72e009f47e0c4bbc90fc5f594588d5 upstream. syzbot (KCSAN) reported a data-race in tick_do_update_jiffies64(): BUG: KCSAN: data-race in tick_do_update_jiffies64 / tick_do_update_jiffies64 write to 0xffffffff8603d008 of 8 bytes by interrupt on cpu 1: tick_do_update_jiffies64+0x100/0x250 kernel/time/tick-sched.c:73 tick_sched_do_timer+0xd4/0xe0 kernel/time/tick-sched.c:138 tick_sched_timer+0x43/0xe0 kernel/time/tick-sched.c:1292 __run_hrtimer kernel/time/hrtimer.c:1514 [inline] __hrtimer_run_queues+0x274/0x5f0 kernel/time/hrtimer.c:1576 hrtimer_interrupt+0x22a/0x480 kernel/time/hrtimer.c:1638 local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1110 [inline] smp_apic_timer_interrupt+0xdc/0x280 arch/x86/kernel/apic/apic.c:1135 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830 arch_local_irq_restore arch/x86/include/asm/paravirt.h:756 [inline] kcsan_setup_watchpoint+0x1d4/0x460 kernel/kcsan/core.c:436 check_access kernel/kcsan/core.c:466 [inline] __tsan_read1 kernel/kcsan/core.c:593 [inline] __tsan_read1+0xc2/0x100 kernel/kcsan/core.c:593 kallsyms_expand_symbol.constprop.0+0x70/0x160 kernel/kallsyms.c:79 kallsyms_lookup_name+0x7f/0x120 kernel/kallsyms.c:170 insert_report_filterlist kernel/kcsan/debugfs.c:155 [inline] debugfs_write+0x14b/0x2d0 kernel/kcsan/debugfs.c:256 full_proxy_write+0xbd/0x100 fs/debugfs/file.c:225 __vfs_write+0x67/0xc0 fs/read_write.c:494 vfs_write fs/read_write.c:558 [inline] vfs_write+0x18a/0x390 fs/read_write.c:542 ksys_write+0xd5/0x1b0 fs/read_write.c:611 __do_sys_write fs/read_write.c:623 [inline] __se_sys_write fs/read_write.c:620 [inline] __x64_sys_write+0x4c/0x60 fs/read_write.c:620 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9 read to 0xffffffff8603d008 of 8 bytes by task 0 on cpu 0: tick_do_update_jiffies64+0x2b/0x250 kernel/time/tick-sched.c:62 tick_nohz_update_jiffies kernel/time/tick-sched.c:505 [inline] tick_nohz_irq_enter kernel/time/tick-sched.c:1257 [inline] tick_irq_enter+0x139/0x1c0 kernel/time/tick-sched.c:1274 irq_enter+0x4f/0x60 kernel/softirq.c:354 entering_irq arch/x86/include/asm/apic.h:517 [inline] entering_ack_irq arch/x86/include/asm/apic.h:523 [inline] smp_apic_timer_interrupt+0x55/0x280 arch/x86/kernel/apic/apic.c:1133 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830 native_safe_halt+0xe/0x10 arch/x86/include/asm/irqflags.h:60 arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:571 default_idle_call+0x1e/0x40 kernel/sched/idle.c:94 cpuidle_idle_call kernel/sched/idle.c:154 [inline] do_idle+0x1af/0x280 kernel/sched/idle.c:263 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:355 rest_init+0xec/0xf6 init/main.c:452 arch_call_rest_init+0x17/0x37 start_kernel+0x838/0x85e init/main.c:786 x86_64_start_reservations+0x29/0x2b arch/x86/kernel/head64.c:490 x86_64_start_kernel+0x72/0x76 arch/x86/kernel/head64.c:471 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:241 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.4.0-rc7+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Use READ_ONCE() and WRITE_ONCE() to annotate this expected race. Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Link: https://lore.kernel.org/r/20191205045619.204946-1-edumazet@google.com Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-12-04 21:56:19 -07:00
/* Pairs with the lockless read in this function. */
WRITE_ONCE(last_jiffies_update,
ktime_add(last_jiffies_update, tick_period));
/* Slow path for long timeouts */
if (unlikely(delta >= tick_period)) {
s64 incr = ktime_to_ns(tick_period);
ticks = ktime_divns(delta, incr);
tick/sched: Annotate lockless access to last_jiffies_update commit de95a991bb72e009f47e0c4bbc90fc5f594588d5 upstream. syzbot (KCSAN) reported a data-race in tick_do_update_jiffies64(): BUG: KCSAN: data-race in tick_do_update_jiffies64 / tick_do_update_jiffies64 write to 0xffffffff8603d008 of 8 bytes by interrupt on cpu 1: tick_do_update_jiffies64+0x100/0x250 kernel/time/tick-sched.c:73 tick_sched_do_timer+0xd4/0xe0 kernel/time/tick-sched.c:138 tick_sched_timer+0x43/0xe0 kernel/time/tick-sched.c:1292 __run_hrtimer kernel/time/hrtimer.c:1514 [inline] __hrtimer_run_queues+0x274/0x5f0 kernel/time/hrtimer.c:1576 hrtimer_interrupt+0x22a/0x480 kernel/time/hrtimer.c:1638 local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1110 [inline] smp_apic_timer_interrupt+0xdc/0x280 arch/x86/kernel/apic/apic.c:1135 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830 arch_local_irq_restore arch/x86/include/asm/paravirt.h:756 [inline] kcsan_setup_watchpoint+0x1d4/0x460 kernel/kcsan/core.c:436 check_access kernel/kcsan/core.c:466 [inline] __tsan_read1 kernel/kcsan/core.c:593 [inline] __tsan_read1+0xc2/0x100 kernel/kcsan/core.c:593 kallsyms_expand_symbol.constprop.0+0x70/0x160 kernel/kallsyms.c:79 kallsyms_lookup_name+0x7f/0x120 kernel/kallsyms.c:170 insert_report_filterlist kernel/kcsan/debugfs.c:155 [inline] debugfs_write+0x14b/0x2d0 kernel/kcsan/debugfs.c:256 full_proxy_write+0xbd/0x100 fs/debugfs/file.c:225 __vfs_write+0x67/0xc0 fs/read_write.c:494 vfs_write fs/read_write.c:558 [inline] vfs_write+0x18a/0x390 fs/read_write.c:542 ksys_write+0xd5/0x1b0 fs/read_write.c:611 __do_sys_write fs/read_write.c:623 [inline] __se_sys_write fs/read_write.c:620 [inline] __x64_sys_write+0x4c/0x60 fs/read_write.c:620 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9 read to 0xffffffff8603d008 of 8 bytes by task 0 on cpu 0: tick_do_update_jiffies64+0x2b/0x250 kernel/time/tick-sched.c:62 tick_nohz_update_jiffies kernel/time/tick-sched.c:505 [inline] tick_nohz_irq_enter kernel/time/tick-sched.c:1257 [inline] tick_irq_enter+0x139/0x1c0 kernel/time/tick-sched.c:1274 irq_enter+0x4f/0x60 kernel/softirq.c:354 entering_irq arch/x86/include/asm/apic.h:517 [inline] entering_ack_irq arch/x86/include/asm/apic.h:523 [inline] smp_apic_timer_interrupt+0x55/0x280 arch/x86/kernel/apic/apic.c:1133 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830 native_safe_halt+0xe/0x10 arch/x86/include/asm/irqflags.h:60 arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:571 default_idle_call+0x1e/0x40 kernel/sched/idle.c:94 cpuidle_idle_call kernel/sched/idle.c:154 [inline] do_idle+0x1af/0x280 kernel/sched/idle.c:263 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:355 rest_init+0xec/0xf6 init/main.c:452 arch_call_rest_init+0x17/0x37 start_kernel+0x838/0x85e init/main.c:786 x86_64_start_reservations+0x29/0x2b arch/x86/kernel/head64.c:490 x86_64_start_kernel+0x72/0x76 arch/x86/kernel/head64.c:471 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:241 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.4.0-rc7+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Use READ_ONCE() and WRITE_ONCE() to annotate this expected race. Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Link: https://lore.kernel.org/r/20191205045619.204946-1-edumazet@google.com Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-12-04 21:56:19 -07:00
/* Pairs with the lockless read in this function. */
WRITE_ONCE(last_jiffies_update,
ktime_add_ns(last_jiffies_update,
incr * ticks));
}
do_timer(++ticks);
/* Keep the tick_next_period variable up to date */
tick_next_period = ktime_add(last_jiffies_update, tick_period);
} else {
write_sequnlock(&jiffies_lock);
return;
}
write_sequnlock(&jiffies_lock);
update_wall_time();
}
/*
* Initialize and return retrieve the jiffies update.
*/
static ktime_t tick_init_jiffy_update(void)
{
ktime_t period;
write_seqlock(&jiffies_lock);
/* Did we start the jiffies update yet ? */
if (last_jiffies_update == 0)
last_jiffies_update = tick_next_period;
period = last_jiffies_update;
write_sequnlock(&jiffies_lock);
return period;
}
static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
{
int cpu = smp_processor_id();
nohz: Rename CONFIG_NO_HZ to CONFIG_NO_HZ_COMMON We are planning to convert the dynticks Kconfig options layout into a choice menu. The user must be able to easily pick any of the following implementations: constant periodic tick, idle dynticks, full dynticks. As this implies a mutual exclusion, the two dynticks implementions need to converge on the selection of a common Kconfig option in order to ease the sharing of a common infrastructure. It would thus seem pretty natural to reuse CONFIG_NO_HZ to that end. It already implements all the idle dynticks code and the full dynticks depends on all that code for now. So ideally the choice menu would propose CONFIG_NO_HZ_IDLE and CONFIG_NO_HZ_EXTENDED then both would select CONFIG_NO_HZ. On the other hand we want to stay backward compatible: if CONFIG_NO_HZ is set in an older config file, we want to enable CONFIG_NO_HZ_IDLE by default. But we can't afford both at the same time or we run into a circular dependency: 1) CONFIG_NO_HZ_IDLE and CONFIG_NO_HZ_EXTENDED both select CONFIG_NO_HZ 2) If CONFIG_NO_HZ is set, we default to CONFIG_NO_HZ_IDLE We might be able to support that from Kconfig/Kbuild but it may not be wise to introduce such a confusing behaviour. So to solve this, create a new CONFIG_NO_HZ_COMMON option which gathers the common code between idle and full dynticks (that common code for now is simply the idle dynticks code) and select it from their referring Kconfig. Then we'll later create CONFIG_NO_HZ_IDLE and map CONFIG_NO_HZ to it for backward compatibility. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Christoph Lameter <cl@linux.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Gilad Ben Yossef <gilad@benyossef.com> Cc: Hakan Akkan <hakanakkan@gmail.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Kevin Hilman <khilman@linaro.org> Cc: Li Zhong <zhong@linux.vnet.ibm.com> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Paul Gortmaker <paul.gortmaker@windriver.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Gleixner <tglx@linutronix.de>
2011-08-10 15:21:01 -06:00
#ifdef CONFIG_NO_HZ_COMMON
/*
* Check if the do_timer duty was dropped. We don't care about
* concurrency: This happens only when the CPU in charge went
* into a long sleep. If two CPUs happen to assign themselves to
* this duty, then the jiffies update is still serialized by
* jiffies_lock.
*/
if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
&& !tick_nohz_full_cpu(cpu))
tick_do_timer_cpu = cpu;
#endif
/* Check, if the jiffies need an update */
if (tick_do_timer_cpu == cpu)
tick_do_update_jiffies64(now);
if (ts->inidle)
ts->got_idle_tick = 1;
}
static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
{
nohz: Rename CONFIG_NO_HZ to CONFIG_NO_HZ_COMMON We are planning to convert the dynticks Kconfig options layout into a choice menu. The user must be able to easily pick any of the following implementations: constant periodic tick, idle dynticks, full dynticks. As this implies a mutual exclusion, the two dynticks implementions need to converge on the selection of a common Kconfig option in order to ease the sharing of a common infrastructure. It would thus seem pretty natural to reuse CONFIG_NO_HZ to that end. It already implements all the idle dynticks code and the full dynticks depends on all that code for now. So ideally the choice menu would propose CONFIG_NO_HZ_IDLE and CONFIG_NO_HZ_EXTENDED then both would select CONFIG_NO_HZ. On the other hand we want to stay backward compatible: if CONFIG_NO_HZ is set in an older config file, we want to enable CONFIG_NO_HZ_IDLE by default. But we can't afford both at the same time or we run into a circular dependency: 1) CONFIG_NO_HZ_IDLE and CONFIG_NO_HZ_EXTENDED both select CONFIG_NO_HZ 2) If CONFIG_NO_HZ is set, we default to CONFIG_NO_HZ_IDLE We might be able to support that from Kconfig/Kbuild but it may not be wise to introduce such a confusing behaviour. So to solve this, create a new CONFIG_NO_HZ_COMMON option which gathers the common code between idle and full dynticks (that common code for now is simply the idle dynticks code) and select it from their referring Kconfig. Then we'll later create CONFIG_NO_HZ_IDLE and map CONFIG_NO_HZ to it for backward compatibility. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Christoph Lameter <cl@linux.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Gilad Ben Yossef <gilad@benyossef.com> Cc: Hakan Akkan <hakanakkan@gmail.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Kevin Hilman <khilman@linaro.org> Cc: Li Zhong <zhong@linux.vnet.ibm.com> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Paul Gortmaker <paul.gortmaker@windriver.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Gleixner <tglx@linutronix.de>
2011-08-10 15:21:01 -06:00
#ifdef CONFIG_NO_HZ_COMMON
/*
* When we are idle and the tick is stopped, we have to touch
* the watchdog as we might not schedule for a really long
* time. This happens on complete idle SMP systems while
* waiting on the login prompt. We also increment the "start of
* idle" jiffy stamp so the idle accounting adjustment we do
* when we go busy again does not account too much ticks.
*/
if (ts->tick_stopped) {
touch_softlockup_watchdog_sched();
if (is_idle_task(current))
ts->idle_jiffies++;
nohz: Fix collision between tick and other hrtimers, again This restores commit: 24b91e360ef5: ("nohz: Fix collision between tick and other hrtimers") ... which got reverted by commit: 558e8e27e73f: ('Revert "nohz: Fix collision between tick and other hrtimers"') ... due to a regression where CPUs spuriously stopped ticking. The bug happened when a tick fired too early past its expected expiration: on IRQ exit the tick was scheduled again to the same deadline but skipped reprogramming because ts->next_tick still kept in cache the deadline. This has been fixed now with resetting ts->next_tick from the tick itself. Extra care has also been taken to prevent from obsolete values throughout CPU hotplug operations. When the tick is stopped and an interrupt occurs afterward, we check on that interrupt exit if the next tick needs to be rescheduled. If it doesn't need any update, we don't want to do anything. In order to check if the tick needs an update, we compare it against the clockevent device deadline. Now that's a problem because the clockevent device is at a lower level than the tick itself if it is implemented on top of hrtimer. Every hrtimer share this clockevent device. So comparing the next tick deadline against the clockevent device deadline is wrong because the device may be programmed for another hrtimer whose deadline collides with the tick. As a result we may end up not reprogramming the tick accidentally. In a worst case scenario under full dynticks mode, the tick stops firing as it is supposed to every 1hz, leaving /proc/stat stalled: Task in a full dynticks CPU ---------------------------- * hrtimer A is queued 2 seconds ahead * the tick is stopped, scheduled 1 second ahead * tick fires 1 second later * on tick exit, nohz schedules the tick 1 second ahead but sees the clockevent device is already programmed to that deadline, fooled by hrtimer A, the tick isn't rescheduled. * hrtimer A is cancelled before its deadline * tick never fires again until an interrupt happens... In order to fix this, store the next tick deadline to the tick_sched local structure and reuse that value later to check whether we need to reprogram the clock after an interrupt. On the other hand, ts->sleep_length still wants to know about the next clock event and not just the tick, so we want to improve the related comment to avoid confusion. Reported-and-tested-by: Tim Wright <tim@binbash.co.uk> Reported-and-tested-by: Pavel Machek <pavel@ucw.cz> Reported-by: James Hartsock <hartsjc@redhat.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Rik van Riel <riel@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1492783255-5051-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-04-21 08:00:54 -06:00
/*
* In case the current tick fired too early past its expected
* expiration, make sure we don't bypass the next clock reprogramming
* to the same deadline.
*/
ts->next_tick = 0;
}
#endif
update_process_times(user_mode(regs));
profile_tick(CPU_PROFILING);
}
#endif
#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
bool tick_nohz_full_running;
static atomic_t tick_dep_mask;
nohz: Basic full dynticks interface For extreme usecases such as Real Time or HPC, having the ability to shutdown the tick when a single task runs on a CPU is a desired feature: * Reducing the amount of interrupts improves throughput for CPU-bound tasks. The CPU is less distracted from its real job, from an execution time and from the cache point of views. * This also improve latency response as we have less critical sections. Start with introducing a very simple interface to define full dynticks CPU: use a boot time option defined cpumask through the "nohz_extended=" kernel parameter. CPUs that are part of this range will have their tick shutdown whenever possible: provided they run a single task and they don't do kernel activity that require the periodic tick. These details will be later documented in Documentation/* An online CPU must be kept outside this range to handle the timekeeping. Suggested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Christoph Lameter <cl@linux.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Gilad Ben Yossef <gilad@benyossef.com> Cc: Hakan Akkan <hakanakkan@gmail.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Kevin Hilman <khilman@linaro.org> Cc: Li Zhong <zhong@linux.vnet.ibm.com> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Paul Gortmaker <paul.gortmaker@windriver.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Gleixner <tglx@linutronix.de>
2012-12-18 09:32:19 -07:00
static bool check_tick_dependency(atomic_t *dep)
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
{
int val = atomic_read(dep);
if (val & TICK_DEP_MASK_POSIX_TIMER) {
trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
return true;
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
}
if (val & TICK_DEP_MASK_PERF_EVENTS) {
trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS);
return true;
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
}
if (val & TICK_DEP_MASK_SCHED) {
trace_tick_stop(0, TICK_DEP_MASK_SCHED);
return true;
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
}
if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) {
trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
return true;
}
return false;
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
}
static bool can_stop_full_tick(int cpu, struct tick_sched *ts)
{
lockdep_assert_irqs_disabled();
if (unlikely(!cpu_online(cpu)))
return false;
if (check_tick_dependency(&tick_dep_mask))
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
return false;
if (check_tick_dependency(&ts->tick_dep_mask))
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
return false;
if (check_tick_dependency(&current->tick_dep_mask))
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
return false;
if (check_tick_dependency(&current->signal->tick_dep_mask))
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
return false;
return true;
}
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
static void nohz_full_kick_func(struct irq_work *work)
{
/* Empty, the tick restart happens on tick_nohz_irq_exit() */
}
static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
.func = nohz_full_kick_func,
};
nohz: Restore NMI safe local irq work for local nohz kick The local nohz kick is currently used by perf which needs it to be NMI-safe. Recent commit though (7d1311b93e58ed55f3a31cc8f94c4b8fe988a2b9) changed its implementation to fire the local kick using the remote kick API. It was convenient to make the code more generic but the remote kick isn't NMI-safe. As a result: WARNING: CPU: 3 PID: 18062 at kernel/irq_work.c:72 irq_work_queue_on+0x11e/0x140() CPU: 3 PID: 18062 Comm: trinity-subchil Not tainted 3.16.0+ #34 0000000000000009 00000000903774d1 ffff880244e06c00 ffffffff9a7f1e37 0000000000000000 ffff880244e06c38 ffffffff9a0791dd ffff880244fce180 0000000000000003 ffff880244e06d58 ffff880244e06ef8 0000000000000000 Call Trace: <NMI> [<ffffffff9a7f1e37>] dump_stack+0x4e/0x7a [<ffffffff9a0791dd>] warn_slowpath_common+0x7d/0xa0 [<ffffffff9a07930a>] warn_slowpath_null+0x1a/0x20 [<ffffffff9a17ca1e>] irq_work_queue_on+0x11e/0x140 [<ffffffff9a10a2c7>] tick_nohz_full_kick_cpu+0x57/0x90 [<ffffffff9a186cd5>] __perf_event_overflow+0x275/0x350 [<ffffffff9a184f80>] ? perf_event_task_disable+0xa0/0xa0 [<ffffffff9a01a4cf>] ? x86_perf_event_set_period+0xbf/0x150 [<ffffffff9a187934>] perf_event_overflow+0x14/0x20 [<ffffffff9a020386>] intel_pmu_handle_irq+0x206/0x410 [<ffffffff9a0b54d3>] ? arch_vtime_task_switch+0x63/0x130 [<ffffffff9a01937b>] perf_event_nmi_handler+0x2b/0x50 [<ffffffff9a007b72>] nmi_handle+0xd2/0x390 [<ffffffff9a007aa5>] ? nmi_handle+0x5/0x390 [<ffffffff9a0d131b>] ? lock_release+0xab/0x330 [<ffffffff9a008062>] default_do_nmi+0x72/0x1c0 [<ffffffff9a0c925f>] ? cpuacct_account_field+0xcf/0x200 [<ffffffff9a008268>] do_nmi+0xb8/0x100 Lets fix this by restoring the use of local irq work for the nohz local kick. Reported-by: Catalin Iacob <iacobcatalin@gmail.com> Reported-and-tested-by: Dave Jones <davej@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2014-08-13 10:50:16 -06:00
/*
* Kick this CPU if it's full dynticks in order to force it to
* re-evaluate its dependency on the tick and restart it if necessary.
* This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
* is NMI safe.
*/
static void tick_nohz_full_kick(void)
nohz: Restore NMI safe local irq work for local nohz kick The local nohz kick is currently used by perf which needs it to be NMI-safe. Recent commit though (7d1311b93e58ed55f3a31cc8f94c4b8fe988a2b9) changed its implementation to fire the local kick using the remote kick API. It was convenient to make the code more generic but the remote kick isn't NMI-safe. As a result: WARNING: CPU: 3 PID: 18062 at kernel/irq_work.c:72 irq_work_queue_on+0x11e/0x140() CPU: 3 PID: 18062 Comm: trinity-subchil Not tainted 3.16.0+ #34 0000000000000009 00000000903774d1 ffff880244e06c00 ffffffff9a7f1e37 0000000000000000 ffff880244e06c38 ffffffff9a0791dd ffff880244fce180 0000000000000003 ffff880244e06d58 ffff880244e06ef8 0000000000000000 Call Trace: <NMI> [<ffffffff9a7f1e37>] dump_stack+0x4e/0x7a [<ffffffff9a0791dd>] warn_slowpath_common+0x7d/0xa0 [<ffffffff9a07930a>] warn_slowpath_null+0x1a/0x20 [<ffffffff9a17ca1e>] irq_work_queue_on+0x11e/0x140 [<ffffffff9a10a2c7>] tick_nohz_full_kick_cpu+0x57/0x90 [<ffffffff9a186cd5>] __perf_event_overflow+0x275/0x350 [<ffffffff9a184f80>] ? perf_event_task_disable+0xa0/0xa0 [<ffffffff9a01a4cf>] ? x86_perf_event_set_period+0xbf/0x150 [<ffffffff9a187934>] perf_event_overflow+0x14/0x20 [<ffffffff9a020386>] intel_pmu_handle_irq+0x206/0x410 [<ffffffff9a0b54d3>] ? arch_vtime_task_switch+0x63/0x130 [<ffffffff9a01937b>] perf_event_nmi_handler+0x2b/0x50 [<ffffffff9a007b72>] nmi_handle+0xd2/0x390 [<ffffffff9a007aa5>] ? nmi_handle+0x5/0x390 [<ffffffff9a0d131b>] ? lock_release+0xab/0x330 [<ffffffff9a008062>] default_do_nmi+0x72/0x1c0 [<ffffffff9a0c925f>] ? cpuacct_account_field+0xcf/0x200 [<ffffffff9a008268>] do_nmi+0xb8/0x100 Lets fix this by restoring the use of local irq work for the nohz local kick. Reported-by: Catalin Iacob <iacobcatalin@gmail.com> Reported-and-tested-by: Dave Jones <davej@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2014-08-13 10:50:16 -06:00
{
if (!tick_nohz_full_cpu(smp_processor_id()))
return;
irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
nohz: Restore NMI safe local irq work for local nohz kick The local nohz kick is currently used by perf which needs it to be NMI-safe. Recent commit though (7d1311b93e58ed55f3a31cc8f94c4b8fe988a2b9) changed its implementation to fire the local kick using the remote kick API. It was convenient to make the code more generic but the remote kick isn't NMI-safe. As a result: WARNING: CPU: 3 PID: 18062 at kernel/irq_work.c:72 irq_work_queue_on+0x11e/0x140() CPU: 3 PID: 18062 Comm: trinity-subchil Not tainted 3.16.0+ #34 0000000000000009 00000000903774d1 ffff880244e06c00 ffffffff9a7f1e37 0000000000000000 ffff880244e06c38 ffffffff9a0791dd ffff880244fce180 0000000000000003 ffff880244e06d58 ffff880244e06ef8 0000000000000000 Call Trace: <NMI> [<ffffffff9a7f1e37>] dump_stack+0x4e/0x7a [<ffffffff9a0791dd>] warn_slowpath_common+0x7d/0xa0 [<ffffffff9a07930a>] warn_slowpath_null+0x1a/0x20 [<ffffffff9a17ca1e>] irq_work_queue_on+0x11e/0x140 [<ffffffff9a10a2c7>] tick_nohz_full_kick_cpu+0x57/0x90 [<ffffffff9a186cd5>] __perf_event_overflow+0x275/0x350 [<ffffffff9a184f80>] ? perf_event_task_disable+0xa0/0xa0 [<ffffffff9a01a4cf>] ? x86_perf_event_set_period+0xbf/0x150 [<ffffffff9a187934>] perf_event_overflow+0x14/0x20 [<ffffffff9a020386>] intel_pmu_handle_irq+0x206/0x410 [<ffffffff9a0b54d3>] ? arch_vtime_task_switch+0x63/0x130 [<ffffffff9a01937b>] perf_event_nmi_handler+0x2b/0x50 [<ffffffff9a007b72>] nmi_handle+0xd2/0x390 [<ffffffff9a007aa5>] ? nmi_handle+0x5/0x390 [<ffffffff9a0d131b>] ? lock_release+0xab/0x330 [<ffffffff9a008062>] default_do_nmi+0x72/0x1c0 [<ffffffff9a0c925f>] ? cpuacct_account_field+0xcf/0x200 [<ffffffff9a008268>] do_nmi+0xb8/0x100 Lets fix this by restoring the use of local irq work for the nohz local kick. Reported-by: Catalin Iacob <iacobcatalin@gmail.com> Reported-and-tested-by: Dave Jones <davej@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2014-08-13 10:50:16 -06:00
}
/*
* Kick the CPU if it's full dynticks in order to force it to
* re-evaluate its dependency on the tick and restart it if necessary.
*/
void tick_nohz_full_kick_cpu(int cpu)
{
if (!tick_nohz_full_cpu(cpu))
return;
irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
}
/*
* Kick all full dynticks CPUs in order to force these to re-evaluate
* their dependency on the tick and restart it if necessary.
*/
static void tick_nohz_full_kick_all(void)
{
int cpu;
if (!tick_nohz_full_running)
return;
preempt_disable();
for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask)
tick_nohz_full_kick_cpu(cpu);
preempt_enable();
}
static void tick_nohz_dep_set_all(atomic_t *dep,
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
enum tick_dep_bits bit)
{
int prev;
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
prev = atomic_fetch_or(BIT(bit), dep);
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
if (!prev)
tick_nohz_full_kick_all();
}
/*
* Set a global tick dependency. Used by perf events that rely on freq and
* by unstable clock.
*/
void tick_nohz_dep_set(enum tick_dep_bits bit)
{
tick_nohz_dep_set_all(&tick_dep_mask, bit);
}
void tick_nohz_dep_clear(enum tick_dep_bits bit)
{
atomic_andnot(BIT(bit), &tick_dep_mask);
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
}
/*
* Set per-CPU tick dependency. Used by scheduler and perf events in order to
* manage events throttling.
*/
void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
{
int prev;
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
struct tick_sched *ts;
ts = per_cpu_ptr(&tick_cpu_sched, cpu);
prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask);
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
if (!prev) {
preempt_disable();
/* Perf needs local kick that is NMI safe */
if (cpu == smp_processor_id()) {
tick_nohz_full_kick();
} else {
/* Remote irq work not NMI-safe */
if (!WARN_ON_ONCE(in_nmi()))
tick_nohz_full_kick_cpu(cpu);
}
preempt_enable();
}
}
void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
{
struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
atomic_andnot(BIT(bit), &ts->tick_dep_mask);
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
}
/*
* Set a per-task tick dependency. Posix CPU timers need this in order to elapse
* per task timers.
*/
void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
/*
* We could optimize this with just kicking the target running the task
* if that noise matters for nohz full users.
*/
tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit);
}
void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
atomic_andnot(BIT(bit), &tsk->tick_dep_mask);
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
}
/*
* Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
* per process timers.
*/
void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit)
{
tick_nohz_dep_set_all(&sig->tick_dep_mask, bit);
}
void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
{
atomic_andnot(BIT(bit), &sig->tick_dep_mask);
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
}
/*
* Re-evaluate the need for the tick as we switch the current task.
* It might need the tick due to per task/process properties:
* perf events, posix CPU timers, ...
*/
void __tick_nohz_task_switch(void)
{
unsigned long flags;
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
struct tick_sched *ts;
local_irq_save(flags);
if (!tick_nohz_full_cpu(smp_processor_id()))
goto out;
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
ts = this_cpu_ptr(&tick_cpu_sched);
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
if (ts->tick_stopped) {
if (atomic_read(&current->tick_dep_mask) ||
atomic_read(&current->signal->tick_dep_mask))
nohz: New tick dependency mask The tick dependency is evaluated on every IRQ and context switch. This consists is a batch of checks which determine whether it is safe to stop the tick or not. These checks are often split in many details: posix cpu timers, scheduler, sched clock, perf events.... each of which are made of smaller details: posix cpu timer involves checking process wide timers then thread wide timers. Perf involves checking freq events then more per cpu details. Checking these informations asynchronously every time we update the full dynticks state bring avoidable overhead and a messy layout. Let's introduce instead tick dependency masks: one for system wide dependency (unstable sched clock, freq based perf events), one for CPU wide dependency (sched, throttling perf events), and task/signal level dependencies (posix cpu timers). The subsystems are responsible for setting and clearing their dependency through a set of APIs that will take care of concurrent dependency mask modifications and kick targets to restart the relevant CPU tick whenever needed. This new dependency engine stays beside the old one until all subsystems having a tick dependency are converted to it. Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Viresh Kumar <viresh.kumar@linaro.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2015-06-07 07:54:30 -06:00
tick_nohz_full_kick();
}
out:
local_irq_restore(flags);
}
/* Get the boot-time nohz CPU list from the kernel parameters. */
void __init tick_nohz_full_setup(cpumask_var_t cpumask)
nohz: Basic full dynticks interface For extreme usecases such as Real Time or HPC, having the ability to shutdown the tick when a single task runs on a CPU is a desired feature: * Reducing the amount of interrupts improves throughput for CPU-bound tasks. The CPU is less distracted from its real job, from an execution time and from the cache point of views. * This also improve latency response as we have less critical sections. Start with introducing a very simple interface to define full dynticks CPU: use a boot time option defined cpumask through the "nohz_extended=" kernel parameter. CPUs that are part of this range will have their tick shutdown whenever possible: provided they run a single task and they don't do kernel activity that require the periodic tick. These details will be later documented in Documentation/* An online CPU must be kept outside this range to handle the timekeeping. Suggested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Christoph Lameter <cl@linux.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Gilad Ben Yossef <gilad@benyossef.com> Cc: Hakan Akkan <hakanakkan@gmail.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Kevin Hilman <khilman@linaro.org> Cc: Li Zhong <zhong@linux.vnet.ibm.com> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Paul Gortmaker <paul.gortmaker@windriver.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Gleixner <tglx@linutronix.de>
2012-12-18 09:32:19 -07:00
{
alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
cpumask_copy(tick_nohz_full_mask, cpumask);
tick_nohz_full_running = true;
nohz: Basic full dynticks interface For extreme usecases such as Real Time or HPC, having the ability to shutdown the tick when a single task runs on a CPU is a desired feature: * Reducing the amount of interrupts improves throughput for CPU-bound tasks. The CPU is less distracted from its real job, from an execution time and from the cache point of views. * This also improve latency response as we have less critical sections. Start with introducing a very simple interface to define full dynticks CPU: use a boot time option defined cpumask through the "nohz_extended=" kernel parameter. CPUs that are part of this range will have their tick shutdown whenever possible: provided they run a single task and they don't do kernel activity that require the periodic tick. These details will be later documented in Documentation/* An online CPU must be kept outside this range to handle the timekeeping. Suggested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Christoph Lameter <cl@linux.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Gilad Ben Yossef <gilad@benyossef.com> Cc: Hakan Akkan <hakanakkan@gmail.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Kevin Hilman <khilman@linaro.org> Cc: Li Zhong <zhong@linux.vnet.ibm.com> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Paul Gortmaker <paul.gortmaker@windriver.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Gleixner <tglx@linutronix.de>
2012-12-18 09:32:19 -07:00
}
static int tick_nohz_cpu_down(unsigned int cpu)
{
/*
* The boot CPU handles housekeeping duty (unbound timers,
* workqueues, timekeeping, ...) on behalf of full dynticks
* CPUs. It must remain online when nohz full is enabled.
*/
if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
return -EBUSY;
return 0;
}
void __init tick_nohz_init(void)
nohz: Basic full dynticks interface For extreme usecases such as Real Time or HPC, having the ability to shutdown the tick when a single task runs on a CPU is a desired feature: * Reducing the amount of interrupts improves throughput for CPU-bound tasks. The CPU is less distracted from its real job, from an execution time and from the cache point of views. * This also improve latency response as we have less critical sections. Start with introducing a very simple interface to define full dynticks CPU: use a boot time option defined cpumask through the "nohz_extended=" kernel parameter. CPUs that are part of this range will have their tick shutdown whenever possible: provided they run a single task and they don't do kernel activity that require the periodic tick. These details will be later documented in Documentation/* An online CPU must be kept outside this range to handle the timekeeping. Suggested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Christoph Lameter <cl@linux.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Gilad Ben Yossef <gilad@benyossef.com> Cc: Hakan Akkan <hakanakkan@gmail.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Kevin Hilman <khilman@linaro.org> Cc: Li Zhong <zhong@linux.vnet.ibm.com> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Paul Gortmaker <paul.gortmaker@windriver.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Gleixner <tglx@linutronix.de>
2012-12-18 09:32:19 -07:00
{
int cpu, ret;
sched/isolation: Eliminate NO_HZ_FULL_ALL Commit 6f1982fedd59 ("sched/isolation: Handle the nohz_full= parameter") broke CONFIG_NO_HZ_FULL_ALL=y kernels. This breakage is due to the code under CONFIG_NO_HZ_FULL_ALL failing to invoke the shiny new housekeeping functions. This means that rcutorture scenario TREE04 now emits RCU CPU stall warnings due to the RCU grace-period kthreads not being awakened at a time of their choosing, or perhaps even not at all: [ 27.731422] rcu_bh kthread starved for 21001 jiffies! g18446744073709551369 c18446744073709551368 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x402 ->cpu=3 [ 27.731423] rcu_bh I14936 9 2 0x80080000 [ 27.731435] Call Trace: [ 27.731440] __schedule+0x31a/0x6d0 [ 27.731442] schedule+0x31/0x80 [ 27.731446] schedule_timeout+0x15a/0x320 [ 27.731453] ? call_timer_fn+0x130/0x130 [ 27.731457] rcu_gp_kthread+0x66c/0xea0 [ 27.731458] ? rcu_gp_kthread+0x66c/0xea0 Because no one has complained about CONFIG_NO_HZ_FULL_ALL=y being broken, I hypothesize that no one is in fact using it, other than rcutorture. This commit therefore eliminates CONFIG_NO_HZ_FULL_ALL and updates rcutorture's config files to instead use the nohz_full= kernel parameter to put the desired CPUs into nohz_full mode. Fixes: 6f1982fedd59 ("sched/isolation: Handle the nohz_full= parameter") Reported-by: kernel test robot <xiaolong.ye@intel.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Frederic Weisbecker <frederic@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Chris Metcalf <cmetcalf@mellanox.com> Cc: Christoph Lameter <cl@linux.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Wanpeng Li <kernellwp@gmail.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: John Stultz <john.stultz@linaro.org> Cc: Jonathan Corbet <corbet@lwn.net>
2017-11-30 16:36:35 -07:00
if (!tick_nohz_full_running)
return;
/*
* Full dynticks uses irq work to drive the tick rescheduling on safe
* locking contexts. But then we need irq work to raise its own
* interrupts to avoid circular dependency on the tick
*/
if (!arch_irq_work_has_interrupt()) {
pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
cpumask_clear(tick_nohz_full_mask);
tick_nohz_full_running = false;
return;
}
cpu = smp_processor_id();
if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n",
cpu);
cpumask_clear_cpu(cpu, tick_nohz_full_mask);
}
for_each_cpu(cpu, tick_nohz_full_mask)
context_tracking_cpu_set(cpu);
ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
"kernel/nohz:predown", NULL,
tick_nohz_cpu_down);
WARN_ON(ret < 0);
pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
cpumask_pr_args(tick_nohz_full_mask));
nohz: Basic full dynticks interface For extreme usecases such as Real Time or HPC, having the ability to shutdown the tick when a single task runs on a CPU is a desired feature: * Reducing the amount of interrupts improves throughput for CPU-bound tasks. The CPU is less distracted from its real job, from an execution time and from the cache point of views. * This also improve latency response as we have less critical sections. Start with introducing a very simple interface to define full dynticks CPU: use a boot time option defined cpumask through the "nohz_extended=" kernel parameter. CPUs that are part of this range will have their tick shutdown whenever possible: provided they run a single task and they don't do kernel activity that require the periodic tick. These details will be later documented in Documentation/* An online CPU must be kept outside this range to handle the timekeeping. Suggested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Christoph Lameter <cl@linux.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Gilad Ben Yossef <gilad@benyossef.com> Cc: Hakan Akkan <hakanakkan@gmail.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Kevin Hilman <khilman@linaro.org> Cc: Li Zhong <zhong@linux.vnet.ibm.com> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Paul Gortmaker <paul.gortmaker@windriver.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Gleixner <tglx@linutronix.de>
2012-12-18 09:32:19 -07:00
}
#endif
/*
* NOHZ - aka dynamic tick functionality
*/
nohz: Rename CONFIG_NO_HZ to CONFIG_NO_HZ_COMMON We are planning to convert the dynticks Kconfig options layout into a choice menu. The user must be able to easily pick any of the following implementations: constant periodic tick, idle dynticks, full dynticks. As this implies a mutual exclusion, the two dynticks implementions need to converge on the selection of a common Kconfig option in order to ease the sharing of a common infrastructure. It would thus seem pretty natural to reuse CONFIG_NO_HZ to that end. It already implements all the idle dynticks code and the full dynticks depends on all that code for now. So ideally the choice menu would propose CONFIG_NO_HZ_IDLE and CONFIG_NO_HZ_EXTENDED then both would select CONFIG_NO_HZ. On the other hand we want to stay backward compatible: if CONFIG_NO_HZ is set in an older config file, we want to enable CONFIG_NO_HZ_IDLE by default. But we can't afford both at the same time or we run into a circular dependency: 1) CONFIG_NO_HZ_IDLE and CONFIG_NO_HZ_EXTENDED both select CONFIG_NO_HZ 2) If CONFIG_NO_HZ is set, we default to CONFIG_NO_HZ_IDLE We might be able to support that from Kconfig/Kbuild but it may not be wise to introduce such a confusing behaviour. So to solve this, create a new CONFIG_NO_HZ_COMMON option which gathers the common code between idle and full dynticks (that common code for now is simply the idle dynticks code) and select it from their referring Kconfig. Then we'll later create CONFIG_NO_HZ_IDLE and map CONFIG_NO_HZ to it for backward compatibility. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Christoph Lameter <cl@linux.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Gilad Ben Yossef <gilad@benyossef.com> Cc: Hakan Akkan <hakanakkan@gmail.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Kevin Hilman <khilman@linaro.org> Cc: Li Zhong <zhong@linux.vnet.ibm.com> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Paul Gortmaker <paul.gortmaker@windriver.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Gleixner <tglx@linutronix.de>
2011-08-10 15:21:01 -06:00
#ifdef CONFIG_NO_HZ_COMMON
/*
* NO HZ enabled ?
*/
bool tick_nohz_enabled __read_mostly = true;
timer: Reduce timer migration overhead if disabled Eric reported that the timer_migration sysctl is not really nice performance wise as it needs to check at every timer insertion whether the feature is enabled or not. Further the check does not live in the timer code, so we have an extra function call which checks an extra cache line to figure out that it is disabled. We can do better and store that information in the per cpu (hr)timer bases. I pondered to use a static key, but that's a nightmare to update from the nohz code and the timer base cache line is hot anyway when we select a timer base. The old logic enabled the timer migration unconditionally if CONFIG_NO_HZ was set even if nohz was disabled on the kernel command line. With this modification, we start off with migration disabled. The user visible sysctl is still set to enabled. If the kernel switches to NOHZ migration is enabled, if the user did not disable it via the sysctl prior to the switch. If nohz=off is on the kernel command line, migration stays disabled no matter what. Before: 47.76% hog [.] main 14.84% [kernel] [k] _raw_spin_lock_irqsave 9.55% [kernel] [k] _raw_spin_unlock_irqrestore 6.71% [kernel] [k] mod_timer 6.24% [kernel] [k] lock_timer_base.isra.38 3.76% [kernel] [k] detach_if_pending 3.71% [kernel] [k] del_timer 2.50% [kernel] [k] internal_add_timer 1.51% [kernel] [k] get_nohz_timer_target 1.28% [kernel] [k] __internal_add_timer 0.78% [kernel] [k] timerfn 0.48% [kernel] [k] wake_up_nohz_cpu After: 48.10% hog [.] main 15.25% [kernel] [k] _raw_spin_lock_irqsave 9.76% [kernel] [k] _raw_spin_unlock_irqrestore 6.50% [kernel] [k] mod_timer 6.44% [kernel] [k] lock_timer_base.isra.38 3.87% [kernel] [k] detach_if_pending 3.80% [kernel] [k] del_timer 2.67% [kernel] [k] internal_add_timer 1.33% [kernel] [k] __internal_add_timer 0.73% [kernel] [k] timerfn 0.54% [kernel] [k] wake_up_nohz_cpu Reported-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Viresh Kumar <viresh.kumar@linaro.org> Cc: John Stultz <john.stultz@linaro.org> Cc: Joonwoo Park <joonwoop@codeaurora.org> Cc: Wenbo Wang <wenbo.wang@memblaze.com> Link: http://lkml.kernel.org/r/20150526224512.127050787@linutronix.de Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-05-26 16:50:33 -06:00
unsigned long tick_nohz_active __read_mostly;
/*
* Enable / Disable tickless mode
*/
static int __init setup_tick_nohz(char *str)
{
return (kstrtobool(str, &tick_nohz_enabled) == 0);
}
__setup("nohz=", setup_tick_nohz);
bool tick_nohz_tick_stopped(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
return ts->tick_stopped;
}
bool tick_nohz_tick_stopped_cpu(int cpu)
{
struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
return ts->tick_stopped;
}
/**
* tick_nohz_update_jiffies - update jiffies when idle was interrupted
*
* Called from interrupt entry when the CPU was idle
*
* In case the sched_tick was stopped on this CPU, we have to check if jiffies
* must be updated. Otherwise an interrupt handler could use a stale jiffy
* value. We do this unconditionally on any CPU, as we don't know whether the
* CPU, which has the update task assigned is in a long sleep.
*/
static void tick_nohz_update_jiffies(ktime_t now)
{
unsigned long flags;
__this_cpu_write(tick_cpu_sched.idle_waketime, now);
local_irq_save(flags);
tick_do_update_jiffies64(now);
local_irq_restore(flags);
touch_softlockup_watchdog_sched();
}
/*
* Updates the per-CPU time idle statistics counters
*/
static void
update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
{
ktime_t delta;
if (ts->idle_active) {
delta = ktime_sub(now, ts->idle_entrytime);
if (nr_iowait_cpu(cpu) > 0)
ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
else
ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
ts->idle_entrytime = now;
}
if (last_update_time)
*last_update_time = ktime_to_us(now);
}
static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
{
update_ts_time_stats(smp_processor_id(), ts, now, NULL);
ts->idle_active = 0;
sched_clock_idle_wakeup_event();
}
static void tick_nohz_start_idle(struct tick_sched *ts)
{
ts->idle_entrytime = ktime_get();
ts->idle_active = 1;
sched_clock_idle_sleep_event();
}
/**
* get_cpu_idle_time_us - get the total idle time of a CPU
* @cpu: CPU number to query
* @last_update_time: variable to store update time in. Do not update
* counters if NULL.
*
* Return the cumulative idle time (since boot) for a given
* CPU, in microseconds.
*
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
*
* This function returns -1 if NOHZ is not enabled.
*/
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
ktime_t now, idle;
if (!tick_nohz_active)
return -1;
now = ktime_get();
if (last_update_time) {
update_ts_time_stats(cpu, ts, now, last_update_time);
idle = ts->idle_sleeptime;
} else {
if (ts->idle_active && !nr_iowait_cpu(cpu)) {
ktime_t delta = ktime_sub(now, ts->idle_entrytime);
idle = ktime_add(ts->idle_sleeptime, delta);
} else {
idle = ts->idle_sleeptime;
}
}
return ktime_to_us(idle);
}
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
/**
* get_cpu_iowait_time_us - get the total iowait time of a CPU
* @cpu: CPU number to query
* @last_update_time: variable to store update time in. Do not update
* counters if NULL.
*
* Return the cumulative iowait time (since boot) for a given
* CPU, in microseconds.
*
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
*
* This function returns -1 if NOHZ is not enabled.
*/
u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
ktime_t now, iowait;
if (!tick_nohz_active)
return -1;
now = ktime_get();
if (last_update_time) {
update_ts_time_stats(cpu, ts, now, last_update_time);
iowait = ts->iowait_sleeptime;
} else {
if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
ktime_t delta = ktime_sub(now, ts->idle_entrytime);
iowait = ktime_add(ts->iowait_sleeptime, delta);
} else {
iowait = ts->iowait_sleeptime;
}
}
return ktime_to_us(iowait);
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
{
hrtimer_cancel(&ts->sched_timer);
hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
/* Forward the time to expire in the future */
hrtimer_forward(&ts->sched_timer, now, tick_period);
if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
else
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
nohz: Fix collision between tick and other hrtimers, again This restores commit: 24b91e360ef5: ("nohz: Fix collision between tick and other hrtimers") ... which got reverted by commit: 558e8e27e73f: ('Revert "nohz: Fix collision between tick and other hrtimers"') ... due to a regression where CPUs spuriously stopped ticking. The bug happened when a tick fired too early past its expected expiration: on IRQ exit the tick was scheduled again to the same deadline but skipped reprogramming because ts->next_tick still kept in cache the deadline. This has been fixed now with resetting ts->next_tick from the tick itself. Extra care has also been taken to prevent from obsolete values throughout CPU hotplug operations. When the tick is stopped and an interrupt occurs afterward, we check on that interrupt exit if the next tick needs to be rescheduled. If it doesn't need any update, we don't want to do anything. In order to check if the tick needs an update, we compare it against the clockevent device deadline. Now that's a problem because the clockevent device is at a lower level than the tick itself if it is implemented on top of hrtimer. Every hrtimer share this clockevent device. So comparing the next tick deadline against the clockevent device deadline is wrong because the device may be programmed for another hrtimer whose deadline collides with the tick. As a result we may end up not reprogramming the tick accidentally. In a worst case scenario under full dynticks mode, the tick stops firing as it is supposed to every 1hz, leaving /proc/stat stalled: Task in a full dynticks CPU ---------------------------- * hrtimer A is queued 2 seconds ahead * the tick is stopped, scheduled 1 second ahead * tick fires 1 second later * on tick exit, nohz schedules the tick 1 second ahead but sees the clockevent device is already programmed to that deadline, fooled by hrtimer A, the tick isn't rescheduled. * hrtimer A is cancelled before its deadline * tick never fires again until an interrupt happens... In order to fix this, store the next tick deadline to the tick_sched local structure and reuse that value later to check whether we need to reprogram the clock after an interrupt. On the other hand, ts->sleep_length still wants to know about the next clock event and not just the tick, so we want to improve the related comment to avoid confusion. Reported-and-tested-by: Tim Wright <tim@binbash.co.uk> Reported-and-tested-by: Pavel Machek <pavel@ucw.cz> Reported-by: James Hartsock <hartsjc@redhat.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Rik van Riel <riel@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1492783255-5051-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-04-21 08:00:54 -06:00
/*
* Reset to make sure next tick stop doesn't get fooled by past
* cached clock deadline.
*/
ts->next_tick = 0;
}
nohz: Prevent a timer interrupt storm in tick_nohz_stop_sched_tick() The conditions in irq_exit() to invoke tick_nohz_irq_exit() which subsequently invokes tick_nohz_stop_sched_tick() are: if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) If need_resched() is not set, but a timer softirq is pending then this is an indication that the softirq code punted and delegated the execution to softirqd. need_resched() is not true because the current interrupted task takes precedence over softirqd. Invoking tick_nohz_irq_exit() in this case can cause an endless loop of timer interrupts because the timer wheel contains an expired timer, but softirqs are not yet executed. So it returns an immediate expiry request, which causes the timer to fire immediately again. Lather, rinse and repeat.... Prevent that by adding a check for a pending timer soft interrupt to the conditions in tick_nohz_stop_sched_tick() which avoid calling get_next_timer_interrupt(). That keeps the tick sched timer on the tick and prevents a repetitive programming of an already expired timer. Reported-by: Sebastian Siewior <bigeasy@linutronix.d> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com> Cc: Anna-Maria Gleixner <anna-maria@linutronix.de> Cc: Sebastian Siewior <bigeasy@linutronix.de> Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712272156050.2431@nanos
2017-12-22 07:51:13 -07:00
static inline bool local_timer_softirq_pending(void)
{
return local_softirq_pending() & BIT(TIMER_SOFTIRQ);
nohz: Prevent a timer interrupt storm in tick_nohz_stop_sched_tick() The conditions in irq_exit() to invoke tick_nohz_irq_exit() which subsequently invokes tick_nohz_stop_sched_tick() are: if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) If need_resched() is not set, but a timer softirq is pending then this is an indication that the softirq code punted and delegated the execution to softirqd. need_resched() is not true because the current interrupted task takes precedence over softirqd. Invoking tick_nohz_irq_exit() in this case can cause an endless loop of timer interrupts because the timer wheel contains an expired timer, but softirqs are not yet executed. So it returns an immediate expiry request, which causes the timer to fire immediately again. Lather, rinse and repeat.... Prevent that by adding a check for a pending timer soft interrupt to the conditions in tick_nohz_stop_sched_tick() which avoid calling get_next_timer_interrupt(). That keeps the tick sched timer on the tick and prevents a repetitive programming of an already expired timer. Reported-by: Sebastian Siewior <bigeasy@linutronix.d> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com> Cc: Anna-Maria Gleixner <anna-maria@linutronix.de> Cc: Sebastian Siewior <bigeasy@linutronix.de> Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712272156050.2431@nanos
2017-12-22 07:51:13 -07:00
}
static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
{
u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
unsigned long seq, basejiff;
/* Read jiffies and the time when jiffies were updated last */
do {
seq = read_seqbegin(&jiffies_lock);
basemono = last_jiffies_update;
basejiff = jiffies;
} while (read_seqretry(&jiffies_lock, seq));
ts->last_jiffies = basejiff;
ts->timer_expires_base = basemono;
nohz: Prevent a timer interrupt storm in tick_nohz_stop_sched_tick() The conditions in irq_exit() to invoke tick_nohz_irq_exit() which subsequently invokes tick_nohz_stop_sched_tick() are: if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) If need_resched() is not set, but a timer softirq is pending then this is an indication that the softirq code punted and delegated the execution to softirqd. need_resched() is not true because the current interrupted task takes precedence over softirqd. Invoking tick_nohz_irq_exit() in this case can cause an endless loop of timer interrupts because the timer wheel contains an expired timer, but softirqs are not yet executed. So it returns an immediate expiry request, which causes the timer to fire immediately again. Lather, rinse and repeat.... Prevent that by adding a check for a pending timer soft interrupt to the conditions in tick_nohz_stop_sched_tick() which avoid calling get_next_timer_interrupt(). That keeps the tick sched timer on the tick and prevents a repetitive programming of an already expired timer. Reported-by: Sebastian Siewior <bigeasy@linutronix.d> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com> Cc: Anna-Maria Gleixner <anna-maria@linutronix.de> Cc: Sebastian Siewior <bigeasy@linutronix.de> Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1712272156050.2431@nanos
2017-12-22 07:51:13 -07:00
/*
* Keep the periodic tick, when RCU, architecture or irq_work
* requests it.
* Aside of that check whether the local timer softirq is
* pending. If so its a bad idea to call get_next_timer_interrupt()
* because there is an already expired timer, so it will request
* immeditate expiry, which rearms the hardware timer with a
* minimal delta which brings us back to this place
* immediately. Lather, rinse and repeat...
*/
if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
irq_work_needs_cpu() || local_timer_softirq_pending()) {
next_tick = basemono + TICK_NSEC;
} else {
/*
* Get the next pending timer. If high resolution
* timers are enabled this only takes the timer wheel
* timers into account. If high resolution timers are
* disabled this also looks at the next expiring
* hrtimer.
*/
next_tmr = get_next_timer_interrupt(basejiff, basemono);
ts->next_timer = next_tmr;
/* Take the next rcu event into account */
next_tick = next_rcu < next_tmr ? next_rcu : next_tmr;
}
/*
* If the tick is due in the next period, keep it ticking or
* force prod the timer.
*/
delta = next_tick - basemono;
if (delta <= (u64)TICK_NSEC) {
/*
* Tell the timer code that the base is not idle, i.e. undo
* the effect of get_next_timer_interrupt():
*/
timer_clear_idle();
/*
* We've not stopped the tick yet, and there's a timer in the
* next period, so no point in stopping it either, bail.
*/
nohz: Fix buggy tick delay on IRQ storms When the tick is stopped and we reach the dynticks evaluation code on IRQ exit, we perform a soft tick restart if we observe an expired timer from there. It means we program the nearest possible tick but we stay in dynticks mode (ts->tick_stopped = 1) because we may need to stop the tick again after that expired timer is handled. Now this solution works most of the time but if we suffer an IRQ storm and those interrupts trigger faster than the hardware clockevents min delay, our tick won't fire until that IRQ storm is finished. Here is the problem: on IRQ exit we reprog the timer to at least NOW() + min_clockevents_delay. Another IRQ fires before the tick so we reschedule again to NOW() + min_clockevents_delay, etc... The tick is eternally rescheduled min_clockevents_delay ahead. A solution is to simply remove this soft tick restart. After all the normal dynticks evaluation path can handle 0 delay just fine. And by doing that we benefit from the optimization branch which avoids clock reprogramming if the clockevents deadline hasn't changed since the last reprog. This fixes our issue because we don't do repetitive clock reprog that always add hardware min delay. As a side effect it should even optimize the 0 delay path in general. Reported-and-tested-by: Octavian Purdila <octavian.purdila@nxp.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/1496328429-13317-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-01 08:47:09 -06:00
if (!ts->tick_stopped) {
ts->timer_expires = 0;
goto out;
}
}
/*
* If this CPU is the one which had the do_timer() duty last, we limit
* the sleep time to the timekeeping max_deferment value.
* Otherwise we can sleep as long as we want.
*/
delta = timekeeping_max_deferment();
if (cpu != tick_do_timer_cpu &&
(tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last))
delta = KTIME_MAX;
/* Calculate the next expiry time */
if (delta < (KTIME_MAX - basemono))
expires = basemono + delta;
else
expires = KTIME_MAX;
ts->timer_expires = min_t(u64, expires, next_tick);
out:
return ts->timer_expires;
}
static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
{
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
u64 basemono = ts->timer_expires_base;
u64 expires = ts->timer_expires;
ktime_t tick = expires;
/* Make sure we won't be trying to stop it twice in a row. */
ts->timer_expires_base = 0;
/*
* If this CPU is the one which updates jiffies, then give up
* the assignment and let it be taken by the CPU which runs
* the tick timer next, which might be this CPU as well. If we
* don't drop this here the jiffies might be stale and
* do_timer() never invoked. Keep track of the fact that it
* was the one which had the do_timer() duty last.
*/
if (cpu == tick_do_timer_cpu) {
tick_do_timer_cpu = TICK_DO_TIMER_NONE;
ts->do_timer_last = 1;
} else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
ts->do_timer_last = 0;
}
/* Skip reprogram of event if its not changed */
nohz: Fix collision between tick and other hrtimers, again This restores commit: 24b91e360ef5: ("nohz: Fix collision between tick and other hrtimers") ... which got reverted by commit: 558e8e27e73f: ('Revert "nohz: Fix collision between tick and other hrtimers"') ... due to a regression where CPUs spuriously stopped ticking. The bug happened when a tick fired too early past its expected expiration: on IRQ exit the tick was scheduled again to the same deadline but skipped reprogramming because ts->next_tick still kept in cache the deadline. This has been fixed now with resetting ts->next_tick from the tick itself. Extra care has also been taken to prevent from obsolete values throughout CPU hotplug operations. When the tick is stopped and an interrupt occurs afterward, we check on that interrupt exit if the next tick needs to be rescheduled. If it doesn't need any update, we don't want to do anything. In order to check if the tick needs an update, we compare it against the clockevent device deadline. Now that's a problem because the clockevent device is at a lower level than the tick itself if it is implemented on top of hrtimer. Every hrtimer share this clockevent device. So comparing the next tick deadline against the clockevent device deadline is wrong because the device may be programmed for another hrtimer whose deadline collides with the tick. As a result we may end up not reprogramming the tick accidentally. In a worst case scenario under full dynticks mode, the tick stops firing as it is supposed to every 1hz, leaving /proc/stat stalled: Task in a full dynticks CPU ---------------------------- * hrtimer A is queued 2 seconds ahead * the tick is stopped, scheduled 1 second ahead * tick fires 1 second later * on tick exit, nohz schedules the tick 1 second ahead but sees the clockevent device is already programmed to that deadline, fooled by hrtimer A, the tick isn't rescheduled. * hrtimer A is cancelled before its deadline * tick never fires again until an interrupt happens... In order to fix this, store the next tick deadline to the tick_sched local structure and reuse that value later to check whether we need to reprogram the clock after an interrupt. On the other hand, ts->sleep_length still wants to know about the next clock event and not just the tick, so we want to improve the related comment to avoid confusion. Reported-and-tested-by: Tim Wright <tim@binbash.co.uk> Reported-and-tested-by: Pavel Machek <pavel@ucw.cz> Reported-by: James Hartsock <hartsjc@redhat.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Rik van Riel <riel@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1492783255-5051-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-04-21 08:00:54 -06:00
if (ts->tick_stopped && (expires == ts->next_tick)) {
/* Sanity check: make sure clockevent is actually programmed */
nohz: Fix spurious warning when hrtimer and clockevent get out of sync The sanity check ensuring that the tick expiry cache (ts->next_tick) is actually in sync with the hardware clock (dev->next_event) makes the wrong assumption that the clock can't be programmed later than the hrtimer deadline. In fact the clock hardware can be programmed later on some conditions such as: * The hrtimer deadline is already in the past. * The hrtimer deadline is earlier than the minimum delay supported by the hardware. Such conditions can be met when we program the tick, for example if the last jiffies update hasn't been seen by the current CPU yet, we may program the hrtimer to a deadline that is earlier than ktime_get() because last_jiffies_update is our timestamp base to compute the next tick. As a result, we can randomly observe such warning: WARNING: CPU: 5 PID: 0 at kernel/time/tick-sched.c:794 tick_nohz_stop_sched_tick kernel/time/tick-sched.c:791 [inline] Call Trace: tick_nohz_irq_exit tick_irq_exit irq_exit exiting_irq smp_call_function_interrupt smp_call_function_single_interrupt call_function_single_interrupt Therefore, let's rather make sure that the tick expiry cache is sync'ed with the tick hrtimer deadline, against which it is not supposed to drift away. The clock hardware instead has its own will and can't be used as a reliable comparison point. Reported-and-tested-by: Sasha Levin <alexander.levin@verizon.com> Reported-and-tested-by: Abdul Haleem <abdhalee@linux.vnet.ibm.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: James Hartsock <hartsjc@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tim Wright <tim@binbash.co.uk> Link: http://lkml.kernel.org/r/1497326654-14122-1-git-send-email-fweisbec@gmail.com [ Minor readability edit. ] Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-06-12 22:04:14 -06:00
if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
return;
nohz: Fix collision between tick and other hrtimers, again This restores commit: 24b91e360ef5: ("nohz: Fix collision between tick and other hrtimers") ... which got reverted by commit: 558e8e27e73f: ('Revert "nohz: Fix collision between tick and other hrtimers"') ... due to a regression where CPUs spuriously stopped ticking. The bug happened when a tick fired too early past its expected expiration: on IRQ exit the tick was scheduled again to the same deadline but skipped reprogramming because ts->next_tick still kept in cache the deadline. This has been fixed now with resetting ts->next_tick from the tick itself. Extra care has also been taken to prevent from obsolete values throughout CPU hotplug operations. When the tick is stopped and an interrupt occurs afterward, we check on that interrupt exit if the next tick needs to be rescheduled. If it doesn't need any update, we don't want to do anything. In order to check if the tick needs an update, we compare it against the clockevent device deadline. Now that's a problem because the clockevent device is at a lower level than the tick itself if it is implemented on top of hrtimer. Every hrtimer share this clockevent device. So comparing the next tick deadline against the clockevent device deadline is wrong because the device may be programmed for another hrtimer whose deadline collides with the tick. As a result we may end up not reprogramming the tick accidentally. In a worst case scenario under full dynticks mode, the tick stops firing as it is supposed to every 1hz, leaving /proc/stat stalled: Task in a full dynticks CPU ---------------------------- * hrtimer A is queued 2 seconds ahead * the tick is stopped, scheduled 1 second ahead * tick fires 1 second later * on tick exit, nohz schedules the tick 1 second ahead but sees the clockevent device is already programmed to that deadline, fooled by hrtimer A, the tick isn't rescheduled. * hrtimer A is cancelled before its deadline * tick never fires again until an interrupt happens... In order to fix this, store the next tick deadline to the tick_sched local structure and reuse that value later to check whether we need to reprogram the clock after an interrupt. On the other hand, ts->sleep_length still wants to know about the next clock event and not just the tick, so we want to improve the related comment to avoid confusion. Reported-and-tested-by: Tim Wright <tim@binbash.co.uk> Reported-and-tested-by: Pavel Machek <pavel@ucw.cz> Reported-by: James Hartsock <hartsjc@redhat.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Rik van Riel <riel@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1492783255-5051-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-04-21 08:00:54 -06:00
WARN_ON_ONCE(1);
printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n",
basemono, ts->next_tick, dev->next_event,
hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer));
}
/*
* nohz_stop_sched_tick can be called several times before
* the nohz_restart_sched_tick is called. This happens when
* interrupts arrive which do not cause a reschedule. In the
* first call we save the current tick time, so we can restart
* the scheduler tick in nohz_restart_sched_tick.
*/
if (!ts->tick_stopped) {
calc_load_nohz_start();
sched/fair: Correctly handle nohz ticks CPU load accounting Ticks can happen while the CPU is in dynticks-idle or dynticks-singletask mode. In fact "nohz" or "dynticks" only mean that we exit the periodic mode and we try to minimize the ticks as much as possible. The nohz subsystem uses a confusing terminology with the internal state "ts->tick_stopped" which is also available through its public interface with tick_nohz_tick_stopped(). This is a misnomer as the tick is instead reduced with the best effort rather than stopped. In the best case the tick can indeed be actually stopped but there is no guarantee about that. If a timer needs to fire one second later, a tick will fire while the CPU is in nohz mode and this is a very common scenario. Now this confusion happens to be a problem with CPU load updates: cpu_load_update_active() doesn't handle nohz ticks correctly because it assumes that ticks are completely stopped in nohz mode and that cpu_load_update_active() can't be called in dynticks mode. When that happens, the whole previous tickless load is ignored and the function just records the load for the current tick, ignoring potentially long idle periods behind. In order to solve this, we could account the current load for the previous nohz time but there is a risk that we account the load of a task that got freshly enqueued for the whole nohz period. So instead, lets record the dynticks load on nohz frame entry so we know what to record in case of nohz ticks, then use this record to account the tickless load on nohz ticks and nohz frame end. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Byungchul Park <byungchul.park@lge.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/1460555812-25375-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-04-13 07:56:51 -06:00
cpu_load_update_nohz_start();
quiet_vmstat();
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
trace_tick_stop(1, TICK_DEP_MASK_NONE);
}
nohz: Fix collision between tick and other hrtimers, again This restores commit: 24b91e360ef5: ("nohz: Fix collision between tick and other hrtimers") ... which got reverted by commit: 558e8e27e73f: ('Revert "nohz: Fix collision between tick and other hrtimers"') ... due to a regression where CPUs spuriously stopped ticking. The bug happened when a tick fired too early past its expected expiration: on IRQ exit the tick was scheduled again to the same deadline but skipped reprogramming because ts->next_tick still kept in cache the deadline. This has been fixed now with resetting ts->next_tick from the tick itself. Extra care has also been taken to prevent from obsolete values throughout CPU hotplug operations. When the tick is stopped and an interrupt occurs afterward, we check on that interrupt exit if the next tick needs to be rescheduled. If it doesn't need any update, we don't want to do anything. In order to check if the tick needs an update, we compare it against the clockevent device deadline. Now that's a problem because the clockevent device is at a lower level than the tick itself if it is implemented on top of hrtimer. Every hrtimer share this clockevent device. So comparing the next tick deadline against the clockevent device deadline is wrong because the device may be programmed for another hrtimer whose deadline collides with the tick. As a result we may end up not reprogramming the tick accidentally. In a worst case scenario under full dynticks mode, the tick stops firing as it is supposed to every 1hz, leaving /proc/stat stalled: Task in a full dynticks CPU ---------------------------- * hrtimer A is queued 2 seconds ahead * the tick is stopped, scheduled 1 second ahead * tick fires 1 second later * on tick exit, nohz schedules the tick 1 second ahead but sees the clockevent device is already programmed to that deadline, fooled by hrtimer A, the tick isn't rescheduled. * hrtimer A is cancelled before its deadline * tick never fires again until an interrupt happens... In order to fix this, store the next tick deadline to the tick_sched local structure and reuse that value later to check whether we need to reprogram the clock after an interrupt. On the other hand, ts->sleep_length still wants to know about the next clock event and not just the tick, so we want to improve the related comment to avoid confusion. Reported-and-tested-by: Tim Wright <tim@binbash.co.uk> Reported-and-tested-by: Pavel Machek <pavel@ucw.cz> Reported-by: James Hartsock <hartsjc@redhat.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Rik van Riel <riel@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1492783255-5051-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-04-21 08:00:54 -06:00
ts->next_tick = tick;
/*
* If the expiration time == KTIME_MAX, then we simply stop
* the tick timer.
*/
if (unlikely(expires == KTIME_MAX)) {
if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
hrtimer_cancel(&ts->sched_timer);
return;
}
tick/sched: Do not mess with an enqueued hrtimer Kaike reported that in tests rdma hrtimers occasionaly stopped working. He did great debugging, which provided enough context to decode the problem. CPU 3 CPU 2 idle start sched_timer expires = 712171000000 queue->next = sched_timer start rdmavt timer. expires = 712172915662 lock(baseof(CPU3)) tick_nohz_stop_tick() tick = 716767000000 timerqueue_add(tmr) hrtimer_set_expires(sched_timer, tick); sched_timer->expires = 716767000000 <---- FAIL if (tmr->expires < queue->next->expires) hrtimer_start(sched_timer) queue->next = tmr; lock(baseof(CPU3)) unlock(baseof(CPU3)) timerqueue_remove() timerqueue_add() ts->sched_timer is queued and queue->next is pointing to it, but then ts->sched_timer.expires is modified. This not only corrupts the ordering of the timerqueue RB tree, it also makes CPU2 see the new expiry time of timerqueue->next->expires when checking whether timerqueue->next needs to be updated. So CPU2 sees that the rdma timer is earlier than timerqueue->next and sets the rdma timer as new next. Depending on whether it had also seen the new time at RB tree enqueue, it might have queued the rdma timer at the wrong place and then after removing the sched_timer the RB tree is completely hosed. The problem was introduced with a commit which tried to solve inconsistency between the hrtimer in the tick_sched data and the underlying hardware clockevent. It split out hrtimer_set_expires() to store the new tick time in both the NOHZ and the NOHZ + HIGHRES case, but missed the fact that in the NOHZ + HIGHRES case the hrtimer might still be queued. Use hrtimer_start(timer, tick...) for the NOHZ + HIGHRES case which sets timer->expires after canceling the timer and move the hrtimer_set_expires() invocation into the NOHZ only code path which is not affected as it merily uses the hrtimer as next event storage so code pathes can be shared with the NOHZ + HIGHRES case. Fixes: d4af6d933ccf ("nohz: Fix spurious warning when hrtimer and clockevent get out of sync") Reported-by: "Wan Kaike" <kaike.wan@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Frederic Weisbecker <frederic@kernel.org> Cc: "Marciniszyn Mike" <mike.marciniszyn@intel.com> Cc: Anna-Maria Gleixner <anna-maria@linutronix.de> Cc: linux-rdma@vger.kernel.org Cc: "Dalessandro Dennis" <dennis.dalessandro@intel.com> Cc: "Fleck John" <john.fleck@intel.com> Cc: stable@vger.kernel.org Cc: Peter Zijlstra <peterz@infradead.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: "Weiny Ira" <ira.weiny@intel.com> Cc: "linux-rdma@vger.kernel.org" Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1804241637390.1679@nanos.tec.linutronix.de Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1804242119210.1597@nanos.tec.linutronix.de
2018-04-24 13:22:18 -06:00
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
} else {
hrtimer_set_expires(&ts->sched_timer, tick);
tick_program_event(tick, 1);
tick/sched: Do not mess with an enqueued hrtimer Kaike reported that in tests rdma hrtimers occasionaly stopped working. He did great debugging, which provided enough context to decode the problem. CPU 3 CPU 2 idle start sched_timer expires = 712171000000 queue->next = sched_timer start rdmavt timer. expires = 712172915662 lock(baseof(CPU3)) tick_nohz_stop_tick() tick = 716767000000 timerqueue_add(tmr) hrtimer_set_expires(sched_timer, tick); sched_timer->expires = 716767000000 <---- FAIL if (tmr->expires < queue->next->expires) hrtimer_start(sched_timer) queue->next = tmr; lock(baseof(CPU3)) unlock(baseof(CPU3)) timerqueue_remove() timerqueue_add() ts->sched_timer is queued and queue->next is pointing to it, but then ts->sched_timer.expires is modified. This not only corrupts the ordering of the timerqueue RB tree, it also makes CPU2 see the new expiry time of timerqueue->next->expires when checking whether timerqueue->next needs to be updated. So CPU2 sees that the rdma timer is earlier than timerqueue->next and sets the rdma timer as new next. Depending on whether it had also seen the new time at RB tree enqueue, it might have queued the rdma timer at the wrong place and then after removing the sched_timer the RB tree is completely hosed. The problem was introduced with a commit which tried to solve inconsistency between the hrtimer in the tick_sched data and the underlying hardware clockevent. It split out hrtimer_set_expires() to store the new tick time in both the NOHZ and the NOHZ + HIGHRES case, but missed the fact that in the NOHZ + HIGHRES case the hrtimer might still be queued. Use hrtimer_start(timer, tick...) for the NOHZ + HIGHRES case which sets timer->expires after canceling the timer and move the hrtimer_set_expires() invocation into the NOHZ only code path which is not affected as it merily uses the hrtimer as next event storage so code pathes can be shared with the NOHZ + HIGHRES case. Fixes: d4af6d933ccf ("nohz: Fix spurious warning when hrtimer and clockevent get out of sync") Reported-by: "Wan Kaike" <kaike.wan@intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Frederic Weisbecker <frederic@kernel.org> Cc: "Marciniszyn Mike" <mike.marciniszyn@intel.com> Cc: Anna-Maria Gleixner <anna-maria@linutronix.de> Cc: linux-rdma@vger.kernel.org Cc: "Dalessandro Dennis" <dennis.dalessandro@intel.com> Cc: "Fleck John" <john.fleck@intel.com> Cc: stable@vger.kernel.org Cc: Peter Zijlstra <peterz@infradead.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: "Weiny Ira" <ira.weiny@intel.com> Cc: "linux-rdma@vger.kernel.org" Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1804241637390.1679@nanos.tec.linutronix.de Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1804242119210.1597@nanos.tec.linutronix.de
2018-04-24 13:22:18 -06:00
}
nohz: Separate out irq exit and idle loop dyntick logic The tick_nohz_stop_sched_tick() function, which tries to delay the next timer tick as long as possible, can be called from two places: - From the idle loop to start the dytick idle mode - From interrupt exit if we have interrupted the dyntick idle mode, so that we reprogram the next tick event in case the irq changed some internal state that requires this action. There are only few minor differences between both that are handled by that function, driven by the ts->inidle cpu variable and the inidle parameter. The whole guarantees that we only update the dyntick mode on irq exit if we actually interrupted the dyntick idle mode, and that we enter in RCU extended quiescent state from idle loop entry only. Split this function into: - tick_nohz_idle_enter(), which sets ts->inidle to 1, enters dynticks idle mode unconditionally if it can, and enters into RCU extended quiescent state. - tick_nohz_irq_exit() which only updates the dynticks idle mode when ts->inidle is set (ie: if tick_nohz_idle_enter() has been called). To maintain symmetry, tick_nohz_restart_sched_tick() has been renamed into tick_nohz_idle_exit(). This simplifies the code and micro-optimize the irq exit path (no need for local_irq_save there). This also prepares for the split between dynticks and rcu extended quiescent state logics. We'll need this split to further fix illegal uses of RCU in extended quiescent states in the idle loop. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Mike Frysinger <vapier@gentoo.org> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Cc: David Miller <davem@davemloft.net> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Hans-Christian Egtvedt <hans-christian.egtvedt@atmel.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Russell King <linux@arm.linux.org.uk> Cc: Paul Mackerras <paulus@samba.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org>
2011-10-07 10:22:06 -06:00
}
static void tick_nohz_retain_tick(struct tick_sched *ts)
{
ts->timer_expires_base = 0;
}
#ifdef CONFIG_NO_HZ_FULL
static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu)
{
if (tick_nohz_next_event(ts, cpu))
tick_nohz_stop_tick(ts, cpu);
else
tick_nohz_retain_tick(ts);
}
#endif /* CONFIG_NO_HZ_FULL */
sched/fair: Correctly handle nohz ticks CPU load accounting Ticks can happen while the CPU is in dynticks-idle or dynticks-singletask mode. In fact "nohz" or "dynticks" only mean that we exit the periodic mode and we try to minimize the ticks as much as possible. The nohz subsystem uses a confusing terminology with the internal state "ts->tick_stopped" which is also available through its public interface with tick_nohz_tick_stopped(). This is a misnomer as the tick is instead reduced with the best effort rather than stopped. In the best case the tick can indeed be actually stopped but there is no guarantee about that. If a timer needs to fire one second later, a tick will fire while the CPU is in nohz mode and this is a very common scenario. Now this confusion happens to be a problem with CPU load updates: cpu_load_update_active() doesn't handle nohz ticks correctly because it assumes that ticks are completely stopped in nohz mode and that cpu_load_update_active() can't be called in dynticks mode. When that happens, the whole previous tickless load is ignored and the function just records the load for the current tick, ignoring potentially long idle periods behind. In order to solve this, we could account the current load for the previous nohz time but there is a risk that we account the load of a task that got freshly enqueued for the whole nohz period. So instead, lets record the dynticks load on nohz frame entry so we know what to record in case of nohz ticks, then use this record to account the tickless load on nohz ticks and nohz frame end. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Byungchul Park <byungchul.park@lge.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/1460555812-25375-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-04-13 07:56:51 -06:00
static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
{
/* Update jiffies first */
tick_do_update_jiffies64(now);
sched/fair: Correctly handle nohz ticks CPU load accounting Ticks can happen while the CPU is in dynticks-idle or dynticks-singletask mode. In fact "nohz" or "dynticks" only mean that we exit the periodic mode and we try to minimize the ticks as much as possible. The nohz subsystem uses a confusing terminology with the internal state "ts->tick_stopped" which is also available through its public interface with tick_nohz_tick_stopped(). This is a misnomer as the tick is instead reduced with the best effort rather than stopped. In the best case the tick can indeed be actually stopped but there is no guarantee about that. If a timer needs to fire one second later, a tick will fire while the CPU is in nohz mode and this is a very common scenario. Now this confusion happens to be a problem with CPU load updates: cpu_load_update_active() doesn't handle nohz ticks correctly because it assumes that ticks are completely stopped in nohz mode and that cpu_load_update_active() can't be called in dynticks mode. When that happens, the whole previous tickless load is ignored and the function just records the load for the current tick, ignoring potentially long idle periods behind. In order to solve this, we could account the current load for the previous nohz time but there is a risk that we account the load of a task that got freshly enqueued for the whole nohz period. So instead, lets record the dynticks load on nohz frame entry so we know what to record in case of nohz ticks, then use this record to account the tickless load on nohz ticks and nohz frame end. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Byungchul Park <byungchul.park@lge.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/1460555812-25375-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-04-13 07:56:51 -06:00
cpu_load_update_nohz_stop();
/*
* Clear the timer idle flag, so we avoid IPIs on remote queueing and
* the clock forward checks in the enqueue path:
*/
timer_clear_idle();
calc_load_nohz_stop();
touch_softlockup_watchdog_sched();
/*
* Cancel the scheduled timer and restore the tick
*/
ts->tick_stopped = 0;
ts->idle_exittime = now;
tick_nohz_restart(ts, now);
}
static void tick_nohz_full_update_tick(struct tick_sched *ts)
{
#ifdef CONFIG_NO_HZ_FULL
int cpu = smp_processor_id();
if (!tick_nohz_full_cpu(cpu))
return;
if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
return;
if (can_stop_full_tick(cpu, ts))
tick_nohz_stop_sched_tick(ts, cpu);
else if (ts->tick_stopped)
sched/fair: Correctly handle nohz ticks CPU load accounting Ticks can happen while the CPU is in dynticks-idle or dynticks-singletask mode. In fact "nohz" or "dynticks" only mean that we exit the periodic mode and we try to minimize the ticks as much as possible. The nohz subsystem uses a confusing terminology with the internal state "ts->tick_stopped" which is also available through its public interface with tick_nohz_tick_stopped(). This is a misnomer as the tick is instead reduced with the best effort rather than stopped. In the best case the tick can indeed be actually stopped but there is no guarantee about that. If a timer needs to fire one second later, a tick will fire while the CPU is in nohz mode and this is a very common scenario. Now this confusion happens to be a problem with CPU load updates: cpu_load_update_active() doesn't handle nohz ticks correctly because it assumes that ticks are completely stopped in nohz mode and that cpu_load_update_active() can't be called in dynticks mode. When that happens, the whole previous tickless load is ignored and the function just records the load for the current tick, ignoring potentially long idle periods behind. In order to solve this, we could account the current load for the previous nohz time but there is a risk that we account the load of a task that got freshly enqueued for the whole nohz period. So instead, lets record the dynticks load on nohz frame entry so we know what to record in case of nohz ticks, then use this record to account the tickless load on nohz ticks and nohz frame end. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Byungchul Park <byungchul.park@lge.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/1460555812-25375-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-04-13 07:56:51 -06:00
tick_nohz_restart_sched_tick(ts, ktime_get());
#endif
}
static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
{
/*
* If this CPU is offline and it is the one which updates
* jiffies, then give up the assignment and let it be taken by
* the CPU which runs the tick timer next. If we don't drop
* this here the jiffies might be stale and do_timer() never
* invoked.
*/
if (unlikely(!cpu_online(cpu))) {
if (cpu == tick_do_timer_cpu)
tick_do_timer_cpu = TICK_DO_TIMER_NONE;
nohz: Fix collision between tick and other hrtimers, again This restores commit: 24b91e360ef5: ("nohz: Fix collision between tick and other hrtimers") ... which got reverted by commit: 558e8e27e73f: ('Revert "nohz: Fix collision between tick and other hrtimers"') ... due to a regression where CPUs spuriously stopped ticking. The bug happened when a tick fired too early past its expected expiration: on IRQ exit the tick was scheduled again to the same deadline but skipped reprogramming because ts->next_tick still kept in cache the deadline. This has been fixed now with resetting ts->next_tick from the tick itself. Extra care has also been taken to prevent from obsolete values throughout CPU hotplug operations. When the tick is stopped and an interrupt occurs afterward, we check on that interrupt exit if the next tick needs to be rescheduled. If it doesn't need any update, we don't want to do anything. In order to check if the tick needs an update, we compare it against the clockevent device deadline. Now that's a problem because the clockevent device is at a lower level than the tick itself if it is implemented on top of hrtimer. Every hrtimer share this clockevent device. So comparing the next tick deadline against the clockevent device deadline is wrong because the device may be programmed for another hrtimer whose deadline collides with the tick. As a result we may end up not reprogramming the tick accidentally. In a worst case scenario under full dynticks mode, the tick stops firing as it is supposed to every 1hz, leaving /proc/stat stalled: Task in a full dynticks CPU ---------------------------- * hrtimer A is queued 2 seconds ahead * the tick is stopped, scheduled 1 second ahead * tick fires 1 second later * on tick exit, nohz schedules the tick 1 second ahead but sees the clockevent device is already programmed to that deadline, fooled by hrtimer A, the tick isn't rescheduled. * hrtimer A is cancelled before its deadline * tick never fires again until an interrupt happens... In order to fix this, store the next tick deadline to the tick_sched local structure and reuse that value later to check whether we need to reprogram the clock after an interrupt. On the other hand, ts->sleep_length still wants to know about the next clock event and not just the tick, so we want to improve the related comment to avoid confusion. Reported-and-tested-by: Tim Wright <tim@binbash.co.uk> Reported-and-tested-by: Pavel Machek <pavel@ucw.cz> Reported-by: James Hartsock <hartsjc@redhat.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Rik van Riel <riel@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1492783255-5051-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-04-21 08:00:54 -06:00
/*
* Make sure the CPU doesn't get fooled by obsolete tick
* deadline if it comes back online later.
*/
ts->next_tick = 0;
return false;
}
if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
return false;
if (need_resched())
return false;
if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
static int ratelimit;
if (ratelimit < 10 &&
(local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
pr_warn("NOHZ: local_softirq_pending %02x\n",
(unsigned int) local_softirq_pending());
ratelimit++;
}
return false;
}
if (tick_nohz_full_enabled()) {
/*
* Keep the tick alive to guarantee timekeeping progression
* if there are full dynticks CPUs around
*/
if (tick_do_timer_cpu == cpu)
return false;
/*
* Boot safety: make sure the timekeeping duty has been
* assigned before entering dyntick-idle mode,
*/
if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
return false;
}
return true;
}
static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
{
ktime_t expires;
int cpu = smp_processor_id();
#ifdef CONFIG_SMP
if (check_pending_deferrable_timers(cpu))
raise_softirq_irqoff(TIMER_SOFTIRQ);
#endif
sched: idle: Select idle state before stopping the tick In order to address the issue with short idle duration predictions by the idle governor after the scheduler tick has been stopped, reorder the code in cpuidle_idle_call() so that the governor idle state selection runs before tick_nohz_idle_go_idle() and use the "nohz" hint returned by cpuidle_select() to decide whether or not to stop the tick. This isn't straightforward, because menu_select() invokes tick_nohz_get_sleep_length() to get the time to the next timer event and the number returned by the latter comes from __tick_nohz_idle_stop_tick(). Fortunately, however, it is possible to compute that number without actually stopping the tick and with the help of the existing code. Namely, tick_nohz_get_sleep_length() can be made call tick_nohz_next_event(), introduced earlier, to get the time to the next non-highres timer event. If that happens, tick_nohz_next_event() need not be called by __tick_nohz_idle_stop_tick() again. If it turns out that the scheduler tick cannot be stopped going forward or the next timer event is too close for the tick to be stopped, tick_nohz_get_sleep_length() can simply return the time to the next event currently programmed into the corresponding clock event device. In addition to knowing the return value of tick_nohz_next_event(), however, tick_nohz_get_sleep_length() needs to know the time to the next highres timer event, but with the scheduler tick timer excluded, which can be computed with the help of hrtimer_get_next_event(). That minimum of that number and the tick_nohz_next_event() return value is the total time to the next timer event with the assumption that the tick will be stopped. It can be returned to the idle governor which can use it for predicting idle duration (under the assumption that the tick will be stopped) and deciding whether or not it makes sense to stop the tick before putting the CPU into the selected idle state. With the above, the sleep_length field in struct tick_sched is not necessary any more, so drop it. Link: https://bugzilla.kernel.org/show_bug.cgi?id=199227 Reported-by: Doug Smythies <dsmythies@telus.net> Reported-by: Thomas Ilsche <thomas.ilsche@tu-dresden.de> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
2018-04-03 15:17:11 -06:00
/*
* If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
* tick timer expiration time is known already.
*/
if (ts->timer_expires_base)
expires = ts->timer_expires;
else if (can_stop_idle_tick(cpu, ts))
expires = tick_nohz_next_event(ts, cpu);
else
return;
ts->idle_calls++;
tick/nohz: Fix softlockup on scheduler stalls in kvm guest tick_nohz_start_idle() is prevented to be called if the idle tick can't be stopped since commit 1f3b0f8243cb934 ("tick/nohz: Optimize nohz idle enter"). As a result, after suspend/resume the host machine, full dynticks kvm guest will softlockup: NMI watchdog: BUG: soft lockup - CPU#0 stuck for 26s! [swapper/0:0] Call Trace: default_idle+0x31/0x1a0 arch_cpu_idle+0xf/0x20 default_idle_call+0x2a/0x50 cpu_startup_entry+0x39b/0x4d0 rest_init+0x138/0x140 ? rest_init+0x5/0x140 start_kernel+0x4c1/0x4ce ? set_init_arg+0x55/0x55 ? early_idt_handler_array+0x120/0x120 x86_64_start_reservations+0x24/0x26 x86_64_start_kernel+0x142/0x14f In addition, cat /proc/stat | grep cpu in guest or host: cpu 398 16 5049 15754 5490 0 1 46 0 0 cpu0 206 5 450 0 0 0 1 14 0 0 cpu1 81 0 3937 3149 1514 0 0 9 0 0 cpu2 45 6 332 6052 2243 0 0 11 0 0 cpu3 65 2 328 6552 1732 0 0 11 0 0 The idle and iowait states are weird 0 for cpu0(housekeeping). The bug is present in both guest and host kernels, and they both have cpu0's idle and iowait states issue, however, host kernel's suspend/resume path etc will touch watchdog to avoid the softlockup. - The watchdog will not be touched in tick_nohz_stop_idle path (need be touched since the scheduler stall is expected) if idle_active flags are not detected. - The idle and iowait states will not be accounted when exit idle loop (resched or interrupt) if idle start time and idle_active flags are not set. This patch fixes it by reverting commit 1f3b0f8243cb934 since can't stop idle tick doesn't mean can't be idle. Fixes: 1f3b0f8243cb934 ("tick/nohz: Optimize nohz idle enter") Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com> Cc: Sanjeev Yadav<sanjeev.yadav@spreadtrum.com> Cc: Gaurav Jindal<gaurav.jindal@spreadtrum.com> Cc: stable@vger.kernel.org Cc: kvm@vger.kernel.org Cc: Radim Krčmář <rkrcmar@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paolo Bonzini <pbonzini@redhat.com> Link: http://lkml.kernel.org/r/1472798303-4154-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2016-09-02 00:38:23 -06:00
if (expires > 0LL) {
int was_stopped = ts->tick_stopped;
tick_nohz_stop_tick(ts, cpu);
ts->idle_sleeps++;
ts->idle_expires = expires;
if (!was_stopped && ts->tick_stopped) {
ts->idle_jiffies = ts->last_jiffies;
nohz_balance_enter_idle(cpu);
}
} else {
tick_nohz_retain_tick(ts);
}
nohz: Separate out irq exit and idle loop dyntick logic The tick_nohz_stop_sched_tick() function, which tries to delay the next timer tick as long as possible, can be called from two places: - From the idle loop to start the dytick idle mode - From interrupt exit if we have interrupted the dyntick idle mode, so that we reprogram the next tick event in case the irq changed some internal state that requires this action. There are only few minor differences between both that are handled by that function, driven by the ts->inidle cpu variable and the inidle parameter. The whole guarantees that we only update the dyntick mode on irq exit if we actually interrupted the dyntick idle mode, and that we enter in RCU extended quiescent state from idle loop entry only. Split this function into: - tick_nohz_idle_enter(), which sets ts->inidle to 1, enters dynticks idle mode unconditionally if it can, and enters into RCU extended quiescent state. - tick_nohz_irq_exit() which only updates the dynticks idle mode when ts->inidle is set (ie: if tick_nohz_idle_enter() has been called). To maintain symmetry, tick_nohz_restart_sched_tick() has been renamed into tick_nohz_idle_exit(). This simplifies the code and micro-optimize the irq exit path (no need for local_irq_save there). This also prepares for the split between dynticks and rcu extended quiescent state logics. We'll need this split to further fix illegal uses of RCU in extended quiescent states in the idle loop. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Mike Frysinger <vapier@gentoo.org> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Cc: David Miller <davem@davemloft.net> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Hans-Christian Egtvedt <hans-christian.egtvedt@atmel.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Russell King <linux@arm.linux.org.uk> Cc: Paul Mackerras <paulus@samba.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org>
2011-10-07 10:22:06 -06:00
}
/**
* tick_nohz_idle_stop_tick - stop the idle tick from the idle task
nohz: Separate out irq exit and idle loop dyntick logic The tick_nohz_stop_sched_tick() function, which tries to delay the next timer tick as long as possible, can be called from two places: - From the idle loop to start the dytick idle mode - From interrupt exit if we have interrupted the dyntick idle mode, so that we reprogram the next tick event in case the irq changed some internal state that requires this action. There are only few minor differences between both that are handled by that function, driven by the ts->inidle cpu variable and the inidle parameter. The whole guarantees that we only update the dyntick mode on irq exit if we actually interrupted the dyntick idle mode, and that we enter in RCU extended quiescent state from idle loop entry only. Split this function into: - tick_nohz_idle_enter(), which sets ts->inidle to 1, enters dynticks idle mode unconditionally if it can, and enters into RCU extended quiescent state. - tick_nohz_irq_exit() which only updates the dynticks idle mode when ts->inidle is set (ie: if tick_nohz_idle_enter() has been called). To maintain symmetry, tick_nohz_restart_sched_tick() has been renamed into tick_nohz_idle_exit(). This simplifies the code and micro-optimize the irq exit path (no need for local_irq_save there). This also prepares for the split between dynticks and rcu extended quiescent state logics. We'll need this split to further fix illegal uses of RCU in extended quiescent states in the idle loop. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Mike Frysinger <vapier@gentoo.org> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Cc: David Miller <davem@davemloft.net> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Hans-Christian Egtvedt <hans-christian.egtvedt@atmel.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Russell King <linux@arm.linux.org.uk> Cc: Paul Mackerras <paulus@samba.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org>
2011-10-07 10:22:06 -06:00
*
* When the next event is more than a tick into the future, stop the idle tick
*/
void tick_nohz_idle_stop_tick(void)
{
__tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched));
}
sched: idle: Select idle state before stopping the tick In order to address the issue with short idle duration predictions by the idle governor after the scheduler tick has been stopped, reorder the code in cpuidle_idle_call() so that the governor idle state selection runs before tick_nohz_idle_go_idle() and use the "nohz" hint returned by cpuidle_select() to decide whether or not to stop the tick. This isn't straightforward, because menu_select() invokes tick_nohz_get_sleep_length() to get the time to the next timer event and the number returned by the latter comes from __tick_nohz_idle_stop_tick(). Fortunately, however, it is possible to compute that number without actually stopping the tick and with the help of the existing code. Namely, tick_nohz_get_sleep_length() can be made call tick_nohz_next_event(), introduced earlier, to get the time to the next non-highres timer event. If that happens, tick_nohz_next_event() need not be called by __tick_nohz_idle_stop_tick() again. If it turns out that the scheduler tick cannot be stopped going forward or the next timer event is too close for the tick to be stopped, tick_nohz_get_sleep_length() can simply return the time to the next event currently programmed into the corresponding clock event device. In addition to knowing the return value of tick_nohz_next_event(), however, tick_nohz_get_sleep_length() needs to know the time to the next highres timer event, but with the scheduler tick timer excluded, which can be computed with the help of hrtimer_get_next_event(). That minimum of that number and the tick_nohz_next_event() return value is the total time to the next timer event with the assumption that the tick will be stopped. It can be returned to the idle governor which can use it for predicting idle duration (under the assumption that the tick will be stopped) and deciding whether or not it makes sense to stop the tick before putting the CPU into the selected idle state. With the above, the sleep_length field in struct tick_sched is not necessary any more, so drop it. Link: https://bugzilla.kernel.org/show_bug.cgi?id=199227 Reported-by: Doug Smythies <dsmythies@telus.net> Reported-by: Thomas Ilsche <thomas.ilsche@tu-dresden.de> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
2018-04-03 15:17:11 -06:00
void tick_nohz_idle_retain_tick(void)
{
tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
/*
* Undo the effect of get_next_timer_interrupt() called from
* tick_nohz_next_event().
*/
timer_clear_idle();
}
/**
* tick_nohz_idle_enter - prepare for entering idle on the current CPU
nohz: Allow rcu extended quiescent state handling seperately from tick stop It is assumed that rcu won't be used once we switch to tickless mode and until we restart the tick. However this is not always true, as in x86-64 where we dereference the idle notifiers after the tick is stopped. To prepare for fixing this, add two new APIs: tick_nohz_idle_enter_norcu() and tick_nohz_idle_exit_norcu(). If no use of RCU is made in the idle loop between tick_nohz_enter_idle() and tick_nohz_exit_idle() calls, the arch must instead call the new *_norcu() version such that the arch doesn't need to call rcu_idle_enter() and rcu_idle_exit(). Otherwise the arch must call tick_nohz_enter_idle() and tick_nohz_exit_idle() and also call explicitly: - rcu_idle_enter() after its last use of RCU before the CPU is put to sleep. - rcu_idle_exit() before the first use of RCU after the CPU is woken up. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Mike Frysinger <vapier@gentoo.org> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Cc: David Miller <davem@davemloft.net> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Hans-Christian Egtvedt <hans-christian.egtvedt@atmel.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Russell King <linux@arm.linux.org.uk> Cc: Paul Mackerras <paulus@samba.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2011-10-08 08:01:00 -06:00
*
* Called when we start the idle loop.
nohz: Separate out irq exit and idle loop dyntick logic The tick_nohz_stop_sched_tick() function, which tries to delay the next timer tick as long as possible, can be called from two places: - From the idle loop to start the dytick idle mode - From interrupt exit if we have interrupted the dyntick idle mode, so that we reprogram the next tick event in case the irq changed some internal state that requires this action. There are only few minor differences between both that are handled by that function, driven by the ts->inidle cpu variable and the inidle parameter. The whole guarantees that we only update the dyntick mode on irq exit if we actually interrupted the dyntick idle mode, and that we enter in RCU extended quiescent state from idle loop entry only. Split this function into: - tick_nohz_idle_enter(), which sets ts->inidle to 1, enters dynticks idle mode unconditionally if it can, and enters into RCU extended quiescent state. - tick_nohz_irq_exit() which only updates the dynticks idle mode when ts->inidle is set (ie: if tick_nohz_idle_enter() has been called). To maintain symmetry, tick_nohz_restart_sched_tick() has been renamed into tick_nohz_idle_exit(). This simplifies the code and micro-optimize the irq exit path (no need for local_irq_save there). This also prepares for the split between dynticks and rcu extended quiescent state logics. We'll need this split to further fix illegal uses of RCU in extended quiescent states in the idle loop. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Mike Frysinger <vapier@gentoo.org> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Cc: David Miller <davem@davemloft.net> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Hans-Christian Egtvedt <hans-christian.egtvedt@atmel.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Russell King <linux@arm.linux.org.uk> Cc: Paul Mackerras <paulus@samba.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org>
2011-10-07 10:22:06 -06:00
*/
void tick_nohz_idle_enter(void)
nohz: Separate out irq exit and idle loop dyntick logic The tick_nohz_stop_sched_tick() function, which tries to delay the next timer tick as long as possible, can be called from two places: - From the idle loop to start the dytick idle mode - From interrupt exit if we have interrupted the dyntick idle mode, so that we reprogram the next tick event in case the irq changed some internal state that requires this action. There are only few minor differences between both that are handled by that function, driven by the ts->inidle cpu variable and the inidle parameter. The whole guarantees that we only update the dyntick mode on irq exit if we actually interrupted the dyntick idle mode, and that we enter in RCU extended quiescent state from idle loop entry only. Split this function into: - tick_nohz_idle_enter(), which sets ts->inidle to 1, enters dynticks idle mode unconditionally if it can, and enters into RCU extended quiescent state. - tick_nohz_irq_exit() which only updates the dynticks idle mode when ts->inidle is set (ie: if tick_nohz_idle_enter() has been called). To maintain symmetry, tick_nohz_restart_sched_tick() has been renamed into tick_nohz_idle_exit(). This simplifies the code and micro-optimize the irq exit path (no need for local_irq_save there). This also prepares for the split between dynticks and rcu extended quiescent state logics. We'll need this split to further fix illegal uses of RCU in extended quiescent states in the idle loop. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Mike Frysinger <vapier@gentoo.org> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Cc: David Miller <davem@davemloft.net> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Hans-Christian Egtvedt <hans-christian.egtvedt@atmel.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Russell King <linux@arm.linux.org.uk> Cc: Paul Mackerras <paulus@samba.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org>
2011-10-07 10:22:06 -06:00
{
struct tick_sched *ts;
lockdep_assert_irqs_enabled();
local_irq_disable();
ts = this_cpu_ptr(&tick_cpu_sched);
WARN_ON_ONCE(ts->timer_expires_base);
nohz: Separate out irq exit and idle loop dyntick logic The tick_nohz_stop_sched_tick() function, which tries to delay the next timer tick as long as possible, can be called from two places: - From the idle loop to start the dytick idle mode - From interrupt exit if we have interrupted the dyntick idle mode, so that we reprogram the next tick event in case the irq changed some internal state that requires this action. There are only few minor differences between both that are handled by that function, driven by the ts->inidle cpu variable and the inidle parameter. The whole guarantees that we only update the dyntick mode on irq exit if we actually interrupted the dyntick idle mode, and that we enter in RCU extended quiescent state from idle loop entry only. Split this function into: - tick_nohz_idle_enter(), which sets ts->inidle to 1, enters dynticks idle mode unconditionally if it can, and enters into RCU extended quiescent state. - tick_nohz_irq_exit() which only updates the dynticks idle mode when ts->inidle is set (ie: if tick_nohz_idle_enter() has been called). To maintain symmetry, tick_nohz_restart_sched_tick() has been renamed into tick_nohz_idle_exit(). This simplifies the code and micro-optimize the irq exit path (no need for local_irq_save there). This also prepares for the split between dynticks and rcu extended quiescent state logics. We'll need this split to further fix illegal uses of RCU in extended quiescent states in the idle loop. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Mike Frysinger <vapier@gentoo.org> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Cc: David Miller <davem@davemloft.net> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Hans-Christian Egtvedt <hans-christian.egtvedt@atmel.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Russell King <linux@arm.linux.org.uk> Cc: Paul Mackerras <paulus@samba.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org>
2011-10-07 10:22:06 -06:00
ts->inidle = 1;
tick_nohz_start_idle(ts);
local_irq_enable();
nohz: Separate out irq exit and idle loop dyntick logic The tick_nohz_stop_sched_tick() function, which tries to delay the next timer tick as long as possible, can be called from two places: - From the idle loop to start the dytick idle mode - From interrupt exit if we have interrupted the dyntick idle mode, so that we reprogram the next tick event in case the irq changed some internal state that requires this action. There are only few minor differences between both that are handled by that function, driven by the ts->inidle cpu variable and the inidle parameter. The whole guarantees that we only update the dyntick mode on irq exit if we actually interrupted the dyntick idle mode, and that we enter in RCU extended quiescent state from idle loop entry only. Split this function into: - tick_nohz_idle_enter(), which sets ts->inidle to 1, enters dynticks idle mode unconditionally if it can, and enters into RCU extended quiescent state. - tick_nohz_irq_exit() which only updates the dynticks idle mode when ts->inidle is set (ie: if tick_nohz_idle_enter() has been called). To maintain symmetry, tick_nohz_restart_sched_tick() has been renamed into tick_nohz_idle_exit(). This simplifies the code and micro-optimize the irq exit path (no need for local_irq_save there). This also prepares for the split between dynticks and rcu extended quiescent state logics. We'll need this split to further fix illegal uses of RCU in extended quiescent states in the idle loop. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Mike Frysinger <vapier@gentoo.org> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Cc: David Miller <davem@davemloft.net> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Hans-Christian Egtvedt <hans-christian.egtvedt@atmel.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Russell King <linux@arm.linux.org.uk> Cc: Paul Mackerras <paulus@samba.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org>
2011-10-07 10:22:06 -06:00
}
/**
* tick_nohz_irq_exit - update next tick event from interrupt exit
*
* When an interrupt fires while we are idle and it doesn't cause
* a reschedule, it may still add, modify or delete a timer, enqueue
* an RCU callback, etc...
* So we need to re-calculate and reprogram the next tick event.
*/
void tick_nohz_irq_exit(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
nohz: Separate out irq exit and idle loop dyntick logic The tick_nohz_stop_sched_tick() function, which tries to delay the next timer tick as long as possible, can be called from two places: - From the idle loop to start the dytick idle mode - From interrupt exit if we have interrupted the dyntick idle mode, so that we reprogram the next tick event in case the irq changed some internal state that requires this action. There are only few minor differences between both that are handled by that function, driven by the ts->inidle cpu variable and the inidle parameter. The whole guarantees that we only update the dyntick mode on irq exit if we actually interrupted the dyntick idle mode, and that we enter in RCU extended quiescent state from idle loop entry only. Split this function into: - tick_nohz_idle_enter(), which sets ts->inidle to 1, enters dynticks idle mode unconditionally if it can, and enters into RCU extended quiescent state. - tick_nohz_irq_exit() which only updates the dynticks idle mode when ts->inidle is set (ie: if tick_nohz_idle_enter() has been called). To maintain symmetry, tick_nohz_restart_sched_tick() has been renamed into tick_nohz_idle_exit(). This simplifies the code and micro-optimize the irq exit path (no need for local_irq_save there). This also prepares for the split between dynticks and rcu extended quiescent state logics. We'll need this split to further fix illegal uses of RCU in extended quiescent states in the idle loop. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Mike Frysinger <vapier@gentoo.org> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Cc: David Miller <davem@davemloft.net> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Hans-Christian Egtvedt <hans-christian.egtvedt@atmel.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Russell King <linux@arm.linux.org.uk> Cc: Paul Mackerras <paulus@samba.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org>
2011-10-07 10:22:06 -06:00
Revert "cpuidle: Quickly notice prediction failure for repeat mode" Revert commit 69a37bea (cpuidle: Quickly notice prediction failure for repeat mode), because it has been identified as the source of a significant performance regression in v3.8 and later as explained by Jeremy Eder: We believe we've identified a particular commit to the cpuidle code that seems to be impacting performance of variety of workloads. The simplest way to reproduce is using netperf TCP_RR test, so we're using that, on a pair of Sandy Bridge based servers. We also have data from a large database setup where performance is also measurably/positively impacted, though that test data isn't easily share-able. Included below are test results from 3 test kernels: kernel reverts ----------------------------------------------------------- 1) vanilla upstream (no reverts) 2) perfteam2 reverts e11538d1f03914eb92af5a1a378375c05ae8520c 3) test reverts 69a37beabf1f0a6705c08e879bdd5d82ff6486c4 e11538d1f03914eb92af5a1a378375c05ae8520c In summary, netperf TCP_RR numbers improve by approximately 4% after reverting 69a37beabf1f0a6705c08e879bdd5d82ff6486c4. When 69a37beabf1f0a6705c08e879bdd5d82ff6486c4 is included, C0 residency never seems to get above 40%. Taking that patch out gets C0 near 100% quite often, and performance increases. The below data are histograms representing the %c0 residency @ 1-second sample rates (using turbostat), while under netperf test. - If you look at the first 4 histograms, you can see %c0 residency almost entirely in the 30,40% bin. - The last pair, which reverts 69a37beabf1f0a6705c08e879bdd5d82ff6486c4, shows %c0 in the 80,90,100% bins. Below each kernel name are netperf TCP_RR trans/s numbers for the particular kernel that can be disclosed publicly, comparing the 3 test kernels. We ran a 4th test with the vanilla kernel where we've also set /dev/cpu_dma_latency=0 to show overall impact boosting single-threaded TCP_RR performance over 11% above baseline. 3.10-rc2 vanilla RX + c0 lock (/dev/cpu_dma_latency=0): TCP_RR trans/s 54323.78 ----------------------------------------------------------- 3.10-rc2 vanilla RX (no reverts) TCP_RR trans/s 48192.47 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 59]: *********************************************************** 40.0000 - 50.0000 [ 1]: * 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: Sender %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 11]: *********** 40.0000 - 50.0000 [ 49]: ************************************************* 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: ----------------------------------------------------------- 3.10-rc2 perfteam2 RX (reverts commit e11538d1f03914eb92af5a1a378375c05ae8520c) TCP_RR trans/s 49698.69 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 1]: * 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 59]: *********************************************************** 40.0000 - 50.0000 [ 0]: 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: Sender %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 2]: ** 40.0000 - 50.0000 [ 58]: ********************************************************** 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: ----------------------------------------------------------- 3.10-rc2 test RX (reverts 69a37beabf1f0a6705c08e879bdd5d82ff6486c4 and e11538d1f03914eb92af5a1a378375c05ae8520c) TCP_RR trans/s 47766.95 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 1]: * 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 27]: *************************** 40.0000 - 50.0000 [ 2]: ** 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 2]: ** 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 28]: **************************** Sender: 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 11]: *********** 40.0000 - 50.0000 [ 0]: 50.0000 - 60.0000 [ 1]: * 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 3]: *** 80.0000 - 90.0000 [ 7]: ******* 90.0000 - 100.0000 [ 38]: ************************************** These results demonstrate gaining back the tendency of the CPU to stay in more responsive, performant C-states (and thus yield measurably better performance), by reverting commit 69a37beabf1f0a6705c08e879bdd5d82ff6486c4. Requested-by: Jeremy Eder <jeder@redhat.com> Tested-by: Len Brown <len.brown@intel.com> Cc: 3.8+ <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2013-07-26 17:41:34 -06:00
if (ts->inidle)
tick_nohz_start_idle(ts);
Revert "cpuidle: Quickly notice prediction failure for repeat mode" Revert commit 69a37bea (cpuidle: Quickly notice prediction failure for repeat mode), because it has been identified as the source of a significant performance regression in v3.8 and later as explained by Jeremy Eder: We believe we've identified a particular commit to the cpuidle code that seems to be impacting performance of variety of workloads. The simplest way to reproduce is using netperf TCP_RR test, so we're using that, on a pair of Sandy Bridge based servers. We also have data from a large database setup where performance is also measurably/positively impacted, though that test data isn't easily share-able. Included below are test results from 3 test kernels: kernel reverts ----------------------------------------------------------- 1) vanilla upstream (no reverts) 2) perfteam2 reverts e11538d1f03914eb92af5a1a378375c05ae8520c 3) test reverts 69a37beabf1f0a6705c08e879bdd5d82ff6486c4 e11538d1f03914eb92af5a1a378375c05ae8520c In summary, netperf TCP_RR numbers improve by approximately 4% after reverting 69a37beabf1f0a6705c08e879bdd5d82ff6486c4. When 69a37beabf1f0a6705c08e879bdd5d82ff6486c4 is included, C0 residency never seems to get above 40%. Taking that patch out gets C0 near 100% quite often, and performance increases. The below data are histograms representing the %c0 residency @ 1-second sample rates (using turbostat), while under netperf test. - If you look at the first 4 histograms, you can see %c0 residency almost entirely in the 30,40% bin. - The last pair, which reverts 69a37beabf1f0a6705c08e879bdd5d82ff6486c4, shows %c0 in the 80,90,100% bins. Below each kernel name are netperf TCP_RR trans/s numbers for the particular kernel that can be disclosed publicly, comparing the 3 test kernels. We ran a 4th test with the vanilla kernel where we've also set /dev/cpu_dma_latency=0 to show overall impact boosting single-threaded TCP_RR performance over 11% above baseline. 3.10-rc2 vanilla RX + c0 lock (/dev/cpu_dma_latency=0): TCP_RR trans/s 54323.78 ----------------------------------------------------------- 3.10-rc2 vanilla RX (no reverts) TCP_RR trans/s 48192.47 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 59]: *********************************************************** 40.0000 - 50.0000 [ 1]: * 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: Sender %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 11]: *********** 40.0000 - 50.0000 [ 49]: ************************************************* 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: ----------------------------------------------------------- 3.10-rc2 perfteam2 RX (reverts commit e11538d1f03914eb92af5a1a378375c05ae8520c) TCP_RR trans/s 49698.69 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 1]: * 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 59]: *********************************************************** 40.0000 - 50.0000 [ 0]: 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: Sender %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 2]: ** 40.0000 - 50.0000 [ 58]: ********************************************************** 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: ----------------------------------------------------------- 3.10-rc2 test RX (reverts 69a37beabf1f0a6705c08e879bdd5d82ff6486c4 and e11538d1f03914eb92af5a1a378375c05ae8520c) TCP_RR trans/s 47766.95 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 1]: * 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 27]: *************************** 40.0000 - 50.0000 [ 2]: ** 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 2]: ** 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 28]: **************************** Sender: 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 11]: *********** 40.0000 - 50.0000 [ 0]: 50.0000 - 60.0000 [ 1]: * 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 3]: *** 80.0000 - 90.0000 [ 7]: ******* 90.0000 - 100.0000 [ 38]: ************************************** These results demonstrate gaining back the tendency of the CPU to stay in more responsive, performant C-states (and thus yield measurably better performance), by reverting commit 69a37beabf1f0a6705c08e879bdd5d82ff6486c4. Requested-by: Jeremy Eder <jeder@redhat.com> Tested-by: Len Brown <len.brown@intel.com> Cc: 3.8+ <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
2013-07-26 17:41:34 -06:00
else
tick_nohz_full_update_tick(ts);
}
cpuidle: consolidate 2.6.22 cpuidle branch into one patch commit e5a16b1f9eec0af7cfa0830304b41c1c0833cf9f Author: Len Brown <len.brown@intel.com> Date: Tue Oct 2 23:44:44 2007 -0400 cpuidle: shrink diff processor_idle.c | 440 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 429 insertions(+), 11 deletions(-) Signed-off-by: Len Brown <len.brown@intel.com> commit dfbb9d5aedfb18848a3e0d6f6e3e4969febb209c Author: Len Brown <len.brown@intel.com> Date: Wed Sep 26 02:17:55 2007 -0400 cpuidle: reduce diff size Reduces the cpuidle processor_idle.c diff vs 2.6.22 from this processor_idle.c | 2006 ++++++++++++++++++++++++++----------------- 1 file changed, 1219 insertions(+), 787 deletions(-) to this: processor_idle.c | 502 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 458 insertions(+), 44 deletions(-) ...for the purpose of making the cpuilde patch less invasive and easier to review. no functional changes. build tested only. Signed-off-by: Len Brown <len.brown@intel.com> commit 889172fc915f5a7fe20f35b133cbd205ce69bf6c Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Sep 13 13:40:05 2007 -0700 cpuidle: Retain old ACPI policy for !CONFIG_CPU_IDLE Retain the old policy in processor_idle, so that when CPU_IDLE is not configured, old C-state policy will still be used. This provides a clean gradual migration path from old ACPI policy to new cpuidle based policy. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 9544a8181edc7ecc33b3bfd69271571f98ed08bc Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Sep 13 13:39:17 2007 -0700 cpuidle: Configure governors by default Quoting Len "Do not give an option to users to shoot themselves in the foot". Remove the configurability of ladder and menu governors as they are needed for default policy of cpuidle. That way users will not be able to have cpuidle without any policy loosing all C-state power savings. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 8975059a2c1e56cfe83d1bcf031bcf4cb39be743 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:27:07 2007 -0400 CPUIDLE: load ACPI properly when CPUIDLE is disabled Change the registration return codes for when CPUIDLE support is not compiled into the kernel. As a result, the ACPI processor driver will load properly even if CPUIDLE is unavailable. However, it may be possible to cleanup the ACPI processor driver further and eliminate some dead code paths. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit e0322e2b58dd1b12ec669bf84693efe0dc2414a8 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:26:06 2007 -0400 CPUIDLE: remove cpuidle_get_bm_activity() Remove cpuidle_get_bm_activity() and updates governors accordingly. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 18a6e770d5c82ba26653e53d240caa617e09e9ab Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:58 2007 -0400 CPUIDLE: max_cstate fix Currently max_cstate is limited to 0, resulting in no idle processor power management on ACPI platforms. This patch restores the value to the array size. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 1fdc0887286179b40ce24bcdbde663172e205ef0 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:40 2007 -0400 CPUIDLE: handle BM detection inside the ACPI Processor driver Update the ACPI processor driver to detect BM activity and limit state entry depth internally, rather than exposing such requirements to CPUIDLE. As a result, CPUIDLE can drop this ACPI-specific interface and become more platform independent. BM activity is now handled much more aggressively than it was in the original implementation, so some testing coverage may be needed to verify that this doesn't introduce any DMA buffer under-run issues. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 0ef38840db666f48e3cdd2b769da676c57228dd9 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:14 2007 -0400 CPUIDLE: menu governor updates Tweak the menu governor to more effectively handle non-timer break events. Non-timer break events are detected by comparing the actual sleep time to the expected sleep time. In future revisions, it may be more reliable to use the timer data structures directly. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit bb4d74fca63fa96cf3ace644b15ae0f12b7df5a1 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:24:40 2007 -0400 CPUIDLE: fix 'current_governor' sysfs entry Allow the "current_governor" sysfs entry to properly handle input terminated with '\n'. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit df3c71559bb69b125f1a48971bf0d17f78bbdf47 Author: Len Brown <len.brown@intel.com> Date: Sun Aug 12 02:00:45 2007 -0400 cpuidle: fix IA64 build (again) Signed-off-by: Len Brown <len.brown@intel.com> commit a02064579e3f9530fd31baae16b1fc46b5a7bca8 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:39:27 2007 -0400 cpuidle: Remove support for runtime changing of max_cstate Remove support for runtime changeability of max_cstate. Drivers can use use latency APIs. max_cstate can still be used as a boot time option and dmi override. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 0912a44b13adf22f5e3f607d263aed23b4910d7e Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:39:16 2007 -0400 cpuidle: Remove ACPI cstate_limit calls from ipw2100 ipw2100 already has code to use accetable_latency interfaces to limit the C-state. Remove the calls to acpi_set_cstate_limit and acpi_get_cstate_limit as they are redundant. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit c649a76e76be6bff1fd770d0a775798813a3f6e0 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:35:39 2007 -0400 cpuidle: compile fix for pause and resume functions Fix the compilation failure when cpuidle is not compiled in. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Acked-by: Adam Belay <adam.belay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 2305a5920fb8ee6ccec1c62ade05aa8351091d71 Author: Adam Belay <abelay@novell.com> Date: Thu Jul 19 00:49:00 2007 -0400 cpuidle: re-write Some portions have been rewritten to make the code cleaner and lighter weight. The following is a list of changes: 1.) the state name is now included in the sysfs interface 2.) detection, hotplug, and available state modifications are handled by CPUIDLE drivers directly 3.) the CPUIDLE idle handler is only ever installed when at least one cpuidle_device is enabled and ready 4.) the menu governor BM code no longer overflows 5.) the sysfs attributes are now printed as unsigned integers, avoiding negative values 6.) a variety of other small cleanups Also, Idle drivers are no longer swappable during runtime through the CPUIDLE sysfs inteface. On i386 and x86_64 most idle handlers (e.g. poll, mwait, halt, etc.) don't benefit from an infrastructure that supports multiple states, so I think using a more general case idle handler selection mechanism would be cleaner. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Acked-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit df25b6b56955714e6e24b574d88d1fd11f0c3ee5 Author: Len Brown <len.brown@intel.com> Date: Tue Jul 24 17:08:21 2007 -0400 cpuidle: fix IA64 buid Signed-off-by: Len Brown <len.brown@intel.com> commit fd6ada4c14488755ff7068860078c437431fbccd Author: Adrian Bunk <bunk@stusta.de> Date: Mon Jul 9 11:33:13 2007 -0700 cpuidle: static make cpuidle_replace_governor() static Signed-off-by: Adrian Bunk <bunk@stusta.de> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit c1d4a2cebcadf2429c0c72e1d29aa2a9684c32e0 Author: Adrian Bunk <bunk@stusta.de> Date: Tue Jul 3 00:54:40 2007 -0400 cpuidle: static This patch makes the needlessly global struct menu_governor static. Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit dbf8780c6e8d572c2c273da97ed1cca7608fd999 Author: Andrew Morton <akpm@linux-foundation.org> Date: Tue Jul 3 00:49:14 2007 -0400 export symbol tick_nohz_get_sleep_length ERROR: "tick_nohz_get_sleep_length" [drivers/cpuidle/governors/menu.ko] undefined! ERROR: "tick_nohz_get_idle_jiffies" [drivers/cpuidle/governors/menu.ko] undefined! And please be sure to get your changes to core kernel suitably reviewed. Cc: Adam Belay <abelay@novell.com> Cc: Venki Pallipadi <venkatesh.pallipadi@intel.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: john stultz <johnstul@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 29f0e248e7017be15f99febf9143a2cef00b2961 Author: Andrew Morton <akpm@linux-foundation.org> Date: Tue Jul 3 00:43:04 2007 -0400 tick.h needs hrtimer.h It uses hrtimers. Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit e40cede7d63a029e92712a3fe02faee60cc38fb4 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:40:34 2007 -0400 cpuidle: first round of documentation updates Documentation changes based on Pavel's feedback. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 83b42be2efece386976507555c29e7773a0dfcd1 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:39:25 2007 -0400 cpuidle: add rating to the governors and pick the one with highest rating by default Introduce a governor rating scheme to pick the right governor by default. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit d2a74b8c5e8f22def4709330d4bfc4a29209b71c Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:38:08 2007 -0400 cpuidle: make cpuidle sysfs driver governor switch off by default Make default cpuidle sysfs to show current_governor and current_driver in read-only mode. More elaborate available_governors and available_drivers with writeable current_governor and current_driver interface only appear with "cpuidle_sysfs_switch" boot parameter. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 1f60a0e80bf83cf6b55c8845bbe5596ed8f6307b Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:37:00 2007 -0400 cpuidle: menu governor: change the early break condition Change the C-state early break out algorithm in menu governor. We only look at early breakouts that result in wakeups shorter than idle state's target_residency. If such a breakout is frequent enough, eliminate the particular idle state upto a timeout period. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 45a42095cf64b003b4a69be3ce7f434f97d7af51 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:35:38 2007 -0400 cpuidle: fix uninitialized variable in sysfs routine Fix the uninitialized usage of ret. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 80dca7cdba3e6ee13eae277660873ab9584eb3be Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:34:16 2007 -0400 cpuidle: reenable /proc/acpi//power interface for the time being Keep /proc/acpi/processor/CPU*/power around for a while as powertop depends on it. It will be marked deprecated and removed in future. powertop can use cpuidle interfaces instead. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 589c37c2646c5e3813a51255a5ee1159cb4c33fc Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:32:37 2007 -0400 cpuidle: menu governor and hrtimer compile fix Compile fix for menu governor. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 0ba80bd9ab3ed304cb4f19b722e4cc6740588b5e Author: Len Brown <len.brown@intel.com> Date: Thu May 31 22:51:43 2007 -0400 cpuidle: build fix - cpuidle vs ipw2100 module ERROR: "acpi_set_cstate_limit" [drivers/net/wireless/ipw2100.ko] undefined! Signed-off-by: Len Brown <len.brown@intel.com> commit d7d8fa7f96a7f7682be7c6cc0cc53fa7a18c3b58 Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:47:07 2007 -0400 cpuidle: add the 'menu' governor Here is my first take at implementing an idle PM governor that takes full advantage of NO_HZ. I call it the 'menu' governor because it considers the full list of idle states before each entry. I've kept the implementation fairly simple. It attempts to guess the next residency time and then chooses a state that would meet at least the break-even point between power savings and entry cost. To this end, it selects the deepest idle state that satisfies the following constraints: 1. If the idle time elapsed since bus master activity was detected is below a threshold (currently 20 ms), then limit the selection to C2-type or above. 2. Do not choose a state with a break-even residency that exceeds the expected time remaining until the next timer interrupt. 3. Do not choose a state with a break-even residency that exceeds the elapsed time between the last pair of break events, excluding timer interrupts. This governor has an advantage over "ladder" governor because it proactively checks how much time remains until the next timer interrupt using the tick infrastructure. Also, it handles device interrupt activity more intelligently by not including timer interrupts in break event calculations. Finally, it doesn't make policy decisions using the number of state entries, which can have variable residency times (NO_HZ makes these potentially very large), and instead only considers sleep time deltas. The menu governor can be selected during runtime using the cpuidle sysfs interface like so: "echo "menu" > /sys/devices/system/cpu/cpuidle/current_governor" Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit a4bec7e65aa3b7488b879d971651cc99a6c410fe Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:47:03 2007 -0400 cpuidle: export time until next timer interrupt using NO_HZ Expose information about the time remaining until the next timer interrupt expires by utilizing the dynticks infrastructure. Also modify the main idle loop to allow dynticks to handle non-interrupt break events (e.g. DMA). Finally, expose sleep ticks information to external code. Thomas Gleixner is responsible for much of the code in this patch. However, I've made some additional changes, so I'm probably responsible if there are any bugs or oversights :) Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 2929d8996fbc77f41a5ff86bb67cdde3ca7d2d72 Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:46:58 2007 -0400 cpuidle: governor API changes This patch prepares cpuidle for the menu governor. It adds an optional stage after idle state entry to give the governor an opportunity to check why the state was exited. Also it makes sure the idle loop returns after each state entry, allowing the appropriate dynticks code to run. Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 3a7fd42f9825c3b03e364ca59baa751bb350775f Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Apr 26 00:03:59 2007 -0700 cpuidle: hang fix Prevent hang on x86-64, when ACPI processor driver is added as a module on a system that does not support C-states. x86-64 expects all idle handlers to enable interrupts before returning from idle handler. This is due to enter_idle(), exit_idle() races. Make cpuidle_idle_call() confirm to this when there is no pm_idle_old. Also, cpuidle look at the return values of attch_driver() and set current_driver to NULL if attach fails on all CPUs. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 4893339a142afbd5b7c01ffadfd53d14746e858e Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:09 2007 +0800 cpuidle: add support for max_cstate limit With CPUIDLE framework, the max_cstate (to limit max cpu c-state) parameter is ingored. Some systems require it to ignore C2/C3 and some drivers like ipw require it too. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 43bbbbe1cb998cbd2df656f55bb3bfe30f30e7d1 Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:13 2007 +0800 cpuidle: add cpuidle_fore_redetect_devices API add cpuidle_force_redetect_devices API, which forces all CPU redetect idle states. Next patch will use it. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit d1edadd608f24836def5ec483d2edccfb37b1d19 Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:01 2007 +0800 cpuidle: fix sysfs related issue Fix the cpuidle sysfs issue. a. make kobject dynamicaly allocated b. fixed sysfs init issue to avoid suspend/resume issue Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 7169a5cc0d67b263978859672e86c13c23a5570d Author: Randy Dunlap <randy.dunlap@oracle.com> Date: Wed Mar 28 22:52:53 2007 -0400 cpuidle: 1-bit field must be unsigned A 1-bit bitfield has no room for a sign bit. drivers/cpuidle/governors/ladder.c:54:16: error: dubious bitfield without explicit `signed' or `unsigned' Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 4658620158dc2fbd9e4bcb213c5b6fb5d05ba7d4 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Wed Mar 28 22:52:41 2007 -0400 cpuidle: fix boot hang Patch for cpuidle boot hang reported by Larry Finger here. http://www.ussg.iu.edu/hypermail/linux/kernel/0703.2/2025.html Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Larry Finger <larry.finger@lwfinger.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit c17e168aa6e5fe3851baaae8df2fbc1cf11443a9 Author: Len Brown <len.brown@intel.com> Date: Wed Mar 7 04:37:53 2007 -0500 cpuidle: ladder does not depend on ACPI build fix for CONFIG_ACPI=n In file included from drivers/cpuidle/governors/ladder.c:21: include/acpi/processor.h:88: error: expected specifier-qualifier-list before ‘acpi_integer’ include/acpi/processor.h:106: error: expected specifier-qualifier-list before ‘acpi_integer’ include/acpi/processor.h:168: error: expected specifier-qualifier-list before ‘acpi_handle’ Signed-off-by: Len Brown <len.brown@intel.com> commit 8c91d958246bde68db0c3f0c57b535962ce861cb Author: Adrian Bunk <bunk@stusta.de> Date: Tue Mar 6 02:29:40 2007 -0800 cpuidle: make code static This patch makes the following needlessly global code static: - driver.c: __cpuidle_find_driver() - governor.c: __cpuidle_find_governor() - ladder.c: struct ladder_governor Signed-off-by: Adrian Bunk <bunk@stusta.de> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Adam Belay <abelay@novell.com> Cc: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 0c39dc3187094c72c33ab65a64d2017b21f372d2 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Wed Mar 7 02:38:22 2007 -0500 cpu_idle: fix build break This patch fixes a build breakage with !CONFIG_HOTPLUG_CPU and CONFIG_CPU_IDLE. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 8112e3b115659b07df340ef170515799c0105f82 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Mar 6 02:29:39 2007 -0800 cpuidle: build fix for !CPU_IDLE Fix the compile issues when CPU_IDLE is not configured. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Adam Belay <abelay@novell.com> Cc: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 1eb4431e9599cd25e0d9872f3c2c8986821839dd Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:54:57 2007 -0800 cpuidle take2: Basic documentation for cpuidle Documentation for cpuidle infrastructure Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit ef5f15a8b79123a047285ec2e3899108661df779 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:54:03 2007 -0800 cpuidle take2: Hookup ACPI C-states driver with cpuidle Hookup ACPI C-states onto generic cpuidle infrastructure. drivers/acpi/procesor_idle.c is now a ACPI C-states driver that registers as a driver in cpuidle infrastructure and the policy part is removed from drivers/acpi/processor_idle.c. We use governor in cpuidle instead. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 987196fa82d4db52c407e8c9d5dec884ba602183 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:52:57 2007 -0800 cpuidle take2: Core cpuidle infrastructure Announcing 'cpuidle', a new CPU power management infrastructure to manage idle CPUs in a clean and efficient manner. cpuidle separates out the drivers that can provide support for multiple types of idle states and policy governors that decide on what idle state to use at run time. A cpuidle driver can support multiple idle states based on parameters like varying power consumption, wakeup latency, etc (ACPI C-states for example). A cpuidle governor can be usage model specific (laptop, server, laptop on battery etc). Main advantage of the infrastructure being, it allows independent development of drivers and governors and allows for better CPU power management. A huge thanks to Adam Belay and Shaohua Li who were part of this mini-project since its beginning and are greatly responsible for this patchset. This patch: Core cpuidle infrastructure. Introduces a new abstraction layer for cpuidle: * which manages drivers that can support multiple idles states. Drivers can be generic or particular to specific hardware/platform * allows pluging in multiple policy governors that can take idle state policy decision * The core also has a set of sysfs interfaces with which administrato can know about supported drivers and governors and switch them at run time. Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> Signed-off-by: Len Brown <len.brown@intel.com>
2007-10-03 16:58:00 -06:00
/**
cpuidle: Return nohz hint from cpuidle_select() Add a new pointer argument to cpuidle_select() and to the ->select cpuidle governor callback to allow a boolean value indicating whether or not the tick should be stopped before entering the selected state to be returned from there. Make the ladder governor ignore that pointer (to preserve its current behavior) and make the menu governor return 'false" through it if: (1) the idle exit latency is constrained at 0, or (2) the selected state is a polling one, or (3) the expected idle period duration is within the tick period range. In addition to that, the correction factor computations in the menu governor need to take the possibility that the tick may not be stopped into account to avoid artificially small correction factor values. To that end, add a mechanism to record tick wakeups, as suggested by Peter Zijlstra, and use it to modify the menu_update() behavior when tick wakeup occurs. Namely, if the CPU is woken up by the tick and the return value of tick_nohz_get_sleep_length() is not within the tick boundary, the predicted idle duration is likely too short, so make menu_update() try to compensate for that by updating the governor statistics as though the CPU was idle for a long time. Since the value returned through the new argument pointer of cpuidle_select() is not used by its caller yet, this change by itself is not expected to alter the functionality of the code. Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2018-03-22 10:50:49 -06:00
* tick_nohz_idle_got_tick - Check whether or not the tick handler has run
*/
bool tick_nohz_idle_got_tick(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
if (ts->got_idle_tick) {
ts->got_idle_tick = 0;
cpuidle: Return nohz hint from cpuidle_select() Add a new pointer argument to cpuidle_select() and to the ->select cpuidle governor callback to allow a boolean value indicating whether or not the tick should be stopped before entering the selected state to be returned from there. Make the ladder governor ignore that pointer (to preserve its current behavior) and make the menu governor return 'false" through it if: (1) the idle exit latency is constrained at 0, or (2) the selected state is a polling one, or (3) the expected idle period duration is within the tick period range. In addition to that, the correction factor computations in the menu governor need to take the possibility that the tick may not be stopped into account to avoid artificially small correction factor values. To that end, add a mechanism to record tick wakeups, as suggested by Peter Zijlstra, and use it to modify the menu_update() behavior when tick wakeup occurs. Namely, if the CPU is woken up by the tick and the return value of tick_nohz_get_sleep_length() is not within the tick boundary, the predicted idle duration is likely too short, so make menu_update() try to compensate for that by updating the governor statistics as though the CPU was idle for a long time. Since the value returned through the new argument pointer of cpuidle_select() is not used by its caller yet, this change by itself is not expected to alter the functionality of the code. Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2018-03-22 10:50:49 -06:00
return true;
}
return false;
}
cpuidle: consolidate 2.6.22 cpuidle branch into one patch commit e5a16b1f9eec0af7cfa0830304b41c1c0833cf9f Author: Len Brown <len.brown@intel.com> Date: Tue Oct 2 23:44:44 2007 -0400 cpuidle: shrink diff processor_idle.c | 440 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 429 insertions(+), 11 deletions(-) Signed-off-by: Len Brown <len.brown@intel.com> commit dfbb9d5aedfb18848a3e0d6f6e3e4969febb209c Author: Len Brown <len.brown@intel.com> Date: Wed Sep 26 02:17:55 2007 -0400 cpuidle: reduce diff size Reduces the cpuidle processor_idle.c diff vs 2.6.22 from this processor_idle.c | 2006 ++++++++++++++++++++++++++----------------- 1 file changed, 1219 insertions(+), 787 deletions(-) to this: processor_idle.c | 502 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 458 insertions(+), 44 deletions(-) ...for the purpose of making the cpuilde patch less invasive and easier to review. no functional changes. build tested only. Signed-off-by: Len Brown <len.brown@intel.com> commit 889172fc915f5a7fe20f35b133cbd205ce69bf6c Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Sep 13 13:40:05 2007 -0700 cpuidle: Retain old ACPI policy for !CONFIG_CPU_IDLE Retain the old policy in processor_idle, so that when CPU_IDLE is not configured, old C-state policy will still be used. This provides a clean gradual migration path from old ACPI policy to new cpuidle based policy. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 9544a8181edc7ecc33b3bfd69271571f98ed08bc Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Sep 13 13:39:17 2007 -0700 cpuidle: Configure governors by default Quoting Len "Do not give an option to users to shoot themselves in the foot". Remove the configurability of ladder and menu governors as they are needed for default policy of cpuidle. That way users will not be able to have cpuidle without any policy loosing all C-state power savings. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 8975059a2c1e56cfe83d1bcf031bcf4cb39be743 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:27:07 2007 -0400 CPUIDLE: load ACPI properly when CPUIDLE is disabled Change the registration return codes for when CPUIDLE support is not compiled into the kernel. As a result, the ACPI processor driver will load properly even if CPUIDLE is unavailable. However, it may be possible to cleanup the ACPI processor driver further and eliminate some dead code paths. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit e0322e2b58dd1b12ec669bf84693efe0dc2414a8 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:26:06 2007 -0400 CPUIDLE: remove cpuidle_get_bm_activity() Remove cpuidle_get_bm_activity() and updates governors accordingly. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 18a6e770d5c82ba26653e53d240caa617e09e9ab Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:58 2007 -0400 CPUIDLE: max_cstate fix Currently max_cstate is limited to 0, resulting in no idle processor power management on ACPI platforms. This patch restores the value to the array size. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 1fdc0887286179b40ce24bcdbde663172e205ef0 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:40 2007 -0400 CPUIDLE: handle BM detection inside the ACPI Processor driver Update the ACPI processor driver to detect BM activity and limit state entry depth internally, rather than exposing such requirements to CPUIDLE. As a result, CPUIDLE can drop this ACPI-specific interface and become more platform independent. BM activity is now handled much more aggressively than it was in the original implementation, so some testing coverage may be needed to verify that this doesn't introduce any DMA buffer under-run issues. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 0ef38840db666f48e3cdd2b769da676c57228dd9 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:14 2007 -0400 CPUIDLE: menu governor updates Tweak the menu governor to more effectively handle non-timer break events. Non-timer break events are detected by comparing the actual sleep time to the expected sleep time. In future revisions, it may be more reliable to use the timer data structures directly. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit bb4d74fca63fa96cf3ace644b15ae0f12b7df5a1 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:24:40 2007 -0400 CPUIDLE: fix 'current_governor' sysfs entry Allow the "current_governor" sysfs entry to properly handle input terminated with '\n'. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit df3c71559bb69b125f1a48971bf0d17f78bbdf47 Author: Len Brown <len.brown@intel.com> Date: Sun Aug 12 02:00:45 2007 -0400 cpuidle: fix IA64 build (again) Signed-off-by: Len Brown <len.brown@intel.com> commit a02064579e3f9530fd31baae16b1fc46b5a7bca8 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:39:27 2007 -0400 cpuidle: Remove support for runtime changing of max_cstate Remove support for runtime changeability of max_cstate. Drivers can use use latency APIs. max_cstate can still be used as a boot time option and dmi override. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 0912a44b13adf22f5e3f607d263aed23b4910d7e Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:39:16 2007 -0400 cpuidle: Remove ACPI cstate_limit calls from ipw2100 ipw2100 already has code to use accetable_latency interfaces to limit the C-state. Remove the calls to acpi_set_cstate_limit and acpi_get_cstate_limit as they are redundant. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit c649a76e76be6bff1fd770d0a775798813a3f6e0 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:35:39 2007 -0400 cpuidle: compile fix for pause and resume functions Fix the compilation failure when cpuidle is not compiled in. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Acked-by: Adam Belay <adam.belay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 2305a5920fb8ee6ccec1c62ade05aa8351091d71 Author: Adam Belay <abelay@novell.com> Date: Thu Jul 19 00:49:00 2007 -0400 cpuidle: re-write Some portions have been rewritten to make the code cleaner and lighter weight. The following is a list of changes: 1.) the state name is now included in the sysfs interface 2.) detection, hotplug, and available state modifications are handled by CPUIDLE drivers directly 3.) the CPUIDLE idle handler is only ever installed when at least one cpuidle_device is enabled and ready 4.) the menu governor BM code no longer overflows 5.) the sysfs attributes are now printed as unsigned integers, avoiding negative values 6.) a variety of other small cleanups Also, Idle drivers are no longer swappable during runtime through the CPUIDLE sysfs inteface. On i386 and x86_64 most idle handlers (e.g. poll, mwait, halt, etc.) don't benefit from an infrastructure that supports multiple states, so I think using a more general case idle handler selection mechanism would be cleaner. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Acked-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit df25b6b56955714e6e24b574d88d1fd11f0c3ee5 Author: Len Brown <len.brown@intel.com> Date: Tue Jul 24 17:08:21 2007 -0400 cpuidle: fix IA64 buid Signed-off-by: Len Brown <len.brown@intel.com> commit fd6ada4c14488755ff7068860078c437431fbccd Author: Adrian Bunk <bunk@stusta.de> Date: Mon Jul 9 11:33:13 2007 -0700 cpuidle: static make cpuidle_replace_governor() static Signed-off-by: Adrian Bunk <bunk@stusta.de> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit c1d4a2cebcadf2429c0c72e1d29aa2a9684c32e0 Author: Adrian Bunk <bunk@stusta.de> Date: Tue Jul 3 00:54:40 2007 -0400 cpuidle: static This patch makes the needlessly global struct menu_governor static. Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit dbf8780c6e8d572c2c273da97ed1cca7608fd999 Author: Andrew Morton <akpm@linux-foundation.org> Date: Tue Jul 3 00:49:14 2007 -0400 export symbol tick_nohz_get_sleep_length ERROR: "tick_nohz_get_sleep_length" [drivers/cpuidle/governors/menu.ko] undefined! ERROR: "tick_nohz_get_idle_jiffies" [drivers/cpuidle/governors/menu.ko] undefined! And please be sure to get your changes to core kernel suitably reviewed. Cc: Adam Belay <abelay@novell.com> Cc: Venki Pallipadi <venkatesh.pallipadi@intel.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: john stultz <johnstul@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 29f0e248e7017be15f99febf9143a2cef00b2961 Author: Andrew Morton <akpm@linux-foundation.org> Date: Tue Jul 3 00:43:04 2007 -0400 tick.h needs hrtimer.h It uses hrtimers. Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit e40cede7d63a029e92712a3fe02faee60cc38fb4 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:40:34 2007 -0400 cpuidle: first round of documentation updates Documentation changes based on Pavel's feedback. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 83b42be2efece386976507555c29e7773a0dfcd1 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:39:25 2007 -0400 cpuidle: add rating to the governors and pick the one with highest rating by default Introduce a governor rating scheme to pick the right governor by default. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit d2a74b8c5e8f22def4709330d4bfc4a29209b71c Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:38:08 2007 -0400 cpuidle: make cpuidle sysfs driver governor switch off by default Make default cpuidle sysfs to show current_governor and current_driver in read-only mode. More elaborate available_governors and available_drivers with writeable current_governor and current_driver interface only appear with "cpuidle_sysfs_switch" boot parameter. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 1f60a0e80bf83cf6b55c8845bbe5596ed8f6307b Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:37:00 2007 -0400 cpuidle: menu governor: change the early break condition Change the C-state early break out algorithm in menu governor. We only look at early breakouts that result in wakeups shorter than idle state's target_residency. If such a breakout is frequent enough, eliminate the particular idle state upto a timeout period. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 45a42095cf64b003b4a69be3ce7f434f97d7af51 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:35:38 2007 -0400 cpuidle: fix uninitialized variable in sysfs routine Fix the uninitialized usage of ret. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 80dca7cdba3e6ee13eae277660873ab9584eb3be Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:34:16 2007 -0400 cpuidle: reenable /proc/acpi//power interface for the time being Keep /proc/acpi/processor/CPU*/power around for a while as powertop depends on it. It will be marked deprecated and removed in future. powertop can use cpuidle interfaces instead. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 589c37c2646c5e3813a51255a5ee1159cb4c33fc Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:32:37 2007 -0400 cpuidle: menu governor and hrtimer compile fix Compile fix for menu governor. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 0ba80bd9ab3ed304cb4f19b722e4cc6740588b5e Author: Len Brown <len.brown@intel.com> Date: Thu May 31 22:51:43 2007 -0400 cpuidle: build fix - cpuidle vs ipw2100 module ERROR: "acpi_set_cstate_limit" [drivers/net/wireless/ipw2100.ko] undefined! Signed-off-by: Len Brown <len.brown@intel.com> commit d7d8fa7f96a7f7682be7c6cc0cc53fa7a18c3b58 Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:47:07 2007 -0400 cpuidle: add the 'menu' governor Here is my first take at implementing an idle PM governor that takes full advantage of NO_HZ. I call it the 'menu' governor because it considers the full list of idle states before each entry. I've kept the implementation fairly simple. It attempts to guess the next residency time and then chooses a state that would meet at least the break-even point between power savings and entry cost. To this end, it selects the deepest idle state that satisfies the following constraints: 1. If the idle time elapsed since bus master activity was detected is below a threshold (currently 20 ms), then limit the selection to C2-type or above. 2. Do not choose a state with a break-even residency that exceeds the expected time remaining until the next timer interrupt. 3. Do not choose a state with a break-even residency that exceeds the elapsed time between the last pair of break events, excluding timer interrupts. This governor has an advantage over "ladder" governor because it proactively checks how much time remains until the next timer interrupt using the tick infrastructure. Also, it handles device interrupt activity more intelligently by not including timer interrupts in break event calculations. Finally, it doesn't make policy decisions using the number of state entries, which can have variable residency times (NO_HZ makes these potentially very large), and instead only considers sleep time deltas. The menu governor can be selected during runtime using the cpuidle sysfs interface like so: "echo "menu" > /sys/devices/system/cpu/cpuidle/current_governor" Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit a4bec7e65aa3b7488b879d971651cc99a6c410fe Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:47:03 2007 -0400 cpuidle: export time until next timer interrupt using NO_HZ Expose information about the time remaining until the next timer interrupt expires by utilizing the dynticks infrastructure. Also modify the main idle loop to allow dynticks to handle non-interrupt break events (e.g. DMA). Finally, expose sleep ticks information to external code. Thomas Gleixner is responsible for much of the code in this patch. However, I've made some additional changes, so I'm probably responsible if there are any bugs or oversights :) Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 2929d8996fbc77f41a5ff86bb67cdde3ca7d2d72 Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:46:58 2007 -0400 cpuidle: governor API changes This patch prepares cpuidle for the menu governor. It adds an optional stage after idle state entry to give the governor an opportunity to check why the state was exited. Also it makes sure the idle loop returns after each state entry, allowing the appropriate dynticks code to run. Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 3a7fd42f9825c3b03e364ca59baa751bb350775f Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Apr 26 00:03:59 2007 -0700 cpuidle: hang fix Prevent hang on x86-64, when ACPI processor driver is added as a module on a system that does not support C-states. x86-64 expects all idle handlers to enable interrupts before returning from idle handler. This is due to enter_idle(), exit_idle() races. Make cpuidle_idle_call() confirm to this when there is no pm_idle_old. Also, cpuidle look at the return values of attch_driver() and set current_driver to NULL if attach fails on all CPUs. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 4893339a142afbd5b7c01ffadfd53d14746e858e Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:09 2007 +0800 cpuidle: add support for max_cstate limit With CPUIDLE framework, the max_cstate (to limit max cpu c-state) parameter is ingored. Some systems require it to ignore C2/C3 and some drivers like ipw require it too. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 43bbbbe1cb998cbd2df656f55bb3bfe30f30e7d1 Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:13 2007 +0800 cpuidle: add cpuidle_fore_redetect_devices API add cpuidle_force_redetect_devices API, which forces all CPU redetect idle states. Next patch will use it. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit d1edadd608f24836def5ec483d2edccfb37b1d19 Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:01 2007 +0800 cpuidle: fix sysfs related issue Fix the cpuidle sysfs issue. a. make kobject dynamicaly allocated b. fixed sysfs init issue to avoid suspend/resume issue Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 7169a5cc0d67b263978859672e86c13c23a5570d Author: Randy Dunlap <randy.dunlap@oracle.com> Date: Wed Mar 28 22:52:53 2007 -0400 cpuidle: 1-bit field must be unsigned A 1-bit bitfield has no room for a sign bit. drivers/cpuidle/governors/ladder.c:54:16: error: dubious bitfield without explicit `signed' or `unsigned' Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 4658620158dc2fbd9e4bcb213c5b6fb5d05ba7d4 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Wed Mar 28 22:52:41 2007 -0400 cpuidle: fix boot hang Patch for cpuidle boot hang reported by Larry Finger here. http://www.ussg.iu.edu/hypermail/linux/kernel/0703.2/2025.html Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Larry Finger <larry.finger@lwfinger.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit c17e168aa6e5fe3851baaae8df2fbc1cf11443a9 Author: Len Brown <len.brown@intel.com> Date: Wed Mar 7 04:37:53 2007 -0500 cpuidle: ladder does not depend on ACPI build fix for CONFIG_ACPI=n In file included from drivers/cpuidle/governors/ladder.c:21: include/acpi/processor.h:88: error: expected specifier-qualifier-list before ‘acpi_integer’ include/acpi/processor.h:106: error: expected specifier-qualifier-list before ‘acpi_integer’ include/acpi/processor.h:168: error: expected specifier-qualifier-list before ‘acpi_handle’ Signed-off-by: Len Brown <len.brown@intel.com> commit 8c91d958246bde68db0c3f0c57b535962ce861cb Author: Adrian Bunk <bunk@stusta.de> Date: Tue Mar 6 02:29:40 2007 -0800 cpuidle: make code static This patch makes the following needlessly global code static: - driver.c: __cpuidle_find_driver() - governor.c: __cpuidle_find_governor() - ladder.c: struct ladder_governor Signed-off-by: Adrian Bunk <bunk@stusta.de> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Adam Belay <abelay@novell.com> Cc: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 0c39dc3187094c72c33ab65a64d2017b21f372d2 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Wed Mar 7 02:38:22 2007 -0500 cpu_idle: fix build break This patch fixes a build breakage with !CONFIG_HOTPLUG_CPU and CONFIG_CPU_IDLE. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 8112e3b115659b07df340ef170515799c0105f82 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Mar 6 02:29:39 2007 -0800 cpuidle: build fix for !CPU_IDLE Fix the compile issues when CPU_IDLE is not configured. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Adam Belay <abelay@novell.com> Cc: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 1eb4431e9599cd25e0d9872f3c2c8986821839dd Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:54:57 2007 -0800 cpuidle take2: Basic documentation for cpuidle Documentation for cpuidle infrastructure Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit ef5f15a8b79123a047285ec2e3899108661df779 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:54:03 2007 -0800 cpuidle take2: Hookup ACPI C-states driver with cpuidle Hookup ACPI C-states onto generic cpuidle infrastructure. drivers/acpi/procesor_idle.c is now a ACPI C-states driver that registers as a driver in cpuidle infrastructure and the policy part is removed from drivers/acpi/processor_idle.c. We use governor in cpuidle instead. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 987196fa82d4db52c407e8c9d5dec884ba602183 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:52:57 2007 -0800 cpuidle take2: Core cpuidle infrastructure Announcing 'cpuidle', a new CPU power management infrastructure to manage idle CPUs in a clean and efficient manner. cpuidle separates out the drivers that can provide support for multiple types of idle states and policy governors that decide on what idle state to use at run time. A cpuidle driver can support multiple idle states based on parameters like varying power consumption, wakeup latency, etc (ACPI C-states for example). A cpuidle governor can be usage model specific (laptop, server, laptop on battery etc). Main advantage of the infrastructure being, it allows independent development of drivers and governors and allows for better CPU power management. A huge thanks to Adam Belay and Shaohua Li who were part of this mini-project since its beginning and are greatly responsible for this patchset. This patch: Core cpuidle infrastructure. Introduces a new abstraction layer for cpuidle: * which manages drivers that can support multiple idles states. Drivers can be generic or particular to specific hardware/platform * allows pluging in multiple policy governors that can take idle state policy decision * The core also has a set of sysfs interfaces with which administrato can know about supported drivers and governors and switch them at run time. Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> Signed-off-by: Len Brown <len.brown@intel.com>
2007-10-03 16:58:00 -06:00
/**
sched: idle: Select idle state before stopping the tick In order to address the issue with short idle duration predictions by the idle governor after the scheduler tick has been stopped, reorder the code in cpuidle_idle_call() so that the governor idle state selection runs before tick_nohz_idle_go_idle() and use the "nohz" hint returned by cpuidle_select() to decide whether or not to stop the tick. This isn't straightforward, because menu_select() invokes tick_nohz_get_sleep_length() to get the time to the next timer event and the number returned by the latter comes from __tick_nohz_idle_stop_tick(). Fortunately, however, it is possible to compute that number without actually stopping the tick and with the help of the existing code. Namely, tick_nohz_get_sleep_length() can be made call tick_nohz_next_event(), introduced earlier, to get the time to the next non-highres timer event. If that happens, tick_nohz_next_event() need not be called by __tick_nohz_idle_stop_tick() again. If it turns out that the scheduler tick cannot be stopped going forward or the next timer event is too close for the tick to be stopped, tick_nohz_get_sleep_length() can simply return the time to the next event currently programmed into the corresponding clock event device. In addition to knowing the return value of tick_nohz_next_event(), however, tick_nohz_get_sleep_length() needs to know the time to the next highres timer event, but with the scheduler tick timer excluded, which can be computed with the help of hrtimer_get_next_event(). That minimum of that number and the tick_nohz_next_event() return value is the total time to the next timer event with the assumption that the tick will be stopped. It can be returned to the idle governor which can use it for predicting idle duration (under the assumption that the tick will be stopped) and deciding whether or not it makes sense to stop the tick before putting the CPU into the selected idle state. With the above, the sleep_length field in struct tick_sched is not necessary any more, so drop it. Link: https://bugzilla.kernel.org/show_bug.cgi?id=199227 Reported-by: Doug Smythies <dsmythies@telus.net> Reported-by: Thomas Ilsche <thomas.ilsche@tu-dresden.de> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
2018-04-03 15:17:11 -06:00
* tick_nohz_get_sleep_length - return the expected length of the current sleep
* @delta_next: duration until the next event if the tick cannot be stopped
cpuidle: consolidate 2.6.22 cpuidle branch into one patch commit e5a16b1f9eec0af7cfa0830304b41c1c0833cf9f Author: Len Brown <len.brown@intel.com> Date: Tue Oct 2 23:44:44 2007 -0400 cpuidle: shrink diff processor_idle.c | 440 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 429 insertions(+), 11 deletions(-) Signed-off-by: Len Brown <len.brown@intel.com> commit dfbb9d5aedfb18848a3e0d6f6e3e4969febb209c Author: Len Brown <len.brown@intel.com> Date: Wed Sep 26 02:17:55 2007 -0400 cpuidle: reduce diff size Reduces the cpuidle processor_idle.c diff vs 2.6.22 from this processor_idle.c | 2006 ++++++++++++++++++++++++++----------------- 1 file changed, 1219 insertions(+), 787 deletions(-) to this: processor_idle.c | 502 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 458 insertions(+), 44 deletions(-) ...for the purpose of making the cpuilde patch less invasive and easier to review. no functional changes. build tested only. Signed-off-by: Len Brown <len.brown@intel.com> commit 889172fc915f5a7fe20f35b133cbd205ce69bf6c Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Sep 13 13:40:05 2007 -0700 cpuidle: Retain old ACPI policy for !CONFIG_CPU_IDLE Retain the old policy in processor_idle, so that when CPU_IDLE is not configured, old C-state policy will still be used. This provides a clean gradual migration path from old ACPI policy to new cpuidle based policy. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 9544a8181edc7ecc33b3bfd69271571f98ed08bc Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Sep 13 13:39:17 2007 -0700 cpuidle: Configure governors by default Quoting Len "Do not give an option to users to shoot themselves in the foot". Remove the configurability of ladder and menu governors as they are needed for default policy of cpuidle. That way users will not be able to have cpuidle without any policy loosing all C-state power savings. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 8975059a2c1e56cfe83d1bcf031bcf4cb39be743 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:27:07 2007 -0400 CPUIDLE: load ACPI properly when CPUIDLE is disabled Change the registration return codes for when CPUIDLE support is not compiled into the kernel. As a result, the ACPI processor driver will load properly even if CPUIDLE is unavailable. However, it may be possible to cleanup the ACPI processor driver further and eliminate some dead code paths. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit e0322e2b58dd1b12ec669bf84693efe0dc2414a8 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:26:06 2007 -0400 CPUIDLE: remove cpuidle_get_bm_activity() Remove cpuidle_get_bm_activity() and updates governors accordingly. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 18a6e770d5c82ba26653e53d240caa617e09e9ab Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:58 2007 -0400 CPUIDLE: max_cstate fix Currently max_cstate is limited to 0, resulting in no idle processor power management on ACPI platforms. This patch restores the value to the array size. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 1fdc0887286179b40ce24bcdbde663172e205ef0 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:40 2007 -0400 CPUIDLE: handle BM detection inside the ACPI Processor driver Update the ACPI processor driver to detect BM activity and limit state entry depth internally, rather than exposing such requirements to CPUIDLE. As a result, CPUIDLE can drop this ACPI-specific interface and become more platform independent. BM activity is now handled much more aggressively than it was in the original implementation, so some testing coverage may be needed to verify that this doesn't introduce any DMA buffer under-run issues. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 0ef38840db666f48e3cdd2b769da676c57228dd9 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:14 2007 -0400 CPUIDLE: menu governor updates Tweak the menu governor to more effectively handle non-timer break events. Non-timer break events are detected by comparing the actual sleep time to the expected sleep time. In future revisions, it may be more reliable to use the timer data structures directly. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit bb4d74fca63fa96cf3ace644b15ae0f12b7df5a1 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:24:40 2007 -0400 CPUIDLE: fix 'current_governor' sysfs entry Allow the "current_governor" sysfs entry to properly handle input terminated with '\n'. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit df3c71559bb69b125f1a48971bf0d17f78bbdf47 Author: Len Brown <len.brown@intel.com> Date: Sun Aug 12 02:00:45 2007 -0400 cpuidle: fix IA64 build (again) Signed-off-by: Len Brown <len.brown@intel.com> commit a02064579e3f9530fd31baae16b1fc46b5a7bca8 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:39:27 2007 -0400 cpuidle: Remove support for runtime changing of max_cstate Remove support for runtime changeability of max_cstate. Drivers can use use latency APIs. max_cstate can still be used as a boot time option and dmi override. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 0912a44b13adf22f5e3f607d263aed23b4910d7e Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:39:16 2007 -0400 cpuidle: Remove ACPI cstate_limit calls from ipw2100 ipw2100 already has code to use accetable_latency interfaces to limit the C-state. Remove the calls to acpi_set_cstate_limit and acpi_get_cstate_limit as they are redundant. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit c649a76e76be6bff1fd770d0a775798813a3f6e0 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:35:39 2007 -0400 cpuidle: compile fix for pause and resume functions Fix the compilation failure when cpuidle is not compiled in. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Acked-by: Adam Belay <adam.belay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 2305a5920fb8ee6ccec1c62ade05aa8351091d71 Author: Adam Belay <abelay@novell.com> Date: Thu Jul 19 00:49:00 2007 -0400 cpuidle: re-write Some portions have been rewritten to make the code cleaner and lighter weight. The following is a list of changes: 1.) the state name is now included in the sysfs interface 2.) detection, hotplug, and available state modifications are handled by CPUIDLE drivers directly 3.) the CPUIDLE idle handler is only ever installed when at least one cpuidle_device is enabled and ready 4.) the menu governor BM code no longer overflows 5.) the sysfs attributes are now printed as unsigned integers, avoiding negative values 6.) a variety of other small cleanups Also, Idle drivers are no longer swappable during runtime through the CPUIDLE sysfs inteface. On i386 and x86_64 most idle handlers (e.g. poll, mwait, halt, etc.) don't benefit from an infrastructure that supports multiple states, so I think using a more general case idle handler selection mechanism would be cleaner. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Acked-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit df25b6b56955714e6e24b574d88d1fd11f0c3ee5 Author: Len Brown <len.brown@intel.com> Date: Tue Jul 24 17:08:21 2007 -0400 cpuidle: fix IA64 buid Signed-off-by: Len Brown <len.brown@intel.com> commit fd6ada4c14488755ff7068860078c437431fbccd Author: Adrian Bunk <bunk@stusta.de> Date: Mon Jul 9 11:33:13 2007 -0700 cpuidle: static make cpuidle_replace_governor() static Signed-off-by: Adrian Bunk <bunk@stusta.de> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit c1d4a2cebcadf2429c0c72e1d29aa2a9684c32e0 Author: Adrian Bunk <bunk@stusta.de> Date: Tue Jul 3 00:54:40 2007 -0400 cpuidle: static This patch makes the needlessly global struct menu_governor static. Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit dbf8780c6e8d572c2c273da97ed1cca7608fd999 Author: Andrew Morton <akpm@linux-foundation.org> Date: Tue Jul 3 00:49:14 2007 -0400 export symbol tick_nohz_get_sleep_length ERROR: "tick_nohz_get_sleep_length" [drivers/cpuidle/governors/menu.ko] undefined! ERROR: "tick_nohz_get_idle_jiffies" [drivers/cpuidle/governors/menu.ko] undefined! And please be sure to get your changes to core kernel suitably reviewed. Cc: Adam Belay <abelay@novell.com> Cc: Venki Pallipadi <venkatesh.pallipadi@intel.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: john stultz <johnstul@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 29f0e248e7017be15f99febf9143a2cef00b2961 Author: Andrew Morton <akpm@linux-foundation.org> Date: Tue Jul 3 00:43:04 2007 -0400 tick.h needs hrtimer.h It uses hrtimers. Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit e40cede7d63a029e92712a3fe02faee60cc38fb4 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:40:34 2007 -0400 cpuidle: first round of documentation updates Documentation changes based on Pavel's feedback. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 83b42be2efece386976507555c29e7773a0dfcd1 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:39:25 2007 -0400 cpuidle: add rating to the governors and pick the one with highest rating by default Introduce a governor rating scheme to pick the right governor by default. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit d2a74b8c5e8f22def4709330d4bfc4a29209b71c Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:38:08 2007 -0400 cpuidle: make cpuidle sysfs driver governor switch off by default Make default cpuidle sysfs to show current_governor and current_driver in read-only mode. More elaborate available_governors and available_drivers with writeable current_governor and current_driver interface only appear with "cpuidle_sysfs_switch" boot parameter. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 1f60a0e80bf83cf6b55c8845bbe5596ed8f6307b Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:37:00 2007 -0400 cpuidle: menu governor: change the early break condition Change the C-state early break out algorithm in menu governor. We only look at early breakouts that result in wakeups shorter than idle state's target_residency. If such a breakout is frequent enough, eliminate the particular idle state upto a timeout period. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 45a42095cf64b003b4a69be3ce7f434f97d7af51 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:35:38 2007 -0400 cpuidle: fix uninitialized variable in sysfs routine Fix the uninitialized usage of ret. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 80dca7cdba3e6ee13eae277660873ab9584eb3be Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:34:16 2007 -0400 cpuidle: reenable /proc/acpi//power interface for the time being Keep /proc/acpi/processor/CPU*/power around for a while as powertop depends on it. It will be marked deprecated and removed in future. powertop can use cpuidle interfaces instead. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 589c37c2646c5e3813a51255a5ee1159cb4c33fc Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:32:37 2007 -0400 cpuidle: menu governor and hrtimer compile fix Compile fix for menu governor. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 0ba80bd9ab3ed304cb4f19b722e4cc6740588b5e Author: Len Brown <len.brown@intel.com> Date: Thu May 31 22:51:43 2007 -0400 cpuidle: build fix - cpuidle vs ipw2100 module ERROR: "acpi_set_cstate_limit" [drivers/net/wireless/ipw2100.ko] undefined! Signed-off-by: Len Brown <len.brown@intel.com> commit d7d8fa7f96a7f7682be7c6cc0cc53fa7a18c3b58 Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:47:07 2007 -0400 cpuidle: add the 'menu' governor Here is my first take at implementing an idle PM governor that takes full advantage of NO_HZ. I call it the 'menu' governor because it considers the full list of idle states before each entry. I've kept the implementation fairly simple. It attempts to guess the next residency time and then chooses a state that would meet at least the break-even point between power savings and entry cost. To this end, it selects the deepest idle state that satisfies the following constraints: 1. If the idle time elapsed since bus master activity was detected is below a threshold (currently 20 ms), then limit the selection to C2-type or above. 2. Do not choose a state with a break-even residency that exceeds the expected time remaining until the next timer interrupt. 3. Do not choose a state with a break-even residency that exceeds the elapsed time between the last pair of break events, excluding timer interrupts. This governor has an advantage over "ladder" governor because it proactively checks how much time remains until the next timer interrupt using the tick infrastructure. Also, it handles device interrupt activity more intelligently by not including timer interrupts in break event calculations. Finally, it doesn't make policy decisions using the number of state entries, which can have variable residency times (NO_HZ makes these potentially very large), and instead only considers sleep time deltas. The menu governor can be selected during runtime using the cpuidle sysfs interface like so: "echo "menu" > /sys/devices/system/cpu/cpuidle/current_governor" Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit a4bec7e65aa3b7488b879d971651cc99a6c410fe Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:47:03 2007 -0400 cpuidle: export time until next timer interrupt using NO_HZ Expose information about the time remaining until the next timer interrupt expires by utilizing the dynticks infrastructure. Also modify the main idle loop to allow dynticks to handle non-interrupt break events (e.g. DMA). Finally, expose sleep ticks information to external code. Thomas Gleixner is responsible for much of the code in this patch. However, I've made some additional changes, so I'm probably responsible if there are any bugs or oversights :) Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 2929d8996fbc77f41a5ff86bb67cdde3ca7d2d72 Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:46:58 2007 -0400 cpuidle: governor API changes This patch prepares cpuidle for the menu governor. It adds an optional stage after idle state entry to give the governor an opportunity to check why the state was exited. Also it makes sure the idle loop returns after each state entry, allowing the appropriate dynticks code to run. Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 3a7fd42f9825c3b03e364ca59baa751bb350775f Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Apr 26 00:03:59 2007 -0700 cpuidle: hang fix Prevent hang on x86-64, when ACPI processor driver is added as a module on a system that does not support C-states. x86-64 expects all idle handlers to enable interrupts before returning from idle handler. This is due to enter_idle(), exit_idle() races. Make cpuidle_idle_call() confirm to this when there is no pm_idle_old. Also, cpuidle look at the return values of attch_driver() and set current_driver to NULL if attach fails on all CPUs. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 4893339a142afbd5b7c01ffadfd53d14746e858e Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:09 2007 +0800 cpuidle: add support for max_cstate limit With CPUIDLE framework, the max_cstate (to limit max cpu c-state) parameter is ingored. Some systems require it to ignore C2/C3 and some drivers like ipw require it too. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 43bbbbe1cb998cbd2df656f55bb3bfe30f30e7d1 Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:13 2007 +0800 cpuidle: add cpuidle_fore_redetect_devices API add cpuidle_force_redetect_devices API, which forces all CPU redetect idle states. Next patch will use it. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit d1edadd608f24836def5ec483d2edccfb37b1d19 Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:01 2007 +0800 cpuidle: fix sysfs related issue Fix the cpuidle sysfs issue. a. make kobject dynamicaly allocated b. fixed sysfs init issue to avoid suspend/resume issue Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 7169a5cc0d67b263978859672e86c13c23a5570d Author: Randy Dunlap <randy.dunlap@oracle.com> Date: Wed Mar 28 22:52:53 2007 -0400 cpuidle: 1-bit field must be unsigned A 1-bit bitfield has no room for a sign bit. drivers/cpuidle/governors/ladder.c:54:16: error: dubious bitfield without explicit `signed' or `unsigned' Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 4658620158dc2fbd9e4bcb213c5b6fb5d05ba7d4 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Wed Mar 28 22:52:41 2007 -0400 cpuidle: fix boot hang Patch for cpuidle boot hang reported by Larry Finger here. http://www.ussg.iu.edu/hypermail/linux/kernel/0703.2/2025.html Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Larry Finger <larry.finger@lwfinger.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit c17e168aa6e5fe3851baaae8df2fbc1cf11443a9 Author: Len Brown <len.brown@intel.com> Date: Wed Mar 7 04:37:53 2007 -0500 cpuidle: ladder does not depend on ACPI build fix for CONFIG_ACPI=n In file included from drivers/cpuidle/governors/ladder.c:21: include/acpi/processor.h:88: error: expected specifier-qualifier-list before ‘acpi_integer’ include/acpi/processor.h:106: error: expected specifier-qualifier-list before ‘acpi_integer’ include/acpi/processor.h:168: error: expected specifier-qualifier-list before ‘acpi_handle’ Signed-off-by: Len Brown <len.brown@intel.com> commit 8c91d958246bde68db0c3f0c57b535962ce861cb Author: Adrian Bunk <bunk@stusta.de> Date: Tue Mar 6 02:29:40 2007 -0800 cpuidle: make code static This patch makes the following needlessly global code static: - driver.c: __cpuidle_find_driver() - governor.c: __cpuidle_find_governor() - ladder.c: struct ladder_governor Signed-off-by: Adrian Bunk <bunk@stusta.de> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Adam Belay <abelay@novell.com> Cc: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 0c39dc3187094c72c33ab65a64d2017b21f372d2 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Wed Mar 7 02:38:22 2007 -0500 cpu_idle: fix build break This patch fixes a build breakage with !CONFIG_HOTPLUG_CPU and CONFIG_CPU_IDLE. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 8112e3b115659b07df340ef170515799c0105f82 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Mar 6 02:29:39 2007 -0800 cpuidle: build fix for !CPU_IDLE Fix the compile issues when CPU_IDLE is not configured. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Adam Belay <abelay@novell.com> Cc: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 1eb4431e9599cd25e0d9872f3c2c8986821839dd Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:54:57 2007 -0800 cpuidle take2: Basic documentation for cpuidle Documentation for cpuidle infrastructure Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit ef5f15a8b79123a047285ec2e3899108661df779 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:54:03 2007 -0800 cpuidle take2: Hookup ACPI C-states driver with cpuidle Hookup ACPI C-states onto generic cpuidle infrastructure. drivers/acpi/procesor_idle.c is now a ACPI C-states driver that registers as a driver in cpuidle infrastructure and the policy part is removed from drivers/acpi/processor_idle.c. We use governor in cpuidle instead. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 987196fa82d4db52c407e8c9d5dec884ba602183 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:52:57 2007 -0800 cpuidle take2: Core cpuidle infrastructure Announcing 'cpuidle', a new CPU power management infrastructure to manage idle CPUs in a clean and efficient manner. cpuidle separates out the drivers that can provide support for multiple types of idle states and policy governors that decide on what idle state to use at run time. A cpuidle driver can support multiple idle states based on parameters like varying power consumption, wakeup latency, etc (ACPI C-states for example). A cpuidle governor can be usage model specific (laptop, server, laptop on battery etc). Main advantage of the infrastructure being, it allows independent development of drivers and governors and allows for better CPU power management. A huge thanks to Adam Belay and Shaohua Li who were part of this mini-project since its beginning and are greatly responsible for this patchset. This patch: Core cpuidle infrastructure. Introduces a new abstraction layer for cpuidle: * which manages drivers that can support multiple idles states. Drivers can be generic or particular to specific hardware/platform * allows pluging in multiple policy governors that can take idle state policy decision * The core also has a set of sysfs interfaces with which administrato can know about supported drivers and governors and switch them at run time. Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> Signed-off-by: Len Brown <len.brown@intel.com>
2007-10-03 16:58:00 -06:00
*
* Called from power state control code with interrupts disabled
*/
ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
cpuidle: consolidate 2.6.22 cpuidle branch into one patch commit e5a16b1f9eec0af7cfa0830304b41c1c0833cf9f Author: Len Brown <len.brown@intel.com> Date: Tue Oct 2 23:44:44 2007 -0400 cpuidle: shrink diff processor_idle.c | 440 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 429 insertions(+), 11 deletions(-) Signed-off-by: Len Brown <len.brown@intel.com> commit dfbb9d5aedfb18848a3e0d6f6e3e4969febb209c Author: Len Brown <len.brown@intel.com> Date: Wed Sep 26 02:17:55 2007 -0400 cpuidle: reduce diff size Reduces the cpuidle processor_idle.c diff vs 2.6.22 from this processor_idle.c | 2006 ++++++++++++++++++++++++++----------------- 1 file changed, 1219 insertions(+), 787 deletions(-) to this: processor_idle.c | 502 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 458 insertions(+), 44 deletions(-) ...for the purpose of making the cpuilde patch less invasive and easier to review. no functional changes. build tested only. Signed-off-by: Len Brown <len.brown@intel.com> commit 889172fc915f5a7fe20f35b133cbd205ce69bf6c Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Sep 13 13:40:05 2007 -0700 cpuidle: Retain old ACPI policy for !CONFIG_CPU_IDLE Retain the old policy in processor_idle, so that when CPU_IDLE is not configured, old C-state policy will still be used. This provides a clean gradual migration path from old ACPI policy to new cpuidle based policy. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 9544a8181edc7ecc33b3bfd69271571f98ed08bc Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Sep 13 13:39:17 2007 -0700 cpuidle: Configure governors by default Quoting Len "Do not give an option to users to shoot themselves in the foot". Remove the configurability of ladder and menu governors as they are needed for default policy of cpuidle. That way users will not be able to have cpuidle without any policy loosing all C-state power savings. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 8975059a2c1e56cfe83d1bcf031bcf4cb39be743 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:27:07 2007 -0400 CPUIDLE: load ACPI properly when CPUIDLE is disabled Change the registration return codes for when CPUIDLE support is not compiled into the kernel. As a result, the ACPI processor driver will load properly even if CPUIDLE is unavailable. However, it may be possible to cleanup the ACPI processor driver further and eliminate some dead code paths. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit e0322e2b58dd1b12ec669bf84693efe0dc2414a8 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:26:06 2007 -0400 CPUIDLE: remove cpuidle_get_bm_activity() Remove cpuidle_get_bm_activity() and updates governors accordingly. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 18a6e770d5c82ba26653e53d240caa617e09e9ab Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:58 2007 -0400 CPUIDLE: max_cstate fix Currently max_cstate is limited to 0, resulting in no idle processor power management on ACPI platforms. This patch restores the value to the array size. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 1fdc0887286179b40ce24bcdbde663172e205ef0 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:40 2007 -0400 CPUIDLE: handle BM detection inside the ACPI Processor driver Update the ACPI processor driver to detect BM activity and limit state entry depth internally, rather than exposing such requirements to CPUIDLE. As a result, CPUIDLE can drop this ACPI-specific interface and become more platform independent. BM activity is now handled much more aggressively than it was in the original implementation, so some testing coverage may be needed to verify that this doesn't introduce any DMA buffer under-run issues. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 0ef38840db666f48e3cdd2b769da676c57228dd9 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:14 2007 -0400 CPUIDLE: menu governor updates Tweak the menu governor to more effectively handle non-timer break events. Non-timer break events are detected by comparing the actual sleep time to the expected sleep time. In future revisions, it may be more reliable to use the timer data structures directly. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit bb4d74fca63fa96cf3ace644b15ae0f12b7df5a1 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:24:40 2007 -0400 CPUIDLE: fix 'current_governor' sysfs entry Allow the "current_governor" sysfs entry to properly handle input terminated with '\n'. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit df3c71559bb69b125f1a48971bf0d17f78bbdf47 Author: Len Brown <len.brown@intel.com> Date: Sun Aug 12 02:00:45 2007 -0400 cpuidle: fix IA64 build (again) Signed-off-by: Len Brown <len.brown@intel.com> commit a02064579e3f9530fd31baae16b1fc46b5a7bca8 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:39:27 2007 -0400 cpuidle: Remove support for runtime changing of max_cstate Remove support for runtime changeability of max_cstate. Drivers can use use latency APIs. max_cstate can still be used as a boot time option and dmi override. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 0912a44b13adf22f5e3f607d263aed23b4910d7e Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:39:16 2007 -0400 cpuidle: Remove ACPI cstate_limit calls from ipw2100 ipw2100 already has code to use accetable_latency interfaces to limit the C-state. Remove the calls to acpi_set_cstate_limit and acpi_get_cstate_limit as they are redundant. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit c649a76e76be6bff1fd770d0a775798813a3f6e0 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:35:39 2007 -0400 cpuidle: compile fix for pause and resume functions Fix the compilation failure when cpuidle is not compiled in. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Acked-by: Adam Belay <adam.belay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 2305a5920fb8ee6ccec1c62ade05aa8351091d71 Author: Adam Belay <abelay@novell.com> Date: Thu Jul 19 00:49:00 2007 -0400 cpuidle: re-write Some portions have been rewritten to make the code cleaner and lighter weight. The following is a list of changes: 1.) the state name is now included in the sysfs interface 2.) detection, hotplug, and available state modifications are handled by CPUIDLE drivers directly 3.) the CPUIDLE idle handler is only ever installed when at least one cpuidle_device is enabled and ready 4.) the menu governor BM code no longer overflows 5.) the sysfs attributes are now printed as unsigned integers, avoiding negative values 6.) a variety of other small cleanups Also, Idle drivers are no longer swappable during runtime through the CPUIDLE sysfs inteface. On i386 and x86_64 most idle handlers (e.g. poll, mwait, halt, etc.) don't benefit from an infrastructure that supports multiple states, so I think using a more general case idle handler selection mechanism would be cleaner. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Acked-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit df25b6b56955714e6e24b574d88d1fd11f0c3ee5 Author: Len Brown <len.brown@intel.com> Date: Tue Jul 24 17:08:21 2007 -0400 cpuidle: fix IA64 buid Signed-off-by: Len Brown <len.brown@intel.com> commit fd6ada4c14488755ff7068860078c437431fbccd Author: Adrian Bunk <bunk@stusta.de> Date: Mon Jul 9 11:33:13 2007 -0700 cpuidle: static make cpuidle_replace_governor() static Signed-off-by: Adrian Bunk <bunk@stusta.de> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit c1d4a2cebcadf2429c0c72e1d29aa2a9684c32e0 Author: Adrian Bunk <bunk@stusta.de> Date: Tue Jul 3 00:54:40 2007 -0400 cpuidle: static This patch makes the needlessly global struct menu_governor static. Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit dbf8780c6e8d572c2c273da97ed1cca7608fd999 Author: Andrew Morton <akpm@linux-foundation.org> Date: Tue Jul 3 00:49:14 2007 -0400 export symbol tick_nohz_get_sleep_length ERROR: "tick_nohz_get_sleep_length" [drivers/cpuidle/governors/menu.ko] undefined! ERROR: "tick_nohz_get_idle_jiffies" [drivers/cpuidle/governors/menu.ko] undefined! And please be sure to get your changes to core kernel suitably reviewed. Cc: Adam Belay <abelay@novell.com> Cc: Venki Pallipadi <venkatesh.pallipadi@intel.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: john stultz <johnstul@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 29f0e248e7017be15f99febf9143a2cef00b2961 Author: Andrew Morton <akpm@linux-foundation.org> Date: Tue Jul 3 00:43:04 2007 -0400 tick.h needs hrtimer.h It uses hrtimers. Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit e40cede7d63a029e92712a3fe02faee60cc38fb4 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:40:34 2007 -0400 cpuidle: first round of documentation updates Documentation changes based on Pavel's feedback. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 83b42be2efece386976507555c29e7773a0dfcd1 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:39:25 2007 -0400 cpuidle: add rating to the governors and pick the one with highest rating by default Introduce a governor rating scheme to pick the right governor by default. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit d2a74b8c5e8f22def4709330d4bfc4a29209b71c Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:38:08 2007 -0400 cpuidle: make cpuidle sysfs driver governor switch off by default Make default cpuidle sysfs to show current_governor and current_driver in read-only mode. More elaborate available_governors and available_drivers with writeable current_governor and current_driver interface only appear with "cpuidle_sysfs_switch" boot parameter. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 1f60a0e80bf83cf6b55c8845bbe5596ed8f6307b Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:37:00 2007 -0400 cpuidle: menu governor: change the early break condition Change the C-state early break out algorithm in menu governor. We only look at early breakouts that result in wakeups shorter than idle state's target_residency. If such a breakout is frequent enough, eliminate the particular idle state upto a timeout period. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 45a42095cf64b003b4a69be3ce7f434f97d7af51 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:35:38 2007 -0400 cpuidle: fix uninitialized variable in sysfs routine Fix the uninitialized usage of ret. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 80dca7cdba3e6ee13eae277660873ab9584eb3be Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:34:16 2007 -0400 cpuidle: reenable /proc/acpi//power interface for the time being Keep /proc/acpi/processor/CPU*/power around for a while as powertop depends on it. It will be marked deprecated and removed in future. powertop can use cpuidle interfaces instead. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 589c37c2646c5e3813a51255a5ee1159cb4c33fc Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:32:37 2007 -0400 cpuidle: menu governor and hrtimer compile fix Compile fix for menu governor. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 0ba80bd9ab3ed304cb4f19b722e4cc6740588b5e Author: Len Brown <len.brown@intel.com> Date: Thu May 31 22:51:43 2007 -0400 cpuidle: build fix - cpuidle vs ipw2100 module ERROR: "acpi_set_cstate_limit" [drivers/net/wireless/ipw2100.ko] undefined! Signed-off-by: Len Brown <len.brown@intel.com> commit d7d8fa7f96a7f7682be7c6cc0cc53fa7a18c3b58 Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:47:07 2007 -0400 cpuidle: add the 'menu' governor Here is my first take at implementing an idle PM governor that takes full advantage of NO_HZ. I call it the 'menu' governor because it considers the full list of idle states before each entry. I've kept the implementation fairly simple. It attempts to guess the next residency time and then chooses a state that would meet at least the break-even point between power savings and entry cost. To this end, it selects the deepest idle state that satisfies the following constraints: 1. If the idle time elapsed since bus master activity was detected is below a threshold (currently 20 ms), then limit the selection to C2-type or above. 2. Do not choose a state with a break-even residency that exceeds the expected time remaining until the next timer interrupt. 3. Do not choose a state with a break-even residency that exceeds the elapsed time between the last pair of break events, excluding timer interrupts. This governor has an advantage over "ladder" governor because it proactively checks how much time remains until the next timer interrupt using the tick infrastructure. Also, it handles device interrupt activity more intelligently by not including timer interrupts in break event calculations. Finally, it doesn't make policy decisions using the number of state entries, which can have variable residency times (NO_HZ makes these potentially very large), and instead only considers sleep time deltas. The menu governor can be selected during runtime using the cpuidle sysfs interface like so: "echo "menu" > /sys/devices/system/cpu/cpuidle/current_governor" Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit a4bec7e65aa3b7488b879d971651cc99a6c410fe Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:47:03 2007 -0400 cpuidle: export time until next timer interrupt using NO_HZ Expose information about the time remaining until the next timer interrupt expires by utilizing the dynticks infrastructure. Also modify the main idle loop to allow dynticks to handle non-interrupt break events (e.g. DMA). Finally, expose sleep ticks information to external code. Thomas Gleixner is responsible for much of the code in this patch. However, I've made some additional changes, so I'm probably responsible if there are any bugs or oversights :) Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 2929d8996fbc77f41a5ff86bb67cdde3ca7d2d72 Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:46:58 2007 -0400 cpuidle: governor API changes This patch prepares cpuidle for the menu governor. It adds an optional stage after idle state entry to give the governor an opportunity to check why the state was exited. Also it makes sure the idle loop returns after each state entry, allowing the appropriate dynticks code to run. Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 3a7fd42f9825c3b03e364ca59baa751bb350775f Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Apr 26 00:03:59 2007 -0700 cpuidle: hang fix Prevent hang on x86-64, when ACPI processor driver is added as a module on a system that does not support C-states. x86-64 expects all idle handlers to enable interrupts before returning from idle handler. This is due to enter_idle(), exit_idle() races. Make cpuidle_idle_call() confirm to this when there is no pm_idle_old. Also, cpuidle look at the return values of attch_driver() and set current_driver to NULL if attach fails on all CPUs. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 4893339a142afbd5b7c01ffadfd53d14746e858e Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:09 2007 +0800 cpuidle: add support for max_cstate limit With CPUIDLE framework, the max_cstate (to limit max cpu c-state) parameter is ingored. Some systems require it to ignore C2/C3 and some drivers like ipw require it too. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 43bbbbe1cb998cbd2df656f55bb3bfe30f30e7d1 Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:13 2007 +0800 cpuidle: add cpuidle_fore_redetect_devices API add cpuidle_force_redetect_devices API, which forces all CPU redetect idle states. Next patch will use it. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit d1edadd608f24836def5ec483d2edccfb37b1d19 Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:01 2007 +0800 cpuidle: fix sysfs related issue Fix the cpuidle sysfs issue. a. make kobject dynamicaly allocated b. fixed sysfs init issue to avoid suspend/resume issue Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 7169a5cc0d67b263978859672e86c13c23a5570d Author: Randy Dunlap <randy.dunlap@oracle.com> Date: Wed Mar 28 22:52:53 2007 -0400 cpuidle: 1-bit field must be unsigned A 1-bit bitfield has no room for a sign bit. drivers/cpuidle/governors/ladder.c:54:16: error: dubious bitfield without explicit `signed' or `unsigned' Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 4658620158dc2fbd9e4bcb213c5b6fb5d05ba7d4 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Wed Mar 28 22:52:41 2007 -0400 cpuidle: fix boot hang Patch for cpuidle boot hang reported by Larry Finger here. http://www.ussg.iu.edu/hypermail/linux/kernel/0703.2/2025.html Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Larry Finger <larry.finger@lwfinger.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit c17e168aa6e5fe3851baaae8df2fbc1cf11443a9 Author: Len Brown <len.brown@intel.com> Date: Wed Mar 7 04:37:53 2007 -0500 cpuidle: ladder does not depend on ACPI build fix for CONFIG_ACPI=n In file included from drivers/cpuidle/governors/ladder.c:21: include/acpi/processor.h:88: error: expected specifier-qualifier-list before ‘acpi_integer’ include/acpi/processor.h:106: error: expected specifier-qualifier-list before ‘acpi_integer’ include/acpi/processor.h:168: error: expected specifier-qualifier-list before ‘acpi_handle’ Signed-off-by: Len Brown <len.brown@intel.com> commit 8c91d958246bde68db0c3f0c57b535962ce861cb Author: Adrian Bunk <bunk@stusta.de> Date: Tue Mar 6 02:29:40 2007 -0800 cpuidle: make code static This patch makes the following needlessly global code static: - driver.c: __cpuidle_find_driver() - governor.c: __cpuidle_find_governor() - ladder.c: struct ladder_governor Signed-off-by: Adrian Bunk <bunk@stusta.de> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Adam Belay <abelay@novell.com> Cc: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 0c39dc3187094c72c33ab65a64d2017b21f372d2 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Wed Mar 7 02:38:22 2007 -0500 cpu_idle: fix build break This patch fixes a build breakage with !CONFIG_HOTPLUG_CPU and CONFIG_CPU_IDLE. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 8112e3b115659b07df340ef170515799c0105f82 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Mar 6 02:29:39 2007 -0800 cpuidle: build fix for !CPU_IDLE Fix the compile issues when CPU_IDLE is not configured. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Adam Belay <abelay@novell.com> Cc: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 1eb4431e9599cd25e0d9872f3c2c8986821839dd Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:54:57 2007 -0800 cpuidle take2: Basic documentation for cpuidle Documentation for cpuidle infrastructure Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit ef5f15a8b79123a047285ec2e3899108661df779 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:54:03 2007 -0800 cpuidle take2: Hookup ACPI C-states driver with cpuidle Hookup ACPI C-states onto generic cpuidle infrastructure. drivers/acpi/procesor_idle.c is now a ACPI C-states driver that registers as a driver in cpuidle infrastructure and the policy part is removed from drivers/acpi/processor_idle.c. We use governor in cpuidle instead. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 987196fa82d4db52c407e8c9d5dec884ba602183 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:52:57 2007 -0800 cpuidle take2: Core cpuidle infrastructure Announcing 'cpuidle', a new CPU power management infrastructure to manage idle CPUs in a clean and efficient manner. cpuidle separates out the drivers that can provide support for multiple types of idle states and policy governors that decide on what idle state to use at run time. A cpuidle driver can support multiple idle states based on parameters like varying power consumption, wakeup latency, etc (ACPI C-states for example). A cpuidle governor can be usage model specific (laptop, server, laptop on battery etc). Main advantage of the infrastructure being, it allows independent development of drivers and governors and allows for better CPU power management. A huge thanks to Adam Belay and Shaohua Li who were part of this mini-project since its beginning and are greatly responsible for this patchset. This patch: Core cpuidle infrastructure. Introduces a new abstraction layer for cpuidle: * which manages drivers that can support multiple idles states. Drivers can be generic or particular to specific hardware/platform * allows pluging in multiple policy governors that can take idle state policy decision * The core also has a set of sysfs interfaces with which administrato can know about supported drivers and governors and switch them at run time. Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> Signed-off-by: Len Brown <len.brown@intel.com>
2007-10-03 16:58:00 -06:00
{
sched: idle: Select idle state before stopping the tick In order to address the issue with short idle duration predictions by the idle governor after the scheduler tick has been stopped, reorder the code in cpuidle_idle_call() so that the governor idle state selection runs before tick_nohz_idle_go_idle() and use the "nohz" hint returned by cpuidle_select() to decide whether or not to stop the tick. This isn't straightforward, because menu_select() invokes tick_nohz_get_sleep_length() to get the time to the next timer event and the number returned by the latter comes from __tick_nohz_idle_stop_tick(). Fortunately, however, it is possible to compute that number without actually stopping the tick and with the help of the existing code. Namely, tick_nohz_get_sleep_length() can be made call tick_nohz_next_event(), introduced earlier, to get the time to the next non-highres timer event. If that happens, tick_nohz_next_event() need not be called by __tick_nohz_idle_stop_tick() again. If it turns out that the scheduler tick cannot be stopped going forward or the next timer event is too close for the tick to be stopped, tick_nohz_get_sleep_length() can simply return the time to the next event currently programmed into the corresponding clock event device. In addition to knowing the return value of tick_nohz_next_event(), however, tick_nohz_get_sleep_length() needs to know the time to the next highres timer event, but with the scheduler tick timer excluded, which can be computed with the help of hrtimer_get_next_event(). That minimum of that number and the tick_nohz_next_event() return value is the total time to the next timer event with the assumption that the tick will be stopped. It can be returned to the idle governor which can use it for predicting idle duration (under the assumption that the tick will be stopped) and deciding whether or not it makes sense to stop the tick before putting the CPU into the selected idle state. With the above, the sleep_length field in struct tick_sched is not necessary any more, so drop it. Link: https://bugzilla.kernel.org/show_bug.cgi?id=199227 Reported-by: Doug Smythies <dsmythies@telus.net> Reported-by: Thomas Ilsche <thomas.ilsche@tu-dresden.de> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
2018-04-03 15:17:11 -06:00
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
sched: idle: Select idle state before stopping the tick In order to address the issue with short idle duration predictions by the idle governor after the scheduler tick has been stopped, reorder the code in cpuidle_idle_call() so that the governor idle state selection runs before tick_nohz_idle_go_idle() and use the "nohz" hint returned by cpuidle_select() to decide whether or not to stop the tick. This isn't straightforward, because menu_select() invokes tick_nohz_get_sleep_length() to get the time to the next timer event and the number returned by the latter comes from __tick_nohz_idle_stop_tick(). Fortunately, however, it is possible to compute that number without actually stopping the tick and with the help of the existing code. Namely, tick_nohz_get_sleep_length() can be made call tick_nohz_next_event(), introduced earlier, to get the time to the next non-highres timer event. If that happens, tick_nohz_next_event() need not be called by __tick_nohz_idle_stop_tick() again. If it turns out that the scheduler tick cannot be stopped going forward or the next timer event is too close for the tick to be stopped, tick_nohz_get_sleep_length() can simply return the time to the next event currently programmed into the corresponding clock event device. In addition to knowing the return value of tick_nohz_next_event(), however, tick_nohz_get_sleep_length() needs to know the time to the next highres timer event, but with the scheduler tick timer excluded, which can be computed with the help of hrtimer_get_next_event(). That minimum of that number and the tick_nohz_next_event() return value is the total time to the next timer event with the assumption that the tick will be stopped. It can be returned to the idle governor which can use it for predicting idle duration (under the assumption that the tick will be stopped) and deciding whether or not it makes sense to stop the tick before putting the CPU into the selected idle state. With the above, the sleep_length field in struct tick_sched is not necessary any more, so drop it. Link: https://bugzilla.kernel.org/show_bug.cgi?id=199227 Reported-by: Doug Smythies <dsmythies@telus.net> Reported-by: Thomas Ilsche <thomas.ilsche@tu-dresden.de> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
2018-04-03 15:17:11 -06:00
int cpu = smp_processor_id();
/*
* The idle entry time is expected to be a sufficient approximation of
* the current time at this point.
*/
ktime_t now = ts->idle_entrytime;
ktime_t next_event;
WARN_ON_ONCE(!ts->inidle);
*delta_next = ktime_sub(dev->next_event, now);
sched: idle: Select idle state before stopping the tick In order to address the issue with short idle duration predictions by the idle governor after the scheduler tick has been stopped, reorder the code in cpuidle_idle_call() so that the governor idle state selection runs before tick_nohz_idle_go_idle() and use the "nohz" hint returned by cpuidle_select() to decide whether or not to stop the tick. This isn't straightforward, because menu_select() invokes tick_nohz_get_sleep_length() to get the time to the next timer event and the number returned by the latter comes from __tick_nohz_idle_stop_tick(). Fortunately, however, it is possible to compute that number without actually stopping the tick and with the help of the existing code. Namely, tick_nohz_get_sleep_length() can be made call tick_nohz_next_event(), introduced earlier, to get the time to the next non-highres timer event. If that happens, tick_nohz_next_event() need not be called by __tick_nohz_idle_stop_tick() again. If it turns out that the scheduler tick cannot be stopped going forward or the next timer event is too close for the tick to be stopped, tick_nohz_get_sleep_length() can simply return the time to the next event currently programmed into the corresponding clock event device. In addition to knowing the return value of tick_nohz_next_event(), however, tick_nohz_get_sleep_length() needs to know the time to the next highres timer event, but with the scheduler tick timer excluded, which can be computed with the help of hrtimer_get_next_event(). That minimum of that number and the tick_nohz_next_event() return value is the total time to the next timer event with the assumption that the tick will be stopped. It can be returned to the idle governor which can use it for predicting idle duration (under the assumption that the tick will be stopped) and deciding whether or not it makes sense to stop the tick before putting the CPU into the selected idle state. With the above, the sleep_length field in struct tick_sched is not necessary any more, so drop it. Link: https://bugzilla.kernel.org/show_bug.cgi?id=199227 Reported-by: Doug Smythies <dsmythies@telus.net> Reported-by: Thomas Ilsche <thomas.ilsche@tu-dresden.de> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
2018-04-03 15:17:11 -06:00
if (!can_stop_idle_tick(cpu, ts))
return *delta_next;
sched: idle: Select idle state before stopping the tick In order to address the issue with short idle duration predictions by the idle governor after the scheduler tick has been stopped, reorder the code in cpuidle_idle_call() so that the governor idle state selection runs before tick_nohz_idle_go_idle() and use the "nohz" hint returned by cpuidle_select() to decide whether or not to stop the tick. This isn't straightforward, because menu_select() invokes tick_nohz_get_sleep_length() to get the time to the next timer event and the number returned by the latter comes from __tick_nohz_idle_stop_tick(). Fortunately, however, it is possible to compute that number without actually stopping the tick and with the help of the existing code. Namely, tick_nohz_get_sleep_length() can be made call tick_nohz_next_event(), introduced earlier, to get the time to the next non-highres timer event. If that happens, tick_nohz_next_event() need not be called by __tick_nohz_idle_stop_tick() again. If it turns out that the scheduler tick cannot be stopped going forward or the next timer event is too close for the tick to be stopped, tick_nohz_get_sleep_length() can simply return the time to the next event currently programmed into the corresponding clock event device. In addition to knowing the return value of tick_nohz_next_event(), however, tick_nohz_get_sleep_length() needs to know the time to the next highres timer event, but with the scheduler tick timer excluded, which can be computed with the help of hrtimer_get_next_event(). That minimum of that number and the tick_nohz_next_event() return value is the total time to the next timer event with the assumption that the tick will be stopped. It can be returned to the idle governor which can use it for predicting idle duration (under the assumption that the tick will be stopped) and deciding whether or not it makes sense to stop the tick before putting the CPU into the selected idle state. With the above, the sleep_length field in struct tick_sched is not necessary any more, so drop it. Link: https://bugzilla.kernel.org/show_bug.cgi?id=199227 Reported-by: Doug Smythies <dsmythies@telus.net> Reported-by: Thomas Ilsche <thomas.ilsche@tu-dresden.de> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
2018-04-03 15:17:11 -06:00
next_event = tick_nohz_next_event(ts, cpu);
if (!next_event)
return *delta_next;
sched: idle: Select idle state before stopping the tick In order to address the issue with short idle duration predictions by the idle governor after the scheduler tick has been stopped, reorder the code in cpuidle_idle_call() so that the governor idle state selection runs before tick_nohz_idle_go_idle() and use the "nohz" hint returned by cpuidle_select() to decide whether or not to stop the tick. This isn't straightforward, because menu_select() invokes tick_nohz_get_sleep_length() to get the time to the next timer event and the number returned by the latter comes from __tick_nohz_idle_stop_tick(). Fortunately, however, it is possible to compute that number without actually stopping the tick and with the help of the existing code. Namely, tick_nohz_get_sleep_length() can be made call tick_nohz_next_event(), introduced earlier, to get the time to the next non-highres timer event. If that happens, tick_nohz_next_event() need not be called by __tick_nohz_idle_stop_tick() again. If it turns out that the scheduler tick cannot be stopped going forward or the next timer event is too close for the tick to be stopped, tick_nohz_get_sleep_length() can simply return the time to the next event currently programmed into the corresponding clock event device. In addition to knowing the return value of tick_nohz_next_event(), however, tick_nohz_get_sleep_length() needs to know the time to the next highres timer event, but with the scheduler tick timer excluded, which can be computed with the help of hrtimer_get_next_event(). That minimum of that number and the tick_nohz_next_event() return value is the total time to the next timer event with the assumption that the tick will be stopped. It can be returned to the idle governor which can use it for predicting idle duration (under the assumption that the tick will be stopped) and deciding whether or not it makes sense to stop the tick before putting the CPU into the selected idle state. With the above, the sleep_length field in struct tick_sched is not necessary any more, so drop it. Link: https://bugzilla.kernel.org/show_bug.cgi?id=199227 Reported-by: Doug Smythies <dsmythies@telus.net> Reported-by: Thomas Ilsche <thomas.ilsche@tu-dresden.de> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
2018-04-03 15:17:11 -06:00
/*
* If the next highres timer to expire is earlier than next_event, the
* idle governor needs to know that.
*/
next_event = min_t(u64, next_event,
hrtimer_next_event_without(&ts->sched_timer));
cpuidle: consolidate 2.6.22 cpuidle branch into one patch commit e5a16b1f9eec0af7cfa0830304b41c1c0833cf9f Author: Len Brown <len.brown@intel.com> Date: Tue Oct 2 23:44:44 2007 -0400 cpuidle: shrink diff processor_idle.c | 440 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 429 insertions(+), 11 deletions(-) Signed-off-by: Len Brown <len.brown@intel.com> commit dfbb9d5aedfb18848a3e0d6f6e3e4969febb209c Author: Len Brown <len.brown@intel.com> Date: Wed Sep 26 02:17:55 2007 -0400 cpuidle: reduce diff size Reduces the cpuidle processor_idle.c diff vs 2.6.22 from this processor_idle.c | 2006 ++++++++++++++++++++++++++----------------- 1 file changed, 1219 insertions(+), 787 deletions(-) to this: processor_idle.c | 502 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 458 insertions(+), 44 deletions(-) ...for the purpose of making the cpuilde patch less invasive and easier to review. no functional changes. build tested only. Signed-off-by: Len Brown <len.brown@intel.com> commit 889172fc915f5a7fe20f35b133cbd205ce69bf6c Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Sep 13 13:40:05 2007 -0700 cpuidle: Retain old ACPI policy for !CONFIG_CPU_IDLE Retain the old policy in processor_idle, so that when CPU_IDLE is not configured, old C-state policy will still be used. This provides a clean gradual migration path from old ACPI policy to new cpuidle based policy. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 9544a8181edc7ecc33b3bfd69271571f98ed08bc Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Sep 13 13:39:17 2007 -0700 cpuidle: Configure governors by default Quoting Len "Do not give an option to users to shoot themselves in the foot". Remove the configurability of ladder and menu governors as they are needed for default policy of cpuidle. That way users will not be able to have cpuidle without any policy loosing all C-state power savings. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 8975059a2c1e56cfe83d1bcf031bcf4cb39be743 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:27:07 2007 -0400 CPUIDLE: load ACPI properly when CPUIDLE is disabled Change the registration return codes for when CPUIDLE support is not compiled into the kernel. As a result, the ACPI processor driver will load properly even if CPUIDLE is unavailable. However, it may be possible to cleanup the ACPI processor driver further and eliminate some dead code paths. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit e0322e2b58dd1b12ec669bf84693efe0dc2414a8 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:26:06 2007 -0400 CPUIDLE: remove cpuidle_get_bm_activity() Remove cpuidle_get_bm_activity() and updates governors accordingly. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 18a6e770d5c82ba26653e53d240caa617e09e9ab Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:58 2007 -0400 CPUIDLE: max_cstate fix Currently max_cstate is limited to 0, resulting in no idle processor power management on ACPI platforms. This patch restores the value to the array size. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 1fdc0887286179b40ce24bcdbde663172e205ef0 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:40 2007 -0400 CPUIDLE: handle BM detection inside the ACPI Processor driver Update the ACPI processor driver to detect BM activity and limit state entry depth internally, rather than exposing such requirements to CPUIDLE. As a result, CPUIDLE can drop this ACPI-specific interface and become more platform independent. BM activity is now handled much more aggressively than it was in the original implementation, so some testing coverage may be needed to verify that this doesn't introduce any DMA buffer under-run issues. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 0ef38840db666f48e3cdd2b769da676c57228dd9 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:14 2007 -0400 CPUIDLE: menu governor updates Tweak the menu governor to more effectively handle non-timer break events. Non-timer break events are detected by comparing the actual sleep time to the expected sleep time. In future revisions, it may be more reliable to use the timer data structures directly. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit bb4d74fca63fa96cf3ace644b15ae0f12b7df5a1 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:24:40 2007 -0400 CPUIDLE: fix 'current_governor' sysfs entry Allow the "current_governor" sysfs entry to properly handle input terminated with '\n'. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit df3c71559bb69b125f1a48971bf0d17f78bbdf47 Author: Len Brown <len.brown@intel.com> Date: Sun Aug 12 02:00:45 2007 -0400 cpuidle: fix IA64 build (again) Signed-off-by: Len Brown <len.brown@intel.com> commit a02064579e3f9530fd31baae16b1fc46b5a7bca8 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:39:27 2007 -0400 cpuidle: Remove support for runtime changing of max_cstate Remove support for runtime changeability of max_cstate. Drivers can use use latency APIs. max_cstate can still be used as a boot time option and dmi override. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 0912a44b13adf22f5e3f607d263aed23b4910d7e Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:39:16 2007 -0400 cpuidle: Remove ACPI cstate_limit calls from ipw2100 ipw2100 already has code to use accetable_latency interfaces to limit the C-state. Remove the calls to acpi_set_cstate_limit and acpi_get_cstate_limit as they are redundant. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit c649a76e76be6bff1fd770d0a775798813a3f6e0 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:35:39 2007 -0400 cpuidle: compile fix for pause and resume functions Fix the compilation failure when cpuidle is not compiled in. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Acked-by: Adam Belay <adam.belay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 2305a5920fb8ee6ccec1c62ade05aa8351091d71 Author: Adam Belay <abelay@novell.com> Date: Thu Jul 19 00:49:00 2007 -0400 cpuidle: re-write Some portions have been rewritten to make the code cleaner and lighter weight. The following is a list of changes: 1.) the state name is now included in the sysfs interface 2.) detection, hotplug, and available state modifications are handled by CPUIDLE drivers directly 3.) the CPUIDLE idle handler is only ever installed when at least one cpuidle_device is enabled and ready 4.) the menu governor BM code no longer overflows 5.) the sysfs attributes are now printed as unsigned integers, avoiding negative values 6.) a variety of other small cleanups Also, Idle drivers are no longer swappable during runtime through the CPUIDLE sysfs inteface. On i386 and x86_64 most idle handlers (e.g. poll, mwait, halt, etc.) don't benefit from an infrastructure that supports multiple states, so I think using a more general case idle handler selection mechanism would be cleaner. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Acked-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit df25b6b56955714e6e24b574d88d1fd11f0c3ee5 Author: Len Brown <len.brown@intel.com> Date: Tue Jul 24 17:08:21 2007 -0400 cpuidle: fix IA64 buid Signed-off-by: Len Brown <len.brown@intel.com> commit fd6ada4c14488755ff7068860078c437431fbccd Author: Adrian Bunk <bunk@stusta.de> Date: Mon Jul 9 11:33:13 2007 -0700 cpuidle: static make cpuidle_replace_governor() static Signed-off-by: Adrian Bunk <bunk@stusta.de> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit c1d4a2cebcadf2429c0c72e1d29aa2a9684c32e0 Author: Adrian Bunk <bunk@stusta.de> Date: Tue Jul 3 00:54:40 2007 -0400 cpuidle: static This patch makes the needlessly global struct menu_governor static. Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit dbf8780c6e8d572c2c273da97ed1cca7608fd999 Author: Andrew Morton <akpm@linux-foundation.org> Date: Tue Jul 3 00:49:14 2007 -0400 export symbol tick_nohz_get_sleep_length ERROR: "tick_nohz_get_sleep_length" [drivers/cpuidle/governors/menu.ko] undefined! ERROR: "tick_nohz_get_idle_jiffies" [drivers/cpuidle/governors/menu.ko] undefined! And please be sure to get your changes to core kernel suitably reviewed. Cc: Adam Belay <abelay@novell.com> Cc: Venki Pallipadi <venkatesh.pallipadi@intel.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: john stultz <johnstul@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 29f0e248e7017be15f99febf9143a2cef00b2961 Author: Andrew Morton <akpm@linux-foundation.org> Date: Tue Jul 3 00:43:04 2007 -0400 tick.h needs hrtimer.h It uses hrtimers. Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit e40cede7d63a029e92712a3fe02faee60cc38fb4 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:40:34 2007 -0400 cpuidle: first round of documentation updates Documentation changes based on Pavel's feedback. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 83b42be2efece386976507555c29e7773a0dfcd1 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:39:25 2007 -0400 cpuidle: add rating to the governors and pick the one with highest rating by default Introduce a governor rating scheme to pick the right governor by default. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit d2a74b8c5e8f22def4709330d4bfc4a29209b71c Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:38:08 2007 -0400 cpuidle: make cpuidle sysfs driver governor switch off by default Make default cpuidle sysfs to show current_governor and current_driver in read-only mode. More elaborate available_governors and available_drivers with writeable current_governor and current_driver interface only appear with "cpuidle_sysfs_switch" boot parameter. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 1f60a0e80bf83cf6b55c8845bbe5596ed8f6307b Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:37:00 2007 -0400 cpuidle: menu governor: change the early break condition Change the C-state early break out algorithm in menu governor. We only look at early breakouts that result in wakeups shorter than idle state's target_residency. If such a breakout is frequent enough, eliminate the particular idle state upto a timeout period. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 45a42095cf64b003b4a69be3ce7f434f97d7af51 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:35:38 2007 -0400 cpuidle: fix uninitialized variable in sysfs routine Fix the uninitialized usage of ret. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 80dca7cdba3e6ee13eae277660873ab9584eb3be Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:34:16 2007 -0400 cpuidle: reenable /proc/acpi//power interface for the time being Keep /proc/acpi/processor/CPU*/power around for a while as powertop depends on it. It will be marked deprecated and removed in future. powertop can use cpuidle interfaces instead. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 589c37c2646c5e3813a51255a5ee1159cb4c33fc Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:32:37 2007 -0400 cpuidle: menu governor and hrtimer compile fix Compile fix for menu governor. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 0ba80bd9ab3ed304cb4f19b722e4cc6740588b5e Author: Len Brown <len.brown@intel.com> Date: Thu May 31 22:51:43 2007 -0400 cpuidle: build fix - cpuidle vs ipw2100 module ERROR: "acpi_set_cstate_limit" [drivers/net/wireless/ipw2100.ko] undefined! Signed-off-by: Len Brown <len.brown@intel.com> commit d7d8fa7f96a7f7682be7c6cc0cc53fa7a18c3b58 Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:47:07 2007 -0400 cpuidle: add the 'menu' governor Here is my first take at implementing an idle PM governor that takes full advantage of NO_HZ. I call it the 'menu' governor because it considers the full list of idle states before each entry. I've kept the implementation fairly simple. It attempts to guess the next residency time and then chooses a state that would meet at least the break-even point between power savings and entry cost. To this end, it selects the deepest idle state that satisfies the following constraints: 1. If the idle time elapsed since bus master activity was detected is below a threshold (currently 20 ms), then limit the selection to C2-type or above. 2. Do not choose a state with a break-even residency that exceeds the expected time remaining until the next timer interrupt. 3. Do not choose a state with a break-even residency that exceeds the elapsed time between the last pair of break events, excluding timer interrupts. This governor has an advantage over "ladder" governor because it proactively checks how much time remains until the next timer interrupt using the tick infrastructure. Also, it handles device interrupt activity more intelligently by not including timer interrupts in break event calculations. Finally, it doesn't make policy decisions using the number of state entries, which can have variable residency times (NO_HZ makes these potentially very large), and instead only considers sleep time deltas. The menu governor can be selected during runtime using the cpuidle sysfs interface like so: "echo "menu" > /sys/devices/system/cpu/cpuidle/current_governor" Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit a4bec7e65aa3b7488b879d971651cc99a6c410fe Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:47:03 2007 -0400 cpuidle: export time until next timer interrupt using NO_HZ Expose information about the time remaining until the next timer interrupt expires by utilizing the dynticks infrastructure. Also modify the main idle loop to allow dynticks to handle non-interrupt break events (e.g. DMA). Finally, expose sleep ticks information to external code. Thomas Gleixner is responsible for much of the code in this patch. However, I've made some additional changes, so I'm probably responsible if there are any bugs or oversights :) Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 2929d8996fbc77f41a5ff86bb67cdde3ca7d2d72 Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:46:58 2007 -0400 cpuidle: governor API changes This patch prepares cpuidle for the menu governor. It adds an optional stage after idle state entry to give the governor an opportunity to check why the state was exited. Also it makes sure the idle loop returns after each state entry, allowing the appropriate dynticks code to run. Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 3a7fd42f9825c3b03e364ca59baa751bb350775f Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Apr 26 00:03:59 2007 -0700 cpuidle: hang fix Prevent hang on x86-64, when ACPI processor driver is added as a module on a system that does not support C-states. x86-64 expects all idle handlers to enable interrupts before returning from idle handler. This is due to enter_idle(), exit_idle() races. Make cpuidle_idle_call() confirm to this when there is no pm_idle_old. Also, cpuidle look at the return values of attch_driver() and set current_driver to NULL if attach fails on all CPUs. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 4893339a142afbd5b7c01ffadfd53d14746e858e Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:09 2007 +0800 cpuidle: add support for max_cstate limit With CPUIDLE framework, the max_cstate (to limit max cpu c-state) parameter is ingored. Some systems require it to ignore C2/C3 and some drivers like ipw require it too. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 43bbbbe1cb998cbd2df656f55bb3bfe30f30e7d1 Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:13 2007 +0800 cpuidle: add cpuidle_fore_redetect_devices API add cpuidle_force_redetect_devices API, which forces all CPU redetect idle states. Next patch will use it. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit d1edadd608f24836def5ec483d2edccfb37b1d19 Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:01 2007 +0800 cpuidle: fix sysfs related issue Fix the cpuidle sysfs issue. a. make kobject dynamicaly allocated b. fixed sysfs init issue to avoid suspend/resume issue Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 7169a5cc0d67b263978859672e86c13c23a5570d Author: Randy Dunlap <randy.dunlap@oracle.com> Date: Wed Mar 28 22:52:53 2007 -0400 cpuidle: 1-bit field must be unsigned A 1-bit bitfield has no room for a sign bit. drivers/cpuidle/governors/ladder.c:54:16: error: dubious bitfield without explicit `signed' or `unsigned' Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 4658620158dc2fbd9e4bcb213c5b6fb5d05ba7d4 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Wed Mar 28 22:52:41 2007 -0400 cpuidle: fix boot hang Patch for cpuidle boot hang reported by Larry Finger here. http://www.ussg.iu.edu/hypermail/linux/kernel/0703.2/2025.html Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Larry Finger <larry.finger@lwfinger.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit c17e168aa6e5fe3851baaae8df2fbc1cf11443a9 Author: Len Brown <len.brown@intel.com> Date: Wed Mar 7 04:37:53 2007 -0500 cpuidle: ladder does not depend on ACPI build fix for CONFIG_ACPI=n In file included from drivers/cpuidle/governors/ladder.c:21: include/acpi/processor.h:88: error: expected specifier-qualifier-list before ‘acpi_integer’ include/acpi/processor.h:106: error: expected specifier-qualifier-list before ‘acpi_integer’ include/acpi/processor.h:168: error: expected specifier-qualifier-list before ‘acpi_handle’ Signed-off-by: Len Brown <len.brown@intel.com> commit 8c91d958246bde68db0c3f0c57b535962ce861cb Author: Adrian Bunk <bunk@stusta.de> Date: Tue Mar 6 02:29:40 2007 -0800 cpuidle: make code static This patch makes the following needlessly global code static: - driver.c: __cpuidle_find_driver() - governor.c: __cpuidle_find_governor() - ladder.c: struct ladder_governor Signed-off-by: Adrian Bunk <bunk@stusta.de> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Adam Belay <abelay@novell.com> Cc: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 0c39dc3187094c72c33ab65a64d2017b21f372d2 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Wed Mar 7 02:38:22 2007 -0500 cpu_idle: fix build break This patch fixes a build breakage with !CONFIG_HOTPLUG_CPU and CONFIG_CPU_IDLE. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 8112e3b115659b07df340ef170515799c0105f82 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Mar 6 02:29:39 2007 -0800 cpuidle: build fix for !CPU_IDLE Fix the compile issues when CPU_IDLE is not configured. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Adam Belay <abelay@novell.com> Cc: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 1eb4431e9599cd25e0d9872f3c2c8986821839dd Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:54:57 2007 -0800 cpuidle take2: Basic documentation for cpuidle Documentation for cpuidle infrastructure Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit ef5f15a8b79123a047285ec2e3899108661df779 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:54:03 2007 -0800 cpuidle take2: Hookup ACPI C-states driver with cpuidle Hookup ACPI C-states onto generic cpuidle infrastructure. drivers/acpi/procesor_idle.c is now a ACPI C-states driver that registers as a driver in cpuidle infrastructure and the policy part is removed from drivers/acpi/processor_idle.c. We use governor in cpuidle instead. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 987196fa82d4db52c407e8c9d5dec884ba602183 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:52:57 2007 -0800 cpuidle take2: Core cpuidle infrastructure Announcing 'cpuidle', a new CPU power management infrastructure to manage idle CPUs in a clean and efficient manner. cpuidle separates out the drivers that can provide support for multiple types of idle states and policy governors that decide on what idle state to use at run time. A cpuidle driver can support multiple idle states based on parameters like varying power consumption, wakeup latency, etc (ACPI C-states for example). A cpuidle governor can be usage model specific (laptop, server, laptop on battery etc). Main advantage of the infrastructure being, it allows independent development of drivers and governors and allows for better CPU power management. A huge thanks to Adam Belay and Shaohua Li who were part of this mini-project since its beginning and are greatly responsible for this patchset. This patch: Core cpuidle infrastructure. Introduces a new abstraction layer for cpuidle: * which manages drivers that can support multiple idles states. Drivers can be generic or particular to specific hardware/platform * allows pluging in multiple policy governors that can take idle state policy decision * The core also has a set of sysfs interfaces with which administrato can know about supported drivers and governors and switch them at run time. Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> Signed-off-by: Len Brown <len.brown@intel.com>
2007-10-03 16:58:00 -06:00
sched: idle: Select idle state before stopping the tick In order to address the issue with short idle duration predictions by the idle governor after the scheduler tick has been stopped, reorder the code in cpuidle_idle_call() so that the governor idle state selection runs before tick_nohz_idle_go_idle() and use the "nohz" hint returned by cpuidle_select() to decide whether or not to stop the tick. This isn't straightforward, because menu_select() invokes tick_nohz_get_sleep_length() to get the time to the next timer event and the number returned by the latter comes from __tick_nohz_idle_stop_tick(). Fortunately, however, it is possible to compute that number without actually stopping the tick and with the help of the existing code. Namely, tick_nohz_get_sleep_length() can be made call tick_nohz_next_event(), introduced earlier, to get the time to the next non-highres timer event. If that happens, tick_nohz_next_event() need not be called by __tick_nohz_idle_stop_tick() again. If it turns out that the scheduler tick cannot be stopped going forward or the next timer event is too close for the tick to be stopped, tick_nohz_get_sleep_length() can simply return the time to the next event currently programmed into the corresponding clock event device. In addition to knowing the return value of tick_nohz_next_event(), however, tick_nohz_get_sleep_length() needs to know the time to the next highres timer event, but with the scheduler tick timer excluded, which can be computed with the help of hrtimer_get_next_event(). That minimum of that number and the tick_nohz_next_event() return value is the total time to the next timer event with the assumption that the tick will be stopped. It can be returned to the idle governor which can use it for predicting idle duration (under the assumption that the tick will be stopped) and deciding whether or not it makes sense to stop the tick before putting the CPU into the selected idle state. With the above, the sleep_length field in struct tick_sched is not necessary any more, so drop it. Link: https://bugzilla.kernel.org/show_bug.cgi?id=199227 Reported-by: Doug Smythies <dsmythies@telus.net> Reported-by: Thomas Ilsche <thomas.ilsche@tu-dresden.de> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
2018-04-03 15:17:11 -06:00
return ktime_sub(next_event, now);
cpuidle: consolidate 2.6.22 cpuidle branch into one patch commit e5a16b1f9eec0af7cfa0830304b41c1c0833cf9f Author: Len Brown <len.brown@intel.com> Date: Tue Oct 2 23:44:44 2007 -0400 cpuidle: shrink diff processor_idle.c | 440 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 429 insertions(+), 11 deletions(-) Signed-off-by: Len Brown <len.brown@intel.com> commit dfbb9d5aedfb18848a3e0d6f6e3e4969febb209c Author: Len Brown <len.brown@intel.com> Date: Wed Sep 26 02:17:55 2007 -0400 cpuidle: reduce diff size Reduces the cpuidle processor_idle.c diff vs 2.6.22 from this processor_idle.c | 2006 ++++++++++++++++++++++++++----------------- 1 file changed, 1219 insertions(+), 787 deletions(-) to this: processor_idle.c | 502 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 458 insertions(+), 44 deletions(-) ...for the purpose of making the cpuilde patch less invasive and easier to review. no functional changes. build tested only. Signed-off-by: Len Brown <len.brown@intel.com> commit 889172fc915f5a7fe20f35b133cbd205ce69bf6c Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Sep 13 13:40:05 2007 -0700 cpuidle: Retain old ACPI policy for !CONFIG_CPU_IDLE Retain the old policy in processor_idle, so that when CPU_IDLE is not configured, old C-state policy will still be used. This provides a clean gradual migration path from old ACPI policy to new cpuidle based policy. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 9544a8181edc7ecc33b3bfd69271571f98ed08bc Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Sep 13 13:39:17 2007 -0700 cpuidle: Configure governors by default Quoting Len "Do not give an option to users to shoot themselves in the foot". Remove the configurability of ladder and menu governors as they are needed for default policy of cpuidle. That way users will not be able to have cpuidle without any policy loosing all C-state power savings. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 8975059a2c1e56cfe83d1bcf031bcf4cb39be743 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:27:07 2007 -0400 CPUIDLE: load ACPI properly when CPUIDLE is disabled Change the registration return codes for when CPUIDLE support is not compiled into the kernel. As a result, the ACPI processor driver will load properly even if CPUIDLE is unavailable. However, it may be possible to cleanup the ACPI processor driver further and eliminate some dead code paths. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit e0322e2b58dd1b12ec669bf84693efe0dc2414a8 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:26:06 2007 -0400 CPUIDLE: remove cpuidle_get_bm_activity() Remove cpuidle_get_bm_activity() and updates governors accordingly. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 18a6e770d5c82ba26653e53d240caa617e09e9ab Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:58 2007 -0400 CPUIDLE: max_cstate fix Currently max_cstate is limited to 0, resulting in no idle processor power management on ACPI platforms. This patch restores the value to the array size. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 1fdc0887286179b40ce24bcdbde663172e205ef0 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:40 2007 -0400 CPUIDLE: handle BM detection inside the ACPI Processor driver Update the ACPI processor driver to detect BM activity and limit state entry depth internally, rather than exposing such requirements to CPUIDLE. As a result, CPUIDLE can drop this ACPI-specific interface and become more platform independent. BM activity is now handled much more aggressively than it was in the original implementation, so some testing coverage may be needed to verify that this doesn't introduce any DMA buffer under-run issues. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 0ef38840db666f48e3cdd2b769da676c57228dd9 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:14 2007 -0400 CPUIDLE: menu governor updates Tweak the menu governor to more effectively handle non-timer break events. Non-timer break events are detected by comparing the actual sleep time to the expected sleep time. In future revisions, it may be more reliable to use the timer data structures directly. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit bb4d74fca63fa96cf3ace644b15ae0f12b7df5a1 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:24:40 2007 -0400 CPUIDLE: fix 'current_governor' sysfs entry Allow the "current_governor" sysfs entry to properly handle input terminated with '\n'. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit df3c71559bb69b125f1a48971bf0d17f78bbdf47 Author: Len Brown <len.brown@intel.com> Date: Sun Aug 12 02:00:45 2007 -0400 cpuidle: fix IA64 build (again) Signed-off-by: Len Brown <len.brown@intel.com> commit a02064579e3f9530fd31baae16b1fc46b5a7bca8 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:39:27 2007 -0400 cpuidle: Remove support for runtime changing of max_cstate Remove support for runtime changeability of max_cstate. Drivers can use use latency APIs. max_cstate can still be used as a boot time option and dmi override. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 0912a44b13adf22f5e3f607d263aed23b4910d7e Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:39:16 2007 -0400 cpuidle: Remove ACPI cstate_limit calls from ipw2100 ipw2100 already has code to use accetable_latency interfaces to limit the C-state. Remove the calls to acpi_set_cstate_limit and acpi_get_cstate_limit as they are redundant. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit c649a76e76be6bff1fd770d0a775798813a3f6e0 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:35:39 2007 -0400 cpuidle: compile fix for pause and resume functions Fix the compilation failure when cpuidle is not compiled in. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Acked-by: Adam Belay <adam.belay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 2305a5920fb8ee6ccec1c62ade05aa8351091d71 Author: Adam Belay <abelay@novell.com> Date: Thu Jul 19 00:49:00 2007 -0400 cpuidle: re-write Some portions have been rewritten to make the code cleaner and lighter weight. The following is a list of changes: 1.) the state name is now included in the sysfs interface 2.) detection, hotplug, and available state modifications are handled by CPUIDLE drivers directly 3.) the CPUIDLE idle handler is only ever installed when at least one cpuidle_device is enabled and ready 4.) the menu governor BM code no longer overflows 5.) the sysfs attributes are now printed as unsigned integers, avoiding negative values 6.) a variety of other small cleanups Also, Idle drivers are no longer swappable during runtime through the CPUIDLE sysfs inteface. On i386 and x86_64 most idle handlers (e.g. poll, mwait, halt, etc.) don't benefit from an infrastructure that supports multiple states, so I think using a more general case idle handler selection mechanism would be cleaner. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Acked-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit df25b6b56955714e6e24b574d88d1fd11f0c3ee5 Author: Len Brown <len.brown@intel.com> Date: Tue Jul 24 17:08:21 2007 -0400 cpuidle: fix IA64 buid Signed-off-by: Len Brown <len.brown@intel.com> commit fd6ada4c14488755ff7068860078c437431fbccd Author: Adrian Bunk <bunk@stusta.de> Date: Mon Jul 9 11:33:13 2007 -0700 cpuidle: static make cpuidle_replace_governor() static Signed-off-by: Adrian Bunk <bunk@stusta.de> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit c1d4a2cebcadf2429c0c72e1d29aa2a9684c32e0 Author: Adrian Bunk <bunk@stusta.de> Date: Tue Jul 3 00:54:40 2007 -0400 cpuidle: static This patch makes the needlessly global struct menu_governor static. Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit dbf8780c6e8d572c2c273da97ed1cca7608fd999 Author: Andrew Morton <akpm@linux-foundation.org> Date: Tue Jul 3 00:49:14 2007 -0400 export symbol tick_nohz_get_sleep_length ERROR: "tick_nohz_get_sleep_length" [drivers/cpuidle/governors/menu.ko] undefined! ERROR: "tick_nohz_get_idle_jiffies" [drivers/cpuidle/governors/menu.ko] undefined! And please be sure to get your changes to core kernel suitably reviewed. Cc: Adam Belay <abelay@novell.com> Cc: Venki Pallipadi <venkatesh.pallipadi@intel.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: john stultz <johnstul@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 29f0e248e7017be15f99febf9143a2cef00b2961 Author: Andrew Morton <akpm@linux-foundation.org> Date: Tue Jul 3 00:43:04 2007 -0400 tick.h needs hrtimer.h It uses hrtimers. Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit e40cede7d63a029e92712a3fe02faee60cc38fb4 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:40:34 2007 -0400 cpuidle: first round of documentation updates Documentation changes based on Pavel's feedback. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 83b42be2efece386976507555c29e7773a0dfcd1 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:39:25 2007 -0400 cpuidle: add rating to the governors and pick the one with highest rating by default Introduce a governor rating scheme to pick the right governor by default. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit d2a74b8c5e8f22def4709330d4bfc4a29209b71c Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:38:08 2007 -0400 cpuidle: make cpuidle sysfs driver governor switch off by default Make default cpuidle sysfs to show current_governor and current_driver in read-only mode. More elaborate available_governors and available_drivers with writeable current_governor and current_driver interface only appear with "cpuidle_sysfs_switch" boot parameter. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 1f60a0e80bf83cf6b55c8845bbe5596ed8f6307b Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:37:00 2007 -0400 cpuidle: menu governor: change the early break condition Change the C-state early break out algorithm in menu governor. We only look at early breakouts that result in wakeups shorter than idle state's target_residency. If such a breakout is frequent enough, eliminate the particular idle state upto a timeout period. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 45a42095cf64b003b4a69be3ce7f434f97d7af51 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:35:38 2007 -0400 cpuidle: fix uninitialized variable in sysfs routine Fix the uninitialized usage of ret. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 80dca7cdba3e6ee13eae277660873ab9584eb3be Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:34:16 2007 -0400 cpuidle: reenable /proc/acpi//power interface for the time being Keep /proc/acpi/processor/CPU*/power around for a while as powertop depends on it. It will be marked deprecated and removed in future. powertop can use cpuidle interfaces instead. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 589c37c2646c5e3813a51255a5ee1159cb4c33fc Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:32:37 2007 -0400 cpuidle: menu governor and hrtimer compile fix Compile fix for menu governor. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 0ba80bd9ab3ed304cb4f19b722e4cc6740588b5e Author: Len Brown <len.brown@intel.com> Date: Thu May 31 22:51:43 2007 -0400 cpuidle: build fix - cpuidle vs ipw2100 module ERROR: "acpi_set_cstate_limit" [drivers/net/wireless/ipw2100.ko] undefined! Signed-off-by: Len Brown <len.brown@intel.com> commit d7d8fa7f96a7f7682be7c6cc0cc53fa7a18c3b58 Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:47:07 2007 -0400 cpuidle: add the 'menu' governor Here is my first take at implementing an idle PM governor that takes full advantage of NO_HZ. I call it the 'menu' governor because it considers the full list of idle states before each entry. I've kept the implementation fairly simple. It attempts to guess the next residency time and then chooses a state that would meet at least the break-even point between power savings and entry cost. To this end, it selects the deepest idle state that satisfies the following constraints: 1. If the idle time elapsed since bus master activity was detected is below a threshold (currently 20 ms), then limit the selection to C2-type or above. 2. Do not choose a state with a break-even residency that exceeds the expected time remaining until the next timer interrupt. 3. Do not choose a state with a break-even residency that exceeds the elapsed time between the last pair of break events, excluding timer interrupts. This governor has an advantage over "ladder" governor because it proactively checks how much time remains until the next timer interrupt using the tick infrastructure. Also, it handles device interrupt activity more intelligently by not including timer interrupts in break event calculations. Finally, it doesn't make policy decisions using the number of state entries, which can have variable residency times (NO_HZ makes these potentially very large), and instead only considers sleep time deltas. The menu governor can be selected during runtime using the cpuidle sysfs interface like so: "echo "menu" > /sys/devices/system/cpu/cpuidle/current_governor" Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit a4bec7e65aa3b7488b879d971651cc99a6c410fe Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:47:03 2007 -0400 cpuidle: export time until next timer interrupt using NO_HZ Expose information about the time remaining until the next timer interrupt expires by utilizing the dynticks infrastructure. Also modify the main idle loop to allow dynticks to handle non-interrupt break events (e.g. DMA). Finally, expose sleep ticks information to external code. Thomas Gleixner is responsible for much of the code in this patch. However, I've made some additional changes, so I'm probably responsible if there are any bugs or oversights :) Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 2929d8996fbc77f41a5ff86bb67cdde3ca7d2d72 Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:46:58 2007 -0400 cpuidle: governor API changes This patch prepares cpuidle for the menu governor. It adds an optional stage after idle state entry to give the governor an opportunity to check why the state was exited. Also it makes sure the idle loop returns after each state entry, allowing the appropriate dynticks code to run. Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 3a7fd42f9825c3b03e364ca59baa751bb350775f Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Apr 26 00:03:59 2007 -0700 cpuidle: hang fix Prevent hang on x86-64, when ACPI processor driver is added as a module on a system that does not support C-states. x86-64 expects all idle handlers to enable interrupts before returning from idle handler. This is due to enter_idle(), exit_idle() races. Make cpuidle_idle_call() confirm to this when there is no pm_idle_old. Also, cpuidle look at the return values of attch_driver() and set current_driver to NULL if attach fails on all CPUs. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 4893339a142afbd5b7c01ffadfd53d14746e858e Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:09 2007 +0800 cpuidle: add support for max_cstate limit With CPUIDLE framework, the max_cstate (to limit max cpu c-state) parameter is ingored. Some systems require it to ignore C2/C3 and some drivers like ipw require it too. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 43bbbbe1cb998cbd2df656f55bb3bfe30f30e7d1 Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:13 2007 +0800 cpuidle: add cpuidle_fore_redetect_devices API add cpuidle_force_redetect_devices API, which forces all CPU redetect idle states. Next patch will use it. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit d1edadd608f24836def5ec483d2edccfb37b1d19 Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:01 2007 +0800 cpuidle: fix sysfs related issue Fix the cpuidle sysfs issue. a. make kobject dynamicaly allocated b. fixed sysfs init issue to avoid suspend/resume issue Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 7169a5cc0d67b263978859672e86c13c23a5570d Author: Randy Dunlap <randy.dunlap@oracle.com> Date: Wed Mar 28 22:52:53 2007 -0400 cpuidle: 1-bit field must be unsigned A 1-bit bitfield has no room for a sign bit. drivers/cpuidle/governors/ladder.c:54:16: error: dubious bitfield without explicit `signed' or `unsigned' Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 4658620158dc2fbd9e4bcb213c5b6fb5d05ba7d4 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Wed Mar 28 22:52:41 2007 -0400 cpuidle: fix boot hang Patch for cpuidle boot hang reported by Larry Finger here. http://www.ussg.iu.edu/hypermail/linux/kernel/0703.2/2025.html Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Larry Finger <larry.finger@lwfinger.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit c17e168aa6e5fe3851baaae8df2fbc1cf11443a9 Author: Len Brown <len.brown@intel.com> Date: Wed Mar 7 04:37:53 2007 -0500 cpuidle: ladder does not depend on ACPI build fix for CONFIG_ACPI=n In file included from drivers/cpuidle/governors/ladder.c:21: include/acpi/processor.h:88: error: expected specifier-qualifier-list before ‘acpi_integer’ include/acpi/processor.h:106: error: expected specifier-qualifier-list before ‘acpi_integer’ include/acpi/processor.h:168: error: expected specifier-qualifier-list before ‘acpi_handle’ Signed-off-by: Len Brown <len.brown@intel.com> commit 8c91d958246bde68db0c3f0c57b535962ce861cb Author: Adrian Bunk <bunk@stusta.de> Date: Tue Mar 6 02:29:40 2007 -0800 cpuidle: make code static This patch makes the following needlessly global code static: - driver.c: __cpuidle_find_driver() - governor.c: __cpuidle_find_governor() - ladder.c: struct ladder_governor Signed-off-by: Adrian Bunk <bunk@stusta.de> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Adam Belay <abelay@novell.com> Cc: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 0c39dc3187094c72c33ab65a64d2017b21f372d2 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Wed Mar 7 02:38:22 2007 -0500 cpu_idle: fix build break This patch fixes a build breakage with !CONFIG_HOTPLUG_CPU and CONFIG_CPU_IDLE. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 8112e3b115659b07df340ef170515799c0105f82 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Mar 6 02:29:39 2007 -0800 cpuidle: build fix for !CPU_IDLE Fix the compile issues when CPU_IDLE is not configured. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Adam Belay <abelay@novell.com> Cc: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 1eb4431e9599cd25e0d9872f3c2c8986821839dd Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:54:57 2007 -0800 cpuidle take2: Basic documentation for cpuidle Documentation for cpuidle infrastructure Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit ef5f15a8b79123a047285ec2e3899108661df779 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:54:03 2007 -0800 cpuidle take2: Hookup ACPI C-states driver with cpuidle Hookup ACPI C-states onto generic cpuidle infrastructure. drivers/acpi/procesor_idle.c is now a ACPI C-states driver that registers as a driver in cpuidle infrastructure and the policy part is removed from drivers/acpi/processor_idle.c. We use governor in cpuidle instead. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 987196fa82d4db52c407e8c9d5dec884ba602183 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:52:57 2007 -0800 cpuidle take2: Core cpuidle infrastructure Announcing 'cpuidle', a new CPU power management infrastructure to manage idle CPUs in a clean and efficient manner. cpuidle separates out the drivers that can provide support for multiple types of idle states and policy governors that decide on what idle state to use at run time. A cpuidle driver can support multiple idle states based on parameters like varying power consumption, wakeup latency, etc (ACPI C-states for example). A cpuidle governor can be usage model specific (laptop, server, laptop on battery etc). Main advantage of the infrastructure being, it allows independent development of drivers and governors and allows for better CPU power management. A huge thanks to Adam Belay and Shaohua Li who were part of this mini-project since its beginning and are greatly responsible for this patchset. This patch: Core cpuidle infrastructure. Introduces a new abstraction layer for cpuidle: * which manages drivers that can support multiple idles states. Drivers can be generic or particular to specific hardware/platform * allows pluging in multiple policy governors that can take idle state policy decision * The core also has a set of sysfs interfaces with which administrato can know about supported drivers and governors and switch them at run time. Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> Signed-off-by: Len Brown <len.brown@intel.com>
2007-10-03 16:58:00 -06:00
}
EXPORT_SYMBOL_GPL(tick_nohz_get_sleep_length);
cpuidle: consolidate 2.6.22 cpuidle branch into one patch commit e5a16b1f9eec0af7cfa0830304b41c1c0833cf9f Author: Len Brown <len.brown@intel.com> Date: Tue Oct 2 23:44:44 2007 -0400 cpuidle: shrink diff processor_idle.c | 440 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 429 insertions(+), 11 deletions(-) Signed-off-by: Len Brown <len.brown@intel.com> commit dfbb9d5aedfb18848a3e0d6f6e3e4969febb209c Author: Len Brown <len.brown@intel.com> Date: Wed Sep 26 02:17:55 2007 -0400 cpuidle: reduce diff size Reduces the cpuidle processor_idle.c diff vs 2.6.22 from this processor_idle.c | 2006 ++++++++++++++++++++++++++----------------- 1 file changed, 1219 insertions(+), 787 deletions(-) to this: processor_idle.c | 502 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 458 insertions(+), 44 deletions(-) ...for the purpose of making the cpuilde patch less invasive and easier to review. no functional changes. build tested only. Signed-off-by: Len Brown <len.brown@intel.com> commit 889172fc915f5a7fe20f35b133cbd205ce69bf6c Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Sep 13 13:40:05 2007 -0700 cpuidle: Retain old ACPI policy for !CONFIG_CPU_IDLE Retain the old policy in processor_idle, so that when CPU_IDLE is not configured, old C-state policy will still be used. This provides a clean gradual migration path from old ACPI policy to new cpuidle based policy. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 9544a8181edc7ecc33b3bfd69271571f98ed08bc Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Sep 13 13:39:17 2007 -0700 cpuidle: Configure governors by default Quoting Len "Do not give an option to users to shoot themselves in the foot". Remove the configurability of ladder and menu governors as they are needed for default policy of cpuidle. That way users will not be able to have cpuidle without any policy loosing all C-state power savings. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 8975059a2c1e56cfe83d1bcf031bcf4cb39be743 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:27:07 2007 -0400 CPUIDLE: load ACPI properly when CPUIDLE is disabled Change the registration return codes for when CPUIDLE support is not compiled into the kernel. As a result, the ACPI processor driver will load properly even if CPUIDLE is unavailable. However, it may be possible to cleanup the ACPI processor driver further and eliminate some dead code paths. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit e0322e2b58dd1b12ec669bf84693efe0dc2414a8 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:26:06 2007 -0400 CPUIDLE: remove cpuidle_get_bm_activity() Remove cpuidle_get_bm_activity() and updates governors accordingly. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 18a6e770d5c82ba26653e53d240caa617e09e9ab Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:58 2007 -0400 CPUIDLE: max_cstate fix Currently max_cstate is limited to 0, resulting in no idle processor power management on ACPI platforms. This patch restores the value to the array size. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 1fdc0887286179b40ce24bcdbde663172e205ef0 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:40 2007 -0400 CPUIDLE: handle BM detection inside the ACPI Processor driver Update the ACPI processor driver to detect BM activity and limit state entry depth internally, rather than exposing such requirements to CPUIDLE. As a result, CPUIDLE can drop this ACPI-specific interface and become more platform independent. BM activity is now handled much more aggressively than it was in the original implementation, so some testing coverage may be needed to verify that this doesn't introduce any DMA buffer under-run issues. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 0ef38840db666f48e3cdd2b769da676c57228dd9 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:25:14 2007 -0400 CPUIDLE: menu governor updates Tweak the menu governor to more effectively handle non-timer break events. Non-timer break events are detected by comparing the actual sleep time to the expected sleep time. In future revisions, it may be more reliable to use the timer data structures directly. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit bb4d74fca63fa96cf3ace644b15ae0f12b7df5a1 Author: Adam Belay <abelay@novell.com> Date: Tue Aug 21 18:24:40 2007 -0400 CPUIDLE: fix 'current_governor' sysfs entry Allow the "current_governor" sysfs entry to properly handle input terminated with '\n'. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit df3c71559bb69b125f1a48971bf0d17f78bbdf47 Author: Len Brown <len.brown@intel.com> Date: Sun Aug 12 02:00:45 2007 -0400 cpuidle: fix IA64 build (again) Signed-off-by: Len Brown <len.brown@intel.com> commit a02064579e3f9530fd31baae16b1fc46b5a7bca8 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:39:27 2007 -0400 cpuidle: Remove support for runtime changing of max_cstate Remove support for runtime changeability of max_cstate. Drivers can use use latency APIs. max_cstate can still be used as a boot time option and dmi override. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 0912a44b13adf22f5e3f607d263aed23b4910d7e Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:39:16 2007 -0400 cpuidle: Remove ACPI cstate_limit calls from ipw2100 ipw2100 already has code to use accetable_latency interfaces to limit the C-state. Remove the calls to acpi_set_cstate_limit and acpi_get_cstate_limit as they are redundant. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit c649a76e76be6bff1fd770d0a775798813a3f6e0 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Sun Aug 12 01:35:39 2007 -0400 cpuidle: compile fix for pause and resume functions Fix the compilation failure when cpuidle is not compiled in. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Acked-by: Adam Belay <adam.belay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 2305a5920fb8ee6ccec1c62ade05aa8351091d71 Author: Adam Belay <abelay@novell.com> Date: Thu Jul 19 00:49:00 2007 -0400 cpuidle: re-write Some portions have been rewritten to make the code cleaner and lighter weight. The following is a list of changes: 1.) the state name is now included in the sysfs interface 2.) detection, hotplug, and available state modifications are handled by CPUIDLE drivers directly 3.) the CPUIDLE idle handler is only ever installed when at least one cpuidle_device is enabled and ready 4.) the menu governor BM code no longer overflows 5.) the sysfs attributes are now printed as unsigned integers, avoiding negative values 6.) a variety of other small cleanups Also, Idle drivers are no longer swappable during runtime through the CPUIDLE sysfs inteface. On i386 and x86_64 most idle handlers (e.g. poll, mwait, halt, etc.) don't benefit from an infrastructure that supports multiple states, so I think using a more general case idle handler selection mechanism would be cleaner. Signed-off-by: Adam Belay <abelay@novell.com> Acked-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Acked-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit df25b6b56955714e6e24b574d88d1fd11f0c3ee5 Author: Len Brown <len.brown@intel.com> Date: Tue Jul 24 17:08:21 2007 -0400 cpuidle: fix IA64 buid Signed-off-by: Len Brown <len.brown@intel.com> commit fd6ada4c14488755ff7068860078c437431fbccd Author: Adrian Bunk <bunk@stusta.de> Date: Mon Jul 9 11:33:13 2007 -0700 cpuidle: static make cpuidle_replace_governor() static Signed-off-by: Adrian Bunk <bunk@stusta.de> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit c1d4a2cebcadf2429c0c72e1d29aa2a9684c32e0 Author: Adrian Bunk <bunk@stusta.de> Date: Tue Jul 3 00:54:40 2007 -0400 cpuidle: static This patch makes the needlessly global struct menu_governor static. Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit dbf8780c6e8d572c2c273da97ed1cca7608fd999 Author: Andrew Morton <akpm@linux-foundation.org> Date: Tue Jul 3 00:49:14 2007 -0400 export symbol tick_nohz_get_sleep_length ERROR: "tick_nohz_get_sleep_length" [drivers/cpuidle/governors/menu.ko] undefined! ERROR: "tick_nohz_get_idle_jiffies" [drivers/cpuidle/governors/menu.ko] undefined! And please be sure to get your changes to core kernel suitably reviewed. Cc: Adam Belay <abelay@novell.com> Cc: Venki Pallipadi <venkatesh.pallipadi@intel.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: john stultz <johnstul@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 29f0e248e7017be15f99febf9143a2cef00b2961 Author: Andrew Morton <akpm@linux-foundation.org> Date: Tue Jul 3 00:43:04 2007 -0400 tick.h needs hrtimer.h It uses hrtimers. Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit e40cede7d63a029e92712a3fe02faee60cc38fb4 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:40:34 2007 -0400 cpuidle: first round of documentation updates Documentation changes based on Pavel's feedback. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 83b42be2efece386976507555c29e7773a0dfcd1 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:39:25 2007 -0400 cpuidle: add rating to the governors and pick the one with highest rating by default Introduce a governor rating scheme to pick the right governor by default. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit d2a74b8c5e8f22def4709330d4bfc4a29209b71c Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:38:08 2007 -0400 cpuidle: make cpuidle sysfs driver governor switch off by default Make default cpuidle sysfs to show current_governor and current_driver in read-only mode. More elaborate available_governors and available_drivers with writeable current_governor and current_driver interface only appear with "cpuidle_sysfs_switch" boot parameter. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 1f60a0e80bf83cf6b55c8845bbe5596ed8f6307b Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:37:00 2007 -0400 cpuidle: menu governor: change the early break condition Change the C-state early break out algorithm in menu governor. We only look at early breakouts that result in wakeups shorter than idle state's target_residency. If such a breakout is frequent enough, eliminate the particular idle state upto a timeout period. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 45a42095cf64b003b4a69be3ce7f434f97d7af51 Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:35:38 2007 -0400 cpuidle: fix uninitialized variable in sysfs routine Fix the uninitialized usage of ret. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 80dca7cdba3e6ee13eae277660873ab9584eb3be Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:34:16 2007 -0400 cpuidle: reenable /proc/acpi//power interface for the time being Keep /proc/acpi/processor/CPU*/power around for a while as powertop depends on it. It will be marked deprecated and removed in future. powertop can use cpuidle interfaces instead. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 589c37c2646c5e3813a51255a5ee1159cb4c33fc Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Jul 3 00:32:37 2007 -0400 cpuidle: menu governor and hrtimer compile fix Compile fix for menu governor. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 0ba80bd9ab3ed304cb4f19b722e4cc6740588b5e Author: Len Brown <len.brown@intel.com> Date: Thu May 31 22:51:43 2007 -0400 cpuidle: build fix - cpuidle vs ipw2100 module ERROR: "acpi_set_cstate_limit" [drivers/net/wireless/ipw2100.ko] undefined! Signed-off-by: Len Brown <len.brown@intel.com> commit d7d8fa7f96a7f7682be7c6cc0cc53fa7a18c3b58 Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:47:07 2007 -0400 cpuidle: add the 'menu' governor Here is my first take at implementing an idle PM governor that takes full advantage of NO_HZ. I call it the 'menu' governor because it considers the full list of idle states before each entry. I've kept the implementation fairly simple. It attempts to guess the next residency time and then chooses a state that would meet at least the break-even point between power savings and entry cost. To this end, it selects the deepest idle state that satisfies the following constraints: 1. If the idle time elapsed since bus master activity was detected is below a threshold (currently 20 ms), then limit the selection to C2-type or above. 2. Do not choose a state with a break-even residency that exceeds the expected time remaining until the next timer interrupt. 3. Do not choose a state with a break-even residency that exceeds the elapsed time between the last pair of break events, excluding timer interrupts. This governor has an advantage over "ladder" governor because it proactively checks how much time remains until the next timer interrupt using the tick infrastructure. Also, it handles device interrupt activity more intelligently by not including timer interrupts in break event calculations. Finally, it doesn't make policy decisions using the number of state entries, which can have variable residency times (NO_HZ makes these potentially very large), and instead only considers sleep time deltas. The menu governor can be selected during runtime using the cpuidle sysfs interface like so: "echo "menu" > /sys/devices/system/cpu/cpuidle/current_governor" Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit a4bec7e65aa3b7488b879d971651cc99a6c410fe Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:47:03 2007 -0400 cpuidle: export time until next timer interrupt using NO_HZ Expose information about the time remaining until the next timer interrupt expires by utilizing the dynticks infrastructure. Also modify the main idle loop to allow dynticks to handle non-interrupt break events (e.g. DMA). Finally, expose sleep ticks information to external code. Thomas Gleixner is responsible for much of the code in this patch. However, I've made some additional changes, so I'm probably responsible if there are any bugs or oversights :) Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 2929d8996fbc77f41a5ff86bb67cdde3ca7d2d72 Author: Adam Belay <abelay@novell.com> Date: Sat Mar 24 03:46:58 2007 -0400 cpuidle: governor API changes This patch prepares cpuidle for the menu governor. It adds an optional stage after idle state entry to give the governor an opportunity to check why the state was exited. Also it makes sure the idle loop returns after each state entry, allowing the appropriate dynticks code to run. Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 3a7fd42f9825c3b03e364ca59baa751bb350775f Author: Venki Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Apr 26 00:03:59 2007 -0700 cpuidle: hang fix Prevent hang on x86-64, when ACPI processor driver is added as a module on a system that does not support C-states. x86-64 expects all idle handlers to enable interrupts before returning from idle handler. This is due to enter_idle(), exit_idle() races. Make cpuidle_idle_call() confirm to this when there is no pm_idle_old. Also, cpuidle look at the return values of attch_driver() and set current_driver to NULL if attach fails on all CPUs. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 4893339a142afbd5b7c01ffadfd53d14746e858e Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:09 2007 +0800 cpuidle: add support for max_cstate limit With CPUIDLE framework, the max_cstate (to limit max cpu c-state) parameter is ingored. Some systems require it to ignore C2/C3 and some drivers like ipw require it too. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 43bbbbe1cb998cbd2df656f55bb3bfe30f30e7d1 Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:13 2007 +0800 cpuidle: add cpuidle_fore_redetect_devices API add cpuidle_force_redetect_devices API, which forces all CPU redetect idle states. Next patch will use it. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit d1edadd608f24836def5ec483d2edccfb37b1d19 Author: Shaohua Li <shaohua.li@intel.com> Date: Thu Apr 26 10:40:01 2007 +0800 cpuidle: fix sysfs related issue Fix the cpuidle sysfs issue. a. make kobject dynamicaly allocated b. fixed sysfs init issue to avoid suspend/resume issue Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 7169a5cc0d67b263978859672e86c13c23a5570d Author: Randy Dunlap <randy.dunlap@oracle.com> Date: Wed Mar 28 22:52:53 2007 -0400 cpuidle: 1-bit field must be unsigned A 1-bit bitfield has no room for a sign bit. drivers/cpuidle/governors/ladder.c:54:16: error: dubious bitfield without explicit `signed' or `unsigned' Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 4658620158dc2fbd9e4bcb213c5b6fb5d05ba7d4 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Wed Mar 28 22:52:41 2007 -0400 cpuidle: fix boot hang Patch for cpuidle boot hang reported by Larry Finger here. http://www.ussg.iu.edu/hypermail/linux/kernel/0703.2/2025.html Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Larry Finger <larry.finger@lwfinger.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit c17e168aa6e5fe3851baaae8df2fbc1cf11443a9 Author: Len Brown <len.brown@intel.com> Date: Wed Mar 7 04:37:53 2007 -0500 cpuidle: ladder does not depend on ACPI build fix for CONFIG_ACPI=n In file included from drivers/cpuidle/governors/ladder.c:21: include/acpi/processor.h:88: error: expected specifier-qualifier-list before ‘acpi_integer’ include/acpi/processor.h:106: error: expected specifier-qualifier-list before ‘acpi_integer’ include/acpi/processor.h:168: error: expected specifier-qualifier-list before ‘acpi_handle’ Signed-off-by: Len Brown <len.brown@intel.com> commit 8c91d958246bde68db0c3f0c57b535962ce861cb Author: Adrian Bunk <bunk@stusta.de> Date: Tue Mar 6 02:29:40 2007 -0800 cpuidle: make code static This patch makes the following needlessly global code static: - driver.c: __cpuidle_find_driver() - governor.c: __cpuidle_find_governor() - ladder.c: struct ladder_governor Signed-off-by: Adrian Bunk <bunk@stusta.de> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Adam Belay <abelay@novell.com> Cc: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 0c39dc3187094c72c33ab65a64d2017b21f372d2 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Wed Mar 7 02:38:22 2007 -0500 cpu_idle: fix build break This patch fixes a build breakage with !CONFIG_HOTPLUG_CPU and CONFIG_CPU_IDLE. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 8112e3b115659b07df340ef170515799c0105f82 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue Mar 6 02:29:39 2007 -0800 cpuidle: build fix for !CPU_IDLE Fix the compile issues when CPU_IDLE is not configured. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Adam Belay <abelay@novell.com> Cc: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Len Brown <len.brown@intel.com> commit 1eb4431e9599cd25e0d9872f3c2c8986821839dd Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:54:57 2007 -0800 cpuidle take2: Basic documentation for cpuidle Documentation for cpuidle infrastructure Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> commit ef5f15a8b79123a047285ec2e3899108661df779 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:54:03 2007 -0800 cpuidle take2: Hookup ACPI C-states driver with cpuidle Hookup ACPI C-states onto generic cpuidle infrastructure. drivers/acpi/procesor_idle.c is now a ACPI C-states driver that registers as a driver in cpuidle infrastructure and the policy part is removed from drivers/acpi/processor_idle.c. We use governor in cpuidle instead. Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Len Brown <len.brown@intel.com> commit 987196fa82d4db52c407e8c9d5dec884ba602183 Author: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Thu Feb 22 13:52:57 2007 -0800 cpuidle take2: Core cpuidle infrastructure Announcing 'cpuidle', a new CPU power management infrastructure to manage idle CPUs in a clean and efficient manner. cpuidle separates out the drivers that can provide support for multiple types of idle states and policy governors that decide on what idle state to use at run time. A cpuidle driver can support multiple idle states based on parameters like varying power consumption, wakeup latency, etc (ACPI C-states for example). A cpuidle governor can be usage model specific (laptop, server, laptop on battery etc). Main advantage of the infrastructure being, it allows independent development of drivers and governors and allows for better CPU power management. A huge thanks to Adam Belay and Shaohua Li who were part of this mini-project since its beginning and are greatly responsible for this patchset. This patch: Core cpuidle infrastructure. Introduces a new abstraction layer for cpuidle: * which manages drivers that can support multiple idles states. Drivers can be generic or particular to specific hardware/platform * allows pluging in multiple policy governors that can take idle state policy decision * The core also has a set of sysfs interfaces with which administrato can know about supported drivers and governors and switch them at run time. Signed-off-by: Adam Belay <abelay@novell.com> Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Len Brown <len.brown@intel.com> Signed-off-by: Len Brown <len.brown@intel.com>
2007-10-03 16:58:00 -06:00
/**
* tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
* for a particular CPU.
*
* Called from the schedutil frequency scaling governor in scheduler context.
*/
unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
{
struct tick_sched *ts = tick_get_tick_sched(cpu);
return ts->idle_calls;
}
cpufreq: schedutil: Avoid reducing frequency of busy CPUs prematurely The way the schedutil governor uses the PELT metric causes it to underestimate the CPU utilization in some cases. That can be easily demonstrated by running kernel compilation on a Sandy Bridge Intel processor, running turbostat in parallel with it and looking at the values written to the MSR_IA32_PERF_CTL register. Namely, the expected result would be that when all CPUs were 100% busy, all of them would be requested to run in the maximum P-state, but observation shows that this clearly isn't the case. The CPUs run in the maximum P-state for a while and then are requested to run slower and go back to the maximum P-state after a while again. That causes the actual frequency of the processor to visibly oscillate below the sustainable maximum in a jittery fashion which clearly is not desirable. That has been attributed to CPU utilization metric updates on task migration that cause the total utilization value for the CPU to be reduced by the utilization of the migrated task. If that happens, the schedutil governor may see a CPU utilization reduction and will attempt to reduce the CPU frequency accordingly right away. That may be premature, though, for example if the system is generally busy and there are other runnable tasks waiting to be run on that CPU already. This is unlikely to be an issue on systems where cpufreq policies are shared between multiple CPUs, because in those cases the policy utilization is computed as the maximum of the CPU utilization values over the whole policy and if that turns out to be low, reducing the frequency for the policy most likely is a good idea anyway. On systems with one CPU per policy, however, it may affect performance adversely and even lead to increased energy consumption in some cases. On those systems it may be addressed by taking another utilization metric into consideration, like whether or not the CPU whose frequency is about to be reduced has been idle recently, because if that's not the case, the CPU is likely to be busy in the near future and its frequency should not be reduced. To that end, use the counter of idle calls in the timekeeping code. Namely, make the schedutil governor look at that counter for the current CPU every time before its frequency is about to be reduced. If the counter has not changed since the previous iteration of the governor computations for that CPU, the CPU has been busy for all that time and its frequency should not be decreased, so if the new frequency would be lower than the one set previously, the governor will skip the frequency update. Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Viresh Kumar <viresh.kumar@linaro.org> Reviewed-by: Joel Fernandes <joelaf@google.com>
2017-03-21 17:08:50 -06:00
/**
* tick_nohz_get_idle_calls - return the current idle calls counter value
*
* Called from the schedutil frequency scaling governor in scheduler context.
*/
unsigned long tick_nohz_get_idle_calls(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
return ts->idle_calls;
}
nohz: Make nohz API agnostic against idle ticks cputime accounting When the timer tick fires, it accounts the new jiffy as either part of system, user or idle time. This is how we record the cputime statistics. But when the tick is stopped from the idle task, we still need to record the number of jiffies spent tickless until we restart the tick and fall back to traditional tick-based cputime accounting. To do this, we take a snapshot of jiffies when the tick is stopped and compute the difference against the new value of jiffies when the tick is restarted. Then we account this whole difference to the idle cputime. However we are preparing to be able to stop the tick from other places than idle. So this idle time accounting needs to be performed from the callers of nohz APIs, not from the nohz APIs themselves because we now want them to be agnostic against places that stop/restart tick. Therefore, we pull the tickless idle time accounting out of generic nohz helpers up to idle entry/exit callers. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Alessio Igor Bogani <abogani@kernel.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Avi Kivity <avi@redhat.com> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Christoph Lameter <cl@linux.com> Cc: Daniel Lezcano <daniel.lezcano@linaro.org> Cc: Geoff Levand <geoff@infradead.org> Cc: Gilad Ben Yossef <gilad@benyossef.com> Cc: Hakan Akkan <hakanakkan@gmail.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Kevin Hilman <khilman@ti.com> Cc: Max Krasnyansky <maxk@qualcomm.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephen Hemminger <shemminger@vyatta.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Sven-Thorsten Dietrich <thebigcorporation@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de>
2011-07-27 20:00:47 -06:00
static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
{
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
nohz: Make nohz API agnostic against idle ticks cputime accounting When the timer tick fires, it accounts the new jiffy as either part of system, user or idle time. This is how we record the cputime statistics. But when the tick is stopped from the idle task, we still need to record the number of jiffies spent tickless until we restart the tick and fall back to traditional tick-based cputime accounting. To do this, we take a snapshot of jiffies when the tick is stopped and compute the difference against the new value of jiffies when the tick is restarted. Then we account this whole difference to the idle cputime. However we are preparing to be able to stop the tick from other places than idle. So this idle time accounting needs to be performed from the callers of nohz APIs, not from the nohz APIs themselves because we now want them to be agnostic against places that stop/restart tick. Therefore, we pull the tickless idle time accounting out of generic nohz helpers up to idle entry/exit callers. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Alessio Igor Bogani <abogani@kernel.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Avi Kivity <avi@redhat.com> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Christoph Lameter <cl@linux.com> Cc: Daniel Lezcano <daniel.lezcano@linaro.org> Cc: Geoff Levand <geoff@infradead.org> Cc: Gilad Ben Yossef <gilad@benyossef.com> Cc: Hakan Akkan <hakanakkan@gmail.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Kevin Hilman <khilman@ti.com> Cc: Max Krasnyansky <maxk@qualcomm.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephen Hemminger <shemminger@vyatta.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Sven-Thorsten Dietrich <thebigcorporation@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de>
2011-07-27 20:00:47 -06:00
unsigned long ticks;
if (vtime_accounting_cpu_enabled())
return;
/*
* We stopped the tick in idle. Update process times would miss the
* time we slept as update_process_times does only a 1 tick
* accounting. Enforce that this is accounted to idle !
*/
ticks = jiffies - ts->idle_jiffies;
/*
* We might be one off. Do not randomly account a huge number of ticks!
*/
2008-12-31 07:11:38 -07:00
if (ticks && ticks < LONG_MAX)
account_idle_ticks(ticks);
#endif
}
static void __tick_nohz_idle_restart_tick(struct tick_sched *ts, ktime_t now)
{
tick_nohz_restart_sched_tick(ts, now);
tick_nohz_account_idle_ticks(ts);
}
void tick_nohz_idle_restart_tick(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
if (ts->tick_stopped)
__tick_nohz_idle_restart_tick(ts, ktime_get());
}
/**
nohz: Separate out irq exit and idle loop dyntick logic The tick_nohz_stop_sched_tick() function, which tries to delay the next timer tick as long as possible, can be called from two places: - From the idle loop to start the dytick idle mode - From interrupt exit if we have interrupted the dyntick idle mode, so that we reprogram the next tick event in case the irq changed some internal state that requires this action. There are only few minor differences between both that are handled by that function, driven by the ts->inidle cpu variable and the inidle parameter. The whole guarantees that we only update the dyntick mode on irq exit if we actually interrupted the dyntick idle mode, and that we enter in RCU extended quiescent state from idle loop entry only. Split this function into: - tick_nohz_idle_enter(), which sets ts->inidle to 1, enters dynticks idle mode unconditionally if it can, and enters into RCU extended quiescent state. - tick_nohz_irq_exit() which only updates the dynticks idle mode when ts->inidle is set (ie: if tick_nohz_idle_enter() has been called). To maintain symmetry, tick_nohz_restart_sched_tick() has been renamed into tick_nohz_idle_exit(). This simplifies the code and micro-optimize the irq exit path (no need for local_irq_save there). This also prepares for the split between dynticks and rcu extended quiescent state logics. We'll need this split to further fix illegal uses of RCU in extended quiescent states in the idle loop. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Mike Frysinger <vapier@gentoo.org> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Cc: David Miller <davem@davemloft.net> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Hans-Christian Egtvedt <hans-christian.egtvedt@atmel.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Russell King <linux@arm.linux.org.uk> Cc: Paul Mackerras <paulus@samba.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org>
2011-10-07 10:22:06 -06:00
* tick_nohz_idle_exit - restart the idle tick from the idle task
*
* Restart the idle tick when the CPU is woken up from idle
nohz: Separate out irq exit and idle loop dyntick logic The tick_nohz_stop_sched_tick() function, which tries to delay the next timer tick as long as possible, can be called from two places: - From the idle loop to start the dytick idle mode - From interrupt exit if we have interrupted the dyntick idle mode, so that we reprogram the next tick event in case the irq changed some internal state that requires this action. There are only few minor differences between both that are handled by that function, driven by the ts->inidle cpu variable and the inidle parameter. The whole guarantees that we only update the dyntick mode on irq exit if we actually interrupted the dyntick idle mode, and that we enter in RCU extended quiescent state from idle loop entry only. Split this function into: - tick_nohz_idle_enter(), which sets ts->inidle to 1, enters dynticks idle mode unconditionally if it can, and enters into RCU extended quiescent state. - tick_nohz_irq_exit() which only updates the dynticks idle mode when ts->inidle is set (ie: if tick_nohz_idle_enter() has been called). To maintain symmetry, tick_nohz_restart_sched_tick() has been renamed into tick_nohz_idle_exit(). This simplifies the code and micro-optimize the irq exit path (no need for local_irq_save there). This also prepares for the split between dynticks and rcu extended quiescent state logics. We'll need this split to further fix illegal uses of RCU in extended quiescent states in the idle loop. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Mike Frysinger <vapier@gentoo.org> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Cc: David Miller <davem@davemloft.net> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Hans-Christian Egtvedt <hans-christian.egtvedt@atmel.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Russell King <linux@arm.linux.org.uk> Cc: Paul Mackerras <paulus@samba.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org>
2011-10-07 10:22:06 -06:00
* This also exit the RCU extended quiescent state. The CPU
* can use RCU again after this function is called.
*/
nohz: Separate out irq exit and idle loop dyntick logic The tick_nohz_stop_sched_tick() function, which tries to delay the next timer tick as long as possible, can be called from two places: - From the idle loop to start the dytick idle mode - From interrupt exit if we have interrupted the dyntick idle mode, so that we reprogram the next tick event in case the irq changed some internal state that requires this action. There are only few minor differences between both that are handled by that function, driven by the ts->inidle cpu variable and the inidle parameter. The whole guarantees that we only update the dyntick mode on irq exit if we actually interrupted the dyntick idle mode, and that we enter in RCU extended quiescent state from idle loop entry only. Split this function into: - tick_nohz_idle_enter(), which sets ts->inidle to 1, enters dynticks idle mode unconditionally if it can, and enters into RCU extended quiescent state. - tick_nohz_irq_exit() which only updates the dynticks idle mode when ts->inidle is set (ie: if tick_nohz_idle_enter() has been called). To maintain symmetry, tick_nohz_restart_sched_tick() has been renamed into tick_nohz_idle_exit(). This simplifies the code and micro-optimize the irq exit path (no need for local_irq_save there). This also prepares for the split between dynticks and rcu extended quiescent state logics. We'll need this split to further fix illegal uses of RCU in extended quiescent states in the idle loop. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Mike Frysinger <vapier@gentoo.org> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Cc: David Miller <davem@davemloft.net> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Hans-Christian Egtvedt <hans-christian.egtvedt@atmel.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Russell King <linux@arm.linux.org.uk> Cc: Paul Mackerras <paulus@samba.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org>
2011-10-07 10:22:06 -06:00
void tick_nohz_idle_exit(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
bool idle_active, tick_stopped;
ktime_t now;
local_irq_disable();
nohz: Allow rcu extended quiescent state handling seperately from tick stop It is assumed that rcu won't be used once we switch to tickless mode and until we restart the tick. However this is not always true, as in x86-64 where we dereference the idle notifiers after the tick is stopped. To prepare for fixing this, add two new APIs: tick_nohz_idle_enter_norcu() and tick_nohz_idle_exit_norcu(). If no use of RCU is made in the idle loop between tick_nohz_enter_idle() and tick_nohz_exit_idle() calls, the arch must instead call the new *_norcu() version such that the arch doesn't need to call rcu_idle_enter() and rcu_idle_exit(). Otherwise the arch must call tick_nohz_enter_idle() and tick_nohz_exit_idle() and also call explicitly: - rcu_idle_enter() after its last use of RCU before the CPU is put to sleep. - rcu_idle_exit() before the first use of RCU after the CPU is woken up. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Mike Frysinger <vapier@gentoo.org> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Cc: David Miller <davem@davemloft.net> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Hans-Christian Egtvedt <hans-christian.egtvedt@atmel.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Russell King <linux@arm.linux.org.uk> Cc: Paul Mackerras <paulus@samba.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2011-10-08 08:01:00 -06:00
WARN_ON_ONCE(!ts->inidle);
WARN_ON_ONCE(ts->timer_expires_base);
ts->inidle = 0;
idle_active = ts->idle_active;
tick_stopped = ts->tick_stopped;
if (idle_active || tick_stopped)
now = ktime_get();
if (idle_active)
tick_nohz_stop_idle(ts, now);
if (tick_stopped)
__tick_nohz_idle_restart_tick(ts, now);
local_irq_enable();
}
/*
* The nohz low res interrupt handler
*/
static void tick_nohz_handler(struct clock_event_device *dev)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
dev->next_event = KTIME_MAX;
tick_sched_do_timer(ts, now);
tick_sched_handle(ts, regs);
nohz: Fix spurious periodic tick behaviour in low-res dynticks mode When we reach the end of the tick handler, we unconditionally reschedule the next tick to the next jiffy. Then on irq exit, the nohz code overrides that setting if needed and defers the next tick as far away in the future as possible. Now in the best dynticks case, when we actually don't need any tick in the future (ie: expires == KTIME_MAX), low-res and high-res behave differently. What we want in this case is to cancel the next tick programmed by the previous one. That's what we do in high-res mode. OTOH we lack a low-res mode equivalent of hrtimer_cancel() so we simply don't do anything in this case and the next tick remains scheduled to jiffies + 1. As a result, in low-res mode, when the dynticks code determines that no tick is needed in the future, we can recursively get a spurious tick every jiffy because then the next tick is always reprogrammed from the tick handler and is never cancelled. And this can happen indefinetly until some subsystem actually needs a precise tick in the future and only then we eventually overwrite the previous tick handler setting to defer the next tick. We are fixing this by introducing the ONESHOT_STOPPED mode which will let us pause a clockevent when no further interrupt is needed. Meanwhile we can't expect all drivers to support this new mode. So lets reduce much of the symptoms by skipping the nohz-blind tick rescheduling from the tick-handler when the CPU is in dynticks mode. That tick rescheduling wrongly assumed periodicity and the low-res dynticks code can't cancel such decision. This breaks the recursive (and thus the worst) part of the problem. In the worst case now, we'll get only one extra tick due to uncancelled tick scheduled before we entered dynticks mode. This also removes a needless clockevent write on idle ticks. Since those clock write are usually considered to be slow, it's a general win. Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2014-06-12 04:54:41 -06:00
/* No need to reprogram if we are running tickless */
if (unlikely(ts->tick_stopped))
return;
hrtimer_forward(&ts->sched_timer, now, tick_period);
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
}
timer: Reduce timer migration overhead if disabled Eric reported that the timer_migration sysctl is not really nice performance wise as it needs to check at every timer insertion whether the feature is enabled or not. Further the check does not live in the timer code, so we have an extra function call which checks an extra cache line to figure out that it is disabled. We can do better and store that information in the per cpu (hr)timer bases. I pondered to use a static key, but that's a nightmare to update from the nohz code and the timer base cache line is hot anyway when we select a timer base. The old logic enabled the timer migration unconditionally if CONFIG_NO_HZ was set even if nohz was disabled on the kernel command line. With this modification, we start off with migration disabled. The user visible sysctl is still set to enabled. If the kernel switches to NOHZ migration is enabled, if the user did not disable it via the sysctl prior to the switch. If nohz=off is on the kernel command line, migration stays disabled no matter what. Before: 47.76% hog [.] main 14.84% [kernel] [k] _raw_spin_lock_irqsave 9.55% [kernel] [k] _raw_spin_unlock_irqrestore 6.71% [kernel] [k] mod_timer 6.24% [kernel] [k] lock_timer_base.isra.38 3.76% [kernel] [k] detach_if_pending 3.71% [kernel] [k] del_timer 2.50% [kernel] [k] internal_add_timer 1.51% [kernel] [k] get_nohz_timer_target 1.28% [kernel] [k] __internal_add_timer 0.78% [kernel] [k] timerfn 0.48% [kernel] [k] wake_up_nohz_cpu After: 48.10% hog [.] main 15.25% [kernel] [k] _raw_spin_lock_irqsave 9.76% [kernel] [k] _raw_spin_unlock_irqrestore 6.50% [kernel] [k] mod_timer 6.44% [kernel] [k] lock_timer_base.isra.38 3.87% [kernel] [k] detach_if_pending 3.80% [kernel] [k] del_timer 2.67% [kernel] [k] internal_add_timer 1.33% [kernel] [k] __internal_add_timer 0.73% [kernel] [k] timerfn 0.54% [kernel] [k] wake_up_nohz_cpu Reported-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Viresh Kumar <viresh.kumar@linaro.org> Cc: John Stultz <john.stultz@linaro.org> Cc: Joonwoo Park <joonwoop@codeaurora.org> Cc: Wenbo Wang <wenbo.wang@memblaze.com> Link: http://lkml.kernel.org/r/20150526224512.127050787@linutronix.de Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-05-26 16:50:33 -06:00
static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
{
if (!tick_nohz_enabled)
return;
ts->nohz_mode = mode;
/* One update is enough */
if (!test_and_set_bit(0, &tick_nohz_active))
timers_update_nohz();
timer: Reduce timer migration overhead if disabled Eric reported that the timer_migration sysctl is not really nice performance wise as it needs to check at every timer insertion whether the feature is enabled or not. Further the check does not live in the timer code, so we have an extra function call which checks an extra cache line to figure out that it is disabled. We can do better and store that information in the per cpu (hr)timer bases. I pondered to use a static key, but that's a nightmare to update from the nohz code and the timer base cache line is hot anyway when we select a timer base. The old logic enabled the timer migration unconditionally if CONFIG_NO_HZ was set even if nohz was disabled on the kernel command line. With this modification, we start off with migration disabled. The user visible sysctl is still set to enabled. If the kernel switches to NOHZ migration is enabled, if the user did not disable it via the sysctl prior to the switch. If nohz=off is on the kernel command line, migration stays disabled no matter what. Before: 47.76% hog [.] main 14.84% [kernel] [k] _raw_spin_lock_irqsave 9.55% [kernel] [k] _raw_spin_unlock_irqrestore 6.71% [kernel] [k] mod_timer 6.24% [kernel] [k] lock_timer_base.isra.38 3.76% [kernel] [k] detach_if_pending 3.71% [kernel] [k] del_timer 2.50% [kernel] [k] internal_add_timer 1.51% [kernel] [k] get_nohz_timer_target 1.28% [kernel] [k] __internal_add_timer 0.78% [kernel] [k] timerfn 0.48% [kernel] [k] wake_up_nohz_cpu After: 48.10% hog [.] main 15.25% [kernel] [k] _raw_spin_lock_irqsave 9.76% [kernel] [k] _raw_spin_unlock_irqrestore 6.50% [kernel] [k] mod_timer 6.44% [kernel] [k] lock_timer_base.isra.38 3.87% [kernel] [k] detach_if_pending 3.80% [kernel] [k] del_timer 2.67% [kernel] [k] internal_add_timer 1.33% [kernel] [k] __internal_add_timer 0.73% [kernel] [k] timerfn 0.54% [kernel] [k] wake_up_nohz_cpu Reported-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Viresh Kumar <viresh.kumar@linaro.org> Cc: John Stultz <john.stultz@linaro.org> Cc: Joonwoo Park <joonwoop@codeaurora.org> Cc: Wenbo Wang <wenbo.wang@memblaze.com> Link: http://lkml.kernel.org/r/20150526224512.127050787@linutronix.de Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-05-26 16:50:33 -06:00
}
/**
* tick_nohz_switch_to_nohz - switch to nohz mode
*/
static void tick_nohz_switch_to_nohz(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t next;
if (!tick_nohz_enabled)
return;
if (tick_switch_to_oneshot(tick_nohz_handler))
return;
/*
* Recycle the hrtimer in ts, so we can share the
* hrtimer_forward with the highres code.
*/
hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
/* Get the next period */
next = tick_init_jiffy_update();
hrtimer_set_expires(&ts->sched_timer, next);
hrtimer_forward_now(&ts->sched_timer, tick_period);
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
timer: Reduce timer migration overhead if disabled Eric reported that the timer_migration sysctl is not really nice performance wise as it needs to check at every timer insertion whether the feature is enabled or not. Further the check does not live in the timer code, so we have an extra function call which checks an extra cache line to figure out that it is disabled. We can do better and store that information in the per cpu (hr)timer bases. I pondered to use a static key, but that's a nightmare to update from the nohz code and the timer base cache line is hot anyway when we select a timer base. The old logic enabled the timer migration unconditionally if CONFIG_NO_HZ was set even if nohz was disabled on the kernel command line. With this modification, we start off with migration disabled. The user visible sysctl is still set to enabled. If the kernel switches to NOHZ migration is enabled, if the user did not disable it via the sysctl prior to the switch. If nohz=off is on the kernel command line, migration stays disabled no matter what. Before: 47.76% hog [.] main 14.84% [kernel] [k] _raw_spin_lock_irqsave 9.55% [kernel] [k] _raw_spin_unlock_irqrestore 6.71% [kernel] [k] mod_timer 6.24% [kernel] [k] lock_timer_base.isra.38 3.76% [kernel] [k] detach_if_pending 3.71% [kernel] [k] del_timer 2.50% [kernel] [k] internal_add_timer 1.51% [kernel] [k] get_nohz_timer_target 1.28% [kernel] [k] __internal_add_timer 0.78% [kernel] [k] timerfn 0.48% [kernel] [k] wake_up_nohz_cpu After: 48.10% hog [.] main 15.25% [kernel] [k] _raw_spin_lock_irqsave 9.76% [kernel] [k] _raw_spin_unlock_irqrestore 6.50% [kernel] [k] mod_timer 6.44% [kernel] [k] lock_timer_base.isra.38 3.87% [kernel] [k] detach_if_pending 3.80% [kernel] [k] del_timer 2.67% [kernel] [k] internal_add_timer 1.33% [kernel] [k] __internal_add_timer 0.73% [kernel] [k] timerfn 0.54% [kernel] [k] wake_up_nohz_cpu Reported-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Viresh Kumar <viresh.kumar@linaro.org> Cc: John Stultz <john.stultz@linaro.org> Cc: Joonwoo Park <joonwoop@codeaurora.org> Cc: Wenbo Wang <wenbo.wang@memblaze.com> Link: http://lkml.kernel.org/r/20150526224512.127050787@linutronix.de Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-05-26 16:50:33 -06:00
tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
}
static inline void tick_nohz_irq_enter(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t now;
if (!ts->idle_active && !ts->tick_stopped)
return;
now = ktime_get();
if (ts->idle_active)
tick_nohz_stop_idle(ts, now);
if (ts->tick_stopped)
tick_nohz_update_jiffies(now);
}
#else
static inline void tick_nohz_switch_to_nohz(void) { }
static inline void tick_nohz_irq_enter(void) { }
timer: Reduce timer migration overhead if disabled Eric reported that the timer_migration sysctl is not really nice performance wise as it needs to check at every timer insertion whether the feature is enabled or not. Further the check does not live in the timer code, so we have an extra function call which checks an extra cache line to figure out that it is disabled. We can do better and store that information in the per cpu (hr)timer bases. I pondered to use a static key, but that's a nightmare to update from the nohz code and the timer base cache line is hot anyway when we select a timer base. The old logic enabled the timer migration unconditionally if CONFIG_NO_HZ was set even if nohz was disabled on the kernel command line. With this modification, we start off with migration disabled. The user visible sysctl is still set to enabled. If the kernel switches to NOHZ migration is enabled, if the user did not disable it via the sysctl prior to the switch. If nohz=off is on the kernel command line, migration stays disabled no matter what. Before: 47.76% hog [.] main 14.84% [kernel] [k] _raw_spin_lock_irqsave 9.55% [kernel] [k] _raw_spin_unlock_irqrestore 6.71% [kernel] [k] mod_timer 6.24% [kernel] [k] lock_timer_base.isra.38 3.76% [kernel] [k] detach_if_pending 3.71% [kernel] [k] del_timer 2.50% [kernel] [k] internal_add_timer 1.51% [kernel] [k] get_nohz_timer_target 1.28% [kernel] [k] __internal_add_timer 0.78% [kernel] [k] timerfn 0.48% [kernel] [k] wake_up_nohz_cpu After: 48.10% hog [.] main 15.25% [kernel] [k] _raw_spin_lock_irqsave 9.76% [kernel] [k] _raw_spin_unlock_irqrestore 6.50% [kernel] [k] mod_timer 6.44% [kernel] [k] lock_timer_base.isra.38 3.87% [kernel] [k] detach_if_pending 3.80% [kernel] [k] del_timer 2.67% [kernel] [k] internal_add_timer 1.33% [kernel] [k] __internal_add_timer 0.73% [kernel] [k] timerfn 0.54% [kernel] [k] wake_up_nohz_cpu Reported-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Viresh Kumar <viresh.kumar@linaro.org> Cc: John Stultz <john.stultz@linaro.org> Cc: Joonwoo Park <joonwoop@codeaurora.org> Cc: Wenbo Wang <wenbo.wang@memblaze.com> Link: http://lkml.kernel.org/r/20150526224512.127050787@linutronix.de Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-05-26 16:50:33 -06:00
static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { }
nohz: Rename CONFIG_NO_HZ to CONFIG_NO_HZ_COMMON We are planning to convert the dynticks Kconfig options layout into a choice menu. The user must be able to easily pick any of the following implementations: constant periodic tick, idle dynticks, full dynticks. As this implies a mutual exclusion, the two dynticks implementions need to converge on the selection of a common Kconfig option in order to ease the sharing of a common infrastructure. It would thus seem pretty natural to reuse CONFIG_NO_HZ to that end. It already implements all the idle dynticks code and the full dynticks depends on all that code for now. So ideally the choice menu would propose CONFIG_NO_HZ_IDLE and CONFIG_NO_HZ_EXTENDED then both would select CONFIG_NO_HZ. On the other hand we want to stay backward compatible: if CONFIG_NO_HZ is set in an older config file, we want to enable CONFIG_NO_HZ_IDLE by default. But we can't afford both at the same time or we run into a circular dependency: 1) CONFIG_NO_HZ_IDLE and CONFIG_NO_HZ_EXTENDED both select CONFIG_NO_HZ 2) If CONFIG_NO_HZ is set, we default to CONFIG_NO_HZ_IDLE We might be able to support that from Kconfig/Kbuild but it may not be wise to introduce such a confusing behaviour. So to solve this, create a new CONFIG_NO_HZ_COMMON option which gathers the common code between idle and full dynticks (that common code for now is simply the idle dynticks code) and select it from their referring Kconfig. Then we'll later create CONFIG_NO_HZ_IDLE and map CONFIG_NO_HZ to it for backward compatibility. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Christoph Lameter <cl@linux.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Gilad Ben Yossef <gilad@benyossef.com> Cc: Hakan Akkan <hakanakkan@gmail.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Kevin Hilman <khilman@linaro.org> Cc: Li Zhong <zhong@linux.vnet.ibm.com> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Paul Gortmaker <paul.gortmaker@windriver.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Gleixner <tglx@linutronix.de>
2011-08-10 15:21:01 -06:00
#endif /* CONFIG_NO_HZ_COMMON */
/*
* Called from irq_enter to notify about the possible interruption of idle()
*/
void tick_irq_enter(void)
{
tick_check_oneshot_broadcast_this_cpu();
tick_nohz_irq_enter();
}
/*
* High resolution timer specific code
*/
#ifdef CONFIG_HIGH_RES_TIMERS
static void (*wake_callback)(void);
void register_tick_sched_wakeup_callback(void (*cb)(void))
{
if (!wake_callback)
wake_callback = cb;
else
pr_warn("tick-sched wake cb already exists; skipping.\n");
}
EXPORT_SYMBOL_GPL(register_tick_sched_wakeup_callback);
/*
* We rearm the timer until we get disabled by the idle code.
* Called with interrupts disabled.
*/
static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
{
struct tick_sched *ts =
container_of(timer, struct tick_sched, sched_timer);
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
tick_sched_do_timer(ts, now);
/*
* Do not call, when we are not in irq context and have
* no valid regs pointer
*/
if (regs) {
tick_sched_handle(ts, regs);
Merge android-4.19-stable.136 (204dd19) into msm-4.19 * refs/heads/tmp-204dd19: UPSTREAM: driver core: Avoid deferred probe due to fw_devlink_pause/resume() UPSTREAM: driver core: Rename dev_links_info.defer_sync to defer_hook UPSTREAM: driver core: Don't do deferred probe in parallel with kernel_init thread Restore sdcardfs feature Revert rpmh and usb changes Linux 4.19.136 regmap: debugfs: check count when read regmap file rtnetlink: Fix memory(net_device) leak when ->newlink fails udp: Improve load balancing for SO_REUSEPORT. udp: Copy has_conns in reuseport_grow(). sctp: shrink stream outq when fails to do addstream reconf sctp: shrink stream outq only when new outcnt < old outcnt AX.25: Prevent integer overflows in connect and sendmsg tcp: allow at most one TLP probe per flight rxrpc: Fix sendmsg() returning EPIPE due to recvmsg() returning ENODATA qrtr: orphan socket in qrtr_release() net: udp: Fix wrong clean up for IS_UDPLITE macro net-sysfs: add a newline when printing 'tx_timeout' by sysfs ip6_gre: fix null-ptr-deref in ip6gre_init_net() drivers/net/wan/x25_asy: Fix to make it work dev: Defer free of skbs in flush_backlog AX.25: Prevent out-of-bounds read in ax25_sendmsg() AX.25: Fix out-of-bounds read in ax25_connect() Linux 4.19.135 ath9k: Fix regression with Atheros 9271 ath9k: Fix general protection fault in ath9k_hif_usb_rx_cb dm integrity: fix integrity recalculation that is improperly skipped ASoC: qcom: Drop HAS_DMA dependency to fix link failure ASoC: rt5670: Add new gpio1_is_ext_spk_en quirk and enable it on the Lenovo Miix 2 10 x86, vmlinux.lds: Page-align end of ..page_aligned sections parisc: Add atomic64_set_release() define to avoid CPU soft lockups drm/amd/powerplay: fix a crash when overclocking Vega M drm/amdgpu: Fix NULL dereference in dpm sysfs handlers io-mapping: indicate mapping failure mm: memcg/slab: fix memory leak at non-root kmem_cache destroy mm: memcg/slab: synchronize access to kmem_cache dying flag using a spinlock mm/memcg: fix refcount error while moving and swapping Makefile: Fix GCC_TOOLCHAIN_DIR prefix for Clang cross compilation vt: Reject zero-sized screen buffer size. fbdev: Detect integer underflow at "struct fbcon_ops"->clear_margins. serial: 8250_mtk: Fix high-speed baud rates clamping serial: 8250: fix null-ptr-deref in serial8250_start_tx() staging: comedi: addi_apci_1564: check INSN_CONFIG_DIGITAL_TRIG shift staging: comedi: addi_apci_1500: check INSN_CONFIG_DIGITAL_TRIG shift staging: comedi: ni_6527: fix INSN_CONFIG_DIGITAL_TRIG support staging: comedi: addi_apci_1032: check INSN_CONFIG_DIGITAL_TRIG shift staging: wlan-ng: properly check endpoint types Revert "cifs: Fix the target file was deleted when rename failed." usb: xhci: Fix ASM2142/ASM3142 DMA addressing usb: xhci-mtk: fix the failure of bandwidth allocation binder: Don't use mmput() from shrinker function. RISC-V: Upgrade smp_mb__after_spinlock() to iorw,iorw x86: math-emu: Fix up 'cmp' insn for clang ias arm64: Use test_tsk_thread_flag() for checking TIF_SINGLESTEP hwmon: (scmi) Fix potential buffer overflow in scmi_hwmon_probe() hwmon: (adm1275) Make sure we are reading enough data for different chips usb: gadget: udc: gr_udc: fix memleak on error handling path in gr_ep_init() Input: synaptics - enable InterTouch for ThinkPad X1E 1st gen dmaengine: ioat setting ioat timeout as module parameter hwmon: (aspeed-pwm-tacho) Avoid possible buffer overflow regmap: dev_get_regmap_match(): fix string comparison spi: mediatek: use correct SPI_CFG2_REG MACRO Input: add `SW_MACHINE_COVER` dmaengine: tegra210-adma: Fix runtime PM imbalance on error HID: apple: Disable Fn-key key-re-mapping on clone keyboards HID: steam: fixes race in handling device list. HID: alps: support devices with report id 2 HID: i2c-hid: add Mediacom FlexBook edge13 to descriptor override scripts/gdb: fix lx-symbols 'gdb.error' while loading modules scripts/decode_stacktrace: strip basepath from all paths serial: exar: Fix GPIO configuration for Sealevel cards based on XR17V35X bonding: check return value of register_netdevice() in bond_newlink() i2c: rcar: always clear ICSAR to avoid side effects net: ethernet: ave: Fix error returns in ave_init ipvs: fix the connection sync failed in some cases qed: suppress "don't support RoCE & iWARP" flooding on HW init mlxsw: destroy workqueue when trap_register in mlxsw_emad_init bonding: check error value of register_netdevice() immediately net: smc91x: Fix possible memory leak in smc_drv_probe() drm: sun4i: hdmi: Fix inverted HPD result ieee802154: fix one possible memleak in adf7242_probe net: dp83640: fix SIOCSHWTSTAMP to update the struct with actual configuration ax88172a: fix ax88172a_unbind() failures hippi: Fix a size used in a 'pci_free_consistent()' in an error handling path fpga: dfl: fix bug in port reset handshake bnxt_en: Fix race when modifying pause settings. btrfs: fix page leaks after failure to lock page for delalloc btrfs: fix mount failure caused by race with umount btrfs: fix double free on ulist after backref resolution failure ASoC: rt5670: Correct RT5670_LDO_SEL_MASK ALSA: info: Drop WARN_ON() from buffer NULL sanity check uprobes: Change handle_swbp() to send SIGTRAP with si_code=SI_KERNEL, to fix GDB regression IB/umem: fix reference count leak in ib_umem_odp_get() tipc: clean up skb list lock handling on send path spi: spi-fsl-dspi: Exit the ISR with IRQ_NONE when it's not ours SUNRPC reverting d03727b248d0 ("NFSv4 fix CLOSE not waiting for direct IO compeletion") irqdomain/treewide: Keep firmware node unconditionally allocated fuse: fix weird page warning drivers/firmware/psci: Fix memory leakage in alloc_init_cpu_groups() drm/nouveau/i2c/g94-: increase NV_PMGR_DP_AUXCTL_TRANSACTREQ timeout net: sky2: initialize return of gm_phy_read drivers/net/wan/lapbether: Fixed the value of hard_header_len xtensa: update *pos in cpuinfo_op.next xtensa: fix __sync_fetch_and_{and,or}_4 declarations scsi: scsi_transport_spi: Fix function pointer check mac80211: allow rx of mesh eapol frames with default rx key pinctrl: amd: fix npins for uart0 in kerncz_groups gpio: arizona: put pm_runtime in case of failure gpio: arizona: handle pm_runtime_get_sync failure case soc: qcom: rpmh: Dirt can only make you dirtier, not cleaner ANDROID: build: update ABI definitions ANDROID: update the kernel release format for GKI ANDROID: Incremental fs: magic number compatible 32-bit ANDROID: kbuild: don't merge .*..compoundliteral in modules ANDROID: GKI: preserve ABI for struct sock_cgroup_data Revert "genetlink: remove genl_bind" Revert "arm64/alternatives: use subsections for replacement sequences" Linux 4.19.134 spi: sprd: switch the sequence of setting WDG_LOAD_LOW and _HIGH rxrpc: Fix trace string libceph: don't omit recovery_deletes in target_copy() printk: queue wake_up_klogd irq_work only if per-CPU areas are ready genirq/affinity: Handle affinity setting on inactive interrupts correctly sched/fair: handle case of task_h_load() returning 0 sched: Fix unreliable rseq cpu_id for new tasks arm64: compat: Ensure upper 32 bits of x0 are zero on syscall return arm64: ptrace: Consistently use pseudo-singlestep exceptions arm64: ptrace: Override SPSR.SS when single-stepping is enabled thermal/drivers/cpufreq_cooling: Fix wrong frequency converted from power misc: atmel-ssc: lock with mutex instead of spinlock dmaengine: fsl-edma: Fix NULL pointer exception in fsl_edma_tx_handler intel_th: Fix a NULL dereference when hub driver is not loaded intel_th: pci: Add Emmitsburg PCH support intel_th: pci: Add Tiger Lake PCH-H support intel_th: pci: Add Jasper Lake CPU support powerpc/book3s64/pkeys: Fix pkey_access_permitted() for execute disable pkey hwmon: (emc2103) fix unable to change fan pwm1_enable attribute riscv: use 16KB kernel stack on 64-bit MIPS: Fix build for LTS kernel caused by backporting lpj adjustment timer: Fix wheel index calculation on last level timer: Prevent base->clk from moving backward uio_pdrv_genirq: fix use without device tree and no interrupt Input: i8042 - add Lenovo XiaoXin Air 12 to i8042 nomux list mei: bus: don't clean driver pointer Revert "zram: convert remaining CLASS_ATTR() to CLASS_ATTR_RO()" fuse: Fix parameter for FS_IOC_{GET,SET}FLAGS ovl: fix unneeded call to ovl_change_flags() ovl: relax WARN_ON() when decoding lower directory file handle ovl: inode reference leak in ovl_is_inuse true case. serial: mxs-auart: add missed iounmap() in probe failure and remove virtio: virtio_console: add missing MODULE_DEVICE_TABLE() for rproc serial virt: vbox: Fix guest capabilities mask check virt: vbox: Fix VBGL_IOCTL_VMMDEV_REQUEST_BIG and _LOG req numbers to match upstream USB: serial: option: add Quectel EG95 LTE modem USB: serial: option: add GosunCn GM500 series USB: serial: ch341: add new Product ID for CH340 USB: serial: cypress_m8: enable Simply Automated UPB PIM USB: serial: iuu_phoenix: fix memory corruption usb: gadget: function: fix missing spinlock in f_uac1_legacy usb: chipidea: core: add wakeup support for extcon usb: dwc2: Fix shutdown callback in platform USB: c67x00: fix use after free in c67x00_giveback_urb ALSA: hda/realtek - Enable Speaker for ASUS UX533 and UX534 ALSA: hda/realtek - change to suitable link model for ASUS platform ALSA: usb-audio: Fix race against the error recovery URB submission ALSA: line6: Sync the pending work cancel at disconnection ALSA: line6: Perform sanity check for each URB creation HID: quirks: Ignore Simply Automated UPB PIM HID: quirks: Always poll Obins Anne Pro 2 keyboard HID: magicmouse: do not set up autorepeat slimbus: core: Fix mismatch in of_node_get/put mtd: rawnand: oxnas: Release all devices in the _remove() path mtd: rawnand: oxnas: Unregister all devices on error mtd: rawnand: oxnas: Keep track of registered devices mtd: rawnand: brcmnand: fix CS0 layout mtd: rawnand: timings: Fix default tR_max and tCCS_min timings mtd: rawnand: marvell: Fix probe error path mtd: rawnand: marvell: Use nand_cleanup() when the device is not yet registered soc: qcom: rpmh-rsc: Allow using free WAKE TCS for active request soc: qcom: rpmh-rsc: Clear active mode configuration for wake TCS soc: qcom: rpmh: Invalidate SLEEP and WAKE TCSes before flushing new data soc: qcom: rpmh: Update dirty flag only when data changes perf stat: Zero all the 'ena' and 'run' array slot stats for interval mode apparmor: ensure that dfa state tables have entries copy_xstate_to_kernel: Fix typo which caused GDB regression regmap: debugfs: Don't sleep while atomic for fast_io regmaps ARM: dts: socfpga: Align L2 cache-controller nodename with dtschema Revert "thermal: mediatek: fix register index error" staging: comedi: verify array index is correct before using it usb: gadget: udc: atmel: fix uninitialized read in debug printk spi: spi-sun6i: sun6i_spi_transfer_one(): fix setting of clock rate arm64: dts: meson: add missing gxl rng clock phy: sun4i-usb: fix dereference of pointer phy0 before it is null checked iio:health:afe4404 Fix timestamp alignment and prevent data leak. ALSA: usb-audio: Add registration quirk for Kingston HyperX Cloud Flight S ACPI: video: Use native backlight on Acer TravelMate 5735Z Input: mms114 - add extra compatible for mms345l ALSA: usb-audio: Add registration quirk for Kingston HyperX Cloud Alpha S ACPI: video: Use native backlight on Acer Aspire 5783z ALSA: usb-audio: Rewrite registration quirk handling mmc: sdhci: do not enable card detect interrupt for gpio cd type doc: dt: bindings: usb: dwc3: Update entries for disabling SS instances in park mode ALSA: usb-audio: Create a registration quirk for Kingston HyperX Amp (0951:16d8) scsi: sr: remove references to BLK_DEV_SR_VENDOR, leave it enabled ARM: at91: pm: add quirk for sam9x60's ulp1 HID: quirks: Remove ITE 8595 entry from hid_have_special_driver net: sfp: add some quirks for GPON modules net: sfp: add support for module quirks Revert "usb/ehci-platform: Set PM runtime as active on resume" Revert "usb/xhci-plat: Set PM runtime as active on resume" Revert "usb/ohci-platform: Fix a warning when hibernating" of: of_mdio: Correct loop scanning logic net: dsa: bcm_sf2: Fix node reference count spi: spi-fsl-dspi: Fix lockup if device is shutdown during SPI transfer spi: fix initial SPI_SR value in spi-fsl-dspi iio:health:afe4403 Fix timestamp alignment and prevent data leak. iio:pressure:ms5611 Fix buffer element alignment iio:humidity:hts221 Fix alignment and data leak issues iio: pressure: zpa2326: handle pm_runtime_get_sync failure iio: mma8452: Add missed iio_device_unregister() call in mma8452_probe() iio: magnetometer: ak8974: Fix runtime PM imbalance on error iio:humidity:hdc100x Fix alignment and data leak issues iio:magnetometer:ak8974: Fix alignment and data leak issues arm64/alternatives: don't patch up internal branches i2c: eg20t: Load module automatically if ID matches gfs2: read-only mounts should grab the sd_freeze_gl glock tpm_tis: extra chip->ops check on error path in tpm_tis_core_init arm64/alternatives: use subsections for replacement sequences m68k: mm: fix node memblock init m68k: nommu: register start of the memory with memblock drm/exynos: fix ref count leak in mic_pre_enable drm/msm: fix potential memleak in error branch vlan: consolidate VLAN parsing code and limit max parsing depth sched: consistently handle layer3 header accesses in the presence of VLANs cgroup: Fix sock_cgroup_data on big-endian. cgroup: fix cgroup_sk_alloc() for sk_clone_lock() tcp: md5: allow changing MD5 keys in all socket states tcp: md5: refine tcp_md5_do_add()/tcp_md5_hash_key() barriers tcp: md5: do not send silly options in SYNCOOKIES tcp: md5: add missing memory barriers in tcp_md5_do_add()/tcp_md5_hash_key() tcp: make sure listeners don't initialize congestion-control state tcp: fix SO_RCVLOWAT possible hangs under high mem pressure net: usb: qmi_wwan: add support for Quectel EG95 LTE modem net_sched: fix a memory leak in atm_tc_init() net: Added pointer check for dst->ops->neigh_lookup in dst_neigh_lookup_skb llc: make sure applications use ARPHRD_ETHER l2tp: remove skb_dst_set() from l2tp_xmit_skb() ipv4: fill fl4_icmp_{type,code} in ping_v4_sendmsg genetlink: remove genl_bind net: rmnet: fix lower interface leak perf: Make perf able to build with latest libbfd UPSTREAM: media: v4l2-ctrl: Add H264 profile and levels UPSTREAM: media: v4l2-ctrl: Add control for h.264 chroma qp offset ANDROID: GKI: ASoC: compress: revert some code to avoid race condition ANDROID: GKI: Update the ABI xml representation. ANDROID: GKI: kernel: tick-sched: Add an API for wakeup callbacks ANDROID: ASoC: Compress: Check and set pcm_new driver op Revert "ANDROID: GKI: arm64: gki_defconfig: Disable CONFIG_ARM64_TAGGED_ADDR_ABI" ANDROID: arm64: configs: enabe CONFIG_TMPFS Revert "ALSA: compress: fix partial_drain completion state" ANDROID: GKI: enable CONFIG_EXT4_FS_POSIX_ACL. ANDROID: GKI: set CONFIG_STATIC_USERMODEHELPER_PATH Linux 4.19.133 s390/mm: fix huge pte soft dirty copying ARC: elf: use right ELF_ARCH ARC: entry: fix potential EFA clobber when TIF_SYSCALL_TRACE dm: use noio when sending kobject event drm/radeon: fix double free btrfs: fix fatal extent_buffer readahead vs releasepage race Revert "ath9k: Fix general protection fault in ath9k_hif_usb_rx_cb" bpf: Check correct cred for CAP_SYSLOG in bpf_dump_raw_ok() kprobes: Do not expose probe addresses to non-CAP_SYSLOG module: Do not expose section addresses to non-CAP_SYSLOG module: Refactor section attr into bin attribute kernel: module: Use struct_size() helper kallsyms: Refactor kallsyms_show_value() to take cred KVM: x86: Mark CR4.TSD as being possibly owned by the guest KVM: x86: Inject #GP if guest attempts to toggle CR4.LA57 in 64-bit mode KVM: x86: bit 8 of non-leaf PDPEs is not reserved KVM: arm64: Stop clobbering x0 for HVC_SOFT_RESTART KVM: arm64: Fix definition of PAGE_HYP_DEVICE ALSA: usb-audio: add quirk for MacroSilicon MS2109 ALSA: hda - let hs_mic be picked ahead of hp_mic ALSA: opl3: fix infoleak in opl3 mlxsw: spectrum_router: Remove inappropriate usage of WARN_ON() net: macb: mark device wake capable when "magic-packet" property present bnxt_en: fix NULL dereference in case SR-IOV configuration fails cxgb4: fix all-mask IP address comparison nbd: Fix memory leak in nbd_add_socket arm64: kgdb: Fix single-step exception handling oops ALSA: compress: fix partial_drain completion state net: hns3: fix use-after-free when doing self test smsc95xx: avoid memory leak in smsc95xx_bind smsc95xx: check return value of smsc95xx_reset net: cxgb4: fix return error value in t4_prep_fw drm/mediatek: Check plane visibility in atomic_update net: qrtr: Fix an out of bounds read qrtr_endpoint_post() x86/entry: Increase entry_stack size to a full page nvme-rdma: assign completion vector correctly block: release bip in a right way in error path usb: dwc3: pci: Fix reference count leak in dwc3_pci_resume_work scsi: mptscsih: Fix read sense data size ARM: imx6: add missing put_device() call in imx6q_suspend_init() cifs: update ctime and mtime during truncate s390/kasan: fix early pgm check handler execution drm: panel-orientation-quirks: Use generic orientation-data for Acer S1003 drm: panel-orientation-quirks: Add quirk for Asus T101HA panel i40e: protect ring accesses with READ- and WRITE_ONCE ixgbe: protect ring accesses with READ- and WRITE_ONCE spi: spidev: fix a potential use-after-free in spidev_release() spi: spidev: fix a race between spidev_release and spidev_remove gpu: host1x: Detach driver on unregister drm/tegra: hub: Do not enable orphaned window group ARM: dts: omap4-droid4: Fix spi configuration and increase rate regmap: fix alignment issue spi: spi-fsl-dspi: Fix external abort on interrupt in resume or exit paths spi: spi-fsl-dspi: use IRQF_SHARED mode to request IRQ spi: spi-fsl-dspi: Fix lockup if device is removed during SPI transfer spi: spi-fsl-dspi: Adding shutdown hook KVM: s390: reduce number of IO pins to 1 ANDROID: GKI: update abi based on padding fields being added ANDROID: GKI: USB: Gadget: add Android ABI padding to struct usb_gadget ANDROID: GKI: sound/usb/card.h: add Android ABI padding to struct snd_usb_endpoint ANDROID: fscrypt: fix DUN contiguity with inline encryption + IV_INO_LBLK_32 policies ANDROID: f2fs: add back compress inode check Linux 4.19.132 efi: Make it possible to disable efivar_ssdt entirely dm zoned: assign max_io_len correctly irqchip/gic: Atomically update affinity MIPS: Add missing EHB in mtc0 -> mfc0 sequence for DSPen cifs: Fix the target file was deleted when rename failed. SMB3: Honor lease disabling for multiuser mounts SMB3: Honor persistent/resilient handle flags for multiuser mounts SMB3: Honor 'seal' flag for multiuser mounts Revert "ALSA: usb-audio: Improve frames size computation" nfsd: apply umask on fs without ACL support i2c: mlxcpld: check correct size of maximum RECV_LEN packet i2c: algo-pca: Add 0x78 as SCL stuck low status for PCA9665 nvme: fix a crash in nvme_mpath_add_disk SMB3: Honor 'posix' flag for multiuser mounts virtio-blk: free vblk-vqs in error path of virtblk_probe() drm: sun4i: hdmi: Remove extra HPD polling hwmon: (acpi_power_meter) Fix potential memory leak in acpi_power_meter_add() hwmon: (max6697) Make sure the OVERT mask is set correctly cxgb4: fix SGE queue dump destination buffer context cxgb4: use correct type for all-mask IP address comparison cxgb4: parse TC-U32 key values and masks natively cxgb4: use unaligned conversion for fetching timestamp drm/msm/dpu: fix error return code in dpu_encoder_init crypto: af_alg - fix use-after-free in af_alg_accept() due to bh_lock_sock() kgdb: Avoid suspicious RCU usage warning nvme-multipath: fix deadlock between ana_work and scan_work nvme-multipath: set bdi capabilities once s390/debug: avoid kernel warning on too large number of pages usb: usbtest: fix missing kfree(dev->buf) in usbtest_disconnect mm/slub: fix stack overruns with SLUB_STATS mm/slub.c: fix corrupted freechain in deactivate_slab() usbnet: smsc95xx: Fix use-after-free after removal EDAC/amd64: Read back the scrub rate PCI register on F15h mm: fix swap cache node allocation mask btrfs: fix a block group ref counter leak after failure to remove block group ANDROID: Update ABI representation for libabigail update ANDROID: Update the ABI representation ANDROID: Update the ABI xml representation ANDROID: GKI: fix ABI diffs caused by GPU heap and pool vmstat additions ANDROID: sched: consider stune boost margin when computing energy ANDROID: GKI: move abi files to android/ ANDROID: GKI: drop unneeded "_whitelist" off of symbol filenames UPSTREAM: binder: fix null deref of proc->context ANDROID: cpufreq: schedutil: maintain raw cache when next_f is not changed UPSTREAM: net: bpf: Make bpf_ktime_get_ns() available to non GPL programs UPSTREAM: usb: musb: mediatek: add reset FADDR to zero in reset interrupt handle ANDROID: GKI: scripts: Makefile: update the lz4 command (#2) ANDROID: Update the ABI xml representation Revert "drm/dsi: Fix byte order of DCS set/get brightness" Linux 4.19.131 Revert "tty: hvc: Fix data abort due to race in hvc_open" xfs: add agf freeblocks verify in xfs_agf_verify dm writecache: add cond_resched to loop in persistent_memory_claim() dm writecache: correct uncommitted_block when discarding uncommitted entry NFSv4 fix CLOSE not waiting for direct IO compeletion pNFS/flexfiles: Fix list corruption if the mirror count changes SUNRPC: Properly set the @subbuf parameter of xdr_buf_subsegment() sunrpc: fixed rollback in rpc_gssd_dummy_populate() Staging: rtl8723bs: prevent buffer overflow in update_sta_support_rate() drm/radeon: fix fb_div check in ni_init_smc_spll_table() drm: rcar-du: Fix build error ring-buffer: Zero out time extend if it is nested and not absolute tracing: Fix event trigger to accept redundant spaces arm64: perf: Report the PC value in REGS_ABI_32 mode ocfs2: fix panic on nfs server over ocfs2 ocfs2: fix value of OCFS2_INVALID_SLOT ocfs2: load global_inode_alloc ocfs2: avoid inode removal while nfsd is accessing it mm/slab: use memzero_explicit() in kzfree() btrfs: fix failure of RWF_NOWAIT write into prealloc extent beyond eof btrfs: fix data block group relocation failure due to concurrent scrub x86/asm/64: Align start of __clear_user() loop to 16-bytes KVM: nVMX: Plumb L2 GPA through to PML emulation KVM: X86: Fix MSR range of APIC registers in X2APIC mode erofs: fix partially uninitialized misuse in z_erofs_onlinepage_fixup ACPI: sysfs: Fix pm_profile_attr type ALSA: hda/realtek - Add quirk for MSI GE63 laptop ALSA: hda: Add NVIDIA codec IDs 9a & 9d through a0 to patch table RISC-V: Don't allow write+exec only page mapping request in mmap blktrace: break out of blktrace setup on concurrent calls kbuild: improve cc-option to clean up all temporary files arm64: sve: Fix build failure when ARM64_SVE=y and SYSCTL=n s390/vdso: fix vDSO clock_getres() s390/ptrace: fix setting syscall number net: alx: fix race condition in alx_remove ibmvnic: Harden device login requests hwrng: ks-sa - Fix runtime PM imbalance on error riscv/atomic: Fix sign extension for RV64I drm/amd/display: Use kfree() to free rgb_user in calculate_user_regamma_ramp() ata/libata: Fix usage of page address by page_address in ata_scsi_mode_select_xlat function sata_rcar: handle pm_runtime_get_sync failure cases sched/core: Fix PI boosting between RT and DEADLINE tasks sched/deadline: Initialize ->dl_boosted i2c: core: check returned size of emulated smbus block read i2c: fsi: Fix the port number field in status register net: bcmgenet: use hardware padding of runt frames netfilter: ipset: fix unaligned atomic access usb: gadget: udc: Potential Oops in error handling code ARM: imx5: add missing put_device() call in imx_suspend_alloc_ocram() cxgb4: move handling L2T ARP failures to caller net: qed: fix excessive QM ILT lines consumption net: qed: fix NVMe login fails over VFs net: qed: fix left elements count calculation RDMA/mad: Fix possible memory leak in ib_mad_post_receive_mads() ASoC: rockchip: Fix a reference count leak. RDMA/cma: Protect bind_list and listen_list while finding matching cm id RDMA/qedr: Fix KASAN: use-after-free in ucma_event_handler+0x532 rxrpc: Fix handling of rwind from an ACK packet ARM: dts: NSP: Correct FA2 mailbox node regmap: Fix memory leak from regmap_register_patch x86/resctrl: Fix a NULL vs IS_ERR() static checker warning in rdt_cdp_peer_get() ARM: dts: Fix duovero smsc interrupt for suspend ASoC: fsl_ssi: Fix bclk calculation for mono channel regualtor: pfuze100: correct sw1a/sw2 on pfuze3000 efi/esrt: Fix reference count leak in esre_create_sysfs_entry. ASoC: q6asm: handle EOS correctly xfrm: Fix double ESP trailer insertion in IPsec crypto offload. cifs/smb3: Fix data inconsistent when zero file range cifs/smb3: Fix data inconsistent when punch hole IB/mad: Fix use after free when destroying MAD agent loop: replace kill_bdev with invalidate_bdev cdc-acm: Add DISABLE_ECHO quirk for Microchip/SMSC chip xhci: Return if xHCI doesn't support LPM xhci: Fix enumeration issue when setting max packet size for FS devices. xhci: Fix incorrect EP_STATE_MASK scsi: zfcp: Fix panic on ERP timeout for previously dismissed ERP action ALSA: usb-audio: Fix OOB access of mixer element list ALSA: usb-audio: add quirk for Samsung USBC Headset (AKG) ALSA: usb-audio: add quirk for Denon DCD-1500RE usb: typec: tcpci_rt1711h: avoid screaming irq causing boot hangs usb: host: ehci-exynos: Fix error check in exynos_ehci_probe() xhci: Poll for U0 after disabling USB2 LPM usb: host: xhci-mtk: avoid runtime suspend when removing hcd USB: ehci: reopen solution for Synopsys HC bug usb: add USB_QUIRK_DELAY_INIT for Logitech C922 usb: dwc2: Postponed gadget registration to the udc class driver USB: ohci-sm501: Add missed iounmap() in remove net: core: reduce recursion limit value net: Do not clear the sock TX queue in sk_set_socket() net: Fix the arp error in some cases sch_cake: don't call diffserv parsing code when it is not needed tcp_cubic: fix spurious HYSTART_DELAY exit upon drop in min RTT sch_cake: fix a few style nits sch_cake: don't try to reallocate or unshare skb unconditionally ip_tunnel: fix use-after-free in ip_tunnel_lookup() net: phy: Check harder for errors in get_phy_id() ip6_gre: fix use-after-free in ip6gre_tunnel_lookup() tg3: driver sleeps indefinitely when EEH errors exceed eeh_max_freezes tcp: grow window for OOO packets only for SACK flows tcp: don't ignore ECN CWR on pure ACK sctp: Don't advertise IPv4 addresses if ipv6only is set on the socket rxrpc: Fix notification call on completion of discarded calls rocker: fix incorrect error handling in dma_rings_init net: usb: ax88179_178a: fix packet alignment padding net: increment xmit_recursion level in dev_direct_xmit() net: use correct this_cpu primitive in dev_recursion_level net: place xmit recursion in softnet data net: fix memleak in register_netdevice() net: bridge: enfore alignment for ethernet address mld: fix memory leak in ipv6_mc_destroy_dev() ibmveth: Fix max MTU limit apparmor: don't try to replace stale label in ptraceme check ALSA: hda/realtek - Enable micmute LED on and HP system ALSA: hda/realtek: Enable mute LED on an HP system ALSA: hda/realtek - Enable the headset of ASUS B9450FA with ALC294 fix a braino in "sparc32: fix register window handling in genregs32_[gs]et()" i2c: tegra: Fix Maximum transfer size i2c: tegra: Add missing kerneldoc for some fields i2c: tegra: Cleanup kerneldoc comments EDAC/amd64: Add Family 17h Model 30h PCI IDs net: sched: export __netdev_watchdog_up() net: bcmgenet: remove HFB_CTRL access mtd: rawnand: marvell: Fix the condition on a return code fanotify: fix ignore mask logic for events on child and on dir block/bio-integrity: don't free 'buf' if bio_integrity_add_page() failed net: be more gentle about silly gso requests coming from user ANDROID: lib/vdso: do not update timespec if clock_getres() fails Revert "ANDROID: fscrypt: add key removal notifier chain" ANDROID: update the ABI xml and qcom whitelist ANDROID: fs: export vfs_{read|write} ANDROID: GKI: update abi definitions now that sdcardfs is gone Revert "ANDROID: sdcardfs: Enable modular sdcardfs" Revert "ANDROID: vfs: Add setattr2 for filesystems with per mount permissions" Revert "ANDROID: vfs: fix export symbol type" Revert "ANDROID: vfs: Add permission2 for filesystems with per mount permissions" Revert "ANDROID: vfs: fix export symbol types" Revert "ANDROID: vfs: add d_canonical_path for stacked filesystem support" Revert "ANDROID: fs: Restore vfs_path_lookup() export" ANDROID: sdcardfs: remove sdcardfs from system Revert "ALSA: usb-audio: Improve frames size computation" ANDROID: Makefile: append BUILD_NUMBER to version string when defined ANDROID: GKI: Update ABI for incremental fs ANDROID: GKI: Update cuttlefish whitelist ANDROID: GKI: Disable INCREMENTAL_FS on x86 too ANDROID: cpufreq: schedutil: drop cache when update skipped due to rate limit Linux 4.19.130 KVM: x86/mmu: Set mmio_value to '0' if reserved #PF can't be generated kvm: x86: Fix reserved bits related calculation errors caused by MKTME kvm: x86: Move kvm_set_mmio_spte_mask() from x86.c to mmu.c md: add feature flag MD_FEATURE_RAID0_LAYOUT Revert "dpaa_eth: fix usage as DSA master, try 3" net: core: device_rename: Use rwsem instead of a seqcount sched/rt, net: Use CONFIG_PREEMPTION.patch kretprobe: Prevent triggering kretprobe from within kprobe_flush_task net: octeon: mgmt: Repair filling of RX ring e1000e: Do not wake up the system via WOL if device wakeup is disabled kprobes: Fix to protect kick_kprobe_optimizer() by kprobe_mutex crypto: algboss - don't wait during notifier callback crypto: algif_skcipher - Cap recv SG list at ctx->used drm/i915/icl+: Fix hotplug interrupt disabling after storm detection drm/i915: Whitelist context-local timestamp in the gen9 cmdparser s390: fix syscall_get_error for compat processes mtd: rawnand: tmio: Fix the probe error path mtd: rawnand: mtk: Fix the probe error path mtd: rawnand: plat_nand: Fix the probe error path mtd: rawnand: socrates: Fix the probe error path mtd: rawnand: oxnas: Fix the probe error path mtd: rawnand: oxnas: Add of_node_put() mtd: rawnand: orion: Fix the probe error path mtd: rawnand: xway: Fix the probe error path mtd: rawnand: sharpsl: Fix the probe error path mtd: rawnand: diskonchip: Fix the probe error path mtd: rawnand: Pass a nand_chip object to nand_release() mtd: rawnand: Pass a nand_chip object to nand_scan() block: nr_sects_write(): Disable preemption on seqcount write x86/boot/compressed: Relax sed symbol type regex for LLVM ld.lld drm/dp_mst: Increase ACT retry timeout to 3s ext4: avoid race conditions when remounting with options that change dax ext4: fix partial cluster initialization when splitting extent selinux: fix double free drm/amdgpu: Replace invalid device ID with a valid device ID drm/qxl: Use correct notify port address when creating cursor ring drm/dp_mst: Reformat drm_dp_check_act_status() a bit drm: encoder_slave: fix refcouting error for modules libata: Use per port sync for detach arm64: hw_breakpoint: Don't invoke overflow handler on uaccess watchpoints block: Fix use-after-free in blkdev_get() afs: afs_write_end() should change i_size under the right lock afs: Fix non-setting of mtime when writing into mmap bcache: fix potential deadlock problem in btree_gc_coalesce ext4: stop overwrite the errcode in ext4_setup_super perf report: Fix NULL pointer dereference in hists__fprintf_nr_sample_events() usb/ehci-platform: Set PM runtime as active on resume usb: host: ehci-platform: add a quirk to avoid stuck usb/xhci-plat: Set PM runtime as active on resume xdp: Fix xsk_generic_xmit errno net/filter: Permit reading NET in load_bytes_relative when MAC not set x86/idt: Keep spurious entries unset in system_vectors scsi: acornscsi: Fix an error handling path in acornscsi_probe() drm/sun4i: hdmi ddc clk: Fix size of m divider ASoC: rt5645: Add platform-data for Asus T101HA ASoC: Intel: bytcr_rt5640: Add quirk for Toshiba Encore WT10-A tablet ASoC: core: only convert non DPCM link to DPCM link afs: Fix memory leak in afs_put_sysnames() selftests/net: in timestamping, strncpy needs to preserve null byte drivers/perf: hisi: Fix wrong value for all counters enable NTB: ntb_test: Fix bug when counting remote files NTB: perf: Fix race condition when run with ntb_test NTB: perf: Fix support for hardware that doesn't have port numbers NTB: perf: Don't require one more memory window than number of peers NTB: Revert the change to use the NTB device dev for DMA allocations NTB: ntb_tool: reading the link file should not end in a NULL byte ntb_tool: pass correct struct device to dma_alloc_coherent ntb_perf: pass correct struct device to dma_alloc_coherent gfs2: fix use-after-free on transaction ail lists blktrace: fix endianness for blk_log_remap() blktrace: fix endianness in get_pdu_int() blktrace: use errno instead of bi_status selftests/vm/pkeys: fix alloc_random_pkey() to make it really random elfnote: mark all .note sections SHF_ALLOC include/linux/bitops.h: avoid clang shift-count-overflow warnings lib/zlib: remove outdated and incorrect pre-increment optimization geneve: change from tx_error to tx_dropped on missing metadata crypto: omap-sham - add proper load balancing support for multicore pinctrl: freescale: imx: Fix an error handling path in 'imx_pinctrl_probe()' pinctrl: imxl: Fix an error handling path in 'imx1_pinctrl_core_probe()' scsi: ufs: Don't update urgent bkops level when toggling auto bkops scsi: iscsi: Fix reference count leak in iscsi_boot_create_kobj gfs2: Allow lock_nolock mount to specify jid=X openrisc: Fix issue with argument clobbering for clone/fork rxrpc: Adjust /proc/net/rxrpc/calls to display call->debug_id not user_ID vfio/mdev: Fix reference count leak in add_mdev_supported_type ASoC: fsl_asrc_dma: Fix dma_chan leak when config DMA channel failed extcon: adc-jack: Fix an error handling path in 'adc_jack_probe()' powerpc/4xx: Don't unmap NULL mbase of: Fix a refcounting bug in __of_attach_node_sysfs() NFSv4.1 fix rpc_call_done assignment for BIND_CONN_TO_SESSION net: sunrpc: Fix off-by-one issues in 'rpc_ntop6' clk: sprd: return correct type of value for _sprd_pll_recalc_rate KVM: PPC: Book3S HV: Ignore kmemleak false positives scsi: ufs-qcom: Fix scheduling while atomic issue clk: bcm2835: Fix return type of bcm2835_register_gate scsi: target: tcmu: Fix a use after free in tcmu_check_expired_queue_cmd() ASoC: fix incomplete error-handling in img_i2s_in_probe. x86/apic: Make TSC deadline timer detection message visible RDMA/iw_cxgb4: cleanup device debugfs entries on ULD remove usb: gadget: Fix issue with config_ep_by_speed function usb: gadget: fix potential double-free in m66592_probe. usb: gadget: lpc32xx_udc: don't dereference ep pointer before null check USB: gadget: udc: s3c2410_udc: Remove pointless NULL check in s3c2410_udc_nuke usb: dwc2: gadget: move gadget resume after the core is in L0 state watchdog: da9062: No need to ping manually before setting timeout IB/cma: Fix ports memory leak in cma_configfs PCI: dwc: Fix inner MSI IRQ domain registration PCI/PTM: Inherit Switch Downstream Port PTM settings from Upstream Port dm zoned: return NULL if dmz_get_zone_for_reclaim() fails to find a zone powerpc/64s/pgtable: fix an undefined behaviour arm64: tegra: Fix ethernet phy-mode for Jetson Xavier scsi: target: tcmu: Userspace must not complete queued commands clk: samsung: exynos5433: Add IGNORE_UNUSED flag to sclk_i2s1 fpga: dfl: afu: Corrected error handling levels tty: n_gsm: Fix bogus i++ in gsm_data_kick USB: host: ehci-mxc: Add error handling in ehci_mxc_drv_probe() ASoC: Intel: bytcr_rt5640: Add quirk for Toshiba Encore WT8-A tablet drm/msm/mdp5: Fix mdp5_init error path for failed mdp5_kms allocation usb/ohci-platform: Fix a warning when hibernating vfio-pci: Mask cap zero powerpc/ps3: Fix kexec shutdown hang powerpc/pseries/ras: Fix FWNMI_VALID off by one ipmi: use vzalloc instead of kmalloc for user creation HID: Add quirks for Trust Panora Graphic Tablet tty: n_gsm: Fix waking up upper tty layer when room available tty: n_gsm: Fix SOF skipping powerpc/64: Don't initialise init_task->thread.regs PCI: Fix pci_register_host_bridge() device_register() error handling clk: ti: composite: fix memory leak dlm: remove BUG() before panic() pinctrl: rockchip: fix memleak in rockchip_dt_node_to_map scsi: mpt3sas: Fix double free warnings power: supply: smb347-charger: IRQSTAT_D is volatile power: supply: lp8788: Fix an error handling path in 'lp8788_charger_probe()' scsi: qla2xxx: Fix warning after FC target reset PCI/ASPM: Allow ASPM on links to PCIe-to-PCI/PCI-X Bridges PCI: rcar: Fix incorrect programming of OB windows drivers: base: Fix NULL pointer exception in __platform_driver_probe() if a driver developer is foolish serial: amba-pl011: Make sure we initialize the port.lock spinlock i2c: pxa: fix i2c_pxa_scream_blue_murder() debug output PCI: v3-semi: Fix a memory leak in v3_pci_probe() error handling paths staging: sm750fb: add missing case while setting FB_VISUAL usb: dwc3: gadget: Properly handle failed kick_transfer thermal/drivers/ti-soc-thermal: Avoid dereferencing ERR_PTR slimbus: ngd: get drvdata from correct device tty: hvc: Fix data abort due to race in hvc_open s390/qdio: put thinint indicator after early error ALSA: usb-audio: Fix racy list management in output queue ALSA: usb-audio: Improve frames size computation staging: gasket: Fix mapping refcnt leak when register/store fails staging: gasket: Fix mapping refcnt leak when put attribute fails firmware: qcom_scm: fix bogous abuse of dma-direct internals pinctrl: rza1: Fix wrong array assignment of rza1l_swio_entries scsi: qedf: Fix crash when MFW calls for protocol stats while function is still probing gpio: dwapb: Append MODULE_ALIAS for platform driver ARM: dts: sun8i-h2-plus-bananapi-m2-zero: Fix led polarity scsi: qedi: Do not flush offload work if ARP not resolved arm64: dts: mt8173: fix unit name warnings staging: greybus: fix a missing-check bug in gb_lights_light_config() x86/purgatory: Disable various profiling and sanitizing options apparmor: fix nnp subset test for unconfined scsi: ibmvscsi: Don't send host info in adapter info MAD after LPM scsi: sr: Fix sr_probe() missing deallocate of device minor ASoC: meson: add missing free_irq() in error path apparmor: check/put label on apparmor_sk_clone_security() apparmor: fix introspection of of task mode for unconfined tasks mksysmap: Fix the mismatch of '.L' symbols in System.map NTB: Fix the default port and peer numbers for legacy drivers NTB: ntb_pingpong: Choose doorbells based on port number yam: fix possible memory leak in yam_init_driver pwm: img: Call pm_runtime_put() in pm_runtime_get_sync() failed case powerpc/crashkernel: Take "mem=" option into account PCI: vmd: Filter resource type bits from shadow register nfsd: Fix svc_xprt refcnt leak when setup callback client failed powerpc/perf/hv-24x7: Fix inconsistent output values incase multiple hv-24x7 events run clk: clk-flexgen: fix clock-critical handling scsi: lpfc: Fix lpfc_nodelist leak when processing unsolicited event mfd: wm8994: Fix driver operation if loaded as modules gpio: dwapb: Call acpi_gpiochip_free_interrupts() on GPIO chip de-registration m68k/PCI: Fix a memory leak in an error handling path RDMA/mlx5: Add init2init as a modify command vfio/pci: fix memory leaks in alloc_perm_bits() ps3disk: use the default segment boundary PCI: aardvark: Don't blindly enable ASPM L0s and don't write to read-only register dm mpath: switch paths in dm_blk_ioctl() code path serial: 8250: Fix max baud limit in generic 8250 port usblp: poison URBs upon disconnect clk: samsung: Mark top ISP and CAM clocks on Exynos542x as critical i2c: pxa: clear all master action bits in i2c_pxa_stop_message() f2fs: report delalloc reserve as non-free in statfs for project quota iio: bmp280: fix compensation of humidity scsi: qla2xxx: Fix issue with adapter's stopping state PCI: Allow pci_resize_resource() for devices on root bus ALSA: isa/wavefront: prevent out of bounds write in ioctl ALSA: hda/realtek - Introduce polarity for micmute LED GPIO scsi: qedi: Check for buffer overflow in qedi_set_path() ARM: integrator: Add some Kconfig selections ASoC: davinci-mcasp: Fix dma_chan refcnt leak when getting dma type backlight: lp855x: Ensure regulators are disabled on probe failure clk: qcom: msm8916: Fix the address location of pll->config_reg remoteproc: Fix IDR initialisation in rproc_alloc() iio: pressure: bmp280: Tolerate IRQ before registering i2c: piix4: Detect secondary SMBus controller on AMD AM4 chipsets ASoC: tegra: tegra_wm8903: Support nvidia, headset property clk: sunxi: Fix incorrect usage of round_down() power: supply: bq24257_charger: Replace depends on REGMAP_I2C with select ANDROID: ext4: Optimize match for casefolded encrypted dirs ANDROID: ext4: Handle casefolding with encryption ANDROID: extcon: Remove redundant EXPORT_SYMBOL_GPL ANDROID: update the ABI xml representation ANDROID: GKI: cfg80211: add ABI changes for CONFIG_NL80211_TESTMODE ANDROID: gki_defconfig: x86: Enable KERNEL_LZ4 ANDROID: GKI: scripts: Makefile: update the lz4 command FROMLIST: f2fs: fix use-after-free when accessing bio->bi_crypt_context UPSTREAM: fdt: Update CRC check for rng-seed ANDROID: GKI: Update ABI for incremental fs ANDROID: GKI: Update whitelist and defconfig for incfs ANDROID: Use depmod from the hermetic toolchain Linux 4.19.129 perf symbols: Fix debuginfo search for Ubuntu perf probe: Check address correctness by map instead of _etext perf probe: Fix to check blacklist address correctly perf probe: Do not show the skipped events w1: omap-hdq: cleanup to add missing newline for some dev_dbg mtd: rawnand: pasemi: Fix the probe error path mtd: rawnand: brcmnand: fix hamming oob layout sunrpc: clean up properly in gss_mech_unregister() sunrpc: svcauth_gss_register_pseudoflavor must reject duplicate registrations. kbuild: force to build vmlinux if CONFIG_MODVERSION=y powerpc/64s: Save FSCR to init_task.thread.fscr after feature init powerpc/64s: Don't let DT CPU features set FSCR_DSCR drivers/macintosh: Fix memleak in windfarm_pm112 driver ARM: dts: s5pv210: Set keep-power-in-suspend for SDHCI1 on Aries ARM: dts: at91: sama5d2_ptc_ek: fix vbus pin ARM: dts: exynos: Fix GPIO polarity for thr GalaxyS3 CM36651 sensor's bus ARM: tegra: Correct PL310 Auxiliary Control Register initialization kernel/cpu_pm: Fix uninitted local in cpu_pm alpha: fix memory barriers so that they conform to the specification dm crypt: avoid truncating the logical block size sparc64: fix misuses of access_process_vm() in genregs32_[sg]et() sparc32: fix register window handling in genregs32_[gs]et() gnss: sirf: fix error return code in sirf_probe() pinctrl: samsung: Save/restore eint_mask over suspend for EINT_TYPE GPIOs pinctrl: samsung: Correct setting of eint wakeup mask on s5pv210 power: vexpress: add suppress_bind_attrs to true igb: Report speed and duplex as unknown when device is runtime suspended media: ov5640: fix use of destroyed mutex b43_legacy: Fix connection problem with WPA3 b43: Fix connection problem with WPA3 b43legacy: Fix case where channel status is corrupted Bluetooth: hci_bcm: fix freeing not-requested IRQ media: go7007: fix a miss of snd_card_free carl9170: remove P2P_GO support e1000e: Relax condition to trigger reset for ME workaround e1000e: Disable TSO for buffer overrun workaround PCI: Program MPS for RCiEP devices ima: Call ima_calc_boot_aggregate() in ima_eventdigest_init() btrfs: fix wrong file range cleanup after an error filling dealloc range btrfs: fix error handling when submitting direct I/O bio PCI: Generalize multi-function power dependency device links PCI: Unify ACS quirk desired vs provided checking PCI: Make ACS quirk implementations more uniform serial: 8250_pci: Move Pericom IDs to pci_ids.h PCI: Add Loongson vendor ID x86/amd_nb: Add Family 19h PCI IDs PCI: vmd: Add device id for VMD device 8086:9A0B PCI: Add Amazon's Annapurna Labs vendor ID PCI: Add Genesys Logic, Inc. Vendor ID ALSA: lx6464es - add support for LX6464ESe pci express variant x86/amd_nb: Add PCI device IDs for family 17h, model 70h PCI: mediatek: Add controller support for MT7629 PCI: Enable NVIDIA HDA controllers PCI: Add NVIDIA GPU multi-function power dependencies PCI: Add Synopsys endpoint EDDA Device ID misc: pci_endpoint_test: Add support to test PCI EP in AM654x misc: pci_endpoint_test: Add the layerscape EP device support PCI: Move Rohm Vendor ID to generic list PCI: Move Synopsys HAPS platform device IDs PCI: add USR vendor id and use it in r8169 and w6692 driver x86/amd_nb: Add PCI device IDs for family 17h, model 30h hwmon/k10temp, x86/amd_nb: Consolidate shared device IDs pci:ipmi: Move IPMI PCI class id defines to pci_ids.h PCI: Remove unused NFP32xx IDs PCI: Add ACS quirk for Intel Root Complex Integrated Endpoints PCI: Add ACS quirk for iProc PAXB PCI: Avoid FLR for AMD Starship USB 3.0 PCI: Avoid FLR for AMD Matisse HD Audio & USB 3.0 PCI: Avoid Pericom USB controller OHCI/EHCI PME# defect ext4: fix race between ext4_sync_parent() and rename() ext4: fix error pointer dereference ext4: fix EXT_MAX_EXTENT/INDEX to check for zeroed eh_max evm: Fix possible memory leak in evm_calc_hmac_or_hash() ima: Directly assign the ima_default_policy pointer to ima_rules ima: Fix ima digest hash table key calculation mm: initialize deferred pages with interrupts enabled mm: thp: make the THP mapcount atomic against __split_huge_pmd_locked() btrfs: send: emit file capabilities after chown btrfs: include non-missing as a qualifier for the latest_bdev string.h: fix incompatibility between FORTIFY_SOURCE and KASAN platform/x86: intel-vbtn: Only blacklist SW_TABLET_MODE on the 9 / "Laptop" chasis-type platform/x86: intel-hid: Add a quirk to support HP Spectre X2 (2015) platform/x86: hp-wmi: Convert simple_strtoul() to kstrtou32() cpuidle: Fix three reference count leaks spi: dw: Return any value retrieved from the dma_transfer callback mmc: sdhci-esdhc-imx: fix the mask for tuning start point ixgbe: fix signed-integer-overflow warning mmc: via-sdmmc: Respect the cmd->busy_timeout from the mmc core staging: greybus: sdio: Respect the cmd->busy_timeout from the mmc core mmc: sdhci-msm: Set SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12 quirk bcache: fix refcount underflow in bcache_device_free() MIPS: Fix IRQ tracing when call handle_fpe() and handle_msa_fpe() PCI: Don't disable decoding when mmio_always_on is set macvlan: Skip loopback packets in RX handler btrfs: qgroup: mark qgroup inconsistent if we're inherting snapshot to a new qgroup m68k: mac: Don't call via_flush_cache() on Mac IIfx x86/mm: Stop printing BRK addresses crypto: stm32/crc32 - fix multi-instance crypto: stm32/crc32 - fix run-time self test issue. crypto: stm32/crc32 - fix ext4 chksum BUG_ON() mips: Add udelay lpj numbers adjustment mips: MAAR: Use more precise address mask x86/boot: Correct relocation destination on old linkers mwifiex: Fix memory corruption in dump_station rtlwifi: Fix a double free in _rtl_usb_tx_urb_setup() net/mlx5e: IPoIB, Drop multicast packets that this interface sent veth: Adjust hard_start offset on redirect XDP frames md: don't flush workqueue unconditionally in md_open mt76: avoid rx reorder buffer overflow net: qed*: Reduce RX and TX default ring count when running inside kdump kernel wcn36xx: Fix error handling path in 'wcn36xx_probe()' ath10k: Remove msdu from idr when management pkt send fails nvme: refine the Qemu Identify CNS quirk platform/x86: intel-vbtn: Also handle tablet-mode switch on "Detachable" and "Portable" chassis-types platform/x86: intel-vbtn: Do not advertise switches to userspace if they are not there platform/x86: intel-vbtn: Split keymap into buttons and switches parts platform/x86: intel-vbtn: Use acpi_evaluate_integer() xfs: fix duplicate verification from xfs_qm_dqflush() xfs: reset buffer write failure state on successful completion kgdb: Fix spurious true from in_dbg_master() mips: cm: Fix an invalid error code of INTVN_*_ERR MIPS: Truncate link address into 32bit for 32bit kernel Crypto/chcr: fix for ccm(aes) failed test xfs: clean up the error handling in xfs_swap_extents powerpc/spufs: fix copy_to_user while atomic net: allwinner: Fix use correct return type for ndo_start_xmit() media: cec: silence shift wrapping warning in __cec_s_log_addrs() net: lpc-enet: fix error return code in lpc_mii_init() drivers/perf: hisi: Fix typo in events attribute array sched/core: Fix illegal RCU from offline CPUs exit: Move preemption fixup up, move blocking operations down lib/mpi: Fix 64-bit MIPS build with Clang net: bcmgenet: set Rx mode before starting netif selftests/bpf: Fix memory leak in extract_build_id() netfilter: nft_nat: return EOPNOTSUPP if type or flags are not supported audit: fix a net reference leak in audit_list_rules_send() Bluetooth: btbcm: Add 2 missing models to subver tables MIPS: Make sparse_init() using top-down allocation media: platform: fcp: Set appropriate DMA parameters media: dvb: return -EREMOTEIO on i2c transfer failure. audit: fix a net reference leak in audit_send_reply() dt-bindings: display: mediatek: control dpi pins mode to avoid leakage e1000: Distribute switch variables for initialization tools api fs: Make xxx__mountpoint() more scalable brcmfmac: fix wrong location to get firmware feature staging: android: ion: use vmap instead of vm_map_ram net: vmxnet3: fix possible buffer overflow caused by bad DMA value in vmxnet3_get_rss() x86/kvm/hyper-v: Explicitly align hcall param for kvm_hyperv_exit spi: dw: Fix Rx-only DMA transfers mmc: meson-mx-sdio: trigger a soft reset after a timeout or CRC error batman-adv: Revert "disable ethtool link speed detection when auto negotiation off" ARM: 8978/1: mm: make act_mm() respect THREAD_SIZE btrfs: do not ignore error from btrfs_next_leaf() when inserting checksums clocksource: dw_apb_timer_of: Fix missing clockevent timers clocksource: dw_apb_timer: Make CPU-affiliation being optional spi: dw: Enable interrupts in accordance with DMA xfer mode kgdb: Prevent infinite recursive entries to the debugger kgdb: Disable WARN_CONSOLE_UNLOCKED for all kgdb Bluetooth: Add SCO fallback for invalid LMP parameters error MIPS: Loongson: Build ATI Radeon GPU driver as module ixgbe: Fix XDP redirect on archs with PAGE_SIZE above 4K arm64: insn: Fix two bugs in encoding 32-bit logical immediates spi: dw: Zero DMA Tx and Rx configurations on stack arm64: cacheflush: Fix KGDB trap detection efi/libstub/x86: Work around LLVM ELF quirk build regression net: ena: fix error returning in ena_com_get_hash_function() net: atlantic: make hw_get_regs optional spi: pxa2xx: Apply CS clk quirk to BXT objtool: Ignore empty alternatives media: si2157: Better check for running tuner in init crypto: ccp -- don't "select" CONFIG_DMADEVICES drm: bridge: adv7511: Extend list of audio sample rates ACPI: GED: use correct trigger type field in _Exx / _Lxx handling KVM: arm64: Synchronize sysreg state on injecting an AArch32 exception xen/pvcalls-back: test for errors when calling backend_connect() mmc: sdio: Fix potential NULL pointer error in mmc_sdio_init_card() ARM: dts: at91: sama5d2_ptc_ek: fix sdmmc0 node description mmc: sdhci-msm: Clear tuning done flag while hs400 tuning agp/intel: Reinforce the barrier after GTT updates perf: Add cond_resched() to task_function_call() fat: don't allow to mount if the FAT length == 0 mm/slub: fix a memory leak in sysfs_slab_add() drm/vkms: Hold gem object while still in-use Smack: slab-out-of-bounds in vsscanf ath9k: Fix general protection fault in ath9k_hif_usb_rx_cb ath9x: Fix stack-out-of-bounds Write in ath9k_hif_usb_rx_cb ath9k: Fix use-after-free Write in ath9k_htc_rx_msg ath9k: Fix use-after-free Read in ath9k_wmi_ctrl_rx scsi: megaraid_sas: TM command refire leads to controller firmware crash KVM: arm64: Make vcpu_cp1x() work on Big Endian hosts KVM: MIPS: Fix VPN2_MASK definition for variable cpu_vmbits KVM: MIPS: Define KVM_ENTRYHI_ASID to cpu_asid_mask(&boot_cpu_data) KVM: nVMX: Consult only the "basic" exit reason when routing nested exit KVM: nSVM: leave ASID aside in copy_vmcb_control_area KVM: nSVM: fix condition for filtering async PF video: fbdev: w100fb: Fix a potential double free. proc: Use new_inode not new_inode_pseudo ovl: initialize error in ovl_copy_xattr selftests/net: in rxtimestamp getopt_long needs terminating null entry crypto: virtio: Fix dest length calculation in __virtio_crypto_skcipher_do_req() crypto: virtio: Fix src/dst scatterlist calculation in __virtio_crypto_skcipher_do_req() crypto: virtio: Fix use-after-free in virtio_crypto_skcipher_finalize_req() spi: pxa2xx: Fix runtime PM ref imbalance on probe error spi: pxa2xx: Balance runtime PM enable/disable on error spi: bcm2835: Fix controller unregister order spi: pxa2xx: Fix controller unregister order spi: Fix controller unregister order spi: No need to assign dummy value in spi_unregister_controller() x86/speculation: PR_SPEC_FORCE_DISABLE enforcement for indirect branches. x86/speculation: Avoid force-disabling IBPB based on STIBP and enhanced IBRS. x86/speculation: Add support for STIBP always-on preferred mode x86/speculation: Change misspelled STIPB to STIBP KVM: x86: only do L1TF workaround on affected processors KVM: x86/mmu: Consolidate "is MMIO SPTE" code kvm: x86: Fix L1TF mitigation for shadow MMU KVM: x86: Fix APIC page invalidation race x86/{mce,mm}: Unmap the entire page if the whole page is affected and poisoned ALSA: pcm: disallow linking stream to itself crypto: cavium/nitrox - Fix 'nitrox_get_first_device()' when ndevlist is fully iterated PM: runtime: clk: Fix clk_pm_runtime_get() error path spi: bcm-qspi: when tx/rx buffer is NULL set to 0 spi: bcm2835aux: Fix controller unregister order spi: dw: Fix controller unregister order nilfs2: fix null pointer dereference at nilfs_segctor_do_construct() cgroup, blkcg: Prepare some symbols for module and !CONFIG_CGROUP usages ACPI: PM: Avoid using power resources if there are none for D0 ACPI: GED: add support for _Exx / _Lxx handler methods ACPI: CPPC: Fix reference count leak in acpi_cppc_processor_probe() ACPI: sysfs: Fix reference count leak in acpi_sysfs_add_hotplug_profile() ALSA: usb-audio: Add vendor, product and profile name for HP Thunderbolt Dock ALSA: usb-audio: Fix inconsistent card PM state after resume ALSA: hda/realtek - add a pintbl quirk for several Lenovo machines ALSA: es1688: Add the missed snd_card_free() efi/efivars: Add missing kobject_put() in sysfs entry creation error path x86/reboot/quirks: Add MacBook6,1 reboot quirk x86/speculation: Prevent rogue cross-process SSBD shutdown x86/PCI: Mark Intel C620 MROMs as having non-compliant BARs x86_64: Fix jiffies ODR violation btrfs: tree-checker: Check level for leaves and nodes aio: fix async fsync creds mm: add kvfree_sensitive() for freeing sensitive data objects perf probe: Accept the instance number of kretprobe event x86/cpu/amd: Make erratum #1054 a legacy erratum RDMA/uverbs: Make the event_queue fds return POLLERR when disassociated ath9k_htc: Silence undersized packet warnings powerpc/xive: Clear the page tables for the ESB IO mapping drivers/net/ibmvnic: Update VNIC protocol version reporting Input: synaptics - add a second working PNP_ID for Lenovo T470s sched/fair: Don't NUMA balance for kthreads ARM: 8977/1: ptrace: Fix mask for thumb breakpoint hook Input: mms114 - fix handling of mms345l crypto: talitos - fix ECB and CBC algs ivsize btrfs: Detect unbalanced tree with empty leaf before crashing btree operations btrfs: merge btrfs_find_device and find_device lib: Reduce user_access_begin() boundaries in strncpy_from_user() and strnlen_user() x86: uaccess: Inhibit speculation past access_ok() in user_access_begin() arch/openrisc: Fix issues with access_ok() Fix 'acccess_ok()' on alpha and SH make 'user_access_begin()' do 'access_ok()' selftests: bpf: fix use of undeclared RET_IF macro tun: correct header offsets in napi frags mode vxlan: Avoid infinite loop when suppressing NS messages with invalid options bridge: Avoid infinite loop when suppressing NS messages with invalid options net_failover: fixed rollback in net_failover_open() ipv6: fix IPV6_ADDRFORM operation logic writeback: Drop I_DIRTY_TIME_EXPIRE writeback: Fix sync livelock due to b_dirty_time processing writeback: Avoid skipping inode writeback writeback: Protect inode->i_io_list with inode->i_lock Revert "writeback: Avoid skipping inode writeback" ANDROID: gki_defconfig: increase vbus_draw to 500mA fscrypt: remove stale definition fs-verity: remove unnecessary extern keywords fs-verity: fix all kerneldoc warnings fscrypt: add support for IV_INO_LBLK_32 policies fscrypt: make test_dummy_encryption use v2 by default fscrypt: support test_dummy_encryption=v2 fscrypt: add fscrypt_add_test_dummy_key() linux/parser.h: add include guards fscrypt: remove unnecessary extern keywords fscrypt: name all function parameters fscrypt: fix all kerneldoc warnings ANDROID: Update the ABI ANDROID: GKI: power: power-supply: Add POWER_SUPPLY_PROP_CHARGER_STATUS property ANDROID: GKI: add dev to usb_gsi_request ANDROID: GKI: dma-buf: add dent_count to dma_buf ANDROID: Update the ABI xml and whitelist ANDROID: GKI: update whitelist ANDROID: extcon: Export symbol of `extcon_get_edev_name` ANDROID: kbuild: merge more sections with LTO UPSTREAM: timekeeping/vsyscall: Update VDSO data unconditionally ANDROID: GKI: Revert "genetlink: disallow subscribing to unknown mcast groups" BACKPORT: usb: musb: Add support for MediaTek musb controller UPSTREAM: usb: musb: Add musb_clearb/w() interface UPSTREAM: usb: musb: Add noirq type of dma create interface UPSTREAM: usb: musb: Add get/set toggle hooks UPSTREAM: dt-bindings: usb: musb: Add support for MediaTek musb controller FROMGIT: driver core: Remove unnecessary is_fwnode_dev variable in device_add() FROMGIT: driver core: Remove check in driver_deferred_probe_force_trigger() FROMGIT: of: platform: Batch fwnode parsing when adding all top level devices FROMGIT: BACKPORT: driver core: fw_devlink: Add support for batching fwnode parsing BACKPORT: driver core: Look for waiting consumers only for a fwnode's primary device BACKPORT: driver core: Add device links from fwnode only for the primary device Linux 4.19.128 Revert "net/mlx5: Annotate mutex destroy for root ns" uprobes: ensure that uprobe->offset and ->ref_ctr_offset are properly aligned x86/speculation: Add Ivy Bridge to affected list x86/speculation: Add SRBDS vulnerability and mitigation documentation x86/speculation: Add Special Register Buffer Data Sampling (SRBDS) mitigation x86/cpu: Add 'table' argument to cpu_matches() x86/cpu: Add a steppings field to struct x86_cpu_id nvmem: qfprom: remove incorrect write support CDC-ACM: heed quirk also in error handling staging: rtl8712: Fix IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK tty: hvc_console, fix crashes on parallel open/close vt: keyboard: avoid signed integer overflow in k_ascii usb: musb: Fix runtime PM imbalance on error usb: musb: start session in resume for host port iio: vcnl4000: Fix i2c swapped word reading. USB: serial: option: add Telit LE910C1-EUX compositions USB: serial: usb_wwan: do not resubmit rx urb on fatal errors USB: serial: qcserial: add DW5816e QDL support net: check untrusted gso_size at kernel entry vsock: fix timeout in vsock_accept() NFC: st21nfca: add missed kfree_skb() in an error path net: usb: qmi_wwan: add Telit LE910C1-EUX composition l2tp: do not use inet_hash()/inet_unhash() l2tp: add sk_family checks to l2tp_validate_socket devinet: fix memleak in inetdev_init() Revert "ANDROID: Remove default y on BRIDGE_IGMP_SNOOPING" ANDROID: Update the ABI xml and whitelist ANDROID: GKI: update whitelist ANDROID: arch: arm64: vdso: export the symbols for time() ANDROID: Incremental fs: Remove dependency on PKCS7_MESSAGE_PARSER ANDROID: dm-bow: Add block_size option f2fs: attach IO flags to the missing cases f2fs: add node_io_flag for bio flags likewise data_io_flag f2fs: remove unused parameter of f2fs_put_rpages_mapping() f2fs: handle readonly filesystem in f2fs_ioc_shutdown() f2fs: avoid utf8_strncasecmp() with unstable name f2fs: don't return vmalloc() memory from f2fs_kmalloc() ANDROID: GKI: set CONFIG_BLK_DEV_LOOP_MIN_COUNT to 16 ANDROID: Incremental fs: Cache successful hash calculations ANDROID: Incremental fs: Fix four error-path bugs Linux 4.19.127 net: smsc911x: Fix runtime PM imbalance on error net: ethernet: stmmac: Enable interface clocks on probe for IPQ806x net/ethernet/freescale: rework quiesce/activate for ucc_geth null_blk: return error for invalid zone size s390/mm: fix set_huge_pte_at() for empty ptes drm/edid: Add Oculus Rift S to non-desktop list net: bmac: Fix read of MAC address from ROM x86/mmiotrace: Use cpumask_available() for cpumask_var_t variables i2c: altera: Fix race between xfer_msg and isr thread evm: Fix RCU list related warnings ARC: [plat-eznps]: Restrict to CONFIG_ISA_ARCOMPACT ARC: Fix ICCM & DCCM runtime size checks s390/ftrace: save traced function caller spi: dw: use "smp_mb()" to avoid sending spi data error powerpc/powernv: Avoid re-registration of imc debugfs directory scsi: hisi_sas: Check sas_port before using it drm/i915: fix port checks for MST support on gen >= 11 airo: Fix read overflows sending packets net: dsa: mt7530: set CPU port to fallback mode scsi: ufs: Release clock if DMA map fails mmc: fix compilation of user API kernel/relay.c: handle alloc_percpu returning NULL in relay_open p54usb: add AirVasT USB stick device-id HID: i2c-hid: add Schneider SCL142ALM to descriptor override HID: sony: Fix for broken buttons on DS3 USB dongles mm: Fix mremap not considering huge pmd devmap libnvdimm: Fix endian conversion issues  Revert "cgroup: Add memory barriers to plug cgroup_rstat_updated() race window" f2fs: fix retry logic in f2fs_write_cache_pages() ANDROID: Update ABI representation Linux 4.19.126 mm/vmalloc.c: don't dereference possible NULL pointer in __vunmap() netfilter: nf_conntrack_pptp: fix compilation warning with W=1 build bonding: Fix reference count leak in bond_sysfs_slave_add. crypto: chelsio/chtls: properly set tp->lsndtime qlcnic: fix missing release in qlcnic_83xx_interrupt_test. xsk: Add overflow check for u64 division, stored into u32 bnxt_en: Fix accumulation of bp->net_stats_prev. esp6: get the right proto for transport mode in esp6_gso_encap netfilter: nf_conntrack_pptp: prevent buffer overflows in debug code netfilter: nfnetlink_cthelper: unbreak userspace helper support netfilter: ipset: Fix subcounter update skip netfilter: nft_reject_bridge: enable reject with bridge vlan ip_vti: receive ipip packet by calling ip_tunnel_rcv vti4: eliminated some duplicate code. xfrm: fix error in comment xfrm: fix a NULL-ptr deref in xfrm_local_error xfrm: fix a warning in xfrm_policy_insert_list xfrm interface: fix oops when deleting a x-netns interface xfrm: call xfrm_output_gso when inner_protocol is set in xfrm_output xfrm: allow to accept packets with ipv6 NEXTHDR_HOP in xfrm_input copy_xstate_to_kernel(): don't leave parts of destination uninitialized x86/dma: Fix max PFN arithmetic overflow on 32 bit systems mac80211: mesh: fix discovery timer re-arming issue / crash RDMA/core: Fix double destruction of uobject mmc: core: Fix recursive locking issue in CQE recovery path parisc: Fix kernel panic in mem_init() iommu: Fix reference count leak in iommu_group_alloc. include/asm-generic/topology.h: guard cpumask_of_node() macro argument fs/binfmt_elf.c: allocate initialized memory in fill_thread_core_info() mm: remove VM_BUG_ON(PageSlab()) from page_mapcount() IB/ipoib: Fix double free of skb in case of multicast traffic in CM mode libceph: ignore pool overlay and cache logic on redirects ALSA: hda/realtek - Add new codec supported for ALC287 ALSA: usb-audio: Quirks for Gigabyte TRX40 Aorus Master onboard audio exec: Always set cap_ambient in cap_bprm_set_creds ALSA: usb-audio: mixer: volume quirk for ESS Technology Asus USB DAC ALSA: hda/realtek - Add a model for Thinkpad T570 without DAC workaround ALSA: hwdep: fix a left shifting 1 by 31 UB bug RDMA/pvrdma: Fix missing pci disable in pvrdma_pci_probe() mmc: block: Fix use-after-free issue for rpmb ARM: dts: bcm: HR2: Fix PPI interrupt types ARM: dts: bcm2835-rpi-zero-w: Fix led polarity ARM: dts/imx6q-bx50v3: Set display interface clock parents IB/qib: Call kobject_put() when kobject_init_and_add() fails gpio: exar: Fix bad handling for ida_simple_get error path ARM: uaccess: fix DACR mismatch with nested exceptions ARM: uaccess: integrate uaccess_save and uaccess_restore ARM: uaccess: consolidate uaccess asm to asm/uaccess-asm.h ARM: 8843/1: use unified assembler in headers ARM: 8970/1: decompressor: increase tag size Input: synaptics-rmi4 - fix error return code in rmi_driver_probe() Input: synaptics-rmi4 - really fix attn_data use-after-free Input: i8042 - add ThinkPad S230u to i8042 reset list Input: dlink-dir685-touchkeys - fix a typo in driver name Input: xpad - add custom init packet for Xbox One S controllers Input: evdev - call input_flush_device() on release(), not flush() Input: usbtouchscreen - add support for BonXeon TP samples: bpf: Fix build error cifs: Fix null pointer check in cifs_read riscv: stacktrace: Fix undefined reference to `walk_stackframe' IB/i40iw: Remove bogus call to netdev_master_upper_dev_get() net: freescale: select CONFIG_FIXED_PHY where needed usb: gadget: legacy: fix redundant initialization warnings usb: dwc3: pci: Enable extcon driver for Intel Merrifield cachefiles: Fix race between read_waiter and read_copier involving op->to_do gfs2: move privileged user check to gfs2_quota_lock_check net: microchip: encx24j600: add missed kthread_stop ALSA: usb-audio: add mapping for ASRock TRX40 Creator gpio: tegra: mask GPIO IRQs during IRQ shutdown ARM: dts: rockchip: fix pinctrl sub nodename for spi in rk322x.dtsi ARM: dts: rockchip: swap clock-names of gpu nodes arm64: dts: rockchip: swap interrupts interrupt-names rk3399 gpu node arm64: dts: rockchip: fix status for &gmac2phy in rk3328-evb.dts ARM: dts: rockchip: fix phy nodename for rk3228-evb mlxsw: spectrum: Fix use-after-free of split/unsplit/type_set in case reload fails net/mlx4_core: fix a memory leak bug. net: sun: fix missing release regions in cas_init_one(). net/mlx5: Annotate mutex destroy for root ns net/mlx5e: Update netdev txq on completions during closure sctp: Start shutdown on association restart if in SHUTDOWN-SENT state and socket is closed sctp: Don't add the shutdown timer if its already been added r8152: support additional Microsoft Surface Ethernet Adapter variant net sched: fix reporting the first-time use timestamp net: revert "net: get rid of an signed integer overflow in ip_idents_reserve()" net: qrtr: Fix passing invalid reference to qrtr_local_enqueue() net/mlx5: Add command entry handling completion net: ipip: fix wrong address family in init error path net: inet_csk: Fix so_reuseport bind-address cache in tb->fast* __netif_receive_skb_core: pass skb by reference net: dsa: mt7530: fix roaming from DSA user ports dpaa_eth: fix usage as DSA master, try 3 ax25: fix setsockopt(SO_BINDTODEVICE) ANDROID: modules: fix lockprove warning FROMGIT: USB: dummy-hcd: use configurable endpoint naming scheme UPSTREAM: usb: raw-gadget: fix null-ptr-deref when reenabling endpoints UPSTREAM: usb: raw-gadget: documentation updates UPSTREAM: usb: raw-gadget: support stalling/halting/wedging endpoints UPSTREAM: usb: raw-gadget: fix gadget endpoint selection UPSTREAM: usb: raw-gadget: improve uapi headers comments UPSTREAM: usb: raw-gadget: fix return value of ep read ioctls UPSTREAM: usb: raw-gadget: fix raw_event_queue_fetch locking UPSTREAM: usb: raw-gadget: Fix copy_to/from_user() checks f2fs: fix wrong discard space f2fs: compress: don't compress any datas after cp stop f2fs: remove unneeded return value of __insert_discard_tree() f2fs: fix wrong value of tracepoint parameter f2fs: protect new segment allocation in expand_inode_data f2fs: code cleanup by removing ifdef macro surrounding writeback: Avoid skipping inode writeback ANDROID: GKI: Update the ABI ANDROID: GKI: update whitelist ANDROID: GKI: support mm_event for FS/IO/UFS path ANDROID: net: bpf: permit redirect from ingress L3 to egress L2 devices at near max mtu FROMGIT: driver core: Update device link status correctly for SYNC_STATE_ONLY links UPSTREAM: driver core: Fix handling of SYNC_STATE_ONLY + STATELESS device links BACKPORT: driver core: Fix SYNC_STATE_ONLY device link implementation ANDROID: Bulk update the ABI xml and qcom whitelist Revert "ANDROID: Incremental fs: Avoid continually recalculating hashes" f2fs: avoid inifinite loop to wait for flushing node pages at cp_error f2fs: compress: fix zstd data corruption f2fs: add compressed/gc data read IO stat f2fs: fix potential use-after-free issue f2fs: compress: don't handle non-compressed data in workqueue f2fs: remove redundant assignment to variable err f2fs: refactor resize_fs to avoid meta updates in progress f2fs: use round_up to enhance calculation f2fs: introduce F2FS_IOC_RESERVE_COMPRESS_BLOCKS f2fs: Avoid double lock for cp_rwsem during checkpoint f2fs: report delalloc reserve as non-free in statfs for project quota f2fs: Fix wrong stub helper update_sit_info f2fs: compress: let lz4 compressor handle output buffer budget properly f2fs: remove blk_plugging in block_operations f2fs: introduce F2FS_IOC_RELEASE_COMPRESS_BLOCKS f2fs: shrink spinlock coverage f2fs: correctly fix the parent inode number during fsync() f2fs: introduce mempool for {,de}compress intermediate page allocation f2fs: introduce f2fs_bmap_compress() f2fs: support fiemap on compressed inode f2fs: support partial truncation on compressed inode f2fs: remove redundant compress inode check f2fs: use strcmp() in parse_options() f2fs: Use the correct style for SPDX License Identifier Conflicts: Documentation/devicetree/bindings Documentation/devicetree/bindings/display/mediatek/mediatek,dpi.txt Documentation/devicetree/bindings/usb/dwc3.txt drivers/media/v4l2-core/v4l2-ctrls.c drivers/mmc/core/queue.c drivers/mmc/host/sdhci-msm.c drivers/scsi/ufs/ufs-qcom.c drivers/slimbus/qcom-ngd-ctrl.c drivers/usb/gadget/composite.c fs/crypto/keyring.c fs/f2fs/data.c include/linux/fs.h include/linux/usb/gadget.h include/uapi/linux/v4l2-controls.h kernel/sched/cpufreq_schedutil.c kernel/sched/fair.c kernel/time/tick-sched.c mm/vmalloc.c net/netlink/genetlink.c net/qrtr/qrtr.c sound/core/compress_offload.c sound/soc/soc-compress.c Fixed errors: drivers/scsi/ufs/ufshcd.c drivers/soc/qcom/rq_stats.c Change-Id: I06ea6a6c3f239045e2947f27af617aa6f523bfdb Signed-off-by: Srinivasarao P <spathi@codeaurora.org>
2020-10-14 08:33:38 -06:00
if (rq_info.init == 1 && wake_callback &&
tick_do_timer_cpu == smp_processor_id()) {
/*
* wakeup user if needed
*/
wake_callback();
}
}
else
ts->next_tick = 0;
/* No need to reprogram if we are in idle or full dynticks mode */
if (unlikely(ts->tick_stopped))
return HRTIMER_NORESTART;
hrtimer_forward(timer, now, tick_period);
return HRTIMER_RESTART;
}
static int sched_skew_tick;
static int __init skew_tick(char *str)
{
get_option(&str, &sched_skew_tick);
return 0;
}
early_param("skew_tick", skew_tick);
/**
* tick_setup_sched_timer - setup the tick emulation timer
*/
void tick_setup_sched_timer(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t now = ktime_get();
/*
* Emulate tick processing via per-CPU hrtimers:
*/
hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
ts->sched_timer.function = tick_sched_timer;
/* Get the next period (per-CPU) */
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
/* Offset the tick to avert jiffies_lock contention. */
if (sched_skew_tick) {
u64 offset = ktime_to_ns(tick_period) >> 1;
do_div(offset, num_possible_cpus());
offset *= smp_processor_id();
hrtimer_add_expires_ns(&ts->sched_timer, offset);
}
hrtimer_forward(&ts->sched_timer, now, tick_period);
hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
timer: Reduce timer migration overhead if disabled Eric reported that the timer_migration sysctl is not really nice performance wise as it needs to check at every timer insertion whether the feature is enabled or not. Further the check does not live in the timer code, so we have an extra function call which checks an extra cache line to figure out that it is disabled. We can do better and store that information in the per cpu (hr)timer bases. I pondered to use a static key, but that's a nightmare to update from the nohz code and the timer base cache line is hot anyway when we select a timer base. The old logic enabled the timer migration unconditionally if CONFIG_NO_HZ was set even if nohz was disabled on the kernel command line. With this modification, we start off with migration disabled. The user visible sysctl is still set to enabled. If the kernel switches to NOHZ migration is enabled, if the user did not disable it via the sysctl prior to the switch. If nohz=off is on the kernel command line, migration stays disabled no matter what. Before: 47.76% hog [.] main 14.84% [kernel] [k] _raw_spin_lock_irqsave 9.55% [kernel] [k] _raw_spin_unlock_irqrestore 6.71% [kernel] [k] mod_timer 6.24% [kernel] [k] lock_timer_base.isra.38 3.76% [kernel] [k] detach_if_pending 3.71% [kernel] [k] del_timer 2.50% [kernel] [k] internal_add_timer 1.51% [kernel] [k] get_nohz_timer_target 1.28% [kernel] [k] __internal_add_timer 0.78% [kernel] [k] timerfn 0.48% [kernel] [k] wake_up_nohz_cpu After: 48.10% hog [.] main 15.25% [kernel] [k] _raw_spin_lock_irqsave 9.76% [kernel] [k] _raw_spin_unlock_irqrestore 6.50% [kernel] [k] mod_timer 6.44% [kernel] [k] lock_timer_base.isra.38 3.87% [kernel] [k] detach_if_pending 3.80% [kernel] [k] del_timer 2.67% [kernel] [k] internal_add_timer 1.33% [kernel] [k] __internal_add_timer 0.73% [kernel] [k] timerfn 0.54% [kernel] [k] wake_up_nohz_cpu Reported-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Viresh Kumar <viresh.kumar@linaro.org> Cc: John Stultz <john.stultz@linaro.org> Cc: Joonwoo Park <joonwoop@codeaurora.org> Cc: Wenbo Wang <wenbo.wang@memblaze.com> Link: http://lkml.kernel.org/r/20150526224512.127050787@linutronix.de Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2015-05-26 16:50:33 -06:00
tick_nohz_activate(ts, NOHZ_MODE_HIGHRES);
}
#endif /* HIGH_RES_TIMERS */
nohz: Rename CONFIG_NO_HZ to CONFIG_NO_HZ_COMMON We are planning to convert the dynticks Kconfig options layout into a choice menu. The user must be able to easily pick any of the following implementations: constant periodic tick, idle dynticks, full dynticks. As this implies a mutual exclusion, the two dynticks implementions need to converge on the selection of a common Kconfig option in order to ease the sharing of a common infrastructure. It would thus seem pretty natural to reuse CONFIG_NO_HZ to that end. It already implements all the idle dynticks code and the full dynticks depends on all that code for now. So ideally the choice menu would propose CONFIG_NO_HZ_IDLE and CONFIG_NO_HZ_EXTENDED then both would select CONFIG_NO_HZ. On the other hand we want to stay backward compatible: if CONFIG_NO_HZ is set in an older config file, we want to enable CONFIG_NO_HZ_IDLE by default. But we can't afford both at the same time or we run into a circular dependency: 1) CONFIG_NO_HZ_IDLE and CONFIG_NO_HZ_EXTENDED both select CONFIG_NO_HZ 2) If CONFIG_NO_HZ is set, we default to CONFIG_NO_HZ_IDLE We might be able to support that from Kconfig/Kbuild but it may not be wise to introduce such a confusing behaviour. So to solve this, create a new CONFIG_NO_HZ_COMMON option which gathers the common code between idle and full dynticks (that common code for now is simply the idle dynticks code) and select it from their referring Kconfig. Then we'll later create CONFIG_NO_HZ_IDLE and map CONFIG_NO_HZ to it for backward compatibility. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Christoph Lameter <cl@linux.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Gilad Ben Yossef <gilad@benyossef.com> Cc: Hakan Akkan <hakanakkan@gmail.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Kevin Hilman <khilman@linaro.org> Cc: Li Zhong <zhong@linux.vnet.ibm.com> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Paul Gortmaker <paul.gortmaker@windriver.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Gleixner <tglx@linutronix.de>
2011-08-10 15:21:01 -06:00
#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
void tick_cancel_sched_timer(int cpu)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
# ifdef CONFIG_HIGH_RES_TIMERS
if (ts->sched_timer.base)
hrtimer_cancel(&ts->sched_timer);
# endif
memset(ts, 0, sizeof(*ts));
}
#endif
/**
* Async notification about clocksource changes
*/
void tick_clock_notify(void)
{
int cpu;
for_each_possible_cpu(cpu)
set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
}
/*
* Async notification about clock event changes
*/
void tick_oneshot_notify(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
set_bit(0, &ts->check_clocks);
}
/**
* Check, if a change happened, which makes oneshot possible.
*
* Called cyclic from the hrtimer softirq (driven by the timer
* softirq) allow_nohz signals, that we can switch into low-res nohz
* mode, because high resolution timers are disabled (either compile
* or runtime). Called with interrupts disabled.
*/
int tick_check_oneshot_change(int allow_nohz)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
if (!test_and_clear_bit(0, &ts->check_clocks))
return 0;
if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
return 0;
if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
return 0;
if (!allow_nohz)
return 1;
tick_nohz_switch_to_nohz();
return 0;
}
ktime_t *get_next_event_cpu(unsigned int cpu)
{
return &(per_cpu(tick_cpu_device, cpu).evtdev->next_event);
}
EXPORT_SYMBOL_GPL(get_next_event_cpu);