Revert "cpuidle: Quickly notice prediction failure for repeat mode"
Revert commit69a37bea
(cpuidle: Quickly notice prediction failure for repeat mode), because it has been identified as the source of a significant performance regression in v3.8 and later as explained by Jeremy Eder: We believe we've identified a particular commit to the cpuidle code that seems to be impacting performance of variety of workloads. The simplest way to reproduce is using netperf TCP_RR test, so we're using that, on a pair of Sandy Bridge based servers. We also have data from a large database setup where performance is also measurably/positively impacted, though that test data isn't easily share-able. Included below are test results from 3 test kernels: kernel reverts ----------------------------------------------------------- 1) vanilla upstream (no reverts) 2) perfteam2 revertse11538d1f0
3) test reverts69a37beabf
e11538d1f0
In summary, netperf TCP_RR numbers improve by approximately 4% after reverting69a37beabf
. When69a37beabf
is included, C0 residency never seems to get above 40%. Taking that patch out gets C0 near 100% quite often, and performance increases. The below data are histograms representing the %c0 residency @ 1-second sample rates (using turbostat), while under netperf test. - If you look at the first 4 histograms, you can see %c0 residency almost entirely in the 30,40% bin. - The last pair, which reverts69a37beabf
, shows %c0 in the 80,90,100% bins. Below each kernel name are netperf TCP_RR trans/s numbers for the particular kernel that can be disclosed publicly, comparing the 3 test kernels. We ran a 4th test with the vanilla kernel where we've also set /dev/cpu_dma_latency=0 to show overall impact boosting single-threaded TCP_RR performance over 11% above baseline. 3.10-rc2 vanilla RX + c0 lock (/dev/cpu_dma_latency=0): TCP_RR trans/s 54323.78 ----------------------------------------------------------- 3.10-rc2 vanilla RX (no reverts) TCP_RR trans/s 48192.47 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 59]: *********************************************************** 40.0000 - 50.0000 [ 1]: * 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: Sender %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 11]: *********** 40.0000 - 50.0000 [ 49]: ************************************************* 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: ----------------------------------------------------------- 3.10-rc2 perfteam2 RX (reverts commite11538d1f0
) TCP_RR trans/s 49698.69 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 1]: * 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 59]: *********************************************************** 40.0000 - 50.0000 [ 0]: 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: Sender %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 2]: ** 40.0000 - 50.0000 [ 58]: ********************************************************** 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 0]: ----------------------------------------------------------- 3.10-rc2 test RX (reverts69a37beabf
ande11538d1f0
) TCP_RR trans/s 47766.95 Receiver %c0 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 1]: * 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 27]: *************************** 40.0000 - 50.0000 [ 2]: ** 50.0000 - 60.0000 [ 0]: 60.0000 - 70.0000 [ 2]: ** 70.0000 - 80.0000 [ 0]: 80.0000 - 90.0000 [ 0]: 90.0000 - 100.0000 [ 28]: **************************** Sender: 0.0000 - 10.0000 [ 1]: * 10.0000 - 20.0000 [ 0]: 20.0000 - 30.0000 [ 0]: 30.0000 - 40.0000 [ 11]: *********** 40.0000 - 50.0000 [ 0]: 50.0000 - 60.0000 [ 1]: * 60.0000 - 70.0000 [ 0]: 70.0000 - 80.0000 [ 3]: *** 80.0000 - 90.0000 [ 7]: ******* 90.0000 - 100.0000 [ 38]: ************************************** These results demonstrate gaining back the tendency of the CPU to stay in more responsive, performant C-states (and thus yield measurably better performance), by reverting commit69a37beabf
. Requested-by: Jeremy Eder <jeder@redhat.com> Tested-by: Len Brown <len.brown@intel.com> Cc: 3.8+ <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
This commit is contained in:
parent
228b30234f
commit
148519120c
3 changed files with 6 additions and 82 deletions
|
@ -28,13 +28,6 @@
|
|||
#define MAX_INTERESTING 50000
|
||||
#define STDDEV_THRESH 400
|
||||
|
||||
/* 60 * 60 > STDDEV_THRESH * INTERVALS = 400 * 8 */
|
||||
#define MAX_DEVIATION 60
|
||||
|
||||
static DEFINE_PER_CPU(struct hrtimer, menu_hrtimer);
|
||||
static DEFINE_PER_CPU(int, hrtimer_status);
|
||||
/* menu hrtimer mode */
|
||||
enum {MENU_HRTIMER_STOP, MENU_HRTIMER_REPEAT};
|
||||
|
||||
/*
|
||||
* Concepts and ideas behind the menu governor
|
||||
|
@ -198,42 +191,17 @@ static u64 div_round64(u64 dividend, u32 divisor)
|
|||
return div_u64(dividend + (divisor / 2), divisor);
|
||||
}
|
||||
|
||||
/* Cancel the hrtimer if it is not triggered yet */
|
||||
void menu_hrtimer_cancel(void)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
struct hrtimer *hrtmr = &per_cpu(menu_hrtimer, cpu);
|
||||
|
||||
/* The timer is still not time out*/
|
||||
if (per_cpu(hrtimer_status, cpu)) {
|
||||
hrtimer_cancel(hrtmr);
|
||||
per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_STOP;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(menu_hrtimer_cancel);
|
||||
|
||||
/* Call back for hrtimer is triggered */
|
||||
static enum hrtimer_restart menu_hrtimer_notify(struct hrtimer *hrtimer)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
|
||||
per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_STOP;
|
||||
|
||||
return HRTIMER_NORESTART;
|
||||
}
|
||||
|
||||
/*
|
||||
* Try detecting repeating patterns by keeping track of the last 8
|
||||
* intervals, and checking if the standard deviation of that set
|
||||
* of points is below a threshold. If it is... then use the
|
||||
* average of these 8 points as the estimated value.
|
||||
*/
|
||||
static u32 get_typical_interval(struct menu_device *data)
|
||||
static void get_typical_interval(struct menu_device *data)
|
||||
{
|
||||
int i = 0, divisor = 0;
|
||||
uint64_t max = 0, avg = 0, stddev = 0;
|
||||
int64_t thresh = LLONG_MAX; /* Discard outliers above this value. */
|
||||
unsigned int ret = 0;
|
||||
|
||||
again:
|
||||
|
||||
|
@ -274,16 +242,13 @@ static u32 get_typical_interval(struct menu_device *data)
|
|||
if (((avg > stddev * 6) && (divisor * 4 >= INTERVALS * 3))
|
||||
|| stddev <= 20) {
|
||||
data->predicted_us = avg;
|
||||
ret = 1;
|
||||
return ret;
|
||||
return;
|
||||
|
||||
} else if ((divisor * 4) > INTERVALS * 3) {
|
||||
/* Exclude the max interval */
|
||||
thresh = max - 1;
|
||||
goto again;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -298,9 +263,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
|
|||
int i;
|
||||
int multiplier;
|
||||
struct timespec t;
|
||||
int repeat = 0, low_predicted = 0;
|
||||
int cpu = smp_processor_id();
|
||||
struct hrtimer *hrtmr = &per_cpu(menu_hrtimer, cpu);
|
||||
|
||||
if (data->needs_update) {
|
||||
menu_update(drv, dev);
|
||||
|
@ -335,7 +297,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
|
|||
data->predicted_us = div_round64(data->expected_us * data->correction_factor[data->bucket],
|
||||
RESOLUTION * DECAY);
|
||||
|
||||
repeat = get_typical_interval(data);
|
||||
get_typical_interval(data);
|
||||
|
||||
/*
|
||||
* We want to default to C1 (hlt), not to busy polling
|
||||
|
@ -356,10 +318,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
|
|||
|
||||
if (s->disabled || su->disable)
|
||||
continue;
|
||||
if (s->target_residency > data->predicted_us) {
|
||||
low_predicted = 1;
|
||||
if (s->target_residency > data->predicted_us)
|
||||
continue;
|
||||
}
|
||||
if (s->exit_latency > latency_req)
|
||||
continue;
|
||||
if (s->exit_latency * multiplier > data->predicted_us)
|
||||
|
@ -369,28 +329,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
|
|||
data->exit_us = s->exit_latency;
|
||||
}
|
||||
|
||||
/* not deepest C-state chosen for low predicted residency */
|
||||
if (low_predicted) {
|
||||
unsigned int timer_us = 0;
|
||||
|
||||
/*
|
||||
* Set a timer to detect whether this sleep is much
|
||||
* longer than repeat mode predicted. If the timer
|
||||
* triggers, the code will evaluate whether to put
|
||||
* the CPU into a deeper C-state.
|
||||
* The timer is cancelled on CPU wakeup.
|
||||
*/
|
||||
timer_us = 2 * (data->predicted_us + MAX_DEVIATION);
|
||||
|
||||
if (repeat && (4 * timer_us < data->expected_us)) {
|
||||
RCU_NONIDLE(hrtimer_start(hrtmr,
|
||||
ns_to_ktime(1000 * timer_us),
|
||||
HRTIMER_MODE_REL_PINNED));
|
||||
/* In repeat case, menu hrtimer is started */
|
||||
per_cpu(hrtimer_status, cpu) = MENU_HRTIMER_REPEAT;
|
||||
}
|
||||
}
|
||||
|
||||
return data->last_state_idx;
|
||||
}
|
||||
|
||||
|
@ -481,9 +419,6 @@ static int menu_enable_device(struct cpuidle_driver *drv,
|
|||
struct cpuidle_device *dev)
|
||||
{
|
||||
struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
|
||||
struct hrtimer *t = &per_cpu(menu_hrtimer, dev->cpu);
|
||||
hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
||||
t->function = menu_hrtimer_notify;
|
||||
|
||||
memset(data, 0, sizeof(struct menu_device));
|
||||
|
||||
|
|
|
@ -174,10 +174,4 @@ static inline void tick_nohz_task_switch(struct task_struct *tsk) { }
|
|||
#endif
|
||||
|
||||
|
||||
# ifdef CONFIG_CPU_IDLE_GOV_MENU
|
||||
extern void menu_hrtimer_cancel(void);
|
||||
# else
|
||||
static inline void menu_hrtimer_cancel(void) {}
|
||||
# endif /* CONFIG_CPU_IDLE_GOV_MENU */
|
||||
|
||||
#endif
|
||||
|
|
|
@ -827,13 +827,10 @@ void tick_nohz_irq_exit(void)
|
|||
{
|
||||
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
|
||||
|
||||
if (ts->inidle) {
|
||||
/* Cancel the timer because CPU already waken up from the C-states*/
|
||||
menu_hrtimer_cancel();
|
||||
if (ts->inidle)
|
||||
__tick_nohz_idle_enter(ts);
|
||||
} else {
|
||||
else
|
||||
tick_nohz_full_stop_tick(ts);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -931,8 +928,6 @@ void tick_nohz_idle_exit(void)
|
|||
|
||||
ts->inidle = 0;
|
||||
|
||||
/* Cancel the timer because CPU already waken up from the C-states*/
|
||||
menu_hrtimer_cancel();
|
||||
if (ts->idle_active || ts->tick_stopped)
|
||||
now = ktime_get();
|
||||
|
||||
|
|
Loading…
Reference in a new issue