Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler changes from Ingo Molnar: "The main changes are: - irqtime accounting cleanups and enhancements. (Frederic Weisbecker) - schedstat debugging enhancements, make it more broadly runtime available. (Josh Poimboeuf) - More work on asymmetric topology/capacity scheduling. (Morten Rasmussen) - sched/wait fixes and cleanups. (Oleg Nesterov) - PELT (per entity load tracking) improvements. (Peter Zijlstra) - Rewrite and enhance select_idle_siblings(). (Peter Zijlstra) - sched/numa enhancements/fixes (Rik van Riel) - sched/cputime scalability improvements (Stanislaw Gruszka) - Load calculation arithmetics fixes. (Dietmar Eggemann) - sched/deadline enhancements (Tommaso Cucinotta) - Fix utilization accounting when switching to the SCHED_NORMAL policy. (Vincent Guittot) - ... plus misc cleanups and enhancements" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (64 commits) sched/irqtime: Consolidate irqtime flushing code sched/irqtime: Consolidate accounting synchronization with u64_stats API u64_stats: Introduce IRQs disabled helpers sched/irqtime: Remove needless IRQs disablement on kcpustat update sched/irqtime: No need for preempt-safe accessors sched/fair: Fix min_vruntime tracking sched/debug: Add SCHED_WARN_ON() sched/core: Fix set_user_nice() sched/fair: Introduce set_curr_task() helper sched/core, ia64: Rename set_curr_task() sched/core: Fix incorrect utilization accounting when switching to fair class sched/core: Optimize SCHED_SMT sched/core: Rewrite and improve select_idle_siblings() sched/core: Replace sd_busy/nr_busy_cpus with sched_domain_shared sched/core: Introduce 'struct sched_domain_shared' sched/core: Restructure destroy_sched_domain() sched/core: Remove unused @cpu argument from destroy_sched_domain*() sched/wait: Introduce init_wait_entry() sched/wait: Avoid abort_exclusive_wait() in __wait_on_bit_lock() sched/wait: Avoid abort_exclusive_wait() in ___wait_event() ...
2016-10-03 13:39:00 -07:00 · 2016-10-03 13:39:00 -07:00 · af79ad2b1f
commit af79ad2b1f
parent e606d81d2d 447976ef4f
24 changed files with 1221 additions and 769 deletions
--- a/Documentation/scheduler/sched-deadline.txt
+++ b/Documentation/scheduler/sched-deadline.txt
@ -16,6 +16,7 @@ CONTENTS
   4.1 System-wide settings
   4.2 Task interface
   4.3 Default behavior
+   4.4 Behavior of sched_yield()
 5. Tasks CPU affinity
   5.1 SCHED_DEADLINE and cpusets HOWTO
 6. Future plans
@ -426,6 +427,23 @@ CONTENTS
 Finally, notice that in order not to jeopardize the admission control a
 -deadline task cannot fork.

+
+4.4 Behavior of sched_yield()
+-----------------------------
+
+ When a SCHED_DEADLINE task calls sched_yield(), it gives up its
+ remaining runtime and is immediately throttled, until the next
+ period, when its runtime will be replenished (a special flag
+ dl_yielded is set and used to handle correctly throttling and runtime
+ replenishment after a call to sched_yield()).
+
+ This behavior of sched_yield() allows the task to wake-up exactly at
+ the beginning of the next period. Also, this may be useful in the
+ future with bandwidth reclaiming mechanisms, where sched_yield() will
+ make the leftoever runtime available for reclamation by other
+ SCHED_DEADLINE tasks.
+
+
 5. Tasks CPU affinity
 =====================

--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@ -986,7 +986,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
 	int cpu = smp_processor_id();

 	previous_current = curr_task(cpu);
-	set_curr_task(cpu, current);
+	ia64_set_curr_task(cpu, current);
 	if ((p = strchr(current->comm, ' ')))
 		*p = '\0';

@ -1360,14 +1360,14 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
 				cpumask_clear_cpu(i, &mca_cpu);	/* wake next cpu */
 				while (monarch_cpu != -1)
 					cpu_relax();	/* spin until last cpu leaves */
-				set_curr_task(cpu, previous_current);
+				ia64_set_curr_task(cpu, previous_current);
 				ia64_mc_info.imi_rendez_checkin[cpu]
 						= IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
 				return;
 			}
 		}
 	}
-	set_curr_task(cpu, previous_current);
+	ia64_set_curr_task(cpu, previous_current);
 	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
 	monarch_cpu = -1;	/* This frees the slaves and previous monarchs */
 }
@ -1729,7 +1729,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 		NOTIFY_INIT(DIE_INIT_SLAVE_LEAVE, regs, (long)&nd, 1);

 		mprintk("Slave on cpu %d returning to normal service.\n", cpu);
-		set_curr_task(cpu, previous_current);
+		ia64_set_curr_task(cpu, previous_current);
 		ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
 		atomic_dec(&slaves);
 		return;
@ -1756,7 +1756,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,

 	mprintk("\nINIT dump complete.  Monarch on cpu %d returning to normal service.\n", cpu);
 	atomic_dec(&monarchs);
-	set_curr_task(cpu, previous_current);
+	ia64_set_curr_task(cpu, previous_current);
 	monarch_cpu = -1;
 	return;
 }
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@ -471,7 +471,7 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 	return false;
 }

-static struct sched_domain_topology_level numa_inside_package_topology[] = {
+static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
 #ifdef CONFIG_SCHED_SMT
 	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
 #endif
@ -480,22 +480,23 @@ static struct sched_domain_topology_level numa_inside_package_topology[] = {
 #endif
 	{ NULL, },
 };
+
+static struct sched_domain_topology_level x86_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ NULL, },
+};
+
 /*
- * set_sched_topology() sets the topology internal to a CPU.  The
- * NUMA topologies are layered on top of it to build the full
- * system topology.
- *
- * If NUMA nodes are observed to occur within a CPU package, this
- * function should be called.  It forces the sched domain code to
- * only use the SMT level for the CPU portion of the topology.
- * This essentially falls back to relying on NUMA information
- * from the SRAT table to describe the entire system topology
- * (except for hyperthreads).
+ * Set if a package/die has multiple NUMA nodes inside.
+ * AMD Magny-Cours and Intel Cluster-on-Die have this.
 */
-static void primarily_use_numa_for_topology(void)
-{
-	set_sched_topology(numa_inside_package_topology);
-}
+static bool x86_has_numa_in_package;

 void set_cpu_sibling_map(int cpu)
 {
@ -558,7 +559,7 @@ void set_cpu_sibling_map(int cpu)
 				c->booted_cores = cpu_data(i).booted_cores;
 		}
 		if (match_die(c, o) && !topology_same_node(c, o))
-			primarily_use_numa_for_topology();
+			x86_has_numa_in_package = true;
 	}

 	threads = cpumask_weight(topology_sibling_cpumask(cpu));
@ -1304,6 +1305,16 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
 		zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
 	}
+
+	/*
+	 * Set 'default' x86 topology, this matches default_topology() in that
+	 * it has NUMA nodes as a topology level. See also
+	 * native_smp_cpus_done().
+	 *
+	 * Must be done before set_cpus_sibling_map() is ran.
+	 */
+	set_sched_topology(x86_topology);
+
 	set_cpu_sibling_map(0);

 	switch (smp_sanity_check(max_cpus)) {
@ -1370,6 +1381,9 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 {
 	pr_debug("Boot done\n");

+	if (x86_has_numa_in_package)
+		set_sched_topology(x86_numa_in_package_topology);
+
 	nmi_selftest();
 	impress_friends();
 	setup_ioapic_dest();
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@ -259,17 +259,14 @@ static inline void might_fault(void) { }
 extern struct atomic_notifier_head panic_notifier_list;
 extern long (*panic_blink)(int state);
 __printf(1, 2)
-void panic(const char *fmt, ...)
-	__noreturn __cold;
+void panic(const char *fmt, ...) __noreturn __cold;
 void nmi_panic(struct pt_regs *regs, const char *msg);
 extern void oops_enter(void);
 extern void oops_exit(void);
 void print_oops_end_marker(void);
 extern int oops_may_print(void);
-void do_exit(long error_code)
-	__noreturn;
-void complete_and_exit(struct completion *, long)
-	__noreturn;
+void do_exit(long error_code) __noreturn;
+void complete_and_exit(struct completion *, long) __noreturn;

 /* Internal, do not use. */
 int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -448,6 +448,8 @@ static inline void io_schedule(void)
 	io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
 }

+void __noreturn do_task_dead(void);
+
 struct nsproxy;
 struct user_namespace;

@ -1022,7 +1024,8 @@ extern void wake_up_q(struct wake_q_head *head);
 #define SD_BALANCE_FORK		0x0008	/* Balance on fork, clone */
 #define SD_BALANCE_WAKE		0x0010  /* Balance on wakeup */
 #define SD_WAKE_AFFINE		0x0020	/* Wake task to waking CPU */
-#define SD_SHARE_CPUCAPACITY	0x0080	/* Domain members share cpu power */
+#define SD_ASYM_CPUCAPACITY	0x0040  /* Groups have different max cpu capacities */
+#define SD_SHARE_CPUCAPACITY	0x0080	/* Domain members share cpu capacity */
 #define SD_SHARE_POWERDOMAIN	0x0100	/* Domain members share power domain */
 #define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
 #define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
@ -1064,6 +1067,12 @@ extern int sched_domain_level_max;

 struct sched_group;

+struct sched_domain_shared {
+	atomic_t	ref;
+	atomic_t	nr_busy_cpus;
+	int		has_idle_cores;
+};
+
 struct sched_domain {
 	/* These fields must be setup */
 	struct sched_domain *parent;	/* top domain must be null terminated */
@ -1094,6 +1103,8 @@ struct sched_domain {
 	u64 max_newidle_lb_cost;
 	unsigned long next_decay_max_lb_cost;

+	u64 avg_scan_cost;		/* select_idle_sibling */
+
 #ifdef CONFIG_SCHEDSTATS
 	/* load_balance() stats */
 	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
@ -1132,6 +1143,7 @@ struct sched_domain {
 		void *private;		/* used during construction */
 		struct rcu_head rcu;	/* used during destruction */
 	};
+	struct sched_domain_shared *shared;

 	unsigned int span_weight;
 	/*
@ -1165,6 +1177,7 @@ typedef int (*sched_domain_flags_f)(void);

 struct sd_data {
 	struct sched_domain **__percpu sd;
+	struct sched_domain_shared **__percpu sds;
 	struct sched_group **__percpu sg;
 	struct sched_group_capacity **__percpu sgc;
 };
@ -2568,7 +2581,7 @@ static inline bool is_idle_task(const struct task_struct *p)
 	return p->pid == 0;
 }
 extern struct task_struct *curr_task(int cpu);
-extern void set_curr_task(int cpu, struct task_struct *p);
+extern void ia64_set_curr_task(int cpu, struct task_struct *p);

 void yield(void);

@ -3206,7 +3219,11 @@ static inline int signal_pending_state(long state, struct task_struct *p)
 * cond_resched_lock() will drop the spinlock before scheduling,
 * cond_resched_softirq() will enable bhs before scheduling.
 */
+#ifndef CONFIG_PREEMPT
 extern int _cond_resched(void);
+#else
+static inline int _cond_resched(void) { return 0; }
+#endif

 #define cond_resched() ({			\
 	___might_sleep(__FILE__, __LINE__, 0);	\
@ -3236,6 +3253,15 @@ static inline void cond_resched_rcu(void)
 #endif
 }

+static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	return p->preempt_disable_ip;
+#else
+	return 0;
+#endif
+}
+
 /*
 * Does a critical section need to be broken due to another
 * task waiting?: (technically does not depend on CONFIG_PREEMPT,
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@ -103,29 +103,40 @@ static inline void u64_stats_update_end_raw(struct u64_stats_sync *syncp)
 #endif
 }

-static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
+static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	return read_seqcount_begin(&syncp->seq);
 #else
-#if BITS_PER_LONG==32
+	return 0;
+#endif
+}
+
+static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
+{
+#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
 	preempt_disable();
 #endif
-	return 0;
+	return __u64_stats_fetch_begin(syncp);
+}
+
+static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
+					 unsigned int start)
+{
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+	return read_seqcount_retry(&syncp->seq, start);
+#else
+	return false;
 #endif
 }

 static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
 					 unsigned int start)
 {
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
-	return read_seqcount_retry(&syncp->seq, start);
-#else
-#if BITS_PER_LONG==32
+#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
 	preempt_enable();
 #endif
-	return false;
-#endif
+	return __u64_stats_fetch_retry(syncp, start);
 }

 /*
@ -136,27 +147,19 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
 */
 static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp)
 {
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
-	return read_seqcount_begin(&syncp->seq);
-#else
-#if BITS_PER_LONG==32
+#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
 	local_irq_disable();
 #endif
-	return 0;
-#endif
+	return __u64_stats_fetch_begin(syncp);
 }

 static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp,
-					 unsigned int start)
+					     unsigned int start)
 {
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
-	return read_seqcount_retry(&syncp->seq, start);
-#else
-#if BITS_PER_LONG==32
+#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
 	local_irq_enable();
 #endif
-	return false;
-#endif
+	return __u64_stats_fetch_retry(syncp, start);
 }

 #endif /* _LINUX_U64_STATS_SYNC_H */
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@ -248,6 +248,8 @@ wait_queue_head_t *bit_waitqueue(void *, int);
 	(!__builtin_constant_p(state) ||				\
 		state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE)	\

+extern void init_wait_entry(wait_queue_t *__wait, int flags);
+
 /*
 * The below macro ___wait_event() has an explicit shadow of the __ret
 * variable when used from the wait_event_*() macros.
@ -266,12 +268,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
 	wait_queue_t __wait;						\
 	long __ret = ret;	/* explicit shadow */			\
 									\
-	INIT_LIST_HEAD(&__wait.task_list);				\
-	if (exclusive)							\
-		__wait.flags = WQ_FLAG_EXCLUSIVE;			\
-	else								\
-		__wait.flags = 0;					\
-									\
+	init_wait_entry(&__wait, exclusive ? WQ_FLAG_EXCLUSIVE : 0);	\
 	for (;;) {							\
 		long __int = prepare_to_wait_event(&wq, &__wait, state);\
 									\
@ -280,12 +277,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
 									\
 		if (___wait_is_interruptible(state) && __int) {		\
 			__ret = __int;					\
-			if (exclusive) {				\
-				abort_exclusive_wait(&wq, &__wait,	\
-						     state, NULL);	\
-				goto __out;				\
-			}						\
-			break;						\
+			goto __out;					\
 		}							\
 									\
 		cmd;							\
@ -989,7 +981,6 @@ void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state);
 void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state);
 long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state);
 void finish_wait(wait_queue_head_t *q, wait_queue_t *wait);
-void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key);
 long wait_woken(wait_queue_t *wait, unsigned mode, long timeout);
 int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
--- a/kernel/exit.c
+++ b/kernel/exit.c
@ -725,7 +725,7 @@ static void check_stack_usage(void)
 static inline void check_stack_usage(void) {}
 #endif

-void do_exit(long code)
+void __noreturn do_exit(long code)
 {
 	struct task_struct *tsk = current;
 	int group_dead;
@ -882,29 +882,7 @@ void do_exit(long code)
 	exit_rcu();
 	TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));

-	/*
-	 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
-	 * when the following two conditions become true.
-	 *   - There is race condition of mmap_sem (It is acquired by
-	 *     exit_mm()), and
-	 *   - SMI occurs before setting TASK_RUNINNG.
-	 *     (or hypervisor of virtual machine switches to other guest)
-	 *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
-	 *
-	 * To avoid it, we have to wait for releasing tsk->pi_lock which
-	 * is held by try_to_wake_up()
-	 */
-	smp_mb();
-	raw_spin_unlock_wait(&tsk->pi_lock);
-
-	/* causes final put_task_struct in finish_task_switch(). */
-	tsk->state = TASK_DEAD;
-	tsk->flags |= PF_NOFREEZE;	/* tell freezer to ignore us */
-	schedule();
-	BUG();
-	/* Avoid "noreturn function does return".  */
-	for (;;)
-		cpu_relax();	/* For when BUG is null */
+	do_task_dead();
 }
 EXPORT_SYMBOL_GPL(do_exit);

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@ -1070,8 +1070,12 @@ static int migration_cpu_stop(void *data)
 	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
 	 * we're holding p->pi_lock.
 	 */
-	if (task_rq(p) == rq && task_on_rq_queued(p))
-		rq = __migrate_task(rq, p, arg->dest_cpu);
+	if (task_rq(p) == rq) {
+		if (task_on_rq_queued(p))
+			rq = __migrate_task(rq, p, arg->dest_cpu);
+		else
+			p->wake_cpu = arg->dest_cpu;
+	}
 	raw_spin_unlock(&rq->lock);
 	raw_spin_unlock(&p->pi_lock);

@ -1112,10 +1116,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)

 	p->sched_class->set_cpus_allowed(p, new_mask);

-	if (running)
-		p->sched_class->set_curr_task(rq);
 	if (queued)
 		enqueue_task(rq, p, ENQUEUE_RESTORE);
+	if (running)
+		set_curr_task(rq, p);
 }

 /*
@ -1272,7 +1276,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
 		/*
 		 * Task isn't running anymore; make it appear like we migrated
 		 * it before it went to sleep. This means on wakeup we make the
-		 * previous cpu our targer instead of where it really is.
+		 * previous cpu our target instead of where it really is.
 		 */
 		p->wake_cpu = cpu;
 	}
@ -1636,23 +1640,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
-#ifdef CONFIG_SCHEDSTATS
-	struct rq *rq = this_rq();
+	struct rq *rq;
+
+	if (!schedstat_enabled())
+		return;
+
+	rq = this_rq();

 #ifdef CONFIG_SMP
-	int this_cpu = smp_processor_id();
-
-	if (cpu == this_cpu) {
-		schedstat_inc(rq, ttwu_local);
-		schedstat_inc(p, se.statistics.nr_wakeups_local);
+	if (cpu == rq->cpu) {
+		schedstat_inc(rq->ttwu_local);
+		schedstat_inc(p->se.statistics.nr_wakeups_local);
 	} else {
 		struct sched_domain *sd;

-		schedstat_inc(p, se.statistics.nr_wakeups_remote);
+		schedstat_inc(p->se.statistics.nr_wakeups_remote);
 		rcu_read_lock();
-		for_each_domain(this_cpu, sd) {
+		for_each_domain(rq->cpu, sd) {
 			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-				schedstat_inc(sd, ttwu_wake_remote);
+				schedstat_inc(sd->ttwu_wake_remote);
 				break;
 			}
 		}
@ -1660,17 +1666,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 	}

 	if (wake_flags & WF_MIGRATED)
-		schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-
+		schedstat_inc(p->se.statistics.nr_wakeups_migrate);
 #endif /* CONFIG_SMP */

-	schedstat_inc(rq, ttwu_count);
-	schedstat_inc(p, se.statistics.nr_wakeups);
+	schedstat_inc(rq->ttwu_count);
+	schedstat_inc(p->se.statistics.nr_wakeups);

 	if (wake_flags & WF_SYNC)
-		schedstat_inc(p, se.statistics.nr_wakeups_sync);
-
-#endif /* CONFIG_SCHEDSTATS */
+		schedstat_inc(p->se.statistics.nr_wakeups_sync);
 }

 static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
@ -2091,8 +2094,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)

 	ttwu_queue(p, cpu, wake_flags);
 stat:
-	if (schedstat_enabled())
-		ttwu_stat(p, cpu, wake_flags);
+	ttwu_stat(p, cpu, wake_flags);
 out:
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

@ -2102,6 +2104,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 /**
 * try_to_wake_up_local - try to wake up a local task with rq lock held
 * @p: the thread to be awakened
+ * @cookie: context's cookie for pinning
 *
 * Put @p on the run-queue if it's not already there. The caller must
 * ensure that this_rq() is locked, @p is bound to this_rq() and not
@ -2140,8 +2143,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);

 	ttwu_do_wakeup(rq, p, 0, cookie);
-	if (schedstat_enabled())
-		ttwu_stat(p, smp_processor_id(), 0);
+	ttwu_stat(p, smp_processor_id(), 0);
 out:
 	raw_spin_unlock(&p->pi_lock);
 }
@ -3199,6 +3201,9 @@ static inline void preempt_latency_stop(int val) { }
 */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
+	/* Save this before calling printk(), since that will clobber it */
+	unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
+
 	if (oops_in_progress)
 		return;

@ -3209,13 +3214,12 @@ static noinline void __schedule_bug(struct task_struct *prev)
 	print_modules();
 	if (irqs_disabled())
 		print_irqtrace_events(prev);
-#ifdef CONFIG_DEBUG_PREEMPT
-	if (in_atomic_preempt_off()) {
+	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+	    && in_atomic_preempt_off()) {
 		pr_err("Preemption disabled at:");
-		print_ip_sym(current->preempt_disable_ip);
+		print_ip_sym(preempt_disable_ip);
 		pr_cont("\n");
 	}
-#endif
 	if (panic_on_warn)
 		panic("scheduling while atomic\n");

@ -3241,7 +3245,7 @@ static inline void schedule_debug(struct task_struct *prev)

 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));

-	schedstat_inc(this_rq(), sched_count);
+	schedstat_inc(this_rq()->sched_count);
 }

 /*
@ -3334,17 +3338,6 @@ static void __sched notrace __schedule(bool preempt)
 	rq = cpu_rq(cpu);
 	prev = rq->curr;

-	/*
-	 * do_exit() calls schedule() with preemption disabled as an exception;
-	 * however we must fix that up, otherwise the next task will see an
-	 * inconsistent (higher) preempt count.
-	 *
-	 * It also avoids the below schedule_debug() test from complaining
-	 * about this.
-	 */
-	if (unlikely(prev->state == TASK_DEAD))
-		preempt_enable_no_resched_notrace();
-
 	schedule_debug(prev);

 	if (sched_feat(HRTICK))
@ -3412,6 +3405,33 @@ static void __sched notrace __schedule(bool preempt)
 }
 STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */

+void __noreturn do_task_dead(void)
+{
+	/*
+	 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
+	 * when the following two conditions become true.
+	 *   - There is race condition of mmap_sem (It is acquired by
+	 *     exit_mm()), and
+	 *   - SMI occurs before setting TASK_RUNINNG.
+	 *     (or hypervisor of virtual machine switches to other guest)
+	 *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
+	 *
+	 * To avoid it, we have to wait for releasing tsk->pi_lock which
+	 * is held by try_to_wake_up()
+	 */
+	smp_mb();
+	raw_spin_unlock_wait(&current->pi_lock);
+
+	/* causes final put_task_struct in finish_task_switch(). */
+	__set_current_state(TASK_DEAD);
+	current->flags |= PF_NOFREEZE;	/* tell freezer to ignore us */
+	__schedule(false);
+	BUG();
+	/* Avoid "noreturn function does return".  */
+	for (;;)
+		cpu_relax();	/* For when BUG is null */
+}
+
 static inline void sched_submit_work(struct task_struct *tsk)
 {
 	if (!tsk->state || tsk_is_pi_blocked(tsk))
@ -3694,10 +3714,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)

 	p->prio = prio;

-	if (running)
-		p->sched_class->set_curr_task(rq);
 	if (queued)
 		enqueue_task(rq, p, queue_flag);
+	if (running)
+		set_curr_task(rq, p);

 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
@ -3711,7 +3731,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)

 void set_user_nice(struct task_struct *p, long nice)
 {
-	int old_prio, delta, queued;
+	bool queued, running;
+	int old_prio, delta;
 	struct rq_flags rf;
 	struct rq *rq;

@ -3733,8 +3754,11 @@ void set_user_nice(struct task_struct *p, long nice)
 		goto out_unlock;
 	}
 	queued = task_on_rq_queued(p);
+	running = task_current(rq, p);
 	if (queued)
 		dequeue_task(rq, p, DEQUEUE_SAVE);
+	if (running)
+		put_prev_task(rq, p);

 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
@ -3751,6 +3775,8 @@ void set_user_nice(struct task_struct *p, long nice)
 		if (delta < 0 || (delta > 0 && task_running(rq, p)))
 			resched_curr(rq);
 	}
+	if (running)
+		set_curr_task(rq, p);
 out_unlock:
 	task_rq_unlock(rq, p, &rf);
 }
@ -4250,8 +4276,6 @@ static int __sched_setscheduler(struct task_struct *p,
 	prev_class = p->sched_class;
 	__setscheduler(rq, p, attr, pi);

-	if (running)
-		p->sched_class->set_curr_task(rq);
 	if (queued) {
 		/*
 		 * We enqueue to tail when the priority of a task is
@ -4262,6 +4286,8 @@ static int __sched_setscheduler(struct task_struct *p,

 		enqueue_task(rq, p, queue_flags);
 	}
+	if (running)
+		set_curr_task(rq, p);

 	check_class_changed(rq, p, prev_class, oldprio);
 	preempt_disable(); /* avoid rq from going away on us */
@ -4853,7 +4879,7 @@ SYSCALL_DEFINE0(sched_yield)
 {
 	struct rq *rq = this_rq_lock();

-	schedstat_inc(rq, yld_count);
+	schedstat_inc(rq->yld_count);
 	current->sched_class->yield_task(rq);

 	/*
@ -4870,6 +4896,7 @@ SYSCALL_DEFINE0(sched_yield)
 	return 0;
 }

+#ifndef CONFIG_PREEMPT
 int __sched _cond_resched(void)
 {
 	if (should_resched(0)) {
@ -4879,6 +4906,7 @@ int __sched _cond_resched(void)
 	return 0;
 }
 EXPORT_SYMBOL(_cond_resched);
+#endif

 /*
 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
@ -5004,7 +5032,7 @@ int __sched yield_to(struct task_struct *p, bool preempt)

 	yielded = curr->sched_class->yield_to_task(rq, p, preempt);
 	if (yielded) {
-		schedstat_inc(rq, yld_count);
+		schedstat_inc(rq->yld_count);
 		/*
 		 * Make p's CPU reschedule; pick_next_entity takes care of
 		 * fairness.
@ -5424,10 +5452,10 @@ void sched_setnuma(struct task_struct *p, int nid)

 	p->numa_preferred_nid = nid;

-	if (running)
-		p->sched_class->set_curr_task(rq);
 	if (queued)
 		enqueue_task(rq, p, ENQUEUE_RESTORE);
+	if (running)
+		set_curr_task(rq, p);
 	task_rq_unlock(rq, p, &rf);
 }
 #endif /* CONFIG_NUMA_BALANCING */
@ -5724,6 +5752,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 	}
 }
 #else /* !CONFIG_SCHED_DEBUG */
+
+# define sched_debug_enabled 0
 # define sched_domain_debug(sd, cpu) do { } while (0)
 static inline bool sched_debug(void)
 {
@ -5742,6 +5772,7 @@ static int sd_degenerate(struct sched_domain *sd)
 			 SD_BALANCE_FORK |
 			 SD_BALANCE_EXEC |
 			 SD_SHARE_CPUCAPACITY |
+			 SD_ASYM_CPUCAPACITY |
 			 SD_SHARE_PKG_RESOURCES |
 			 SD_SHARE_POWERDOMAIN)) {
 		if (sd->groups != sd->groups->next)
@ -5772,6 +5803,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 				SD_BALANCE_NEWIDLE |
 				SD_BALANCE_FORK |
 				SD_BALANCE_EXEC |
+				SD_ASYM_CPUCAPACITY |
 				SD_SHARE_CPUCAPACITY |
 				SD_SHARE_PKG_RESOURCES |
 				SD_PREFER_SIBLING |
@ -5916,10 +5948,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
 	} while (sg != first);
 }

-static void free_sched_domain(struct rcu_head *rcu)
+static void destroy_sched_domain(struct sched_domain *sd)
 {
-	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
-
 	/*
 	 * If its an overlapping domain it has private groups, iterate and
 	 * nuke them all.
@ -5930,18 +5960,26 @@ static void free_sched_domain(struct rcu_head *rcu)
 		kfree(sd->groups->sgc);
 		kfree(sd->groups);
 	}
+	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
+		kfree(sd->shared);
 	kfree(sd);
 }

-static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+static void destroy_sched_domains_rcu(struct rcu_head *rcu)
 {
-	call_rcu(&sd->rcu, free_sched_domain);
+	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+
+	while (sd) {
+		struct sched_domain *parent = sd->parent;
+		destroy_sched_domain(sd);
+		sd = parent;
+	}
 }

-static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+static void destroy_sched_domains(struct sched_domain *sd)
 {
-	for (; sd; sd = sd->parent)
-		destroy_sched_domain(sd, cpu);
+	if (sd)
+		call_rcu(&sd->rcu, destroy_sched_domains_rcu);
 }

 /*
@ -5956,14 +5994,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_busy);
 DEFINE_PER_CPU(struct sched_domain *, sd_asym);

 static void update_top_cache_domain(int cpu)
 {
+	struct sched_domain_shared *sds = NULL;
 	struct sched_domain *sd;
-	struct sched_domain *busy_sd = NULL;
 	int id = cpu;
 	int size = 1;

@ -5971,13 +6009,13 @@ static void update_top_cache_domain(int cpu)
 	if (sd) {
 		id = cpumask_first(sched_domain_span(sd));
 		size = cpumask_weight(sched_domain_span(sd));
-		busy_sd = sd->parent; /* sd_busy */
+		sds = sd->shared;
 	}
-	rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);

 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
 	per_cpu(sd_llc_size, cpu) = size;
 	per_cpu(sd_llc_id, cpu) = id;
+	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);

 	sd = lowest_flag_domain(cpu, SD_NUMA);
 	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
@ -6013,7 +6051,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 			 */
 			if (parent->flags & SD_PREFER_SIBLING)
 				tmp->flags |= SD_PREFER_SIBLING;
-			destroy_sched_domain(parent, cpu);
+			destroy_sched_domain(parent);
 		} else
 			tmp = tmp->parent;
 	}
@ -6021,7 +6059,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 	if (sd && sd_degenerate(sd)) {
 		tmp = sd;
 		sd = sd->parent;
-		destroy_sched_domain(tmp, cpu);
+		destroy_sched_domain(tmp);
 		if (sd)
 			sd->child = NULL;
 	}
@ -6031,7 +6069,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 	rq_attach_root(rq, rd);
 	tmp = rq->sd;
 	rcu_assign_pointer(rq->sd, sd);
-	destroy_sched_domains(tmp, cpu);
+	destroy_sched_domains(tmp);

 	update_top_cache_domain(cpu);
 }
@ -6274,7 +6312,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 		return;

 	update_group_capacity(sd, cpu);
-	atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
 }

 /*
@ -6362,6 +6399,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
 	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
 	*per_cpu_ptr(sdd->sd, cpu) = NULL;

+	if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
+		*per_cpu_ptr(sdd->sds, cpu) = NULL;
+
 	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
 		*per_cpu_ptr(sdd->sg, cpu) = NULL;

@ -6381,26 +6421,37 @@ static int sched_domains_curr_level;
 /*
 * SD_flags allowed in topology descriptions.
 *
- * SD_SHARE_CPUCAPACITY      - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA                - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN   - describes shared power domain
+ * These flags are purely descriptive of the topology and do not prescribe
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
+ * function:
 *
- * Odd one out:
- * SD_ASYM_PACKING        - describes SMT quirks
+ *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
+ *   SD_SHARE_PKG_RESOURCES - describes shared caches
+ *   SD_NUMA                - describes NUMA topologies
+ *   SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
+ *
+ * Odd one out, which beside describing the topology has a quirk also
+ * prescribes the desired behaviour that goes along with it:
+ *
+ *   SD_ASYM_PACKING        - describes SMT quirks
 */
 #define TOPOLOGY_SD_FLAGS		\
 	(SD_SHARE_CPUCAPACITY |		\
 	 SD_SHARE_PKG_RESOURCES |	\
 	 SD_NUMA |			\
 	 SD_ASYM_PACKING |		\
+	 SD_ASYM_CPUCAPACITY |		\
 	 SD_SHARE_POWERDOMAIN)

 static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl,
+	const struct cpumask *cpu_map,
+	struct sched_domain *child, int cpu)
 {
-	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
-	int sd_weight, sd_flags = 0;
+	struct sd_data *sdd = &tl->data;
+	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+	int sd_id, sd_weight, sd_flags = 0;

 #ifdef CONFIG_NUMA
 	/*
@ -6449,15 +6500,26 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
 		.smt_gain		= 0,
 		.max_newidle_lb_cost	= 0,
 		.next_decay_max_lb_cost	= jiffies,
+		.child			= child,
 #ifdef CONFIG_SCHED_DEBUG
 		.name			= tl->name,
 #endif
 	};

+	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+	sd_id = cpumask_first(sched_domain_span(sd));
+
 	/*
 	 * Convert topological properties into behaviour.
 	 */

+	if (sd->flags & SD_ASYM_CPUCAPACITY) {
+		struct sched_domain *t = sd;
+
+		for_each_lower_domain(t)
+			t->flags |= SD_BALANCE_WAKE;
+	}
+
 	if (sd->flags & SD_SHARE_CPUCAPACITY) {
 		sd->flags |= SD_PREFER_SIBLING;
 		sd->imbalance_pct = 110;
@ -6489,7 +6551,17 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
 		sd->idle_idx = 1;
 	}

-	sd->private = &tl->data;
+	/*
+	 * For all levels sharing cache; connect a sched_domain_shared
+	 * instance.
+	 */
+	if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+		sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+		atomic_inc(&sd->shared->ref);
+		atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
+	}
+
+	sd->private = sdd;

 	return sd;
 }
@ -6516,6 +6588,9 @@ static struct sched_domain_topology_level *sched_domain_topology =

 void set_sched_topology(struct sched_domain_topology_level *tl)
 {
+	if (WARN_ON_ONCE(sched_smp_initialized))
+		return;
+
 	sched_domain_topology = tl;
 }

@ -6796,6 +6871,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 		if (!sdd->sd)
 			return -ENOMEM;

+		sdd->sds = alloc_percpu(struct sched_domain_shared *);
+		if (!sdd->sds)
+			return -ENOMEM;
+
 		sdd->sg = alloc_percpu(struct sched_group *);
 		if (!sdd->sg)
 			return -ENOMEM;
@ -6806,6 +6885,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)

 		for_each_cpu(j, cpu_map) {
 			struct sched_domain *sd;
+			struct sched_domain_shared *sds;
 			struct sched_group *sg;
 			struct sched_group_capacity *sgc;

@ -6816,6 +6896,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)

 			*per_cpu_ptr(sdd->sd, j) = sd;

+			sds = kzalloc_node(sizeof(struct sched_domain_shared),
+					GFP_KERNEL, cpu_to_node(j));
+			if (!sds)
+				return -ENOMEM;
+
+			*per_cpu_ptr(sdd->sds, j) = sds;
+
 			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
 			if (!sg)
@ -6855,6 +6942,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
 				kfree(*per_cpu_ptr(sdd->sd, j));
 			}

+			if (sdd->sds)
+				kfree(*per_cpu_ptr(sdd->sds, j));
 			if (sdd->sg)
 				kfree(*per_cpu_ptr(sdd->sg, j));
 			if (sdd->sgc)
@ -6862,6 +6951,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
 		}
 		free_percpu(sdd->sd);
 		sdd->sd = NULL;
+		free_percpu(sdd->sds);
+		sdd->sds = NULL;
 		free_percpu(sdd->sg);
 		sdd->sg = NULL;
 		free_percpu(sdd->sgc);
@ -6873,16 +6964,12 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
 		struct sched_domain *child, int cpu)
 {
-	struct sched_domain *sd = sd_init(tl, cpu);
-	if (!sd)
-		return child;
+	struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);

-	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
 	if (child) {
 		sd->level = child->level + 1;
 		sched_domain_level_max = max(sched_domain_level_max, sd->level);
 		child->parent = sd;
-		sd->child = child;

 		if (!cpumask_subset(sched_domain_span(child),
 				    sched_domain_span(sd))) {
@ -6913,6 +7000,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 	enum s_alloc alloc_state;
 	struct sched_domain *sd;
 	struct s_data d;
+	struct rq *rq = NULL;
 	int i, ret = -ENOMEM;

 	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@ -6963,11 +7051,22 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 	/* Attach the domains */
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map) {
+		rq = cpu_rq(i);
 		sd = *per_cpu_ptr(d.sd, i);
+
+		/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
+		if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
+			WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
+
 		cpu_attach_domain(sd, d.rd, i);
 	}
 	rcu_read_unlock();

+	if (rq && sched_debug_enabled) {
+		pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
+			cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
+	}
+
 	ret = 0;
 error:
 	__free_domain_allocs(&d, alloc_state, cpu_map);
@ -7326,6 +7425,22 @@ int sched_cpu_dying(unsigned int cpu)
 }
 #endif

+#ifdef CONFIG_SCHED_SMT
+DEFINE_STATIC_KEY_FALSE(sched_smt_present);
+
+static void sched_init_smt(void)
+{
+	/*
+	 * We've enumerated all CPUs and will assume that if any CPU
+	 * has SMT siblings, CPU0 will too.
+	 */
+	if (cpumask_weight(cpu_smt_mask(0)) > 1)
+		static_branch_enable(&sched_smt_present);
+}
+#else
+static inline void sched_init_smt(void) { }
+#endif
+
 void __init sched_init_smp(void)
 {
 	cpumask_var_t non_isolated_cpus;
@ -7355,6 +7470,9 @@ void __init sched_init_smp(void)

 	init_sched_rt_class();
 	init_sched_dl_class();
+
+	sched_init_smt();
+
 	sched_smp_initialized = true;
 }

@ -7392,6 +7510,7 @@ static struct kmem_cache *task_group_cache __read_mostly;
 #endif

 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
+DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);

 void __init sched_init(void)
 {
@ -7428,6 +7547,8 @@ void __init sched_init(void)
 	for_each_possible_cpu(i) {
 		per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
 			cpumask_size(), GFP_KERNEL, cpu_to_node(i));
+		per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
+			cpumask_size(), GFP_KERNEL, cpu_to_node(i));
 	}
 #endif /* CONFIG_CPUMASK_OFFSTACK */

@ -7530,21 +7651,12 @@ void __init sched_init(void)

 	set_load_weight(&init_task);

-#ifdef CONFIG_PREEMPT_NOTIFIERS
-	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
-#endif
-
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current);

-	/*
-	 * During early bootup we pretend to be a normal task:
-	 */
-	current->sched_class = &fair_sched_class;
-
 	/*
 	 * Make us the idle thread. Technically, schedule() should not be
 	 * called from this thread, however somewhere below it might be,
@ -7599,6 +7711,7 @@ EXPORT_SYMBOL(__might_sleep);
 void ___might_sleep(const char *file, int line, int preempt_offset)
 {
 	static unsigned long prev_jiffy;	/* ratelimiting */
+	unsigned long preempt_disable_ip;

 	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
 	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
@ -7609,6 +7722,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
 		return;
 	prev_jiffy = jiffies;

+	/* Save this before calling printk(), since that will clobber it */
+	preempt_disable_ip = get_preempt_disable_ip(current);
+
 	printk(KERN_ERR
 		"BUG: sleeping function called from invalid context at %s:%d\n",
 			file, line);
@ -7623,14 +7739,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
 	debug_show_held_locks(current);
 	if (irqs_disabled())
 		print_irqtrace_events(current);
-#ifdef CONFIG_DEBUG_PREEMPT
-	if (!preempt_count_equals(preempt_offset)) {
+	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
+	    && !preempt_count_equals(preempt_offset)) {
 		pr_err("Preemption disabled at:");
-		print_ip_sym(current->preempt_disable_ip);
+		print_ip_sym(preempt_disable_ip);
 		pr_cont("\n");
 	}
-#endif
 	dump_stack();
+	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
 EXPORT_SYMBOL(___might_sleep);
 #endif
@ -7651,12 +7767,10 @@ void normalize_rt_tasks(void)
 		if (p->flags & PF_KTHREAD)
 			continue;

-		p->se.exec_start		= 0;
-#ifdef CONFIG_SCHEDSTATS
-		p->se.statistics.wait_start	= 0;
-		p->se.statistics.sleep_start	= 0;
-		p->se.statistics.block_start	= 0;
-#endif
+		p->se.exec_start = 0;
+		schedstat_set(p->se.statistics.wait_start,  0);
+		schedstat_set(p->se.statistics.sleep_start, 0);
+		schedstat_set(p->se.statistics.block_start, 0);

 		if (!dl_task(p) && !rt_task(p)) {
 			/*
@ -7717,7 +7831,7 @@ struct task_struct *curr_task(int cpu)
 *
 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
 */
-void set_curr_task(int cpu, struct task_struct *p)
+void ia64_set_curr_task(int cpu, struct task_struct *p)
 {
 	cpu_curr(cpu) = p;
 }
@ -7848,10 +7962,10 @@ void sched_move_task(struct task_struct *tsk)

 	sched_change_group(tsk, TASK_MOVE_GROUP);

-	if (unlikely(running))
-		tsk->sched_class->set_curr_task(rq);
 	if (queued)
 		enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
+	if (unlikely(running))
+		set_curr_task(rq, tsk);

 	task_rq_unlock(rq, tsk, &rf);
 }
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@ -31,56 +31,81 @@ static inline int right_child(int i)
 	return (i << 1) + 2;
 }

-static void cpudl_exchange(struct cpudl *cp, int a, int b)
+static void cpudl_heapify_down(struct cpudl *cp, int idx)
 {
-	int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
+	int l, r, largest;

-	swap(cp->elements[a].cpu, cp->elements[b].cpu);
-	swap(cp->elements[a].dl , cp->elements[b].dl );
+	int orig_cpu = cp->elements[idx].cpu;
+	u64 orig_dl = cp->elements[idx].dl;

-	swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
+	if (left_child(idx) >= cp->size)
+		return;
+
+	/* adapted from lib/prio_heap.c */
+	while(1) {
+		u64 largest_dl;
+		l = left_child(idx);
+		r = right_child(idx);
+		largest = idx;
+		largest_dl = orig_dl;
+
+		if ((l < cp->size) && dl_time_before(orig_dl,
+						cp->elements[l].dl)) {
+			largest = l;
+			largest_dl = cp->elements[l].dl;
+		}
+		if ((r < cp->size) && dl_time_before(largest_dl,
+						cp->elements[r].dl))
+			largest = r;
+
+		if (largest == idx)
+			break;
+
+		/* pull largest child onto idx */
+		cp->elements[idx].cpu = cp->elements[largest].cpu;
+		cp->elements[idx].dl = cp->elements[largest].dl;
+		cp->elements[cp->elements[idx].cpu].idx = idx;
+		idx = largest;
+	}
+	/* actual push down of saved original values orig_* */
+	cp->elements[idx].cpu = orig_cpu;
+	cp->elements[idx].dl = orig_dl;
+	cp->elements[cp->elements[idx].cpu].idx = idx;
+}
+
+static void cpudl_heapify_up(struct cpudl *cp, int idx)
+{
+	int p;
+
+	int orig_cpu = cp->elements[idx].cpu;
+	u64 orig_dl = cp->elements[idx].dl;
+
+	if (idx == 0)
+		return;
+
+	do {
+		p = parent(idx);
+		if (dl_time_before(orig_dl, cp->elements[p].dl))
+			break;
+		/* pull parent onto idx */
+		cp->elements[idx].cpu = cp->elements[p].cpu;
+		cp->elements[idx].dl = cp->elements[p].dl;
+		cp->elements[cp->elements[idx].cpu].idx = idx;
+		idx = p;
+	} while (idx != 0);
+	/* actual push up of saved original values orig_* */
+	cp->elements[idx].cpu = orig_cpu;
+	cp->elements[idx].dl = orig_dl;
+	cp->elements[cp->elements[idx].cpu].idx = idx;
 }

 static void cpudl_heapify(struct cpudl *cp, int idx)
 {
-	int l, r, largest;
-
-	/* adapted from lib/prio_heap.c */
-	while(1) {
-		l = left_child(idx);
-		r = right_child(idx);
-		largest = idx;
-
-		if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
-							cp->elements[l].dl))
-			largest = l;
-		if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
-							cp->elements[r].dl))
-			largest = r;
-		if (largest == idx)
-			break;
-
-		/* Push idx down the heap one level and bump one up */
-		cpudl_exchange(cp, largest, idx);
-		idx = largest;
-	}
-}
-
-static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
-{
-	WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
-
-	if (dl_time_before(new_dl, cp->elements[idx].dl)) {
-		cp->elements[idx].dl = new_dl;
-		cpudl_heapify(cp, idx);
-	} else {
-		cp->elements[idx].dl = new_dl;
-		while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
-					cp->elements[idx].dl)) {
-			cpudl_exchange(cp, idx, parent(idx));
-			idx = parent(idx);
-		}
-	}
+	if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
+				cp->elements[idx].dl))
+		cpudl_heapify_up(cp, idx);
+	else
+		cpudl_heapify_down(cp, idx);
 }

 static inline int cpudl_maximum(struct cpudl *cp)
@ -119,6 +144,45 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 	return best_cpu;
 }

+/*
+ * cpudl_clear - remove a cpu from the cpudl max-heap
+ * @cp: the cpudl max-heap context
+ * @cpu: the target cpu
+ *
+ * Notes: assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpudl_clear(struct cpudl *cp, int cpu)
+{
+	int old_idx, new_cpu;
+	unsigned long flags;
+
+	WARN_ON(!cpu_present(cpu));
+
+	raw_spin_lock_irqsave(&cp->lock, flags);
+
+	old_idx = cp->elements[cpu].idx;
+	if (old_idx == IDX_INVALID) {
+		/*
+		 * Nothing to remove if old_idx was invalid.
+		 * This could happen if a rq_offline_dl is
+		 * called for a CPU without -dl tasks running.
+		 */
+	} else {
+		new_cpu = cp->elements[cp->size - 1].cpu;
+		cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
+		cp->elements[old_idx].cpu = new_cpu;
+		cp->size--;
+		cp->elements[new_cpu].idx = old_idx;
+		cp->elements[cpu].idx = IDX_INVALID;
+		cpudl_heapify(cp, old_idx);
+
+		cpumask_set_cpu(cpu, cp->free_cpus);
+	}
+	raw_spin_unlock_irqrestore(&cp->lock, flags);
+}
+
 /*
 * cpudl_set - update the cpudl max-heap
 * @cp: the cpudl max-heap context
@ -129,55 +193,28 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 *
 * Returns: (void)
 */
-void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
 {
-	int old_idx, new_cpu;
+	int old_idx;
 	unsigned long flags;

 	WARN_ON(!cpu_present(cpu));

 	raw_spin_lock_irqsave(&cp->lock, flags);
+
 	old_idx = cp->elements[cpu].idx;
-	if (!is_valid) {
-		/* remove item */
-		if (old_idx == IDX_INVALID) {
-			/*
-			 * Nothing to remove if old_idx was invalid.
-			 * This could happen if a rq_offline_dl is
-			 * called for a CPU without -dl tasks running.
-			 */
-			goto out;
-		}
-		new_cpu = cp->elements[cp->size - 1].cpu;
-		cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
-		cp->elements[old_idx].cpu = new_cpu;
-		cp->size--;
-		cp->elements[new_cpu].idx = old_idx;
-		cp->elements[cpu].idx = IDX_INVALID;
-		while (old_idx > 0 && dl_time_before(
-				cp->elements[parent(old_idx)].dl,
-				cp->elements[old_idx].dl)) {
-			cpudl_exchange(cp, old_idx, parent(old_idx));
-			old_idx = parent(old_idx);
-		}
-		cpumask_set_cpu(cpu, cp->free_cpus);
-                cpudl_heapify(cp, old_idx);
-
-		goto out;
-	}
-
 	if (old_idx == IDX_INVALID) {
-		cp->size++;
-		cp->elements[cp->size - 1].dl = dl;
-		cp->elements[cp->size - 1].cpu = cpu;
-		cp->elements[cpu].idx = cp->size - 1;
-		cpudl_change_key(cp, cp->size - 1, dl);
+		int new_idx = cp->size++;
+		cp->elements[new_idx].dl = dl;
+		cp->elements[new_idx].cpu = cpu;
+		cp->elements[cpu].idx = new_idx;
+		cpudl_heapify_up(cp, new_idx);
 		cpumask_clear_cpu(cpu, cp->free_cpus);
 	} else {
-		cpudl_change_key(cp, old_idx, dl);
+		cp->elements[old_idx].dl = dl;
+		cpudl_heapify(cp, old_idx);
 	}

-out:
 	raw_spin_unlock_irqrestore(&cp->lock, flags);
 }

--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@ -23,7 +23,8 @@ struct cpudl {
 #ifdef CONFIG_SMP
 int cpudl_find(struct cpudl *cp, struct task_struct *p,
 	       struct cpumask *later_mask);
-void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
+void cpudl_clear(struct cpudl *cp, int cpu);
 int cpudl_init(struct cpudl *cp);
 void cpudl_set_freecpu(struct cpudl *cp, int cpu);
 void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@ -23,10 +23,8 @@
 * task when irq is in progress while we read rq->clock. That is a worthy
 * compromise in place of having locks on each irq in account_system_time.
 */
-DEFINE_PER_CPU(u64, cpu_hardirq_time);
-DEFINE_PER_CPU(u64, cpu_softirq_time);
+DEFINE_PER_CPU(struct irqtime, cpu_irqtime);

-static DEFINE_PER_CPU(u64, irq_start_time);
 static int sched_clock_irqtime;

 void enable_sched_clock_irqtime(void)
@ -39,16 +37,13 @@ void disable_sched_clock_irqtime(void)
 	sched_clock_irqtime = 0;
 }

-#ifndef CONFIG_64BIT
-DEFINE_PER_CPU(seqcount_t, irq_time_seq);
-#endif /* CONFIG_64BIT */
-
 /*
 * Called before incrementing preempt_count on {soft,}irq_enter
 * and before decrementing preempt_count on {soft,}irq_exit.
 */
 void irqtime_account_irq(struct task_struct *curr)
 {
+	struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
 	s64 delta;
 	int cpu;

@ -56,10 +51,10 @@ void irqtime_account_irq(struct task_struct *curr)
 		return;

 	cpu = smp_processor_id();
-	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
-	__this_cpu_add(irq_start_time, delta);
+	delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
+	irqtime->irq_start_time += delta;

-	irq_time_write_begin();
+	u64_stats_update_begin(&irqtime->sync);
 	/*
 	 * We do not account for softirq time from ksoftirqd here.
 	 * We want to continue accounting softirq time to ksoftirqd thread
@ -67,42 +62,36 @@ void irqtime_account_irq(struct task_struct *curr)
 	 * that do not consume any time, but still wants to run.
 	 */
 	if (hardirq_count())
-		__this_cpu_add(cpu_hardirq_time, delta);
+		irqtime->hardirq_time += delta;
 	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
-		__this_cpu_add(cpu_softirq_time, delta);
+		irqtime->softirq_time += delta;

-	irq_time_write_end();
+	u64_stats_update_end(&irqtime->sync);
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);

-static cputime_t irqtime_account_hi_update(cputime_t maxtime)
+static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime)
 {
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
-	unsigned long flags;
 	cputime_t irq_cputime;

-	local_irq_save(flags);
-	irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) -
-		      cpustat[CPUTIME_IRQ];
+	irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx];
 	irq_cputime = min(irq_cputime, maxtime);
-	cpustat[CPUTIME_IRQ] += irq_cputime;
-	local_irq_restore(flags);
+	cpustat[idx] += irq_cputime;
+
 	return irq_cputime;
 }

+static cputime_t irqtime_account_hi_update(cputime_t maxtime)
+{
+	return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time),
+				      CPUTIME_IRQ, maxtime);
+}
+
 static cputime_t irqtime_account_si_update(cputime_t maxtime)
 {
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
-	unsigned long flags;
-	cputime_t softirq_cputime;
-
-	local_irq_save(flags);
-	softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) -
-			  cpustat[CPUTIME_SOFTIRQ];
-	softirq_cputime = min(softirq_cputime, maxtime);
-	cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;
-	local_irq_restore(flags);
-	return softirq_cputime;
+	return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time),
+				      CPUTIME_SOFTIRQ, maxtime);
 }

 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
@ -295,6 +284,9 @@ static inline cputime_t account_other_time(cputime_t max)
 {
 	cputime_t accounted;

+	/* Shall be converted to a lockdep-enabled lightweight check */
+	WARN_ON_ONCE(!irqs_disabled());
+
 	accounted = steal_account_process_time(max);

 	if (accounted < max)
@ -306,6 +298,26 @@ static inline cputime_t account_other_time(cputime_t max)
 	return accounted;
 }

+#ifdef CONFIG_64BIT
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+	return t->se.sum_exec_runtime;
+}
+#else
+static u64 read_sum_exec_runtime(struct task_struct *t)
+{
+	u64 ns;
+	struct rq_flags rf;
+	struct rq *rq;
+
+	rq = task_rq_lock(t, &rf);
+	ns = t->se.sum_exec_runtime;
+	task_rq_unlock(rq, t, &rf);
+
+	return ns;
+}
+#endif
+
 /*
 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
 * tasks (sum on group iteration) belonging to @tsk's group.
@ -318,6 +330,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 	unsigned int seq, nextseq;
 	unsigned long flags;

+	/*
+	 * Update current task runtime to account pending time since last
+	 * scheduler action or thread_group_cputime() call. This thread group
+	 * might have other running tasks on different CPUs, but updating
+	 * their runtime can affect syscall performance, so we skip account
+	 * those pending times and rely only on values updated on tick or
+	 * other scheduler action.
+	 */
+	if (same_thread_group(current, tsk))
+		(void) task_sched_runtime(current);
+
 	rcu_read_lock();
 	/* Attempt a lockless read on the first round. */
 	nextseq = 0;
@ -332,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 			task_cputime(t, &utime, &stime);
 			times->utime += utime;
 			times->stime += stime;
-			times->sum_exec_runtime += task_sched_runtime(t);
+			times->sum_exec_runtime += read_sum_exec_runtime(t);
 		}
 		/* If lockless access failed, take the lock. */
 		nextseq = 1;
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
 static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
 {
 	struct rq *later_rq = NULL;
-	bool fallback = false;

 	later_rq = find_lock_later_rq(p, rq);
-
 	if (!later_rq) {
 		int cpu;

@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
 		 * If we cannot preempt any rq, fall back to pick any
 		 * online cpu.
 		 */
-		fallback = true;
 		cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
 		if (cpu >= nr_cpu_ids) {
 			/*
@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
 		double_lock_balance(rq, later_rq);
 	}

-	/*
-	 * By now the task is replenished and enqueued; migrate it.
-	 */
-	deactivate_task(rq, p, 0);
 	set_task_cpu(p, later_rq->cpu);
-	activate_task(later_rq, p, 0);
-
-	if (!fallback)
-		resched_curr(later_rq);
-
 	double_unlock_balance(later_rq, rq);

 	return later_rq;
@ -346,12 +334,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
 * one, and to (try to!) reconcile itself with its own scheduling
 * parameters.
 */
-static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
-				       struct sched_dl_entity *pi_se)
+static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
 {
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 	struct rq *rq = rq_of_dl_rq(dl_rq);

+	WARN_ON(dl_se->dl_boosted);
 	WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));

 	/*
@ -367,8 +355,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
 	 * future; in fact, we must consider execution overheads (time
 	 * spent on hardirq context, etc.).
 	 */
-	dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
-	dl_se->runtime = pi_se->dl_runtime;
+	dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
+	dl_se->runtime = dl_se->dl_runtime;
 }

 /*
@ -641,6 +629,24 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 		goto unlock;
 	}

+#ifdef CONFIG_SMP
+	if (unlikely(!rq->online)) {
+		/*
+		 * If the runqueue is no longer available, migrate the
+		 * task elsewhere. This necessarily changes rq.
+		 */
+		lockdep_unpin_lock(&rq->lock, rf.cookie);
+		rq = dl_task_offline_migration(rq, p);
+		rf.cookie = lockdep_pin_lock(&rq->lock);
+
+		/*
+		 * Now that the task has been migrated to the new RQ and we
+		 * have that locked, proceed as normal and enqueue the task
+		 * there.
+		 */
+	}
+#endif
+
 	enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
 	if (dl_task(rq->curr))
 		check_preempt_curr_dl(rq, p, 0);
@ -648,22 +654,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 		resched_curr(rq);

 #ifdef CONFIG_SMP
-	/*
-	 * Perform balancing operations here; after the replenishments.  We
-	 * cannot drop rq->lock before this, otherwise the assertion in
-	 * start_dl_timer() about not missing updates is not true.
-	 *
-	 * If we find that the rq the task was on is no longer available, we
-	 * need to select a new rq.
-	 *
-	 * XXX figure out if select_task_rq_dl() deals with offline cpus.
-	 */
-	if (unlikely(!rq->online)) {
-		lockdep_unpin_lock(&rq->lock, rf.cookie);
-		rq = dl_task_offline_migration(rq, p);
-		rf.cookie = lockdep_pin_lock(&rq->lock);
-	}
-
 	/*
 	 * Queueing this task back might have overloaded rq, check if we need
 	 * to kick someone away.
@ -797,7 +787,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
 	if (dl_rq->earliest_dl.curr == 0 ||
 	    dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
 		dl_rq->earliest_dl.curr = deadline;
-		cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
+		cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
 	}
 }

@ -812,14 +802,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
 	if (!dl_rq->dl_nr_running) {
 		dl_rq->earliest_dl.curr = 0;
 		dl_rq->earliest_dl.next = 0;
-		cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+		cpudl_clear(&rq->rd->cpudl, rq->cpu);
 	} else {
 		struct rb_node *leftmost = dl_rq->rb_leftmost;
 		struct sched_dl_entity *entry;

 		entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
 		dl_rq->earliest_dl.curr = entry->deadline;
-		cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
+		cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
 	}
 }

@ -1670,7 +1660,7 @@ static void rq_online_dl(struct rq *rq)

 	cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
 	if (rq->dl.dl_nr_running > 0)
-		cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
+		cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
 }

 /* Assumes rq->lock is held */
@ -1679,7 +1669,7 @@ static void rq_offline_dl(struct rq *rq)
 	if (rq->dl.overloaded)
 		dl_clear_overload(rq);

-	cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+	cpudl_clear(&rq->rd->cpudl, rq->cpu);
 	cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
 }

@ -1722,10 +1712,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 */
 static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
-	if (dl_time_before(p->dl.deadline, rq_clock(rq)))
-		setup_new_dl_entity(&p->dl, &p->dl);

-	if (task_on_rq_queued(p) && rq->curr != p) {
+	/* If p is not queued we will update its parameters at next wakeup. */
+	if (!task_on_rq_queued(p))
+		return;
+
+	/*
+	 * If p is boosted we already updated its params in
+	 * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
+	 * p's deadline being now already after rq_clock(rq).
+	 */
+	if (dl_time_before(p->dl.deadline, rq_clock(rq)))
+		setup_new_dl_entity(&p->dl);
+
+	if (rq->curr != p) {
 #ifdef CONFIG_SMP
 		if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
 			queue_push_tasks(rq);
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group

 #define P(F) \
 	SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
+#define P_SCHEDSTAT(F) \
+	SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)schedstat_val(F))
 #define PN(F) \
 	SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN_SCHEDSTAT(F) \
+	SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))

 	if (!se)
 		return;
@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 	PN(se->exec_start);
 	PN(se->vruntime);
 	PN(se->sum_exec_runtime);
-#ifdef CONFIG_SCHEDSTATS
 	if (schedstat_enabled()) {
-		PN(se->statistics.wait_start);
-		PN(se->statistics.sleep_start);
-		PN(se->statistics.block_start);
-		PN(se->statistics.sleep_max);
-		PN(se->statistics.block_max);
-		PN(se->statistics.exec_max);
-		PN(se->statistics.slice_max);
-		PN(se->statistics.wait_max);
-		PN(se->statistics.wait_sum);
-		P(se->statistics.wait_count);
+		PN_SCHEDSTAT(se->statistics.wait_start);
+		PN_SCHEDSTAT(se->statistics.sleep_start);
+		PN_SCHEDSTAT(se->statistics.block_start);
+		PN_SCHEDSTAT(se->statistics.sleep_max);
+		PN_SCHEDSTAT(se->statistics.block_max);
+		PN_SCHEDSTAT(se->statistics.exec_max);
+		PN_SCHEDSTAT(se->statistics.slice_max);
+		PN_SCHEDSTAT(se->statistics.wait_max);
+		PN_SCHEDSTAT(se->statistics.wait_sum);
+		P_SCHEDSTAT(se->statistics.wait_count);
 	}
-#endif
 	P(se->load.weight);
 #ifdef CONFIG_SMP
 	P(se->avg.load_avg);
 	P(se->avg.util_avg);
 #endif
+
+#undef PN_SCHEDSTAT
 #undef PN
+#undef P_SCHEDSTAT
 #undef P
 }
 #endif
@ -429,9 +434,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 		p->prio);

 	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-		SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)),
+		SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)),
 		SPLIT_NS(p->se.sum_exec_runtime),
-		SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime)));
+		SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime)));

 #ifdef CONFIG_NUMA_BALANCING
 	SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
@ -626,9 +631,7 @@ do {									\
 #undef P64
 #endif

-#ifdef CONFIG_SCHEDSTATS
-#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
-
+#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, schedstat_val(rq->n));
 	if (schedstat_enabled()) {
 		P(yld_count);
 		P(sched_count);
@ -636,9 +639,8 @@ do {									\
 		P(ttwu_count);
 		P(ttwu_local);
 	}
-
 #undef P
-#endif
+
 	spin_lock_irqsave(&sched_debug_lock, flags);
 	print_cfs_stats(m, cpu);
 	print_rt_stats(m, cpu);
@ -868,10 +870,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
 #define P(F) \
 	SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define P_SCHEDSTAT(F) \
+	SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
 #define __PN(F) \
 	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
 #define PN(F) \
 	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+#define PN_SCHEDSTAT(F) \
+	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))

 	PN(se.exec_start);
 	PN(se.vruntime);
@ -881,37 +887,36 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)

 	P(se.nr_migrations);

-#ifdef CONFIG_SCHEDSTATS
 	if (schedstat_enabled()) {
 		u64 avg_atom, avg_per_cpu;

-		PN(se.statistics.sum_sleep_runtime);
-		PN(se.statistics.wait_start);
-		PN(se.statistics.sleep_start);
-		PN(se.statistics.block_start);
-		PN(se.statistics.sleep_max);
-		PN(se.statistics.block_max);
-		PN(se.statistics.exec_max);
-		PN(se.statistics.slice_max);
-		PN(se.statistics.wait_max);
-		PN(se.statistics.wait_sum);
-		P(se.statistics.wait_count);
-		PN(se.statistics.iowait_sum);
-		P(se.statistics.iowait_count);
-		P(se.statistics.nr_migrations_cold);
-		P(se.statistics.nr_failed_migrations_affine);
-		P(se.statistics.nr_failed_migrations_running);
-		P(se.statistics.nr_failed_migrations_hot);
-		P(se.statistics.nr_forced_migrations);
-		P(se.statistics.nr_wakeups);
-		P(se.statistics.nr_wakeups_sync);
-		P(se.statistics.nr_wakeups_migrate);
-		P(se.statistics.nr_wakeups_local);
-		P(se.statistics.nr_wakeups_remote);
-		P(se.statistics.nr_wakeups_affine);
-		P(se.statistics.nr_wakeups_affine_attempts);
-		P(se.statistics.nr_wakeups_passive);
-		P(se.statistics.nr_wakeups_idle);
+		PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
+		PN_SCHEDSTAT(se.statistics.wait_start);
+		PN_SCHEDSTAT(se.statistics.sleep_start);
+		PN_SCHEDSTAT(se.statistics.block_start);
+		PN_SCHEDSTAT(se.statistics.sleep_max);
+		PN_SCHEDSTAT(se.statistics.block_max);
+		PN_SCHEDSTAT(se.statistics.exec_max);
+		PN_SCHEDSTAT(se.statistics.slice_max);
+		PN_SCHEDSTAT(se.statistics.wait_max);
+		PN_SCHEDSTAT(se.statistics.wait_sum);
+		P_SCHEDSTAT(se.statistics.wait_count);
+		PN_SCHEDSTAT(se.statistics.iowait_sum);
+		P_SCHEDSTAT(se.statistics.iowait_count);
+		P_SCHEDSTAT(se.statistics.nr_migrations_cold);
+		P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
+		P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
+		P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
+		P_SCHEDSTAT(se.statistics.nr_forced_migrations);
+		P_SCHEDSTAT(se.statistics.nr_wakeups);
+		P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
+		P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
+		P_SCHEDSTAT(se.statistics.nr_wakeups_local);
+		P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
+		P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
+		P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
+		P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
+		P_SCHEDSTAT(se.statistics.nr_wakeups_idle);

 		avg_atom = p->se.sum_exec_runtime;
 		if (nr_switches)
@ -930,7 +935,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 		__PN(avg_atom);
 		__PN(avg_per_cpu);
 	}
-#endif
+
 	__P(nr_switches);
 	SEQ_printf(m, "%-45s:%21Ld\n",
 		   "nr_voluntary_switches", (long long)p->nvcsw);
@ -947,8 +952,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 #endif
 	P(policy);
 	P(prio);
+#undef PN_SCHEDSTAT
 #undef PN
 #undef __PN
+#undef P_SCHEDSTAT
 #undef P
 #undef __P

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@ -27,8 +27,8 @@ static struct task_struct *
 pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
 	put_prev_task(rq, prev);
-
-	schedstat_inc(rq, sched_goidle);
+	update_idle_core(rq);
+	schedstat_inc(rq->sched_goidle);
 	return rq->idle;
 }

--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@ -2,6 +2,7 @@
 #include <linux/sched.h>
 #include <linux/sched/sysctl.h>
 #include <linux/sched/rt.h>
+#include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
 #include <linux/binfmts.h>
 #include <linux/mutex.h>
@ -15,6 +16,12 @@
 #include "cpudeadline.h"
 #include "cpuacct.h"

+#ifdef CONFIG_SCHED_DEBUG
+#define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)
+#else
+#define SCHED_WARN_ON(x)	((void)(x))
+#endif
+
 struct rq;
 struct cpuidle_state;

@ -565,6 +572,8 @@ struct root_domain {
 	 */
 	cpumask_var_t rto_mask;
 	struct cpupri cpupri;
+
+	unsigned long max_cpu_capacity;
 };

 extern struct root_domain def_root_domain;
@ -597,7 +606,6 @@ struct rq {
 #ifdef CONFIG_SMP
 	unsigned long last_load_update_tick;
 #endif /* CONFIG_SMP */
-	u64 nohz_stamp;
 	unsigned long nohz_flags;
 #endif /* CONFIG_NO_HZ_COMMON */
 #ifdef CONFIG_NO_HZ_FULL
@ -723,6 +731,23 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }

+
+#ifdef CONFIG_SCHED_SMT
+
+extern struct static_key_false sched_smt_present;
+
+extern void __update_idle_core(struct rq *rq);
+
+static inline void update_idle_core(struct rq *rq)
+{
+	if (static_branch_unlikely(&sched_smt_present))
+		__update_idle_core(rq);
+}
+
+#else
+static inline void update_idle_core(struct rq *rq) { }
+#endif
+
 DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
@ -857,8 +882,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
 DECLARE_PER_CPU(struct sched_domain *, sd_numa);
-DECLARE_PER_CPU(struct sched_domain *, sd_busy);
 DECLARE_PER_CPU(struct sched_domain *, sd_asym);

 struct sched_group_capacity {
@ -870,10 +895,6 @@ struct sched_group_capacity {
 	unsigned int capacity;
 	unsigned long next_update;
 	int imbalance; /* XXX unrelated to capacity but shared group state */
-	/*
-	 * Number of busy cpus in this group.
-	 */
-	atomic_t nr_busy_cpus;

 	unsigned long cpumask[0]; /* iteration mask */
 };
@ -1260,6 +1281,11 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
 	prev->sched_class->put_prev_task(rq, prev);
 }

+static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
+{
+	curr->sched_class->set_curr_task(rq);
+}
+
 #define sched_class_highest (&stop_sched_class)
 #define for_each_class(class) \
   for (class = sched_class_highest; class; class = class->next)
@ -1290,7 +1316,7 @@ static inline void idle_set_state(struct rq *rq,

 static inline struct cpuidle_state *idle_get_state(struct rq *rq)
 {
-	WARN_ON(!rcu_read_lock_held());
+	SCHED_WARN_ON(!rcu_read_lock_held());
 	return rq->idle_state;
 }
 #else
@ -1710,52 +1736,28 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
 #endif

 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
+struct irqtime {
+	u64			hardirq_time;
+	u64			softirq_time;
+	u64			irq_start_time;
+	struct u64_stats_sync	sync;
+};

-DECLARE_PER_CPU(u64, cpu_hardirq_time);
-DECLARE_PER_CPU(u64, cpu_softirq_time);
-
-#ifndef CONFIG_64BIT
-DECLARE_PER_CPU(seqcount_t, irq_time_seq);
-
-static inline void irq_time_write_begin(void)
-{
-	__this_cpu_inc(irq_time_seq.sequence);
-	smp_wmb();
-}
-
-static inline void irq_time_write_end(void)
-{
-	smp_wmb();
-	__this_cpu_inc(irq_time_seq.sequence);
-}
+DECLARE_PER_CPU(struct irqtime, cpu_irqtime);

 static inline u64 irq_time_read(int cpu)
 {
-	u64 irq_time;
-	unsigned seq;
+	struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
+	unsigned int seq;
+	u64 total;

 	do {
-		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
-		irq_time = per_cpu(cpu_softirq_time, cpu) +
-			   per_cpu(cpu_hardirq_time, cpu);
-	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+		seq = __u64_stats_fetch_begin(&irqtime->sync);
+		total = irqtime->softirq_time + irqtime->hardirq_time;
+	} while (__u64_stats_fetch_retry(&irqtime->sync, seq));

-	return irq_time;
+	return total;
 }
-#else /* CONFIG_64BIT */
-static inline void irq_time_write_begin(void)
-{
-}
-
-static inline void irq_time_write_end(void)
-{
-}
-
-static inline u64 irq_time_read(int cpu)
-{
-	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
-}
-#endif /* CONFIG_64BIT */
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */

 #ifdef CONFIG_CPU_FREQ
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@ -29,11 +29,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
 	if (rq)
 		rq->rq_sched_info.run_delay += delta;
 }
-# define schedstat_enabled()		static_branch_unlikely(&sched_schedstats)
-# define schedstat_inc(rq, field)	do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
-# define schedstat_add(rq, field, amt)	do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
-# define schedstat_set(var, val)	do { if (schedstat_enabled()) { var = (val); } } while (0)
-# define schedstat_val(rq, field)	((schedstat_enabled()) ? (rq)->field : 0)
+#define schedstat_enabled()		static_branch_unlikely(&sched_schedstats)
+#define schedstat_inc(var)		do { if (schedstat_enabled()) { var++; } } while (0)
+#define schedstat_add(var, amt)		do { if (schedstat_enabled()) { var += (amt); } } while (0)
+#define schedstat_set(var, val)		do { if (schedstat_enabled()) { var = (val); } } while (0)
+#define schedstat_val(var)		(var)
+#define schedstat_val_or_zero(var)	((schedstat_enabled()) ? (var) : 0)

 #else /* !CONFIG_SCHEDSTATS */
 static inline void
@ -45,12 +46,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
 static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {}
-# define schedstat_enabled()		0
-# define schedstat_inc(rq, field)	do { } while (0)
-# define schedstat_add(rq, field, amt)	do { } while (0)
-# define schedstat_set(var, val)	do { } while (0)
-# define schedstat_val(rq, field)	0
-#endif
+#define schedstat_enabled()		0
+#define schedstat_inc(var)		do { } while (0)
+#define schedstat_add(var, amt)		do { } while (0)
+#define schedstat_set(var, val)		do { } while (0)
+#define schedstat_val(var)		0
+#define schedstat_val_or_zero(var)	0
+#endif /* CONFIG_SCHEDSTATS */

 #ifdef CONFIG_SCHED_INFO
 static inline void sched_info_reset_dequeued(struct task_struct *t)
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@ -196,27 +196,48 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);

+void init_wait_entry(wait_queue_t *wait, int flags)
+{
+	wait->flags = flags;
+	wait->private = current;
+	wait->func = autoremove_wake_function;
+	INIT_LIST_HEAD(&wait->task_list);
+}
+EXPORT_SYMBOL(init_wait_entry);
+
 long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
 {
 	unsigned long flags;
-
-	if (signal_pending_state(state, current))
-		return -ERESTARTSYS;
-
-	wait->private = current;
-	wait->func = autoremove_wake_function;
+	long ret = 0;

 	spin_lock_irqsave(&q->lock, flags);
-	if (list_empty(&wait->task_list)) {
-		if (wait->flags & WQ_FLAG_EXCLUSIVE)
-			__add_wait_queue_tail(q, wait);
-		else
-			__add_wait_queue(q, wait);
+	if (unlikely(signal_pending_state(state, current))) {
+		/*
+		 * Exclusive waiter must not fail if it was selected by wakeup,
+		 * it should "consume" the condition we were waiting for.
+		 *
+		 * The caller will recheck the condition and return success if
+		 * we were already woken up, we can not miss the event because
+		 * wakeup locks/unlocks the same q->lock.
+		 *
+		 * But we need to ensure that set-condition + wakeup after that
+		 * can't see us, it should wake up another exclusive waiter if
+		 * we fail.
+		 */
+		list_del_init(&wait->task_list);
+		ret = -ERESTARTSYS;
+	} else {
+		if (list_empty(&wait->task_list)) {
+			if (wait->flags & WQ_FLAG_EXCLUSIVE)
+				__add_wait_queue_tail(q, wait);
+			else
+				__add_wait_queue(q, wait);
+		}
+		set_current_state(state);
 	}
-	set_current_state(state);
 	spin_unlock_irqrestore(&q->lock, flags);

-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL(prepare_to_wait_event);

@ -255,39 +276,6 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
 }
 EXPORT_SYMBOL(finish_wait);

-/**
- * abort_exclusive_wait - abort exclusive waiting in a queue
- * @q: waitqueue waited on
- * @wait: wait descriptor
- * @mode: runstate of the waiter to be woken
- * @key: key to identify a wait bit queue or %NULL
- *
- * Sets current thread back to running state and removes
- * the wait descriptor from the given waitqueue if still
- * queued.
- *
- * Wakes up the next waiter if the caller is concurrently
- * woken up through the queue.
- *
- * This prevents waiter starvation where an exclusive waiter
- * aborts and is woken up concurrently and no one wakes up
- * the next waiter.
- */
-void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
-			unsigned int mode, void *key)
-{
-	unsigned long flags;
-
-	__set_current_state(TASK_RUNNING);
-	spin_lock_irqsave(&q->lock, flags);
-	if (!list_empty(&wait->task_list))
-		list_del_init(&wait->task_list);
-	else if (waitqueue_active(q))
-		__wake_up_locked_key(q, mode, key);
-	spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL(abort_exclusive_wait);
-
 int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
 {
 	int ret = default_wake_function(wait, mode, sync, key);
@ -425,20 +413,29 @@ int __sched
 __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
 			wait_bit_action_f *action, unsigned mode)
 {
-	do {
-		int ret;
+	int ret = 0;

+	for (;;) {
 		prepare_to_wait_exclusive(wq, &q->wait, mode);
-		if (!test_bit(q->key.bit_nr, q->key.flags))
-			continue;
-		ret = action(&q->key, mode);
-		if (!ret)
-			continue;
-		abort_exclusive_wait(wq, &q->wait, mode, &q->key);
-		return ret;
-	} while (test_and_set_bit(q->key.bit_nr, q->key.flags));
-	finish_wait(wq, &q->wait);
-	return 0;
+		if (test_bit(q->key.bit_nr, q->key.flags)) {
+			ret = action(&q->key, mode);
+			/*
+			 * See the comment in prepare_to_wait_event().
+			 * finish_wait() does not necessarily takes wq->lock,
+			 * but test_and_set_bit() implies mb() which pairs with
+			 * smp_mb__after_atomic() before wake_up_page().
+			 */
+			if (ret)
+				finish_wait(wq, &q->wait);
+		}
+		if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) {
+			if (!ret)
+				finish_wait(wq, &q->wait);
+			return 0;
+		} else if (ret) {
+			return ret;
+		}
+	}
 }
 EXPORT_SYMBOL(__wait_on_bit_lock);

--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@ -122,12 +122,12 @@ static int smpboot_thread_fn(void *data)

 		if (kthread_should_park()) {
 			__set_current_state(TASK_RUNNING);
-			preempt_enable();
 			if (ht->park && td->status == HP_THREAD_ACTIVE) {
 				BUG_ON(td->cpu != smp_processor_id());
 				ht->park(td->cpu);
 				td->status = HP_THREAD_PARKED;
 			}
+			preempt_enable();
 			kthread_parkme();
 			/* We might have been woken for stop */
 			continue;
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@ -121,6 +121,11 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
 	cpu_stop_init_done(&done, 1);
 	if (!cpu_stop_queue_work(cpu, &work))
 		return -ENOENT;
+	/*
+	 * In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup
+	 * cycle by doing a preemption:
+	 */
+	cond_resched();
 	wait_for_completion(&done.completion);
 	return done.ret;
 }
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@ -1165,7 +1165,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
 	}

 	/* See similar comment in do_numa_page for explanation */
-	if (!(vma->vm_flags & VM_WRITE))
+	if (!pmd_write(pmd))
 		flags |= TNF_NO_GROUP;

 	/*
--- a/mm/memory.c
+++ b/mm/memory.c
@ -3395,7 +3395,7 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
 	 * pte_dirty has unpredictable behaviour between PTE scan updates,
 	 * background writeback, dirty balancing and application behaviour.
 	 */
-	if (!(vma->vm_flags & VM_WRITE))
+	if (!pte_write(pte))
 		flags |= TNF_NO_GROUP;

 	/*
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@ -175,6 +175,7 @@ static int __dead_end_function(struct objtool_file *file, struct symbol *func,
 		"__stack_chk_fail",
 		"panic",
 		"do_exit",
+		"do_task_dead",
 		"__module_put_and_exit",
 		"complete_and_exit",
 		"kvm_spurious_fault",