From 95e904c7da715aa2dbfb595da66b63de37a0bb04 Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabin@rab.in>
Date: Sun, 11 May 2008 05:55:33 +0530
Subject: [PATCH 1/5] sched: fix defined-but-unused warning

Fix this warning, which appears with !CONFIG_SMP:
kernel/sched.c:1216: warning: `init_hrtick' defined but not used

Signed-off-by: Rabin Vincent <rabin@rab.in>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)
diff --git a/kernel/sched.c b/kernel/sched.c
index eaf6751e7612..04228524d160 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1127,6 +1127,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
+#ifdef CONFIG_SMP
 static void hotplug_hrtick_disable(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -1182,6 +1183,7 @@ static void init_hrtick(void)
 {
 	hotcpu_notifier(hotplug_hrtick, 0);
 }
+#endif /* CONFIG_SMP */
 
 static void init_rq_hrtick(struct rq *rq)
 {

From 49307fd6f72bdd68cc2bd23e7da0bcfecf8087c9 Mon Sep 17 00:00:00 2001
From: Dario Faggioli <raistlin@linux.it>
Date: Wed, 18 Jun 2008 09:18:38 +0200
Subject: [PATCH 2/5] sched: NULL pointer dereference while setting
 sched_rt_period_us
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When CONFIG_RT_GROUP_SCHED and CONFIG_CGROUP_SCHED are enabled, with:

 echo 10000 > /proc/sys/kernel/sched_rt_period_us

We get this:

 BUG: unable to handle kernel NULL pointer dereference at 0000008c
 [  947.682233] IP: [<c0216b72>] __rt_schedulable+0x12/0x160
 [  947.683123] *pde = 00000000=20
 [  947.683782] Oops: 0000 [#1]
 [  947.684307] Modules linked in:
 [  947.684308]
 [  947.684308] Pid: 2359, comm: bash Not tainted (2.6.26-rc6 #8)
 [  947.684308] EIP: 0060:[<c0216b72>] EFLAGS: 00000246 CPU: 0
 [  947.684308] EIP is at __rt_schedulable+0x12/0x160
 [  947.684308] EAX: 00000000 EBX: 00000000 ECX: 00000000 EDX: 00000001
 [  947.684308] ESI: c0521db4 EDI: 00000001 EBP: c6cc9f00 ESP: c6cc9ed0
 [  947.684308]  DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068
 [  947.684308] Process bash (pid: 2359, tiÆcc8000 taskÇa54f00=20 task.tiÆcc8000)
 [  947.684308] Stack: c0222790 00000000 080f8c08 c0521db4 c6cc9f00 00000001 00000000 00000000
 [  947.684308]        c6cc9f9c 00000000 c0521db4 00000001 c6cc9f28 c0216d40 00000000 00000000
 [  947.684308]        c6cc9f9c 000f4240 000e7ef0 ffffffff c0521db4 c79dfb60 c6cc9f58 c02af2cc
 [  947.684308] Call Trace:
 [  947.684308]  [<c0222790>] ? do_proc_dointvec_conv+0x0/0x50
 [  947.684308]  [<c0216d40>] ? sched_rt_handler+0x80/0x110
 [  947.684308]  [<c02af2cc>] ? proc_sys_call_handler+0x9c/0xb0
 [  947.684308]  [<c02af2fa>] ? proc_sys_write+0x1a/0x20
 [  947.684308]  [<c0273c36>] ? vfs_write+0x96/0x160
 [  947.684308]  [<c02af2e0>] ? proc_sys_write+0x0/0x20
 [  947.684308]  [<c027423d>] ? sys_write+0x3d/0x70
 [  947.684308]  [<c0202ef5>] ? sysenter_past_esp+0x6a/0x91
 [  947.684308]  =======================
 [  947.684308] Code: 24 04 e8 62 b1 0e 00 89 c7 89 f8 8b 5d f4 8b 75
 f8 8b 7d fc 89 ec 5d c3 90 55 89 e5 57 56 53 83 ec 24 89 45 ec 89 55 e4
 89 4d e8 <8b> b8 8c 00 00 00 85 ff 0f 84 c9 00 00 00 8b 57 24 39 55 e8
 8b
 [  947.684308] EIP: [<c0216b72>] __rt_schedulable+0x12/0x160 SS:ESP  0068:c6cc9ed0

We think the following patch solves the issue.

Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Michael Trimarchi <trimarchimichael@yahoo.it>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 04228524d160..320e9a43d4cc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8350,7 +8350,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
 #ifdef CONFIG_CGROUP_SCHED
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
-	struct task_group *tgi, *parent = tg->parent;
+	struct task_group *tgi, *parent = tg ? tg->parent : NULL;
 	unsigned long total = 0;
 
 	if (!parent) {

From 7ea56616ba6b3d67a4892728182e38ae162ea3e7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jun 2008 09:06:56 +0200
Subject: [PATCH 3/5] sched: rt-group: fix hierarchy

Don't re-set the entity's runqueue to the wrong rq after we've set it
to the right one.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Daniel K. <dk@uw.no>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 320e9a43d4cc..ce375cb551e9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7628,7 +7628,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 	else
 		rt_se->rt_rq = parent->my_q;
 
-	rt_se->rt_rq = &rq->rt;
 	rt_se->my_q = rt_rq;
 	rt_se->parent = parent;
 	INIT_LIST_HEAD(&rt_se->run_list);

From ad2a3f13b7258a5daaaeb8cff9f835aac468b71d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jun 2008 09:06:57 +0200
Subject: [PATCH 4/5] sched: rt-group: heirarchy aware throttle

The bandwidth throttle code dequeues a group when it runs out of quota, and
re-queues it once the period rolls over and the quota gets refreshed.

Sadly it failed to take the hierarchy into consideration. Share more of the
enqueue/dequeue code with regular task opterations.

Also, some operations like sched_setscheduler() can dequeue/enqueue tasks that
are in throttled runqueues, we should not inadvertly re-enqueue empty runqueues
so check for that.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Daniel K. <dk@uw.no>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 59 ++++++++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 26 deletions(-)

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 3432d573205d..837241568d76 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -449,13 +449,19 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 #endif
 }
 
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
 	struct rt_rq *group_rq = group_rt_rq(rt_se);
 
-	if (group_rq && rt_rq_throttled(group_rq))
+	/*
+	 * Don't enqueue the group if its throttled, or when empty.
+	 * The latter is a consequence of the former when a child group
+	 * get throttled and the current group doesn't have any other
+	 * active members.
+	 */
+	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
 		return;
 
 	list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
@@ -464,7 +470,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
 	inc_rt_tasks(rt_se, rt_rq);
 }
 
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
@@ -480,11 +486,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
  * Because the prio of an upper entry depends on the lower
  * entries, we must remove entries top - down.
  */
-static void dequeue_rt_stack(struct task_struct *p)
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 {
-	struct sched_rt_entity *rt_se, *back = NULL;
+	struct sched_rt_entity *back = NULL;
 
-	rt_se = &p->rt;
 	for_each_sched_rt_entity(rt_se) {
 		rt_se->back = back;
 		back = rt_se;
@@ -492,7 +497,26 @@ static void dequeue_rt_stack(struct task_struct *p)
 
 	for (rt_se = back; rt_se; rt_se = rt_se->back) {
 		if (on_rt_rq(rt_se))
-			dequeue_rt_entity(rt_se);
+			__dequeue_rt_entity(rt_se);
+	}
+}
+
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
+{
+	dequeue_rt_stack(rt_se);
+	for_each_sched_rt_entity(rt_se)
+		__enqueue_rt_entity(rt_se);
+}
+
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+{
+	dequeue_rt_stack(rt_se);
+
+	for_each_sched_rt_entity(rt_se) {
+		struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+		if (rt_rq && rt_rq->rt_nr_running)
+			__enqueue_rt_entity(rt_se);
 	}
 }
 
@@ -506,32 +530,15 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 	if (wakeup)
 		rt_se->timeout = 0;
 
-	dequeue_rt_stack(p);
-
-	/*
-	 * enqueue everybody, bottom - up.
-	 */
-	for_each_sched_rt_entity(rt_se)
-		enqueue_rt_entity(rt_se);
+	enqueue_rt_entity(rt_se);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 {
 	struct sched_rt_entity *rt_se = &p->rt;
-	struct rt_rq *rt_rq;
 
 	update_curr_rt(rq);
-
-	dequeue_rt_stack(p);
-
-	/*
-	 * re-enqueue all non-empty rt_rq entities.
-	 */
-	for_each_sched_rt_entity(rt_se) {
-		rt_rq = group_rt_rq(rt_se);
-		if (rt_rq && rt_rq->rt_nr_running)
-			enqueue_rt_entity(rt_se);
-	}
+	dequeue_rt_entity(rt_se);
 }
 
 /*

From 15a8641eadb492ef7c5489faa25256967bdfd303 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jun 2008 09:06:59 +0200
Subject: [PATCH 5/5] sched: rt-group: fix RR buglet

In tick_task_rt() we first call update_curr_rt() which can dequeue a runqueue
due to it running out of runtime, and then we try to requeue it, of it also
having exhausted its RR quota. Obviously requeueing something that is no longer
on the runqueue will not have the expected result.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Daniel K. <dk@uw.no>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 837241568d76..1dad5bbb59b6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -549,8 +549,10 @@ static
 void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
 {
 	struct rt_prio_array *array = &rt_rq->active;
+	struct list_head *queue = array->queue + rt_se_prio(rt_se);
 
-	list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+	if (on_rt_rq(rt_se))
+		list_move_tail(&rt_se->run_list, queue);
 }
 
 static void requeue_task_rt(struct rq *rq, struct task_struct *p)