Merge git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched

* git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched: (96 commits)
  sched: keep total / count stats in addition to the max for
  sched, futex: detach sched.h and futex.h
  sched: fix: don't take a mutex from interrupt context
  sched: print backtrace of running tasks too
  printk: use ktime_get()
  softlockup: fix signedness
  sched: latencytop support
  sched: fix goto retry in pick_next_task_rt()
  timers: don't #error on higher HZ values
  sched: monitor clock underflows in /proc/sched_debug
  sched: fix rq->clock warps on frequency changes
  sched: fix, always create kernel threads with normal priority
  debug: clean up kernel/profile.c
  sched: remove the !PREEMPT_BKL code
  sched: make PREEMPT_BKL the default
  debug: track and print last unloaded module in the oops trace
  debug: show being-loaded/being-unloaded indicator for modules
  sched: rt-watchdog: fix .rlim_max = RLIM_INFINITY
  sched: rt-group: reduce rescheduling
  hrtimer: unlock hrtimer_wakeup
  ...
This commit is contained in:
Linus Torvalds 2008-01-25 13:42:32 -08:00
commit 0008bf5440
81 changed files with 6324 additions and 1838 deletions

View file

@ -9,8 +9,8 @@ The first thing resembling RCU was published in 1980, when Kung and Lehman
[Kung80] recommended use of a garbage collector to defer destruction
of nodes in a parallel binary search tree in order to simplify its
implementation. This works well in environments that have garbage
collectors, but current production garbage collectors incur significant
read-side overhead.
collectors, but most production garbage collectors incur significant
overhead.
In 1982, Manber and Ladner [Manber82,Manber84] recommended deferring
destruction until all threads running at that time have terminated, again
@ -99,16 +99,25 @@ locking, reduces contention, reduces memory latency for readers, and
parallelizes pipeline stalls and memory latency for writers. However,
these techniques still impose significant read-side overhead in the
form of memory barriers. Researchers at Sun worked along similar lines
in the same timeframe [HerlihyLM02,HerlihyLMS03]. These techniques
can be thought of as inside-out reference counts, where the count is
represented by the number of hazard pointers referencing a given data
structure (rather than the more conventional counter field within the
data structure itself).
in the same timeframe [HerlihyLM02]. These techniques can be thought
of as inside-out reference counts, where the count is represented by the
number of hazard pointers referencing a given data structure (rather than
the more conventional counter field within the data structure itself).
By the same token, RCU can be thought of as a "bulk reference count",
where some form of reference counter covers all reference by a given CPU
or thread during a set timeframe. This timeframe is related to, but
not necessarily exactly the same as, an RCU grace period. In classic
RCU, the reference counter is the per-CPU bit in the "bitmask" field,
and each such bit covers all references that might have been made by
the corresponding CPU during the prior grace period. Of course, RCU
can be thought of in other terms as well.
In 2003, the K42 group described how RCU could be used to create
hot-pluggable implementations of operating-system functions. Later that
year saw a paper describing an RCU implementation of System V IPC
[Arcangeli03], and an introduction to RCU in Linux Journal [McKenney03a].
hot-pluggable implementations of operating-system functions [Appavoo03a].
Later that year saw a paper describing an RCU implementation of System
V IPC [Arcangeli03], and an introduction to RCU in Linux Journal
[McKenney03a].
2004 has seen a Linux-Journal article on use of RCU in dcache
[McKenney04a], a performance comparison of locking to RCU on several
@ -117,10 +126,19 @@ number of operating-system kernels [PaulEdwardMcKenneyPhD], a paper
describing how to make RCU safe for soft-realtime applications [Sarma04c],
and a paper describing SELinux performance with RCU [JamesMorris04b].
2005 has seen further adaptation of RCU to realtime use, permitting
2005 brought further adaptation of RCU to realtime use, permitting
preemption of RCU realtime critical sections [PaulMcKenney05a,
PaulMcKenney05b].
2006 saw the first best-paper award for an RCU paper [ThomasEHart2006a],
as well as further work on efficient implementations of preemptible
RCU [PaulEMcKenney2006b], but priority-boosting of RCU read-side critical
sections proved elusive. An RCU implementation permitting general
blocking in read-side critical sections appeared [PaulEMcKenney2006c],
Robert Olsson described an RCU-protected trie-hash combination
[RobertOlsson2006a].
Bibtex Entries
@article{Kung80
@ -203,6 +221,41 @@ Bibtex Entries
,Address="New Orleans, LA"
}
@conference{Pu95a,
Author = "Calton Pu and Tito Autrey and Andrew Black and Charles Consel and
Crispin Cowan and Jon Inouye and Lakshmi Kethana and Jonathan Walpole and
Ke Zhang",
Title = "Optimistic Incremental Specialization: Streamlining a Commercial
Operating System",
Booktitle = "15\textsuperscript{th} ACM Symposium on
Operating Systems Principles (SOSP'95)",
address = "Copper Mountain, CO",
month="December",
year="1995",
pages="314-321",
annotation="
Uses a replugger, but with a flag to signal when people are
using the resource at hand. Only one reader at a time.
"
}
@conference{Cowan96a,
Author = "Crispin Cowan and Tito Autrey and Charles Krasic and
Calton Pu and Jonathan Walpole",
Title = "Fast Concurrent Dynamic Linking for an Adaptive Operating System",
Booktitle = "International Conference on Configurable Distributed Systems
(ICCDS'96)",
address = "Annapolis, MD",
month="May",
year="1996",
pages="108",
isbn="0-8186-7395-8",
annotation="
Uses a replugger, but with a counter to signal when people are
using the resource at hand. Allows multiple readers.
"
}
@techreport{Slingwine95
,author="John D. Slingwine and Paul E. McKenney"
,title="Apparatus and Method for Achieving Reduced Overhead Mutual
@ -312,6 +365,49 @@ Andrea Arcangeli and Andi Kleen and Orran Krieger and Rusty Russell"
[Viewed June 23, 2004]"
}
@conference{Michael02a
,author="Maged M. Michael"
,title="Safe Memory Reclamation for Dynamic Lock-Free Objects Using Atomic
Reads and Writes"
,Year="2002"
,Month="August"
,booktitle="{Proceedings of the 21\textsuperscript{st} Annual ACM
Symposium on Principles of Distributed Computing}"
,pages="21-30"
,annotation="
Each thread keeps an array of pointers to items that it is
currently referencing. Sort of an inside-out garbage collection
mechanism, but one that requires the accessing code to explicitly
state its needs. Also requires read-side memory barriers on
most architectures.
"
}
@conference{Michael02b
,author="Maged M. Michael"
,title="High Performance Dynamic Lock-Free Hash Tables and List-Based Sets"
,Year="2002"
,Month="August"
,booktitle="{Proceedings of the 14\textsuperscript{th} Annual ACM
Symposium on Parallel
Algorithms and Architecture}"
,pages="73-82"
,annotation="
Like the title says...
"
}
@InProceedings{HerlihyLM02
,author={Maurice Herlihy and Victor Luchangco and Mark Moir}
,title="The Repeat Offender Problem: A Mechanism for Supporting Dynamic-Sized,
Lock-Free Data Structures"
,booktitle={Proceedings of 16\textsuperscript{th} International
Symposium on Distributed Computing}
,year=2002
,month="October"
,pages="339-353"
}
@article{Appavoo03a
,author="J. Appavoo and K. Hui and C. A. N. Soules and R. W. Wisniewski and
D. M. {Da Silva} and O. Krieger and M. A. Auslander and D. J. Edelsohn and
@ -447,3 +543,95 @@ Oregon Health and Sciences University"
Realtime turns into making RCU yet more realtime friendly.
"
}
@conference{ThomasEHart2006a
,Author="Thomas E. Hart and Paul E. McKenney and Angela Demke Brown"
,Title="Making Lockless Synchronization Fast: Performance Implications
of Memory Reclamation"
,Booktitle="20\textsuperscript{th} {IEEE} International Parallel and
Distributed Processing Symposium"
,month="April"
,year="2006"
,day="25-29"
,address="Rhodes, Greece"
,annotation="
Compares QSBR (AKA "classic RCU"), HPBR, EBR, and lock-free
reference counting.
"
}
@Conference{PaulEMcKenney2006b
,Author="Paul E. McKenney and Dipankar Sarma and Ingo Molnar and
Suparna Bhattacharya"
,Title="Extending RCU for Realtime and Embedded Workloads"
,Booktitle="{Ottawa Linux Symposium}"
,Month="July"
,Year="2006"
,pages="v2 123-138"
,note="Available:
\url{http://www.linuxsymposium.org/2006/view_abstract.php?content_key=184}
\url{http://www.rdrop.com/users/paulmck/RCU/OLSrtRCU.2006.08.11a.pdf}
[Viewed January 1, 2007]"
,annotation="
Described how to improve the -rt implementation of realtime RCU.
"
}
@unpublished{PaulEMcKenney2006c
,Author="Paul E. McKenney"
,Title="Sleepable {RCU}"
,month="October"
,day="9"
,year="2006"
,note="Available:
\url{http://lwn.net/Articles/202847/}
Revised:
\url{http://www.rdrop.com/users/paulmck/RCU/srcu.2007.01.14a.pdf}
[Viewed August 21, 2006]"
,annotation="
LWN article introducing SRCU.
"
}
@unpublished{RobertOlsson2006a
,Author="Robert Olsson and Stefan Nilsson"
,Title="{TRASH}: A dynamic {LC}-trie and hash data structure"
,month="August"
,day="18"
,year="2006"
,note="Available:
\url{http://www.nada.kth.se/~snilsson/public/papers/trash/trash.pdf}
[Viewed February 24, 2007]"
,annotation="
RCU-protected dynamic trie-hash combination.
"
}
@unpublished{ThomasEHart2007a
,Author="Thomas E. Hart and Paul E. McKenney and Angela Demke Brown and Jonathan Walpole"
,Title="Performance of memory reclamation for lockless synchronization"
,journal="J. Parallel Distrib. Comput."
,year="2007"
,note="To appear in J. Parallel Distrib. Comput.
\url{doi=10.1016/j.jpdc.2007.04.010}"
,annotation={
Compares QSBR (AKA "classic RCU"), HPBR, EBR, and lock-free
reference counting. Journal version of ThomasEHart2006a.
}
}
@unpublished{PaulEMcKenney2007QRCUspin
,Author="Paul E. McKenney"
,Title="Using Promela and Spin to verify parallel algorithms"
,month="August"
,day="1"
,year="2007"
,note="Available:
\url{http://lwn.net/Articles/243851/}
[Viewed September 8, 2007]"
,annotation="
LWN article describing Promela and spin, and also using Oleg
Nesterov's QRCU as an example (with Paul McKenney's fastpath).
"
}

View file

@ -36,6 +36,14 @@ o How can the updater tell when a grace period has completed
executed in user mode, or executed in the idle loop, we can
safely free up that item.
Preemptible variants of RCU (CONFIG_PREEMPT_RCU) get the
same effect, but require that the readers manipulate CPU-local
counters. These counters allow limited types of blocking
within RCU read-side critical sections. SRCU also uses
CPU-local counters, and permits general blocking within
RCU read-side critical sections. These two variants of
RCU detect grace periods by sampling these counters.
o If I am running on a uniprocessor kernel, which can only do one
thing at a time, why should I wait for a grace period?
@ -46,7 +54,10 @@ o How can I see where RCU is currently used in the Linux kernel?
Search for "rcu_read_lock", "rcu_read_unlock", "call_rcu",
"rcu_read_lock_bh", "rcu_read_unlock_bh", "call_rcu_bh",
"srcu_read_lock", "srcu_read_unlock", "synchronize_rcu",
"synchronize_net", and "synchronize_srcu".
"synchronize_net", "synchronize_srcu", and the other RCU
primitives. Or grab one of the cscope databases from:
http://www.rdrop.com/users/paulmck/RCU/linuxusage/rculocktab.html
o What guidelines should I follow when writing code that uses RCU?
@ -67,7 +78,11 @@ o I hear that RCU is patented? What is with that?
o I hear that RCU needs work in order to support realtime kernels?
Yes, work in progress.
This work is largely completed. Realtime-friendly RCU can be
enabled via the CONFIG_PREEMPT_RCU kernel configuration parameter.
However, work is in progress for enabling priority boosting of
preempted RCU read-side critical sections.This is needed if you
have CPU-bound realtime threads.
o Where can I find more information on RCU?

View file

@ -46,12 +46,13 @@ stat_interval The number of seconds between output of torture
shuffle_interval
The number of seconds to keep the test threads affinitied
to a particular subset of the CPUs. Used in conjunction
with test_no_idle_hz.
to a particular subset of the CPUs, defaults to 5 seconds.
Used in conjunction with test_no_idle_hz.
test_no_idle_hz Whether or not to test the ability of RCU to operate in
a kernel that disables the scheduling-clock interrupt to
idle CPUs. Boolean parameter, "1" to test, "0" otherwise.
Defaults to omitting this test.
torture_type The type of RCU to test: "rcu" for the rcu_read_lock() API,
"rcu_sync" for rcu_read_lock() with synchronous reclamation,
@ -82,8 +83,6 @@ be evident. ;-)
The entries are as follows:
o "ggp": The number of counter flips (or batches) since boot.
o "rtc": The hexadecimal address of the structure currently visible
to readers.
@ -117,8 +116,8 @@ o "Reader Pipe": Histogram of "ages" of structures seen by readers.
o "Reader Batch": Another histogram of "ages" of structures seen
by readers, but in terms of counter flips (or batches) rather
than in terms of grace periods. The legal number of non-zero
entries is again two. The reason for this separate view is
that it is easier to get the third entry to show up in the
entries is again two. The reason for this separate view is that
it is sometimes easier to get the third entry to show up in the
"Reader Batch" list than in the "Reader Pipe" list.
o "Free-Block Circulation": Shows the number of torture structures

View file

@ -109,12 +109,13 @@ Never use anything other than cpumask_t to represent bitmap of CPUs.
for_each_cpu_mask(x,mask) - Iterate over some random collection of cpu mask.
#include <linux/cpu.h>
lock_cpu_hotplug() and unlock_cpu_hotplug():
get_online_cpus() and put_online_cpus():
The above calls are used to inhibit cpu hotplug operations. While holding the
cpucontrol mutex, cpu_online_map will not change. If you merely need to avoid
cpus going away, you could also use preempt_disable() and preempt_enable()
for those sections. Just remember the critical section cannot call any
The above calls are used to inhibit cpu hotplug operations. While the
cpu_hotplug.refcount is non zero, the cpu_online_map will not change.
If you merely need to avoid cpus going away, you could also use
preempt_disable() and preempt_enable() for those sections.
Just remember the critical section cannot call any
function that can sleep or schedule this process away. The preempt_disable()
will work as long as stop_machine_run() is used to take a cpu down.

View file

@ -79,17 +79,6 @@ static unsigned long dummy_gettimeoffset(void)
}
#endif
/*
* An implementation of printk_clock() independent from
* sched_clock(). This avoids non-bootable kernels when
* printk_clock is enabled.
*/
unsigned long long printk_clock(void)
{
return (unsigned long long)(jiffies - INITIAL_JIFFIES) *
(1000000000 / HZ);
}
static unsigned long next_rtc_update;
/*

View file

@ -71,8 +71,6 @@ unsigned long __per_cpu_offset[NR_CPUS];
EXPORT_SYMBOL(__per_cpu_offset);
#endif
extern void ia64_setup_printk_clock(void);
DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info);
DEFINE_PER_CPU(unsigned long, local_per_cpu_offset);
unsigned long ia64_cycles_per_usec;
@ -507,8 +505,6 @@ setup_arch (char **cmdline_p)
/* process SAL system table: */
ia64_sal_init(__va(efi.sal_systab));
ia64_setup_printk_clock();
#ifdef CONFIG_SMP
cpu_physical_id(0) = hard_smp_processor_id();
#endif

View file

@ -344,33 +344,6 @@ udelay (unsigned long usecs)
}
EXPORT_SYMBOL(udelay);
static unsigned long long ia64_itc_printk_clock(void)
{
if (ia64_get_kr(IA64_KR_PER_CPU_DATA))
return sched_clock();
return 0;
}
static unsigned long long ia64_default_printk_clock(void)
{
return (unsigned long long)(jiffies_64 - INITIAL_JIFFIES) *
(1000000000/HZ);
}
unsigned long long (*ia64_printk_clock)(void) = &ia64_default_printk_clock;
unsigned long long printk_clock(void)
{
return ia64_printk_clock();
}
void __init
ia64_setup_printk_clock(void)
{
if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT))
ia64_printk_clock = ia64_itc_printk_clock;
}
/* IA64 doesn't cache the timezone */
void update_vsyscall_tz(void)
{

View file

@ -64,7 +64,6 @@ extern void sn_timer_init(void);
extern unsigned long last_time_offset;
extern void (*ia64_mark_idle) (int);
extern void snidle(int);
extern unsigned long long (*ia64_printk_clock)(void);
unsigned long sn_rtc_cycles_per_second;
EXPORT_SYMBOL(sn_rtc_cycles_per_second);
@ -360,14 +359,6 @@ sn_scan_pcdp(void)
static unsigned long sn2_rtc_initial;
static unsigned long long ia64_sn2_printk_clock(void)
{
unsigned long rtc_now = rtc_time();
return (rtc_now - sn2_rtc_initial) *
(1000000000 / sn_rtc_cycles_per_second);
}
/**
* sn_setup - SN platform setup routine
* @cmdline_p: kernel command line
@ -468,8 +459,6 @@ void __init sn_setup(char **cmdline_p)
platform_intr_list[ACPI_INTERRUPT_CPEI] = IA64_CPE_VECTOR;
ia64_printk_clock = ia64_sn2_printk_clock;
printk("SGI SAL version %x.%02x\n", version >> 8, version & 0x00FF);
/*

View file

@ -58,13 +58,13 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,
if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
return -EFAULT;
lock_cpu_hotplug();
get_online_cpus();
read_lock(&tasklist_lock);
p = find_process_by_pid(pid);
if (!p) {
read_unlock(&tasklist_lock);
unlock_cpu_hotplug();
put_online_cpus();
return -ESRCH;
}
@ -106,7 +106,7 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,
out_unlock:
put_task_struct(p);
unlock_cpu_hotplug();
put_online_cpus();
return retval;
}
@ -125,7 +125,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
if (len < real_len)
return -EINVAL;
lock_cpu_hotplug();
get_online_cpus();
read_lock(&tasklist_lock);
retval = -ESRCH;
@ -140,7 +140,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
out_unlock:
read_unlock(&tasklist_lock);
unlock_cpu_hotplug();
put_online_cpus();
if (retval)
return retval;
if (copy_to_user(user_mask_ptr, &mask, real_len))

View file

@ -153,7 +153,7 @@ static int pseries_add_processor(struct device_node *np)
for (i = 0; i < nthreads; i++)
cpu_set(i, tmp);
lock_cpu_hotplug();
cpu_maps_update_begin();
BUG_ON(!cpus_subset(cpu_present_map, cpu_possible_map));
@ -190,7 +190,7 @@ static int pseries_add_processor(struct device_node *np)
}
err = 0;
out_unlock:
unlock_cpu_hotplug();
cpu_maps_update_done();
return err;
}
@ -211,7 +211,7 @@ static void pseries_remove_processor(struct device_node *np)
nthreads = len / sizeof(u32);
lock_cpu_hotplug();
cpu_maps_update_begin();
for (i = 0; i < nthreads; i++) {
for_each_present_cpu(cpu) {
if (get_hard_smp_processor_id(cpu) != intserv[i])
@ -225,7 +225,7 @@ static void pseries_remove_processor(struct device_node *np)
printk(KERN_WARNING "Could not find cpu to remove "
"with physical id 0x%x\n", intserv[i]);
}
unlock_cpu_hotplug();
cpu_maps_update_done();
}
static int pseries_smp_notifier(struct notifier_block *nb,

View file

@ -382,7 +382,7 @@ static void do_event_scan_all_cpus(long delay)
{
int cpu;
lock_cpu_hotplug();
get_online_cpus();
cpu = first_cpu(cpu_online_map);
for (;;) {
set_cpus_allowed(current, cpumask_of_cpu(cpu));
@ -390,15 +390,15 @@ static void do_event_scan_all_cpus(long delay)
set_cpus_allowed(current, CPU_MASK_ALL);
/* Drop hotplug lock, and sleep for the specified delay */
unlock_cpu_hotplug();
put_online_cpus();
msleep_interruptible(delay);
lock_cpu_hotplug();
get_online_cpus();
cpu = next_cpu(cpu, cpu_online_map);
if (cpu == NR_CPUS)
break;
}
unlock_cpu_hotplug();
put_online_cpus();
}
static int rtasd(void *unused)

View file

@ -349,7 +349,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
replace = -1;
/* No CPU hotplug when we change MTRR entries */
lock_cpu_hotplug();
get_online_cpus();
/* Search for existing MTRR */
mutex_lock(&mtrr_mutex);
for (i = 0; i < num_var_ranges; ++i) {
@ -405,7 +405,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
error = i;
out:
mutex_unlock(&mtrr_mutex);
unlock_cpu_hotplug();
put_online_cpus();
return error;
}
@ -495,7 +495,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
max = num_var_ranges;
/* No CPU hotplug when we change MTRR entries */
lock_cpu_hotplug();
get_online_cpus();
mutex_lock(&mtrr_mutex);
if (reg < 0) {
/* Search for existing MTRR */
@ -536,7 +536,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
error = reg;
out:
mutex_unlock(&mtrr_mutex);
unlock_cpu_hotplug();
put_online_cpus();
return error;
}
/**

View file

@ -283,7 +283,7 @@ sysret_careful:
sysret_signal:
TRACE_IRQS_ON
sti
testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
testl $_TIF_DO_NOTIFY_MASK,%edx
jz 1f
/* Really a signal */
@ -377,7 +377,7 @@ int_very_careful:
jmp int_restore_rest
int_signal:
testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
testl $_TIF_DO_NOTIFY_MASK,%edx
jz 1f
movq %rsp,%rdi # &ptregs -> arg1
xorl %esi,%esi # oldset -> arg2
@ -603,7 +603,7 @@ retint_careful:
jmp retint_check
retint_signal:
testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
testl $_TIF_DO_NOTIFY_MASK,%edx
jz retint_swapgs
TRACE_IRQS_ON
sti

View file

@ -436,7 +436,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
return -EINVAL;
}
lock_cpu_hotplug();
get_online_cpus();
mutex_lock(&microcode_mutex);
user_buffer = (void __user *) buf;
@ -447,7 +447,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
ret = (ssize_t)len;
mutex_unlock(&microcode_mutex);
unlock_cpu_hotplug();
put_online_cpus();
return ret;
}
@ -658,14 +658,14 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
old = current->cpus_allowed;
lock_cpu_hotplug();
get_online_cpus();
set_cpus_allowed(current, cpumask_of_cpu(cpu));
mutex_lock(&microcode_mutex);
if (uci->valid)
err = cpu_request_microcode(cpu);
mutex_unlock(&microcode_mutex);
unlock_cpu_hotplug();
put_online_cpus();
set_cpus_allowed(current, old);
}
if (err)
@ -817,9 +817,9 @@ static int __init microcode_init (void)
return PTR_ERR(microcode_pdev);
}
lock_cpu_hotplug();
get_online_cpus();
error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
unlock_cpu_hotplug();
put_online_cpus();
if (error) {
microcode_dev_exit();
platform_device_unregister(microcode_pdev);
@ -839,9 +839,9 @@ static void __exit microcode_exit (void)
unregister_hotcpu_notifier(&mc_cpu_notifier);
lock_cpu_hotplug();
get_online_cpus();
sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
unlock_cpu_hotplug();
put_online_cpus();
platform_device_unregister(microcode_pdev);
}

View file

@ -658,6 +658,9 @@ void do_notify_resume(struct pt_regs *regs, void *_unused,
/* deal with pending signal delivery */
if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
do_signal(regs);
if (thread_info_flags & _TIF_HRTICK_RESCHED)
hrtick_resched();
clear_thread_flag(TIF_IRET);
}

View file

@ -480,6 +480,9 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
/* deal with pending signal delivery */
if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
do_signal(regs);
if (thread_info_flags & _TIF_HRTICK_RESCHED)
hrtick_resched();
}
void signal_fault(struct pt_regs *regs, void __user *frame, char *where)

View file

@ -33,6 +33,19 @@ static void save_stack_address(void *data, unsigned long addr)
trace->entries[trace->nr_entries++] = addr;
}
static void save_stack_address_nosched(void *data, unsigned long addr)
{
struct stack_trace *trace = (struct stack_trace *)data;
if (in_sched_functions(addr))
return;
if (trace->skip > 0) {
trace->skip--;
return;
}
if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = addr;
}
static const struct stacktrace_ops save_stack_ops = {
.warning = save_stack_warning,
.warning_symbol = save_stack_warning_symbol,
@ -40,6 +53,13 @@ static const struct stacktrace_ops save_stack_ops = {
.address = save_stack_address,
};
static const struct stacktrace_ops save_stack_ops_nosched = {
.warning = save_stack_warning,
.warning_symbol = save_stack_warning_symbol,
.stack = save_stack_stack,
.address = save_stack_address_nosched,
};
/*
* Save stack-backtrace addresses into a stack_trace buffer.
*/
@ -50,3 +70,10 @@ void save_stack_trace(struct stack_trace *trace)
trace->entries[trace->nr_entries++] = ULONG_MAX;
}
EXPORT_SYMBOL(save_stack_trace);
void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
{
dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace);
if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = ULONG_MAX;
}

View file

@ -459,7 +459,7 @@ void __init lguest_arch_host_init(void)
/* We don't need the complexity of CPUs coming and going while we're
* doing this. */
lock_cpu_hotplug();
get_online_cpus();
if (cpu_has_pge) { /* We have a broader idea of "global". */
/* Remember that this was originally set (for cleanup). */
cpu_had_pge = 1;
@ -469,20 +469,20 @@ void __init lguest_arch_host_init(void)
/* Turn off the feature in the global feature set. */
clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
}
unlock_cpu_hotplug();
put_online_cpus();
};
/*:*/
void __exit lguest_arch_host_fini(void)
{
/* If we had PGE before we started, turn it back on now. */
lock_cpu_hotplug();
get_online_cpus();
if (cpu_had_pge) {
set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
/* adjust_pge's argument "1" means set PGE. */
on_each_cpu(adjust_pge, (void *)1, 0, 1);
}
unlock_cpu_hotplug();
put_online_cpus();
}

View file

@ -29,12 +29,12 @@ static void sclp_cpu_capability_notify(struct work_struct *work)
struct sys_device *sysdev;
printk(KERN_WARNING TAG "cpu capability changed.\n");
lock_cpu_hotplug();
get_online_cpus();
for_each_online_cpu(cpu) {
sysdev = get_cpu_sysdev(cpu);
kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
}
unlock_cpu_hotplug();
put_online_cpus();
}
static void sclp_conf_receiver_fn(struct evbuf_header *evbuf)

View file

@ -2130,4 +2130,3 @@ source "fs/nls/Kconfig"
source "fs/dlm/Kconfig"
endmenu

View file

@ -310,6 +310,77 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer)
}
#endif
#ifdef CONFIG_LATENCYTOP
static int lstats_show_proc(struct seq_file *m, void *v)
{
int i;
struct task_struct *task = m->private;
seq_puts(m, "Latency Top version : v0.1\n");
for (i = 0; i < 32; i++) {
if (task->latency_record[i].backtrace[0]) {
int q;
seq_printf(m, "%i %li %li ",
task->latency_record[i].count,
task->latency_record[i].time,
task->latency_record[i].max);
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
char sym[KSYM_NAME_LEN];
char *c;
if (!task->latency_record[i].backtrace[q])
break;
if (task->latency_record[i].backtrace[q] == ULONG_MAX)
break;
sprint_symbol(sym, task->latency_record[i].backtrace[q]);
c = strchr(sym, '+');
if (c)
*c = 0;
seq_printf(m, "%s ", sym);
}
seq_printf(m, "\n");
}
}
return 0;
}
static int lstats_open(struct inode *inode, struct file *file)
{
int ret;
struct seq_file *m;
struct task_struct *task = get_proc_task(inode);
ret = single_open(file, lstats_show_proc, NULL);
if (!ret) {
m = file->private_data;
m->private = task;
}
return ret;
}
static ssize_t lstats_write(struct file *file, const char __user *buf,
size_t count, loff_t *offs)
{
struct seq_file *m;
struct task_struct *task;
m = file->private_data;
task = m->private;
clear_all_latency_tracing(task);
return count;
}
static const struct file_operations proc_lstats_operations = {
.open = lstats_open,
.read = seq_read,
.write = lstats_write,
.llseek = seq_lseek,
.release = single_release,
};
#endif
/* The badness from the OOM killer */
unsigned long badness(struct task_struct *p, unsigned long uptime);
static int proc_oom_score(struct task_struct *task, char *buffer)
@ -1020,6 +1091,7 @@ static const struct file_operations proc_fault_inject_operations = {
};
#endif
#ifdef CONFIG_SCHED_DEBUG
/*
* Print out various scheduling related per-task fields:
@ -2230,6 +2302,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_SCHEDSTATS
INF("schedstat", S_IRUGO, pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
REG("latency", S_IRUGO, lstats),
#endif
#ifdef CONFIG_PROC_PID_CPUSET
REG("cpuset", S_IRUGO, cpuset),
#endif
@ -2555,6 +2630,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_SCHEDSTATS
INF("schedstat", S_IRUGO, pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
REG("latency", S_IRUGO, lstats),
#endif
#ifdef CONFIG_PROC_PID_CPUSET
REG("cpuset", S_IRUGO, cpuset),
#endif

View file

@ -44,8 +44,8 @@
#define RLIMIT_NICE 13 /* max nice prio allowed to raise to
0-39 for nice level 19 .. -20 */
#define RLIMIT_RTPRIO 14 /* maximum realtime priority */
#define RLIM_NLIMITS 15
#define RLIMIT_RTTIME 15 /* timeout for RT tasks in us */
#define RLIM_NLIMITS 16
/*
* SuS says limits have to be unsigned.
@ -86,6 +86,7 @@
[RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \
[RLIMIT_NICE] = { 0, 0 }, \
[RLIMIT_RTPRIO] = { 0, 0 }, \
[RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \
}
#endif /* __KERNEL__ */

View file

@ -132,6 +132,7 @@ static inline struct thread_info *current_thread_info(void)
#define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */
#define TIF_SECCOMP 7 /* secure computing */
#define TIF_RESTORE_SIGMASK 8 /* restore signal mask in do_signal() */
#define TIF_HRTICK_RESCHED 9 /* reprogram hrtick timer */
#define TIF_MEMDIE 16
#define TIF_DEBUG 17 /* uses debug registers */
#define TIF_IO_BITMAP 18 /* uses I/O bitmap */
@ -147,6 +148,7 @@ static inline struct thread_info *current_thread_info(void)
#define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1<<TIF_SECCOMP)
#define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK)
#define _TIF_HRTICK_RESCHED (1<<TIF_HRTICK_RESCHED)
#define _TIF_DEBUG (1<<TIF_DEBUG)
#define _TIF_IO_BITMAP (1<<TIF_IO_BITMAP)
#define _TIF_FREEZE (1<<TIF_FREEZE)

View file

@ -115,6 +115,7 @@ static inline struct thread_info *stack_thread_info(void)
#define TIF_SECCOMP 8 /* secure computing */
#define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */
#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
#define TIF_HRTICK_RESCHED 11 /* reprogram hrtick timer */
/* 16 free */
#define TIF_IA32 17 /* 32bit process */
#define TIF_FORK 18 /* ret_from_fork */
@ -133,6 +134,7 @@ static inline struct thread_info *stack_thread_info(void)
#define _TIF_SECCOMP (1<<TIF_SECCOMP)
#define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK)
#define _TIF_MCE_NOTIFY (1<<TIF_MCE_NOTIFY)
#define _TIF_HRTICK_RESCHED (1<<TIF_HRTICK_RESCHED)
#define _TIF_IA32 (1<<TIF_IA32)
#define _TIF_FORK (1<<TIF_FORK)
#define _TIF_ABI_PENDING (1<<TIF_ABI_PENDING)
@ -146,6 +148,9 @@ static inline struct thread_info *stack_thread_info(void)
/* work to do on any return to user space */
#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
#define _TIF_DO_NOTIFY_MASK \
(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED)
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP)

View file

@ -71,18 +71,27 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb)
int cpu_up(unsigned int cpu);
extern void cpu_hotplug_init(void);
#else
static inline int register_cpu_notifier(struct notifier_block *nb)
{
return 0;
}
static inline void unregister_cpu_notifier(struct notifier_block *nb)
{
}
static inline void cpu_hotplug_init(void)
{
}
#endif /* CONFIG_SMP */
extern struct sysdev_class cpu_sysdev_class;
extern void cpu_maps_update_begin(void);
extern void cpu_maps_update_done(void);
#ifdef CONFIG_HOTPLUG_CPU
/* Stop CPUs going up and down. */
@ -97,8 +106,8 @@ static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
mutex_unlock(cpu_hp_mutex);
}
extern void lock_cpu_hotplug(void);
extern void unlock_cpu_hotplug(void);
extern void get_online_cpus(void);
extern void put_online_cpus(void);
#define hotcpu_notifier(fn, pri) { \
static struct notifier_block fn##_nb = \
{ .notifier_call = fn, .priority = pri }; \
@ -115,8 +124,8 @@ static inline void cpuhotplug_mutex_lock(struct mutex *cpu_hp_mutex)
static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
{ }
#define lock_cpu_hotplug() do { } while (0)
#define unlock_cpu_hotplug() do { } while (0)
#define get_online_cpus() do { } while (0)
#define put_online_cpus() do { } while (0)
#define hotcpu_notifier(fn, pri) do { (void)(fn); } while (0)
/* These aren't inline functions due to a GCC bug. */
#define register_hotcpu_notifier(nb) ({ (void)(nb); 0; })

View file

@ -47,6 +47,7 @@ struct task_struct;
#ifdef CONFIG_LOCKDEP
extern void debug_show_all_locks(void);
extern void __debug_show_held_locks(struct task_struct *task);
extern void debug_show_held_locks(struct task_struct *task);
extern void debug_check_no_locks_freed(const void *from, unsigned long len);
extern void debug_check_no_locks_held(struct task_struct *task);
@ -55,6 +56,10 @@ static inline void debug_show_all_locks(void)
{
}
static inline void __debug_show_held_locks(struct task_struct *task)
{
}
static inline void debug_show_held_locks(struct task_struct *task)
{
}

View file

@ -1,8 +1,12 @@
#ifndef _LINUX_FUTEX_H
#define _LINUX_FUTEX_H
#include <linux/sched.h>
#include <linux/compiler.h>
#include <linux/types.h>
struct inode;
struct mm_struct;
struct task_struct;
union ktime;
/* Second argument to futex syscall */

View file

@ -72,11 +72,7 @@
#define in_softirq() (softirq_count())
#define in_interrupt() (irq_count())
#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
# define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != kernel_locked())
#else
# define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0)
#endif
#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0)
#ifdef CONFIG_PREEMPT
# define PREEMPT_CHECK_OFFSET 1

View file

@ -115,10 +115,8 @@ struct hrtimer {
enum hrtimer_restart (*function)(struct hrtimer *);
struct hrtimer_clock_base *base;
unsigned long state;
#ifdef CONFIG_HIGH_RES_TIMERS
enum hrtimer_cb_mode cb_mode;
struct list_head cb_entry;
#endif
#ifdef CONFIG_TIMER_STATS
void *start_site;
char start_comm[16];
@ -194,10 +192,10 @@ struct hrtimer_cpu_base {
spinlock_t lock;
struct lock_class_key lock_key;
struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
struct list_head cb_pending;
#ifdef CONFIG_HIGH_RES_TIMERS
ktime_t expires_next;
int hres_active;
struct list_head cb_pending;
unsigned long nr_events;
#endif
};
@ -217,6 +215,11 @@ static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
return timer->base->get_time();
}
static inline int hrtimer_is_hres_active(struct hrtimer *timer)
{
return timer->base->cpu_base->hres_active;
}
/*
* The resolution of the clocks. The resolution value is returned in
* the clock_getres() system call to give application programmers an
@ -248,6 +251,10 @@ static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
return timer->base->softirq_time;
}
static inline int hrtimer_is_hres_active(struct hrtimer *timer)
{
return 0;
}
#endif
extern ktime_t ktime_get(void);
@ -310,6 +317,7 @@ extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
/* Soft interrupt function to run the hrtimer queues: */
extern void hrtimer_run_queues(void);
extern void hrtimer_run_pending(void);
/* Bootup initialization: */
extern void __init hrtimers_init(void);

View file

@ -132,9 +132,12 @@ extern struct group_info init_groups;
.cpus_allowed = CPU_MASK_ALL, \
.mm = NULL, \
.active_mm = &init_mm, \
.run_list = LIST_HEAD_INIT(tsk.run_list), \
.rt = { \
.run_list = LIST_HEAD_INIT(tsk.rt.run_list), \
.time_slice = HZ, \
.nr_cpus_allowed = NR_CPUS, \
}, \
.ioprio = 0, \
.time_slice = HZ, \
.tasks = LIST_HEAD_INIT(tsk.tasks), \
.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \
.ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \

View file

@ -256,6 +256,7 @@ enum
#ifdef CONFIG_HIGH_RES_TIMERS
HRTIMER_SOFTIRQ,
#endif
RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */
};
/* softirq mask and active fields moved to irq_cpustat_t in

View file

@ -29,6 +29,12 @@
# define SHIFT_HZ 9
#elif HZ >= 768 && HZ < 1536
# define SHIFT_HZ 10
#elif HZ >= 1536 && HZ < 3072
# define SHIFT_HZ 11
#elif HZ >= 3072 && HZ < 6144
# define SHIFT_HZ 12
#elif HZ >= 6144 && HZ < 12288
# define SHIFT_HZ 13
#else
# error You lose.
#endif

View file

@ -105,8 +105,8 @@ struct user;
* supposed to.
*/
#ifdef CONFIG_PREEMPT_VOLUNTARY
extern int cond_resched(void);
# define might_resched() cond_resched()
extern int _cond_resched(void);
# define might_resched() _cond_resched()
#else
# define might_resched() do { } while (0)
#endif

View file

@ -0,0 +1,44 @@
/*
* latencytop.h: Infrastructure for displaying latency
*
* (C) Copyright 2008 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
*
*/
#ifndef _INCLUDE_GUARD_LATENCYTOP_H_
#define _INCLUDE_GUARD_LATENCYTOP_H_
#ifdef CONFIG_LATENCYTOP
#define LT_SAVECOUNT 32
#define LT_BACKTRACEDEPTH 12
struct latency_record {
unsigned long backtrace[LT_BACKTRACEDEPTH];
unsigned int count;
unsigned long time;
unsigned long max;
};
struct task_struct;
void account_scheduler_latency(struct task_struct *task, int usecs, int inter);
void clear_all_latency_tracing(struct task_struct *p);
#else
static inline void
account_scheduler_latency(struct task_struct *task, int usecs, int inter)
{
}
static inline void clear_all_latency_tracing(struct task_struct *p)
{
}
#endif
#endif

View file

@ -207,9 +207,7 @@ static inline int notifier_to_errno(int ret)
#define CPU_DOWN_PREPARE 0x0005 /* CPU (unsigned)v going down */
#define CPU_DOWN_FAILED 0x0006 /* CPU (unsigned)v NOT going down */
#define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */
#define CPU_LOCK_ACQUIRE 0x0008 /* Acquire all hotcpu locks */
#define CPU_LOCK_RELEASE 0x0009 /* Release all hotcpu locks */
#define CPU_DYING 0x000A /* CPU (unsigned)v not running any task,
#define CPU_DYING 0x0008 /* CPU (unsigned)v not running any task,
* not handling interrupts, soon dead */
/* Used for CPU hotplug events occuring while tasks are frozen due to a suspend

164
include/linux/rcuclassic.h Normal file
View file

@ -0,0 +1,164 @@
/*
* Read-Copy Update mechanism for mutual exclusion (classic version)
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright IBM Corporation, 2001
*
* Author: Dipankar Sarma <dipankar@in.ibm.com>
*
* Based on the original work by Paul McKenney <paulmck@us.ibm.com>
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
* Papers:
* http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
* http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
*
* For detailed explanation of Read-Copy Update mechanism see -
* Documentation/RCU
*
*/
#ifndef __LINUX_RCUCLASSIC_H
#define __LINUX_RCUCLASSIC_H
#ifdef __KERNEL__
#include <linux/cache.h>
#include <linux/spinlock.h>
#include <linux/threads.h>
#include <linux/percpu.h>
#include <linux/cpumask.h>
#include <linux/seqlock.h>
/* Global control variables for rcupdate callback mechanism. */
struct rcu_ctrlblk {
long cur; /* Current batch number. */
long completed; /* Number of the last completed batch */
int next_pending; /* Is the next batch already waiting? */
int signaled;
spinlock_t lock ____cacheline_internodealigned_in_smp;
cpumask_t cpumask; /* CPUs that need to switch in order */
/* for current batch to proceed. */
} ____cacheline_internodealigned_in_smp;
/* Is batch a before batch b ? */
static inline int rcu_batch_before(long a, long b)
{
return (a - b) < 0;
}
/* Is batch a after batch b ? */
static inline int rcu_batch_after(long a, long b)
{
return (a - b) > 0;
}
/*
* Per-CPU data for Read-Copy UPdate.
* nxtlist - new callbacks are added here
* curlist - current batch for which quiescent cycle started if any
*/
struct rcu_data {
/* 1) quiescent state handling : */
long quiescbatch; /* Batch # for grace period */
int passed_quiesc; /* User-mode/idle loop etc. */
int qs_pending; /* core waits for quiesc state */
/* 2) batch handling */
long batch; /* Batch # for current RCU batch */
struct rcu_head *nxtlist;
struct rcu_head **nxttail;
long qlen; /* # of queued callbacks */
struct rcu_head *curlist;
struct rcu_head **curtail;
struct rcu_head *donelist;
struct rcu_head **donetail;
long blimit; /* Upper limit on a processed batch */
int cpu;
struct rcu_head barrier;
};
DECLARE_PER_CPU(struct rcu_data, rcu_data);
DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
/*
* Increment the quiescent state counter.
* The counter is a bit degenerated: We do not need to know
* how many quiescent states passed, just if there was at least
* one since the start of the grace period. Thus just a flag.
*/
static inline void rcu_qsctr_inc(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
rdp->passed_quiesc = 1;
}
static inline void rcu_bh_qsctr_inc(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
rdp->passed_quiesc = 1;
}
extern int rcu_pending(int cpu);
extern int rcu_needs_cpu(int cpu);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern struct lockdep_map rcu_lock_map;
# define rcu_read_acquire() \
lock_acquire(&rcu_lock_map, 0, 0, 2, 1, _THIS_IP_)
# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_)
#else
# define rcu_read_acquire() do { } while (0)
# define rcu_read_release() do { } while (0)
#endif
#define __rcu_read_lock() \
do { \
preempt_disable(); \
__acquire(RCU); \
rcu_read_acquire(); \
} while (0)
#define __rcu_read_unlock() \
do { \
rcu_read_release(); \
__release(RCU); \
preempt_enable(); \
} while (0)
#define __rcu_read_lock_bh() \
do { \
local_bh_disable(); \
__acquire(RCU_BH); \
rcu_read_acquire(); \
} while (0)
#define __rcu_read_unlock_bh() \
do { \
rcu_read_release(); \
__release(RCU_BH); \
local_bh_enable(); \
} while (0)
#define __synchronize_sched() synchronize_rcu()
extern void __rcu_init(void);
extern void rcu_check_callbacks(int cpu, int user);
extern void rcu_restart_cpu(int cpu);
extern long rcu_batches_completed(void);
extern long rcu_batches_completed_bh(void);
#endif /* __KERNEL__ */
#endif /* __LINUX_RCUCLASSIC_H */

View file

@ -15,7 +15,7 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) IBM Corporation, 2001
* Copyright IBM Corporation, 2001
*
* Author: Dipankar Sarma <dipankar@in.ibm.com>
*
@ -53,96 +53,18 @@ struct rcu_head {
void (*func)(struct rcu_head *head);
};
#ifdef CONFIG_CLASSIC_RCU
#include <linux/rcuclassic.h>
#else /* #ifdef CONFIG_CLASSIC_RCU */
#include <linux/rcupreempt.h>
#endif /* #else #ifdef CONFIG_CLASSIC_RCU */
#define RCU_HEAD_INIT { .next = NULL, .func = NULL }
#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
#define INIT_RCU_HEAD(ptr) do { \
(ptr)->next = NULL; (ptr)->func = NULL; \
} while (0)
/* Global control variables for rcupdate callback mechanism. */
struct rcu_ctrlblk {
long cur; /* Current batch number. */
long completed; /* Number of the last completed batch */
int next_pending; /* Is the next batch already waiting? */
int signaled;
spinlock_t lock ____cacheline_internodealigned_in_smp;
cpumask_t cpumask; /* CPUs that need to switch in order */
/* for current batch to proceed. */
} ____cacheline_internodealigned_in_smp;
/* Is batch a before batch b ? */
static inline int rcu_batch_before(long a, long b)
{
return (a - b) < 0;
}
/* Is batch a after batch b ? */
static inline int rcu_batch_after(long a, long b)
{
return (a - b) > 0;
}
/*
* Per-CPU data for Read-Copy UPdate.
* nxtlist - new callbacks are added here
* curlist - current batch for which quiescent cycle started if any
*/
struct rcu_data {
/* 1) quiescent state handling : */
long quiescbatch; /* Batch # for grace period */
int passed_quiesc; /* User-mode/idle loop etc. */
int qs_pending; /* core waits for quiesc state */
/* 2) batch handling */
long batch; /* Batch # for current RCU batch */
struct rcu_head *nxtlist;
struct rcu_head **nxttail;
long qlen; /* # of queued callbacks */
struct rcu_head *curlist;
struct rcu_head **curtail;
struct rcu_head *donelist;
struct rcu_head **donetail;
long blimit; /* Upper limit on a processed batch */
int cpu;
struct rcu_head barrier;
};
DECLARE_PER_CPU(struct rcu_data, rcu_data);
DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
/*
* Increment the quiescent state counter.
* The counter is a bit degenerated: We do not need to know
* how many quiescent states passed, just if there was at least
* one since the start of the grace period. Thus just a flag.
*/
static inline void rcu_qsctr_inc(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
rdp->passed_quiesc = 1;
}
static inline void rcu_bh_qsctr_inc(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
rdp->passed_quiesc = 1;
}
extern int rcu_pending(int cpu);
extern int rcu_needs_cpu(int cpu);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern struct lockdep_map rcu_lock_map;
# define rcu_read_acquire() lock_acquire(&rcu_lock_map, 0, 0, 2, 1, _THIS_IP_)
# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_)
#else
# define rcu_read_acquire() do { } while (0)
# define rcu_read_release() do { } while (0)
#endif
/**
* rcu_read_lock - mark the beginning of an RCU read-side critical section.
*
@ -172,24 +94,13 @@ extern struct lockdep_map rcu_lock_map;
*
* It is illegal to block while in an RCU read-side critical section.
*/
#define rcu_read_lock() \
do { \
preempt_disable(); \
__acquire(RCU); \
rcu_read_acquire(); \
} while(0)
#define rcu_read_lock() __rcu_read_lock()
/**
* rcu_read_unlock - marks the end of an RCU read-side critical section.
*
* See rcu_read_lock() for more information.
*/
#define rcu_read_unlock() \
do { \
rcu_read_release(); \
__release(RCU); \
preempt_enable(); \
} while(0)
/*
* So where is rcu_write_lock()? It does not exist, as there is no
@ -200,6 +111,7 @@ extern struct lockdep_map rcu_lock_map;
* used as well. RCU does not care how the writers keep out of each
* others' way, as long as they do so.
*/
#define rcu_read_unlock() __rcu_read_unlock()
/**
* rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section
@ -212,24 +124,14 @@ extern struct lockdep_map rcu_lock_map;
* can use just rcu_read_lock().
*
*/
#define rcu_read_lock_bh() \
do { \
local_bh_disable(); \
__acquire(RCU_BH); \
rcu_read_acquire(); \
} while(0)
#define rcu_read_lock_bh() __rcu_read_lock_bh()
/*
* rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section
*
* See rcu_read_lock_bh() for more information.
*/
#define rcu_read_unlock_bh() \
do { \
rcu_read_release(); \
__release(RCU_BH); \
local_bh_enable(); \
} while(0)
#define rcu_read_unlock_bh() __rcu_read_unlock_bh()
/*
* Prevent the compiler from merging or refetching accesses. The compiler
@ -293,21 +195,52 @@ extern struct lockdep_map rcu_lock_map;
* In "classic RCU", these two guarantees happen to be one and
* the same, but can differ in realtime RCU implementations.
*/
#define synchronize_sched() synchronize_rcu()
#define synchronize_sched() __synchronize_sched()
extern void rcu_init(void);
extern void rcu_check_callbacks(int cpu, int user);
extern void rcu_restart_cpu(int cpu);
/**
* call_rcu - Queue an RCU callback for invocation after a grace period.
* @head: structure to be used for queueing the RCU updates.
* @func: actual update function to be invoked after the grace period
*
* The update function will be invoked some time after a full grace
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. RCU read-side critical
* sections are delimited by rcu_read_lock() and rcu_read_unlock(),
* and may be nested.
*/
extern void call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *head));
/**
* call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
* @head: structure to be used for queueing the RCU updates.
* @func: actual update function to be invoked after the grace period
*
* The update function will be invoked some time after a full grace
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. call_rcu_bh() assumes
* that the read-side critical sections end on completion of a softirq
* handler. This means that read-side critical sections in process
* context must not be interrupted by softirqs. This interface is to be
* used when most of the read-side critical sections are in softirq context.
* RCU read-side critical sections are delimited by :
* - rcu_read_lock() and rcu_read_unlock(), if in interrupt context.
* OR
* - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
* These may be nested.
*/
extern void call_rcu_bh(struct rcu_head *head,
void (*func)(struct rcu_head *head));
/* Exported common interfaces */
extern void synchronize_rcu(void);
extern void rcu_barrier(void);
extern long rcu_batches_completed(void);
extern long rcu_batches_completed_bh(void);
/* Exported interfaces */
extern void FASTCALL(call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *head)));
extern void FASTCALL(call_rcu_bh(struct rcu_head *head,
void (*func)(struct rcu_head *head)));
extern void synchronize_rcu(void);
extern void rcu_barrier(void);
/* Internal to kernel */
extern void rcu_init(void);
extern int rcu_needs_cpu(int cpu);
#endif /* __KERNEL__ */
#endif /* __LINUX_RCUPDATE_H */

View file

@ -0,0 +1,86 @@
/*
* Read-Copy Update mechanism for mutual exclusion (RT implementation)
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) IBM Corporation, 2006
*
* Author: Paul McKenney <paulmck@us.ibm.com>
*
* Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com>
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
* Papers:
* http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
* http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
*
* For detailed explanation of Read-Copy Update mechanism see -
* Documentation/RCU
*
*/
#ifndef __LINUX_RCUPREEMPT_H
#define __LINUX_RCUPREEMPT_H
#ifdef __KERNEL__
#include <linux/cache.h>
#include <linux/spinlock.h>
#include <linux/threads.h>
#include <linux/percpu.h>
#include <linux/cpumask.h>
#include <linux/seqlock.h>
#define rcu_qsctr_inc(cpu)
#define rcu_bh_qsctr_inc(cpu)
#define call_rcu_bh(head, rcu) call_rcu(head, rcu)
extern void __rcu_read_lock(void);
extern void __rcu_read_unlock(void);
extern int rcu_pending(int cpu);
extern int rcu_needs_cpu(int cpu);
#define __rcu_read_lock_bh() { rcu_read_lock(); local_bh_disable(); }
#define __rcu_read_unlock_bh() { local_bh_enable(); rcu_read_unlock(); }
extern void __synchronize_sched(void);
extern void __rcu_init(void);
extern void rcu_check_callbacks(int cpu, int user);
extern void rcu_restart_cpu(int cpu);
extern long rcu_batches_completed(void);
/*
* Return the number of RCU batches processed thus far. Useful for debug
* and statistic. The _bh variant is identifcal to straight RCU
*/
static inline long rcu_batches_completed_bh(void)
{
return rcu_batches_completed();
}
#ifdef CONFIG_RCU_TRACE
struct rcupreempt_trace;
extern long *rcupreempt_flipctr(int cpu);
extern long rcupreempt_data_completed(void);
extern int rcupreempt_flip_flag(int cpu);
extern int rcupreempt_mb_flag(int cpu);
extern char *rcupreempt_try_flip_state_name(void);
extern struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu);
#endif
struct softirq_action;
#endif /* __KERNEL__ */
#endif /* __LINUX_RCUPREEMPT_H */

View file

@ -0,0 +1,99 @@
/*
* Read-Copy Update mechanism for mutual exclusion (RT implementation)
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) IBM Corporation, 2006
*
* Author: Paul McKenney <paulmck@us.ibm.com>
*
* Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com>
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
* Papers:
* http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
* http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
*
* For detailed explanation of the Preemptible Read-Copy Update mechanism see -
* http://lwn.net/Articles/253651/
*/
#ifndef __LINUX_RCUPREEMPT_TRACE_H
#define __LINUX_RCUPREEMPT_TRACE_H
#ifdef __KERNEL__
#include <linux/types.h>
#include <linux/kernel.h>
#include <asm/atomic.h>
/*
* PREEMPT_RCU data structures.
*/
struct rcupreempt_trace {
long next_length;
long next_add;
long wait_length;
long wait_add;
long done_length;
long done_add;
long done_remove;
atomic_t done_invoked;
long rcu_check_callbacks;
atomic_t rcu_try_flip_1;
atomic_t rcu_try_flip_e1;
long rcu_try_flip_i1;
long rcu_try_flip_ie1;
long rcu_try_flip_g1;
long rcu_try_flip_a1;
long rcu_try_flip_ae1;
long rcu_try_flip_a2;
long rcu_try_flip_z1;
long rcu_try_flip_ze1;
long rcu_try_flip_z2;
long rcu_try_flip_m1;
long rcu_try_flip_me1;
long rcu_try_flip_m2;
};
#ifdef CONFIG_RCU_TRACE
#define RCU_TRACE(fn, arg) fn(arg);
#else
#define RCU_TRACE(fn, arg)
#endif
extern void rcupreempt_trace_move2done(struct rcupreempt_trace *trace);
extern void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace);
extern void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace);
extern void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace);
extern void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace);
extern void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace);
extern void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace);
extern void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace);
extern void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace);
extern void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace);
extern void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace);
extern void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace);
extern void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace);
extern void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace);
extern void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace);
extern void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace);
extern void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace);
extern void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace);
extern void rcupreempt_trace_invoke(struct rcupreempt_trace *trace);
extern void rcupreempt_trace_next_add(struct rcupreempt_trace *trace);
#endif /* __KERNEL__ */
#endif /* __LINUX_RCUPREEMPT_TRACE_H */

View file

@ -78,7 +78,6 @@ struct sched_param {
#include <linux/proportions.h>
#include <linux/seccomp.h>
#include <linux/rcupdate.h>
#include <linux/futex.h>
#include <linux/rtmutex.h>
#include <linux/time.h>
@ -88,11 +87,13 @@ struct sched_param {
#include <linux/hrtimer.h>
#include <linux/task_io_accounting.h>
#include <linux/kobject.h>
#include <linux/latencytop.h>
#include <asm/processor.h>
struct exec_domain;
struct futex_pi_state;
struct robust_list_head;
struct bio;
/*
@ -230,6 +231,8 @@ static inline int select_nohz_load_balancer(int cpu)
}
#endif
extern unsigned long rt_needs_cpu(int cpu);
/*
* Only dump TASK_* tasks. (0 for all tasks)
*/
@ -257,13 +260,19 @@ extern void trap_init(void);
extern void account_process_tick(struct task_struct *task, int user);
extern void update_process_times(int user);
extern void scheduler_tick(void);
extern void hrtick_resched(void);
extern void sched_show_task(struct task_struct *p);
#ifdef CONFIG_DETECT_SOFTLOCKUP
extern void softlockup_tick(void);
extern void spawn_softlockup_task(void);
extern void touch_softlockup_watchdog(void);
extern void touch_all_softlockup_watchdogs(void);
extern int softlockup_thresh;
extern unsigned long softlockup_thresh;
extern unsigned long sysctl_hung_task_check_count;
extern unsigned long sysctl_hung_task_timeout_secs;
extern unsigned long sysctl_hung_task_warnings;
#else
static inline void softlockup_tick(void)
{
@ -822,6 +831,7 @@ struct sched_class {
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
void (*yield_task) (struct rq *rq);
int (*select_task_rq)(struct task_struct *p, int sync);
void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
@ -837,11 +847,25 @@ struct sched_class {
int (*move_one_task) (struct rq *this_rq, int this_cpu,
struct rq *busiest, struct sched_domain *sd,
enum cpu_idle_type idle);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
void (*post_schedule) (struct rq *this_rq);
void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
#endif
void (*set_curr_task) (struct rq *rq);
void (*task_tick) (struct rq *rq, struct task_struct *p);
void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
void (*task_new) (struct rq *rq, struct task_struct *p);
void (*set_cpus_allowed)(struct task_struct *p, cpumask_t *newmask);
void (*join_domain)(struct rq *rq);
void (*leave_domain)(struct rq *rq);
void (*switched_from) (struct rq *this_rq, struct task_struct *task,
int running);
void (*switched_to) (struct rq *this_rq, struct task_struct *task,
int running);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
int oldprio, int running);
};
struct load_weight {
@ -871,6 +895,8 @@ struct sched_entity {
#ifdef CONFIG_SCHEDSTATS
u64 wait_start;
u64 wait_max;
u64 wait_count;
u64 wait_sum;
u64 sleep_start;
u64 sleep_max;
@ -909,6 +935,21 @@ struct sched_entity {
#endif
};
struct sched_rt_entity {
struct list_head run_list;
unsigned int time_slice;
unsigned long timeout;
int nr_cpus_allowed;
#ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_rt_entity *parent;
/* rq on which this entity is (to be) queued: */
struct rt_rq *rt_rq;
/* rq "owned" by this entity/group: */
struct rt_rq *my_q;
#endif
};
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
@ -925,9 +966,9 @@ struct task_struct {
#endif
int prio, static_prio, normal_prio;
struct list_head run_list;
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
#ifdef CONFIG_PREEMPT_NOTIFIERS
/* list of struct preempt_notifier: */
@ -951,7 +992,11 @@ struct task_struct {
unsigned int policy;
cpumask_t cpus_allowed;
unsigned int time_slice;
#ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
int rcu_flipctr_idx;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
@ -1041,6 +1086,11 @@ struct task_struct {
/* ipc stuff */
struct sysv_sem sysvsem;
#endif
#ifdef CONFIG_DETECT_SOFTLOCKUP
/* hung task detection */
unsigned long last_switch_timestamp;
unsigned long last_switch_count;
#endif
/* CPU-specific state of this task */
struct thread_struct thread;
/* filesystem information */
@ -1173,6 +1223,10 @@ struct task_struct {
int make_it_fail;
#endif
struct prop_local_single dirties;
#ifdef CONFIG_LATENCYTOP
int latency_record_count;
struct latency_record latency_record[LT_SAVECOUNT];
#endif
};
/*
@ -1453,6 +1507,12 @@ extern unsigned int sysctl_sched_child_runs_first;
extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
extern unsigned int sysctl_sched_rt_period;
extern unsigned int sysctl_sched_rt_ratio;
#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
extern unsigned int sysctl_sched_min_bal_int_shares;
extern unsigned int sysctl_sched_max_bal_int_shares;
#endif
int sched_nr_latency_handler(struct ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length,
@ -1845,7 +1905,18 @@ static inline int need_resched(void)
* cond_resched_lock() will drop the spinlock before scheduling,
* cond_resched_softirq() will enable bhs before scheduling.
*/
extern int cond_resched(void);
#ifdef CONFIG_PREEMPT
static inline int cond_resched(void)
{
return 0;
}
#else
extern int _cond_resched(void);
static inline int cond_resched(void)
{
return _cond_resched();
}
#endif
extern int cond_resched_lock(spinlock_t * lock);
extern int cond_resched_softirq(void);

View file

@ -17,22 +17,10 @@ extern void __lockfunc __release_kernel_lock(void);
__release_kernel_lock(); \
} while (0)
/*
* Non-SMP kernels will never block on the kernel lock,
* so we are better off returning a constant zero from
* reacquire_kernel_lock() so that the compiler can see
* it at compile-time.
*/
#if defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_BKL)
# define return_value_on_smp return
#else
# define return_value_on_smp
#endif
static inline int reacquire_kernel_lock(struct task_struct *task)
{
if (unlikely(task->lock_depth >= 0))
return_value_on_smp __reacquire_kernel_lock();
return __reacquire_kernel_lock();
return 0;
}

View file

@ -9,10 +9,13 @@ struct stack_trace {
};
extern void save_stack_trace(struct stack_trace *trace);
extern void save_stack_trace_tsk(struct task_struct *tsk,
struct stack_trace *trace);
extern void print_stack_trace(struct stack_trace *trace, int spaces);
#else
# define save_stack_trace(trace) do { } while (0)
# define save_stack_trace_tsk(tsk, trace) do { } while (0)
# define print_stack_trace(trace, spaces) do { } while (0)
#endif

View file

@ -5,7 +5,7 @@
*
* Copyright (C) 2002, IBM Corp.
*
* All rights reserved.
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@ -103,6 +103,7 @@
.forkexec_idx = 0, \
.flags = SD_LOAD_BALANCE \
| SD_BALANCE_NEWIDLE \
| SD_BALANCE_FORK \
| SD_BALANCE_EXEC \
| SD_WAKE_AFFINE \
| SD_WAKE_IDLE \
@ -134,6 +135,7 @@
.forkexec_idx = 1, \
.flags = SD_LOAD_BALANCE \
| SD_BALANCE_NEWIDLE \
| SD_BALANCE_FORK \
| SD_BALANCE_EXEC \
| SD_WAKE_AFFINE \
| SD_WAKE_IDLE \
@ -165,6 +167,7 @@
.forkexec_idx = 1, \
.flags = SD_LOAD_BALANCE \
| SD_BALANCE_NEWIDLE \
| SD_BALANCE_FORK \
| SD_BALANCE_EXEC \
| SD_WAKE_AFFINE \
| BALANCE_FOR_PKG_POWER,\

View file

@ -763,3 +763,31 @@ source "block/Kconfig"
config PREEMPT_NOTIFIERS
bool
choice
prompt "RCU implementation type:"
default CLASSIC_RCU
config CLASSIC_RCU
bool "Classic RCU"
help
This option selects the classic RCU implementation that is
designed for best read-side performance on non-realtime
systems.
Say Y if you are unsure.
config PREEMPT_RCU
bool "Preemptible RCU"
depends on PREEMPT
help
This option reduces the latency of the kernel by making certain
RCU sections preemptible. Normally RCU code is non-preemptible, if
this option is selected then read-only RCU sections become
preemptible. This helps latency, but may expose bugs due to
now-naive assumptions about each RCU read-side critical section
remaining on a given CPU through its execution.
Say N if you are unsure.
endchoice

View file

@ -607,6 +607,7 @@ asmlinkage void __init start_kernel(void)
vfs_caches_init_early();
cpuset_init_early();
mem_init();
cpu_hotplug_init();
kmem_cache_init();
setup_per_cpu_pageset();
numa_policy_init();

View file

@ -54,3 +54,5 @@ config HZ
default 300 if HZ_300
default 1000 if HZ_1000
config SCHED_HRTICK
def_bool HIGH_RES_TIMERS && X86

View file

@ -52,14 +52,13 @@ config PREEMPT
endchoice
config PREEMPT_BKL
bool "Preempt The Big Kernel Lock"
depends on SMP || PREEMPT
config RCU_TRACE
bool "Enable tracing for RCU - currently stats in debugfs"
select DEBUG_FS
default y
help
This option reduces the latency of the kernel by making the
big kernel lock preemptible.
This option provides tracing in RCU which presents stats
in debugfs for debugging RCU implementation.
Say Y here if you are building a kernel for a desktop system.
Say Y here if you want to enable RCU tracing
Say N if you are unsure.

View file

@ -52,11 +52,17 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
ifeq ($(CONFIG_PREEMPT_RCU),y)
obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
endif
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
obj-$(CONFIG_MARKERS) += marker.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is

View file

@ -15,9 +15,8 @@
#include <linux/stop_machine.h>
#include <linux/mutex.h>
/* This protects CPUs going up and down... */
/* Serializes the updates to cpu_online_map, cpu_present_map */
static DEFINE_MUTEX(cpu_add_remove_lock);
static DEFINE_MUTEX(cpu_bitmask_lock);
static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
@ -26,52 +25,123 @@ static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
*/
static int cpu_hotplug_disabled;
static struct {
struct task_struct *active_writer;
struct mutex lock; /* Synchronizes accesses to refcount, */
/*
* Also blocks the new readers during
* an ongoing cpu hotplug operation.
*/
int refcount;
wait_queue_head_t writer_queue;
} cpu_hotplug;
#define writer_exists() (cpu_hotplug.active_writer != NULL)
void __init cpu_hotplug_init(void)
{
cpu_hotplug.active_writer = NULL;
mutex_init(&cpu_hotplug.lock);
cpu_hotplug.refcount = 0;
init_waitqueue_head(&cpu_hotplug.writer_queue);
}
#ifdef CONFIG_HOTPLUG_CPU
/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
static struct task_struct *recursive;
static int recursive_depth;
void lock_cpu_hotplug(void)
void get_online_cpus(void)
{
struct task_struct *tsk = current;
if (tsk == recursive) {
static int warnings = 10;
if (warnings) {
printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n");
WARN_ON(1);
warnings--;
}
recursive_depth++;
might_sleep();
if (cpu_hotplug.active_writer == current)
return;
}
mutex_lock(&cpu_bitmask_lock);
recursive = tsk;
}
EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
mutex_lock(&cpu_hotplug.lock);
cpu_hotplug.refcount++;
mutex_unlock(&cpu_hotplug.lock);
void unlock_cpu_hotplug(void)
}
EXPORT_SYMBOL_GPL(get_online_cpus);
void put_online_cpus(void)
{
WARN_ON(recursive != current);
if (recursive_depth) {
recursive_depth--;
if (cpu_hotplug.active_writer == current)
return;
}
recursive = NULL;
mutex_unlock(&cpu_bitmask_lock);
mutex_lock(&cpu_hotplug.lock);
cpu_hotplug.refcount--;
if (unlikely(writer_exists()) && !cpu_hotplug.refcount)
wake_up(&cpu_hotplug.writer_queue);
mutex_unlock(&cpu_hotplug.lock);
}
EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
EXPORT_SYMBOL_GPL(put_online_cpus);
#endif /* CONFIG_HOTPLUG_CPU */
/*
* The following two API's must be used when attempting
* to serialize the updates to cpu_online_map, cpu_present_map.
*/
void cpu_maps_update_begin(void)
{
mutex_lock(&cpu_add_remove_lock);
}
void cpu_maps_update_done(void)
{
mutex_unlock(&cpu_add_remove_lock);
}
/*
* This ensures that the hotplug operation can begin only when the
* refcount goes to zero.
*
* Note that during a cpu-hotplug operation, the new readers, if any,
* will be blocked by the cpu_hotplug.lock
*
* Since cpu_maps_update_begin is always called after invoking
* cpu_maps_update_begin, we can be sure that only one writer is active.
*
* Note that theoretically, there is a possibility of a livelock:
* - Refcount goes to zero, last reader wakes up the sleeping
* writer.
* - Last reader unlocks the cpu_hotplug.lock.
* - A new reader arrives at this moment, bumps up the refcount.
* - The writer acquires the cpu_hotplug.lock finds the refcount
* non zero and goes to sleep again.
*
* However, this is very difficult to achieve in practice since
* get_online_cpus() not an api which is called all that often.
*
*/
static void cpu_hotplug_begin(void)
{
DECLARE_WAITQUEUE(wait, current);
mutex_lock(&cpu_hotplug.lock);
cpu_hotplug.active_writer = current;
add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait);
while (cpu_hotplug.refcount) {
set_current_state(TASK_UNINTERRUPTIBLE);
mutex_unlock(&cpu_hotplug.lock);
schedule();
mutex_lock(&cpu_hotplug.lock);
}
remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait);
}
static void cpu_hotplug_done(void)
{
cpu_hotplug.active_writer = NULL;
mutex_unlock(&cpu_hotplug.lock);
}
/* Need to know about CPUs going up/down? */
int __cpuinit register_cpu_notifier(struct notifier_block *nb)
{
int ret;
mutex_lock(&cpu_add_remove_lock);
cpu_maps_update_begin();
ret = raw_notifier_chain_register(&cpu_chain, nb);
mutex_unlock(&cpu_add_remove_lock);
cpu_maps_update_done();
return ret;
}
@ -81,9 +151,9 @@ EXPORT_SYMBOL(register_cpu_notifier);
void unregister_cpu_notifier(struct notifier_block *nb)
{
mutex_lock(&cpu_add_remove_lock);
cpu_maps_update_begin();
raw_notifier_chain_unregister(&cpu_chain, nb);
mutex_unlock(&cpu_add_remove_lock);
cpu_maps_update_done();
}
EXPORT_SYMBOL(unregister_cpu_notifier);
@ -147,7 +217,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
if (!cpu_online(cpu))
return -EINVAL;
raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
cpu_hotplug_begin();
err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
hcpu, -1, &nr_calls);
if (err == NOTIFY_BAD) {
@ -166,9 +236,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
cpu_clear(cpu, tmp);
set_cpus_allowed(current, tmp);
mutex_lock(&cpu_bitmask_lock);
p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
mutex_unlock(&cpu_bitmask_lock);
if (IS_ERR(p) || cpu_online(cpu)) {
/* CPU didn't die: tell everyone. Can't complain. */
@ -202,7 +270,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
out_allowed:
set_cpus_allowed(current, old_allowed);
out_release:
raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
cpu_hotplug_done();
return err;
}
@ -210,13 +278,13 @@ int cpu_down(unsigned int cpu)
{
int err = 0;
mutex_lock(&cpu_add_remove_lock);
cpu_maps_update_begin();
if (cpu_hotplug_disabled)
err = -EBUSY;
else
err = _cpu_down(cpu, 0);
mutex_unlock(&cpu_add_remove_lock);
cpu_maps_update_done();
return err;
}
#endif /*CONFIG_HOTPLUG_CPU*/
@ -231,7 +299,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
if (cpu_online(cpu) || !cpu_present(cpu))
return -EINVAL;
raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
cpu_hotplug_begin();
ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
-1, &nr_calls);
if (ret == NOTIFY_BAD) {
@ -243,9 +311,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
}
/* Arch-specific enabling code. */
mutex_lock(&cpu_bitmask_lock);
ret = __cpu_up(cpu);
mutex_unlock(&cpu_bitmask_lock);
if (ret != 0)
goto out_notify;
BUG_ON(!cpu_online(cpu));
@ -257,7 +323,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
if (ret != 0)
__raw_notifier_call_chain(&cpu_chain,
CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
cpu_hotplug_done();
return ret;
}
@ -275,13 +341,13 @@ int __cpuinit cpu_up(unsigned int cpu)
return -EINVAL;
}
mutex_lock(&cpu_add_remove_lock);
cpu_maps_update_begin();
if (cpu_hotplug_disabled)
err = -EBUSY;
else
err = _cpu_up(cpu, 0);
mutex_unlock(&cpu_add_remove_lock);
cpu_maps_update_done();
return err;
}
@ -292,7 +358,7 @@ int disable_nonboot_cpus(void)
{
int cpu, first_cpu, error = 0;
mutex_lock(&cpu_add_remove_lock);
cpu_maps_update_begin();
first_cpu = first_cpu(cpu_online_map);
/* We take down all of the non-boot CPUs in one shot to avoid races
* with the userspace trying to use the CPU hotplug at the same time
@ -319,7 +385,7 @@ int disable_nonboot_cpus(void)
} else {
printk(KERN_ERR "Non-boot CPUs are not disabled\n");
}
mutex_unlock(&cpu_add_remove_lock);
cpu_maps_update_done();
return error;
}
@ -328,7 +394,7 @@ void enable_nonboot_cpus(void)
int cpu, error;
/* Allow everyone to use the CPU hotplug again */
mutex_lock(&cpu_add_remove_lock);
cpu_maps_update_begin();
cpu_hotplug_disabled = 0;
if (cpus_empty(frozen_cpus))
goto out;
@ -344,6 +410,6 @@ void enable_nonboot_cpus(void)
}
cpus_clear(frozen_cpus);
out:
mutex_unlock(&cpu_add_remove_lock);
cpu_maps_update_done();
}
#endif /* CONFIG_PM_SLEEP_SMP */

View file

@ -537,10 +537,10 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
*
* Call with cgroup_mutex held. May take callback_mutex during
* call due to the kfifo_alloc() and kmalloc() calls. May nest
* a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
* a call to the get_online_cpus()/put_online_cpus() pair.
* Must not be called holding callback_mutex, because we must not
* call lock_cpu_hotplug() while holding callback_mutex. Elsewhere
* the kernel nests callback_mutex inside lock_cpu_hotplug() calls.
* call get_online_cpus() while holding callback_mutex. Elsewhere
* the kernel nests callback_mutex inside get_online_cpus() calls.
* So the reverse nesting would risk an ABBA deadlock.
*
* The three key local variables below are:
@ -691,9 +691,9 @@ static void rebuild_sched_domains(void)
rebuild:
/* Have scheduler rebuild sched domains */
lock_cpu_hotplug();
get_online_cpus();
partition_sched_domains(ndoms, doms);
unlock_cpu_hotplug();
put_online_cpus();
done:
if (q && !IS_ERR(q))
@ -1617,10 +1617,10 @@ static struct cgroup_subsys_state *cpuset_create(
*
* If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which
* will call rebuild_sched_domains(). The lock_cpu_hotplug()
* will call rebuild_sched_domains(). The get_online_cpus()
* call in rebuild_sched_domains() must not be made while holding
* callback_mutex. Elsewhere the kernel nests callback_mutex inside
* lock_cpu_hotplug() calls. So the reverse nesting would risk an
* get_online_cpus() calls. So the reverse nesting would risk an
* ABBA deadlock.
*/

View file

@ -1045,6 +1045,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
copy_flags(clone_flags, p);
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
#ifdef CONFIG_PREEMPT_RCU
p->rcu_read_lock_nesting = 0;
p->rcu_flipctr_idx = 0;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
@ -1059,6 +1063,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->prev_utime = cputime_zero;
p->prev_stime = cputime_zero;
#ifdef CONFIG_DETECT_SOFTLOCKUP
p->last_switch_count = 0;
p->last_switch_timestamp = 0;
#endif
#ifdef CONFIG_TASK_XACCT
p->rchar = 0; /* I/O counter: bytes read */
p->wchar = 0; /* I/O counter: bytes written */
@ -1196,6 +1205,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
clear_all_latency_tracing(p);
/* Our parent execution domain becomes current domain
These must match for thread signalling to apply */
@ -1237,6 +1247,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
* parent's CPU). This avoids alot of nasty races.
*/
p->cpus_allowed = current->cpus_allowed;
p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
!cpu_online(task_cpu(p))))
set_task_cpu(p, smp_processor_id());

View file

@ -325,6 +325,22 @@ unsigned long ktime_divns(const ktime_t kt, s64 div)
}
#endif /* BITS_PER_LONG >= 64 */
/*
* Check, whether the timer is on the callback pending list
*/
static inline int hrtimer_cb_pending(const struct hrtimer *timer)
{
return timer->state & HRTIMER_STATE_PENDING;
}
/*
* Remove a timer from the callback pending list
*/
static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
{
list_del_init(&timer->cb_entry);
}
/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS
@ -493,22 +509,6 @@ void hres_timers_resume(void)
retrigger_next_event(NULL);
}
/*
* Check, whether the timer is on the callback pending list
*/
static inline int hrtimer_cb_pending(const struct hrtimer *timer)
{
return timer->state & HRTIMER_STATE_PENDING;
}
/*
* Remove a timer from the callback pending list
*/
static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
{
list_del_init(&timer->cb_entry);
}
/*
* Initialize the high resolution related parts of cpu_base
*/
@ -516,7 +516,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
{
base->expires_next.tv64 = KTIME_MAX;
base->hres_active = 0;
INIT_LIST_HEAD(&base->cb_pending);
}
/*
@ -524,7 +523,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
*/
static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
{
INIT_LIST_HEAD(&timer->cb_entry);
}
/*
@ -618,10 +616,13 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
{
return 0;
}
static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; }
static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { }
static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
static inline int hrtimer_reprogram(struct hrtimer *timer,
struct hrtimer_clock_base *base)
{
return 0;
}
#endif /* CONFIG_HIGH_RES_TIMERS */
@ -1001,6 +1002,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
clock_id = CLOCK_MONOTONIC;
timer->base = &cpu_base->clock_base[clock_id];
INIT_LIST_HEAD(&timer->cb_entry);
hrtimer_init_timer_hres(timer);
#ifdef CONFIG_TIMER_STATS
@ -1030,6 +1032,85 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
}
EXPORT_SYMBOL_GPL(hrtimer_get_res);
static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
{
spin_lock_irq(&cpu_base->lock);
while (!list_empty(&cpu_base->cb_pending)) {
enum hrtimer_restart (*fn)(struct hrtimer *);
struct hrtimer *timer;
int restart;
timer = list_entry(cpu_base->cb_pending.next,
struct hrtimer, cb_entry);
timer_stats_account_hrtimer(timer);
fn = timer->function;
__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
spin_unlock_irq(&cpu_base->lock);
restart = fn(timer);
spin_lock_irq(&cpu_base->lock);
timer->state &= ~HRTIMER_STATE_CALLBACK;
if (restart == HRTIMER_RESTART) {
BUG_ON(hrtimer_active(timer));
/*
* Enqueue the timer, allow reprogramming of the event
* device
*/
enqueue_hrtimer(timer, timer->base, 1);
} else if (hrtimer_active(timer)) {
/*
* If the timer was rearmed on another CPU, reprogram
* the event device.
*/
if (timer->base->first == &timer->node)
hrtimer_reprogram(timer, timer->base);
}
}
spin_unlock_irq(&cpu_base->lock);
}
static void __run_hrtimer(struct hrtimer *timer)
{
struct hrtimer_clock_base *base = timer->base;
struct hrtimer_cpu_base *cpu_base = base->cpu_base;
enum hrtimer_restart (*fn)(struct hrtimer *);
int restart;
__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
timer_stats_account_hrtimer(timer);
fn = timer->function;
if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
/*
* Used for scheduler timers, avoid lock inversion with
* rq->lock and tasklist_lock.
*
* These timers are required to deal with enqueue expiry
* themselves and are not allowed to migrate.
*/
spin_unlock(&cpu_base->lock);
restart = fn(timer);
spin_lock(&cpu_base->lock);
} else
restart = fn(timer);
/*
* Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
* reprogramming of the event hardware. This happens at the end of this
* function anyway.
*/
if (restart != HRTIMER_NORESTART) {
BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
enqueue_hrtimer(timer, base, 0);
}
timer->state &= ~HRTIMER_STATE_CALLBACK;
}
#ifdef CONFIG_HIGH_RES_TIMERS
/*
@ -1087,21 +1168,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
continue;
}
__remove_hrtimer(timer, base,
HRTIMER_STATE_CALLBACK, 0);
timer_stats_account_hrtimer(timer);
/*
* Note: We clear the CALLBACK bit after
* enqueue_hrtimer to avoid reprogramming of
* the event hardware. This happens at the end
* of this function anyway.
*/
if (timer->function(timer) != HRTIMER_NORESTART) {
BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
enqueue_hrtimer(timer, base, 0);
}
timer->state &= ~HRTIMER_STATE_CALLBACK;
__run_hrtimer(timer);
}
spin_unlock(&cpu_base->lock);
base++;
@ -1122,98 +1189,11 @@ void hrtimer_interrupt(struct clock_event_device *dev)
static void run_hrtimer_softirq(struct softirq_action *h)
{
struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
spin_lock_irq(&cpu_base->lock);
while (!list_empty(&cpu_base->cb_pending)) {
enum hrtimer_restart (*fn)(struct hrtimer *);
struct hrtimer *timer;
int restart;
timer = list_entry(cpu_base->cb_pending.next,
struct hrtimer, cb_entry);
timer_stats_account_hrtimer(timer);
fn = timer->function;
__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
spin_unlock_irq(&cpu_base->lock);
restart = fn(timer);
spin_lock_irq(&cpu_base->lock);
timer->state &= ~HRTIMER_STATE_CALLBACK;
if (restart == HRTIMER_RESTART) {
BUG_ON(hrtimer_active(timer));
/*
* Enqueue the timer, allow reprogramming of the event
* device
*/
enqueue_hrtimer(timer, timer->base, 1);
} else if (hrtimer_active(timer)) {
/*
* If the timer was rearmed on another CPU, reprogram
* the event device.
*/
if (timer->base->first == &timer->node)
hrtimer_reprogram(timer, timer->base);
}
}
spin_unlock_irq(&cpu_base->lock);
run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
}
#endif /* CONFIG_HIGH_RES_TIMERS */
/*
* Expire the per base hrtimer-queue:
*/
static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
int index)
{
struct rb_node *node;
struct hrtimer_clock_base *base = &cpu_base->clock_base[index];
if (!base->first)
return;
if (base->get_softirq_time)
base->softirq_time = base->get_softirq_time();
spin_lock_irq(&cpu_base->lock);
while ((node = base->first)) {
struct hrtimer *timer;
enum hrtimer_restart (*fn)(struct hrtimer *);
int restart;
timer = rb_entry(node, struct hrtimer, node);
if (base->softirq_time.tv64 <= timer->expires.tv64)
break;
#ifdef CONFIG_HIGH_RES_TIMERS
WARN_ON_ONCE(timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ);
#endif
timer_stats_account_hrtimer(timer);
fn = timer->function;
__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
spin_unlock_irq(&cpu_base->lock);
restart = fn(timer);
spin_lock_irq(&cpu_base->lock);
timer->state &= ~HRTIMER_STATE_CALLBACK;
if (restart != HRTIMER_NORESTART) {
BUG_ON(hrtimer_active(timer));
enqueue_hrtimer(timer, base, 0);
}
}
spin_unlock_irq(&cpu_base->lock);
}
/*
* Called from timer softirq every jiffy, expire hrtimers:
*
@ -1221,10 +1201,9 @@ static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
* softirq context in case the hrtimer initialization failed or has
* not been done yet.
*/
void hrtimer_run_queues(void)
void hrtimer_run_pending(void)
{
struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
int i;
if (hrtimer_hres_active())
return;
@ -1238,8 +1217,54 @@ void hrtimer_run_queues(void)
* deadlock vs. xtime_lock.
*/
if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
if (hrtimer_switch_to_hres())
return;
hrtimer_switch_to_hres();
run_hrtimer_pending(cpu_base);
}
/*
* Called from hardirq context every jiffy
*/
static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
int index)
{
struct rb_node *node;
struct hrtimer_clock_base *base = &cpu_base->clock_base[index];
if (!base->first)
return;
if (base->get_softirq_time)
base->softirq_time = base->get_softirq_time();
spin_lock(&cpu_base->lock);
while ((node = base->first)) {
struct hrtimer *timer;
timer = rb_entry(node, struct hrtimer, node);
if (base->softirq_time.tv64 <= timer->expires.tv64)
break;
if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
__remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0);
list_add_tail(&timer->cb_entry,
&base->cpu_base->cb_pending);
continue;
}
__run_hrtimer(timer);
}
spin_unlock(&cpu_base->lock);
}
void hrtimer_run_queues(void)
{
struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
int i;
if (hrtimer_hres_active())
return;
hrtimer_get_softirq_time(cpu_base);
@ -1268,7 +1293,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
sl->timer.function = hrtimer_wakeup;
sl->task = task;
#ifdef CONFIG_HIGH_RES_TIMERS
sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART;
sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
#endif
}
@ -1279,6 +1304,8 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
do {
set_current_state(TASK_INTERRUPTIBLE);
hrtimer_start(&t->timer, t->timer.expires, mode);
if (!hrtimer_active(&t->timer))
t->task = NULL;
if (likely(t->task))
schedule();
@ -1389,6 +1416,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
cpu_base->clock_base[i].cpu_base = cpu_base;
INIT_LIST_HEAD(&cpu_base->cb_pending);
hrtimer_init_hres(cpu_base);
}

View file

@ -15,6 +15,8 @@
#include <linux/mutex.h>
#include <asm/semaphore.h>
#define KTHREAD_NICE_LEVEL (-5)
static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;
@ -94,10 +96,18 @@ static void create_kthread(struct kthread_create_info *create)
if (pid < 0) {
create->result = ERR_PTR(pid);
} else {
struct sched_param param = { .sched_priority = 0 };
wait_for_completion(&create->started);
read_lock(&tasklist_lock);
create->result = find_task_by_pid(pid);
read_unlock(&tasklist_lock);
/*
* root may have changed our (kthreadd's) priority or CPU mask.
* The kernel thread should not inherit these properties.
*/
sched_setscheduler(create->result, SCHED_NORMAL, &param);
set_user_nice(create->result, KTHREAD_NICE_LEVEL);
set_cpus_allowed(create->result, CPU_MASK_ALL);
}
complete(&create->done);
}
@ -221,7 +231,7 @@ int kthreadd(void *unused)
/* Setup a clean context for our children to inherit. */
set_task_comm(tsk, "kthreadd");
ignore_signals(tsk);
set_user_nice(tsk, -5);
set_user_nice(tsk, KTHREAD_NICE_LEVEL);
set_cpus_allowed(tsk, CPU_MASK_ALL);
current->flags |= PF_NOFREEZE;

239
kernel/latencytop.c Normal file
View file

@ -0,0 +1,239 @@
/*
* latencytop.c: Latency display infrastructure
*
* (C) Copyright 2008 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
#include <linux/latencytop.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/spinlock.h>
#include <linux/proc_fs.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/stacktrace.h>
static DEFINE_SPINLOCK(latency_lock);
#define MAXLR 128
static struct latency_record latency_record[MAXLR];
int latencytop_enabled;
void clear_all_latency_tracing(struct task_struct *p)
{
unsigned long flags;
if (!latencytop_enabled)
return;
spin_lock_irqsave(&latency_lock, flags);
memset(&p->latency_record, 0, sizeof(p->latency_record));
p->latency_record_count = 0;
spin_unlock_irqrestore(&latency_lock, flags);
}
static void clear_global_latency_tracing(void)
{
unsigned long flags;
spin_lock_irqsave(&latency_lock, flags);
memset(&latency_record, 0, sizeof(latency_record));
spin_unlock_irqrestore(&latency_lock, flags);
}
static void __sched
account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat)
{
int firstnonnull = MAXLR + 1;
int i;
if (!latencytop_enabled)
return;
/* skip kernel threads for now */
if (!tsk->mm)
return;
for (i = 0; i < MAXLR; i++) {
int q;
int same = 1;
/* Nothing stored: */
if (!latency_record[i].backtrace[0]) {
if (firstnonnull > i)
firstnonnull = i;
continue;
}
for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
if (latency_record[i].backtrace[q] !=
lat->backtrace[q])
same = 0;
if (same && lat->backtrace[q] == 0)
break;
if (same && lat->backtrace[q] == ULONG_MAX)
break;
}
if (same) {
latency_record[i].count++;
latency_record[i].time += lat->time;
if (lat->time > latency_record[i].max)
latency_record[i].max = lat->time;
return;
}
}
i = firstnonnull;
if (i >= MAXLR - 1)
return;
/* Allocted a new one: */
memcpy(&latency_record[i], lat, sizeof(struct latency_record));
}
static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat)
{
struct stack_trace trace;
memset(&trace, 0, sizeof(trace));
trace.max_entries = LT_BACKTRACEDEPTH;
trace.entries = &lat->backtrace[0];
trace.skip = 0;
save_stack_trace_tsk(tsk, &trace);
}
void __sched
account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
{
unsigned long flags;
int i, q;
struct latency_record lat;
if (!latencytop_enabled)
return;
/* Long interruptible waits are generally user requested... */
if (inter && usecs > 5000)
return;
memset(&lat, 0, sizeof(lat));
lat.count = 1;
lat.time = usecs;
lat.max = usecs;
store_stacktrace(tsk, &lat);
spin_lock_irqsave(&latency_lock, flags);
account_global_scheduler_latency(tsk, &lat);
/*
* short term hack; if we're > 32 we stop; future we recycle:
*/
tsk->latency_record_count++;
if (tsk->latency_record_count >= LT_SAVECOUNT)
goto out_unlock;
for (i = 0; i < LT_SAVECOUNT ; i++) {
struct latency_record *mylat;
int same = 1;
mylat = &tsk->latency_record[i];
for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
if (mylat->backtrace[q] !=
lat.backtrace[q])
same = 0;
if (same && lat.backtrace[q] == 0)
break;
if (same && lat.backtrace[q] == ULONG_MAX)
break;
}
if (same) {
mylat->count++;
mylat->time += lat.time;
if (lat.time > mylat->max)
mylat->max = lat.time;
goto out_unlock;
}
}
/* Allocated a new one: */
i = tsk->latency_record_count;
memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
out_unlock:
spin_unlock_irqrestore(&latency_lock, flags);
}
static int lstats_show(struct seq_file *m, void *v)
{
int i;
seq_puts(m, "Latency Top version : v0.1\n");
for (i = 0; i < MAXLR; i++) {
if (latency_record[i].backtrace[0]) {
int q;
seq_printf(m, "%i %li %li ",
latency_record[i].count,
latency_record[i].time,
latency_record[i].max);
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
char sym[KSYM_NAME_LEN];
char *c;
if (!latency_record[i].backtrace[q])
break;
if (latency_record[i].backtrace[q] == ULONG_MAX)
break;
sprint_symbol(sym, latency_record[i].backtrace[q]);
c = strchr(sym, '+');
if (c)
*c = 0;
seq_printf(m, "%s ", sym);
}
seq_printf(m, "\n");
}
}
return 0;
}
static ssize_t
lstats_write(struct file *file, const char __user *buf, size_t count,
loff_t *offs)
{
clear_global_latency_tracing();
return count;
}
static int lstats_open(struct inode *inode, struct file *filp)
{
return single_open(filp, lstats_show, NULL);
}
static struct file_operations lstats_fops = {
.open = lstats_open,
.read = seq_read,
.write = lstats_write,
.llseek = seq_lseek,
.release = single_release,
};
static int __init init_lstats_procfs(void)
{
struct proc_dir_entry *pe;
pe = create_proc_entry("latency_stats", 0644, NULL);
if (!pe)
return -ENOMEM;
pe->proc_fops = &lstats_fops;
return 0;
}
__initcall(init_lstats_procfs);

View file

@ -3206,7 +3206,11 @@ void debug_show_all_locks(void)
EXPORT_SYMBOL_GPL(debug_show_all_locks);
void debug_show_held_locks(struct task_struct *task)
/*
* Careful: only use this function if you are sure that
* the task cannot run in parallel!
*/
void __debug_show_held_locks(struct task_struct *task)
{
if (unlikely(!debug_locks)) {
printk("INFO: lockdep is turned off.\n");
@ -3214,6 +3218,12 @@ void debug_show_held_locks(struct task_struct *task)
}
lockdep_print_held_locks(task);
}
EXPORT_SYMBOL_GPL(__debug_show_held_locks);
void debug_show_held_locks(struct task_struct *task)
{
__debug_show_held_locks(task);
}
EXPORT_SYMBOL_GPL(debug_show_held_locks);

View file

@ -496,6 +496,8 @@ static struct module_attribute modinfo_##field = { \
MODINFO_ATTR(version);
MODINFO_ATTR(srcversion);
static char last_unloaded_module[MODULE_NAME_LEN+1];
#ifdef CONFIG_MODULE_UNLOAD
/* Init the unload section of the module. */
static void module_unload_init(struct module *mod)
@ -719,6 +721,8 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
mod->exit();
mutex_lock(&module_mutex);
}
/* Store the name of the last unloaded module for diagnostic purposes */
sprintf(last_unloaded_module, mod->name);
free_module(mod);
out:
@ -2357,21 +2361,30 @@ static void m_stop(struct seq_file *m, void *p)
mutex_unlock(&module_mutex);
}
static char *taint_flags(unsigned int taints, char *buf)
static char *module_flags(struct module *mod, char *buf)
{
int bx = 0;
if (taints) {
if (mod->taints ||
mod->state == MODULE_STATE_GOING ||
mod->state == MODULE_STATE_COMING) {
buf[bx++] = '(';
if (taints & TAINT_PROPRIETARY_MODULE)
if (mod->taints & TAINT_PROPRIETARY_MODULE)
buf[bx++] = 'P';
if (taints & TAINT_FORCED_MODULE)
if (mod->taints & TAINT_FORCED_MODULE)
buf[bx++] = 'F';
/*
* TAINT_FORCED_RMMOD: could be added.
* TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
* apply to modules.
*/
/* Show a - for module-is-being-unloaded */
if (mod->state == MODULE_STATE_GOING)
buf[bx++] = '-';
/* Show a + for module-is-being-loaded */
if (mod->state == MODULE_STATE_COMING)
buf[bx++] = '+';
buf[bx++] = ')';
}
buf[bx] = '\0';
@ -2398,7 +2411,7 @@ static int m_show(struct seq_file *m, void *p)
/* Taints info */
if (mod->taints)
seq_printf(m, " %s", taint_flags(mod->taints, buf));
seq_printf(m, " %s", module_flags(mod, buf));
seq_printf(m, "\n");
return 0;
@ -2493,7 +2506,9 @@ void print_modules(void)
printk("Modules linked in:");
list_for_each_entry(mod, &modules, list)
printk(" %s%s", mod->name, taint_flags(mod->taints, buf));
printk(" %s%s", mod->name, module_flags(mod, buf));
if (last_unloaded_module[0])
printk(" [last unloaded: %s]", last_unloaded_module);
printk("\n");
}

View file

@ -967,6 +967,7 @@ static void check_thread_timers(struct task_struct *tsk,
{
int maxfire;
struct list_head *timers = tsk->cpu_timers;
struct signal_struct *const sig = tsk->signal;
maxfire = 20;
tsk->it_prof_expires = cputime_zero;
@ -1011,6 +1012,35 @@ static void check_thread_timers(struct task_struct *tsk,
t->firing = 1;
list_move_tail(&t->entry, firing);
}
/*
* Check for the special case thread timers.
*/
if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) {
unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max;
unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur;
if (hard != RLIM_INFINITY &&
tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
/*
* At the hard limit, we just die.
* No need to calculate anything else now.
*/
__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
return;
}
if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) {
/*
* At the soft limit, send a SIGXCPU every second.
*/
if (sig->rlim[RLIMIT_RTTIME].rlim_cur
< sig->rlim[RLIMIT_RTTIME].rlim_max) {
sig->rlim[RLIMIT_RTTIME].rlim_cur +=
USEC_PER_SEC;
}
__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
}
}
}
/*

View file

@ -573,11 +573,6 @@ static int __init printk_time_setup(char *str)
__setup("time", printk_time_setup);
__attribute__((weak)) unsigned long long printk_clock(void)
{
return sched_clock();
}
/* Check if we have any console registered that can be called early in boot. */
static int have_callable_console(void)
{
@ -628,30 +623,57 @@ asmlinkage int printk(const char *fmt, ...)
/* cpu currently holding logbuf_lock */
static volatile unsigned int printk_cpu = UINT_MAX;
const char printk_recursion_bug_msg [] =
KERN_CRIT "BUG: recent printk recursion!\n";
static int printk_recursion_bug;
asmlinkage int vprintk(const char *fmt, va_list args)
{
unsigned long flags;
int printed_len;
char *p;
static char printk_buf[1024];
static int log_level_unknown = 1;
static char printk_buf[1024];
unsigned long flags;
int printed_len = 0;
int this_cpu;
char *p;
boot_delay_msec();
preempt_disable();
if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
/* If a crash is occurring during printk() on this CPU,
* make sure we can't deadlock */
zap_locks();
/* This stops the holder of console_sem just where we want him */
raw_local_irq_save(flags);
this_cpu = smp_processor_id();
/*
* Ouch, printk recursed into itself!
*/
if (unlikely(printk_cpu == this_cpu)) {
/*
* If a crash is occurring during printk() on this CPU,
* then try to get the crash message out but make sure
* we can't deadlock. Otherwise just return to avoid the
* recursion and return - but flag the recursion so that
* it can be printed at the next appropriate moment:
*/
if (!oops_in_progress) {
printk_recursion_bug = 1;
goto out_restore_irqs;
}
zap_locks();
}
lockdep_off();
spin_lock(&logbuf_lock);
printk_cpu = smp_processor_id();
printk_cpu = this_cpu;
if (printk_recursion_bug) {
printk_recursion_bug = 0;
strcpy(printk_buf, printk_recursion_bug_msg);
printed_len = sizeof(printk_recursion_bug_msg);
}
/* Emit the output into the temporary buffer */
printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
printed_len += vscnprintf(printk_buf + printed_len,
sizeof(printk_buf), fmt, args);
/*
* Copy the output into log_buf. If the caller didn't provide
@ -680,7 +702,9 @@ asmlinkage int vprintk(const char *fmt, va_list args)
loglev_char = default_message_loglevel
+ '0';
}
t = printk_clock();
t = 0;
if (system_state != SYSTEM_BOOTING)
t = ktime_to_ns(ktime_get());
nanosec_rem = do_div(t, 1000000000);
tlen = sprintf(tbuf,
"<%c>[%5lu.%06lu] ",
@ -744,6 +768,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
printk_cpu = UINT_MAX;
spin_unlock(&logbuf_lock);
lockdep_on();
out_restore_irqs:
raw_local_irq_restore(flags);
}

View file

@ -52,7 +52,7 @@ static DEFINE_PER_CPU(int, cpu_profile_flip);
static DEFINE_MUTEX(profile_flip_mutex);
#endif /* CONFIG_SMP */
static int __init profile_setup(char * str)
static int __init profile_setup(char *str)
{
static char __initdata schedstr[] = "schedule";
static char __initdata sleepstr[] = "sleep";
@ -104,28 +104,28 @@ __setup("profile=", profile_setup);
void __init profile_init(void)
{
if (!prof_on)
if (!prof_on)
return;
/* only text is profiled */
prof_len = (_etext - _stext) >> prof_shift;
prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t));
}
/* Profile event notifications */
#ifdef CONFIG_PROFILING
static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
void profile_task_exit(struct task_struct * task)
void profile_task_exit(struct task_struct *task)
{
blocking_notifier_call_chain(&task_exit_notifier, 0, task);
}
int profile_handoff_task(struct task_struct * task)
int profile_handoff_task(struct task_struct *task)
{
int ret;
ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
@ -137,52 +137,55 @@ void profile_munmap(unsigned long addr)
blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
}
int task_handoff_register(struct notifier_block * n)
int task_handoff_register(struct notifier_block *n)
{
return atomic_notifier_chain_register(&task_free_notifier, n);
}
EXPORT_SYMBOL_GPL(task_handoff_register);
int task_handoff_unregister(struct notifier_block * n)
int task_handoff_unregister(struct notifier_block *n)
{
return atomic_notifier_chain_unregister(&task_free_notifier, n);
}
EXPORT_SYMBOL_GPL(task_handoff_unregister);
int profile_event_register(enum profile_type type, struct notifier_block * n)
int profile_event_register(enum profile_type type, struct notifier_block *n)
{
int err = -EINVAL;
switch (type) {
case PROFILE_TASK_EXIT:
err = blocking_notifier_chain_register(
&task_exit_notifier, n);
break;
case PROFILE_MUNMAP:
err = blocking_notifier_chain_register(
&munmap_notifier, n);
break;
}
return err;
}
int profile_event_unregister(enum profile_type type, struct notifier_block * n)
{
int err = -EINVAL;
switch (type) {
case PROFILE_TASK_EXIT:
err = blocking_notifier_chain_unregister(
&task_exit_notifier, n);
break;
case PROFILE_MUNMAP:
err = blocking_notifier_chain_unregister(
&munmap_notifier, n);
break;
case PROFILE_TASK_EXIT:
err = blocking_notifier_chain_register(
&task_exit_notifier, n);
break;
case PROFILE_MUNMAP:
err = blocking_notifier_chain_register(
&munmap_notifier, n);
break;
}
return err;
}
EXPORT_SYMBOL_GPL(profile_event_register);
int profile_event_unregister(enum profile_type type, struct notifier_block *n)
{
int err = -EINVAL;
switch (type) {
case PROFILE_TASK_EXIT:
err = blocking_notifier_chain_unregister(
&task_exit_notifier, n);
break;
case PROFILE_MUNMAP:
err = blocking_notifier_chain_unregister(
&munmap_notifier, n);
break;
}
return err;
}
EXPORT_SYMBOL_GPL(profile_event_unregister);
int register_timer_hook(int (*hook)(struct pt_regs *))
{
@ -191,6 +194,7 @@ int register_timer_hook(int (*hook)(struct pt_regs *))
timer_hook = hook;
return 0;
}
EXPORT_SYMBOL_GPL(register_timer_hook);
void unregister_timer_hook(int (*hook)(struct pt_regs *))
{
@ -199,13 +203,7 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *))
/* make sure all CPUs see the NULL hook */
synchronize_sched(); /* Allow ongoing interrupts to complete. */
}
EXPORT_SYMBOL_GPL(register_timer_hook);
EXPORT_SYMBOL_GPL(unregister_timer_hook);
EXPORT_SYMBOL_GPL(task_handoff_register);
EXPORT_SYMBOL_GPL(task_handoff_unregister);
EXPORT_SYMBOL_GPL(profile_event_register);
EXPORT_SYMBOL_GPL(profile_event_unregister);
#endif /* CONFIG_PROFILING */
@ -366,7 +364,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
}
break;
out_free:
out_free:
page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
per_cpu(cpu_profile_hits, cpu)[1] = NULL;
__free_page(page);
@ -409,7 +407,6 @@ void profile_hits(int type, void *__pc, unsigned int nr_hits)
atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
}
#endif /* !CONFIG_SMP */
EXPORT_SYMBOL_GPL(profile_hits);
void profile_tick(int type)
@ -427,7 +424,7 @@ void profile_tick(int type)
#include <asm/uaccess.h>
#include <asm/ptrace.h>
static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
@ -437,8 +434,8 @@ static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
return len;
}
static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer,
unsigned long count, void *data)
static int prof_cpu_mask_write_proc(struct file *file,
const char __user *buffer, unsigned long count, void *data)
{
cpumask_t *mask = (cpumask_t *)data;
unsigned long full_count = count, err;
@ -457,7 +454,8 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
struct proc_dir_entry *entry;
/* create /proc/irq/prof_cpu_mask */
if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir)))
entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
if (!entry)
return;
entry->data = (void *)&prof_cpu_mask;
entry->read_proc = prof_cpu_mask_read_proc;
@ -475,7 +473,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
unsigned long p = *ppos;
ssize_t read;
char * pnt;
char *pnt;
unsigned int sample_step = 1 << prof_shift;
profile_flip_buffers();
@ -486,12 +484,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
read = 0;
while (p < sizeof(unsigned int) && count > 0) {
if (put_user(*((char *)(&sample_step)+p),buf))
if (put_user(*((char *)(&sample_step)+p), buf))
return -EFAULT;
buf++; p++; count--; read++;
}
pnt = (char *)prof_buffer + p - sizeof(atomic_t);
if (copy_to_user(buf,(void *)pnt,count))
if (copy_to_user(buf, (void *)pnt, count))
return -EFAULT;
read += count;
*ppos += read;
@ -508,7 +506,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
#ifdef CONFIG_SMP
extern int setup_profiling_timer (unsigned int multiplier);
extern int setup_profiling_timer(unsigned int multiplier);
if (count == sizeof(int)) {
unsigned int multiplier;
@ -591,7 +589,8 @@ static int __init create_proc_profile(void)
return 0;
if (create_hash_tables())
return -1;
if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL)))
entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL);
if (!entry)
return 0;
entry->proc_fops = &proc_profile_operations;
entry->size = (1+prof_len) * sizeof(atomic_t);

575
kernel/rcuclassic.c Normal file
View file

@ -0,0 +1,575 @@
/*
* Read-Copy Update mechanism for mutual exclusion
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright IBM Corporation, 2001
*
* Authors: Dipankar Sarma <dipankar@in.ibm.com>
* Manfred Spraul <manfred@colorfullife.com>
*
* Based on the original work by Paul McKenney <paulmck@us.ibm.com>
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
* Papers:
* http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
* http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
*
* For detailed explanation of Read-Copy Update mechanism see -
* Documentation/RCU
*
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/rcupdate.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <asm/atomic.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/completion.h>
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/mutex.h>
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
struct lockdep_map rcu_lock_map =
STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
EXPORT_SYMBOL_GPL(rcu_lock_map);
#endif
/* Definition for rcupdate control block. */
static struct rcu_ctrlblk rcu_ctrlblk = {
.cur = -300,
.completed = -300,
.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
.cpumask = CPU_MASK_NONE,
};
static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.cur = -300,
.completed = -300,
.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
.cpumask = CPU_MASK_NONE,
};
DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
static int blimit = 10;
static int qhimark = 10000;
static int qlowmark = 100;
#ifdef CONFIG_SMP
static void force_quiescent_state(struct rcu_data *rdp,
struct rcu_ctrlblk *rcp)
{
int cpu;
cpumask_t cpumask;
set_need_resched();
if (unlikely(!rcp->signaled)) {
rcp->signaled = 1;
/*
* Don't send IPI to itself. With irqs disabled,
* rdp->cpu is the current cpu.
*/
cpumask = rcp->cpumask;
cpu_clear(rdp->cpu, cpumask);
for_each_cpu_mask(cpu, cpumask)
smp_send_reschedule(cpu);
}
}
#else
static inline void force_quiescent_state(struct rcu_data *rdp,
struct rcu_ctrlblk *rcp)
{
set_need_resched();
}
#endif
/**
* call_rcu - Queue an RCU callback for invocation after a grace period.
* @head: structure to be used for queueing the RCU updates.
* @func: actual update function to be invoked after the grace period
*
* The update function will be invoked some time after a full grace
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. RCU read-side critical
* sections are delimited by rcu_read_lock() and rcu_read_unlock(),
* and may be nested.
*/
void call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu))
{
unsigned long flags;
struct rcu_data *rdp;
head->func = func;
head->next = NULL;
local_irq_save(flags);
rdp = &__get_cpu_var(rcu_data);
*rdp->nxttail = head;
rdp->nxttail = &head->next;
if (unlikely(++rdp->qlen > qhimark)) {
rdp->blimit = INT_MAX;
force_quiescent_state(rdp, &rcu_ctrlblk);
}
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(call_rcu);
/**
* call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
* @head: structure to be used for queueing the RCU updates.
* @func: actual update function to be invoked after the grace period
*
* The update function will be invoked some time after a full grace
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. call_rcu_bh() assumes
* that the read-side critical sections end on completion of a softirq
* handler. This means that read-side critical sections in process
* context must not be interrupted by softirqs. This interface is to be
* used when most of the read-side critical sections are in softirq context.
* RCU read-side critical sections are delimited by rcu_read_lock() and
* rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
* and rcu_read_unlock_bh(), if in process context. These may be nested.
*/
void call_rcu_bh(struct rcu_head *head,
void (*func)(struct rcu_head *rcu))
{
unsigned long flags;
struct rcu_data *rdp;
head->func = func;
head->next = NULL;
local_irq_save(flags);
rdp = &__get_cpu_var(rcu_bh_data);
*rdp->nxttail = head;
rdp->nxttail = &head->next;
if (unlikely(++rdp->qlen > qhimark)) {
rdp->blimit = INT_MAX;
force_quiescent_state(rdp, &rcu_bh_ctrlblk);
}
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(call_rcu_bh);
/*
* Return the number of RCU batches processed thus far. Useful
* for debug and statistics.
*/
long rcu_batches_completed(void)
{
return rcu_ctrlblk.completed;
}
EXPORT_SYMBOL_GPL(rcu_batches_completed);
/*
* Return the number of RCU batches processed thus far. Useful
* for debug and statistics.
*/
long rcu_batches_completed_bh(void)
{
return rcu_bh_ctrlblk.completed;
}
EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
/* Raises the softirq for processing rcu_callbacks. */
static inline void raise_rcu_softirq(void)
{
raise_softirq(RCU_SOFTIRQ);
/*
* The smp_mb() here is required to ensure that this cpu's
* __rcu_process_callbacks() reads the most recently updated
* value of rcu->cur.
*/
smp_mb();
}
/*
* Invoke the completed RCU callbacks. They are expected to be in
* a per-cpu list.
*/
static void rcu_do_batch(struct rcu_data *rdp)
{
struct rcu_head *next, *list;
int count = 0;
list = rdp->donelist;
while (list) {
next = list->next;
prefetch(next);
list->func(list);
list = next;
if (++count >= rdp->blimit)
break;
}
rdp->donelist = list;
local_irq_disable();
rdp->qlen -= count;
local_irq_enable();
if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
rdp->blimit = blimit;
if (!rdp->donelist)
rdp->donetail = &rdp->donelist;
else
raise_rcu_softirq();
}
/*
* Grace period handling:
* The grace period handling consists out of two steps:
* - A new grace period is started.
* This is done by rcu_start_batch. The start is not broadcasted to
* all cpus, they must pick this up by comparing rcp->cur with
* rdp->quiescbatch. All cpus are recorded in the
* rcu_ctrlblk.cpumask bitmap.
* - All cpus must go through a quiescent state.
* Since the start of the grace period is not broadcasted, at least two
* calls to rcu_check_quiescent_state are required:
* The first call just notices that a new grace period is running. The
* following calls check if there was a quiescent state since the beginning
* of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
* the bitmap is empty, then the grace period is completed.
* rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
* period (if necessary).
*/
/*
* Register a new batch of callbacks, and start it up if there is currently no
* active batch and the batch to be registered has not already occurred.
* Caller must hold rcu_ctrlblk.lock.
*/
static void rcu_start_batch(struct rcu_ctrlblk *rcp)
{
if (rcp->next_pending &&
rcp->completed == rcp->cur) {
rcp->next_pending = 0;
/*
* next_pending == 0 must be visible in
* __rcu_process_callbacks() before it can see new value of cur.
*/
smp_wmb();
rcp->cur++;
/*
* Accessing nohz_cpu_mask before incrementing rcp->cur needs a
* Barrier Otherwise it can cause tickless idle CPUs to be
* included in rcp->cpumask, which will extend graceperiods
* unnecessarily.
*/
smp_mb();
cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
rcp->signaled = 0;
}
}
/*
* cpu went through a quiescent state since the beginning of the grace period.
* Clear it from the cpu mask and complete the grace period if it was the last
* cpu. Start another grace period if someone has further entries pending
*/
static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
{
cpu_clear(cpu, rcp->cpumask);
if (cpus_empty(rcp->cpumask)) {
/* batch completed ! */
rcp->completed = rcp->cur;
rcu_start_batch(rcp);
}
}
/*
* Check if the cpu has gone through a quiescent state (say context
* switch). If so and if it already hasn't done so in this RCU
* quiescent cycle, then indicate that it has done so.
*/
static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
if (rdp->quiescbatch != rcp->cur) {
/* start new grace period: */
rdp->qs_pending = 1;
rdp->passed_quiesc = 0;
rdp->quiescbatch = rcp->cur;
return;
}
/* Grace period already completed for this cpu?
* qs_pending is checked instead of the actual bitmap to avoid
* cacheline trashing.
*/
if (!rdp->qs_pending)
return;
/*
* Was there a quiescent state since the beginning of the grace
* period? If no, then exit and wait for the next call.
*/
if (!rdp->passed_quiesc)
return;
rdp->qs_pending = 0;
spin_lock(&rcp->lock);
/*
* rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
* during cpu startup. Ignore the quiescent state.
*/
if (likely(rdp->quiescbatch == rcp->cur))
cpu_quiet(rdp->cpu, rcp);
spin_unlock(&rcp->lock);
}
#ifdef CONFIG_HOTPLUG_CPU
/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
* locking requirements, the list it's pulling from has to belong to a cpu
* which is dead and hence not processing interrupts.
*/
static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
struct rcu_head **tail)
{
local_irq_disable();
*this_rdp->nxttail = list;
if (list)
this_rdp->nxttail = tail;
local_irq_enable();
}
static void __rcu_offline_cpu(struct rcu_data *this_rdp,
struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
{
/* if the cpu going offline owns the grace period
* we can block indefinitely waiting for it, so flush
* it here
*/
spin_lock_bh(&rcp->lock);
if (rcp->cur != rcp->completed)
cpu_quiet(rdp->cpu, rcp);
spin_unlock_bh(&rcp->lock);
rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
}
static void rcu_offline_cpu(int cpu)
{
struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
__rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
&per_cpu(rcu_data, cpu));
__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
&per_cpu(rcu_bh_data, cpu));
put_cpu_var(rcu_data);
put_cpu_var(rcu_bh_data);
}
#else
static void rcu_offline_cpu(int cpu)
{
}
#endif
/*
* This does the RCU processing work from softirq context.
*/
static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
*rdp->donetail = rdp->curlist;
rdp->donetail = rdp->curtail;
rdp->curlist = NULL;
rdp->curtail = &rdp->curlist;
}
if (rdp->nxtlist && !rdp->curlist) {
local_irq_disable();
rdp->curlist = rdp->nxtlist;
rdp->curtail = rdp->nxttail;
rdp->nxtlist = NULL;
rdp->nxttail = &rdp->nxtlist;
local_irq_enable();
/*
* start the next batch of callbacks
*/
/* determine batch number */
rdp->batch = rcp->cur + 1;
/* see the comment and corresponding wmb() in
* the rcu_start_batch()
*/
smp_rmb();
if (!rcp->next_pending) {
/* and start it/schedule start if it's a new batch */
spin_lock(&rcp->lock);
rcp->next_pending = 1;
rcu_start_batch(rcp);
spin_unlock(&rcp->lock);
}
}
rcu_check_quiescent_state(rcp, rdp);
if (rdp->donelist)
rcu_do_batch(rdp);
}
static void rcu_process_callbacks(struct softirq_action *unused)
{
__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
}
static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
{
/* This cpu has pending rcu entries and the grace period
* for them has completed.
*/
if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
return 1;
/* This cpu has no pending entries, but there are new entries */
if (!rdp->curlist && rdp->nxtlist)
return 1;
/* This cpu has finished callbacks to invoke */
if (rdp->donelist)
return 1;
/* The rcu core waits for a quiescent state from the cpu */
if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
return 1;
/* nothing to do */
return 0;
}
/*
* Check to see if there is any immediate RCU-related work to be done
* by the current CPU, returning 1 if so. This function is part of the
* RCU implementation; it is -not- an exported member of the RCU API.
*/
int rcu_pending(int cpu)
{
return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
}
/*
* Check to see if any future RCU-related work will need to be done
* by the current CPU, even if none need be done immediately, returning
* 1 if so. This function is part of the RCU implementation; it is -not-
* an exported member of the RCU API.
*/
int rcu_needs_cpu(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
}
void rcu_check_callbacks(int cpu, int user)
{
if (user ||
(idle_cpu(cpu) && !in_softirq() &&
hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
rcu_qsctr_inc(cpu);
rcu_bh_qsctr_inc(cpu);
} else if (!in_softirq())
rcu_bh_qsctr_inc(cpu);
raise_rcu_softirq();
}
static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
memset(rdp, 0, sizeof(*rdp));
rdp->curtail = &rdp->curlist;
rdp->nxttail = &rdp->nxtlist;
rdp->donetail = &rdp->donelist;
rdp->quiescbatch = rcp->completed;
rdp->qs_pending = 0;
rdp->cpu = cpu;
rdp->blimit = blimit;
}
static void __cpuinit rcu_online_cpu(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
}
static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
rcu_online_cpu(cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
rcu_offline_cpu(cpu);
break;
default:
break;
}
return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata rcu_nb = {
.notifier_call = rcu_cpu_notify,
};
/*
* Initializes rcu mechanism. Assumed to be called early.
* That is before local timer(SMP) or jiffie timer (uniproc) is setup.
* Note that rcu_qsctr and friends are implicitly
* initialized due to the choice of ``0'' for RCU_CTR_INVALID.
*/
void __init __rcu_init(void)
{
rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
/* Register notifier for non-boot CPUs */
register_cpu_notifier(&rcu_nb);
}
module_param(blimit, int, 0);
module_param(qhimark, int, 0);
module_param(qlowmark, int, 0);

View file

@ -15,7 +15,7 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) IBM Corporation, 2001
* Copyright IBM Corporation, 2001
*
* Authors: Dipankar Sarma <dipankar@in.ibm.com>
* Manfred Spraul <manfred@colorfullife.com>
@ -35,572 +35,27 @@
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/rcupdate.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <asm/atomic.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/completion.h>
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/mutex.h>
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
struct lockdep_map rcu_lock_map =
STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
EXPORT_SYMBOL_GPL(rcu_lock_map);
#endif
/* Definition for rcupdate control block. */
static struct rcu_ctrlblk rcu_ctrlblk = {
.cur = -300,
.completed = -300,
.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
.cpumask = CPU_MASK_NONE,
};
static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.cur = -300,
.completed = -300,
.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
.cpumask = CPU_MASK_NONE,
};
DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
/* Fake initialization required by compiler */
static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
static int blimit = 10;
static int qhimark = 10000;
static int qlowmark = 100;
static atomic_t rcu_barrier_cpu_count;
static DEFINE_MUTEX(rcu_barrier_mutex);
static struct completion rcu_barrier_completion;
#ifdef CONFIG_SMP
static void force_quiescent_state(struct rcu_data *rdp,
struct rcu_ctrlblk *rcp)
{
int cpu;
cpumask_t cpumask;
set_need_resched();
if (unlikely(!rcp->signaled)) {
rcp->signaled = 1;
/*
* Don't send IPI to itself. With irqs disabled,
* rdp->cpu is the current cpu.
*/
cpumask = rcp->cpumask;
cpu_clear(rdp->cpu, cpumask);
for_each_cpu_mask(cpu, cpumask)
smp_send_reschedule(cpu);
}
}
#else
static inline void force_quiescent_state(struct rcu_data *rdp,
struct rcu_ctrlblk *rcp)
{
set_need_resched();
}
#endif
/**
* call_rcu - Queue an RCU callback for invocation after a grace period.
* @head: structure to be used for queueing the RCU updates.
* @func: actual update function to be invoked after the grace period
*
* The update function will be invoked some time after a full grace
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. RCU read-side critical
* sections are delimited by rcu_read_lock() and rcu_read_unlock(),
* and may be nested.
*/
void fastcall call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu))
{
unsigned long flags;
struct rcu_data *rdp;
head->func = func;
head->next = NULL;
local_irq_save(flags);
rdp = &__get_cpu_var(rcu_data);
*rdp->nxttail = head;
rdp->nxttail = &head->next;
if (unlikely(++rdp->qlen > qhimark)) {
rdp->blimit = INT_MAX;
force_quiescent_state(rdp, &rcu_ctrlblk);
}
local_irq_restore(flags);
}
/**
* call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
* @head: structure to be used for queueing the RCU updates.
* @func: actual update function to be invoked after the grace period
*
* The update function will be invoked some time after a full grace
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. call_rcu_bh() assumes
* that the read-side critical sections end on completion of a softirq
* handler. This means that read-side critical sections in process
* context must not be interrupted by softirqs. This interface is to be
* used when most of the read-side critical sections are in softirq context.
* RCU read-side critical sections are delimited by rcu_read_lock() and
* rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
* and rcu_read_unlock_bh(), if in process context. These may be nested.
*/
void fastcall call_rcu_bh(struct rcu_head *head,
void (*func)(struct rcu_head *rcu))
{
unsigned long flags;
struct rcu_data *rdp;
head->func = func;
head->next = NULL;
local_irq_save(flags);
rdp = &__get_cpu_var(rcu_bh_data);
*rdp->nxttail = head;
rdp->nxttail = &head->next;
if (unlikely(++rdp->qlen > qhimark)) {
rdp->blimit = INT_MAX;
force_quiescent_state(rdp, &rcu_bh_ctrlblk);
}
local_irq_restore(flags);
}
/*
* Return the number of RCU batches processed thus far. Useful
* for debug and statistics.
*/
long rcu_batches_completed(void)
{
return rcu_ctrlblk.completed;
}
/*
* Return the number of RCU batches processed thus far. Useful
* for debug and statistics.
*/
long rcu_batches_completed_bh(void)
{
return rcu_bh_ctrlblk.completed;
}
static void rcu_barrier_callback(struct rcu_head *notused)
{
if (atomic_dec_and_test(&rcu_barrier_cpu_count))
complete(&rcu_barrier_completion);
}
/*
* Called with preemption disabled, and from cross-cpu IRQ context.
*/
static void rcu_barrier_func(void *notused)
{
int cpu = smp_processor_id();
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
struct rcu_head *head;
head = &rdp->barrier;
atomic_inc(&rcu_barrier_cpu_count);
call_rcu(head, rcu_barrier_callback);
}
/**
* rcu_barrier - Wait until all the in-flight RCUs are complete.
*/
void rcu_barrier(void)
{
BUG_ON(in_interrupt());
/* Take cpucontrol mutex to protect against CPU hotplug */
mutex_lock(&rcu_barrier_mutex);
init_completion(&rcu_barrier_completion);
atomic_set(&rcu_barrier_cpu_count, 0);
on_each_cpu(rcu_barrier_func, NULL, 0, 1);
wait_for_completion(&rcu_barrier_completion);
mutex_unlock(&rcu_barrier_mutex);
}
EXPORT_SYMBOL_GPL(rcu_barrier);
/*
* Invoke the completed RCU callbacks. They are expected to be in
* a per-cpu list.
*/
static void rcu_do_batch(struct rcu_data *rdp)
{
struct rcu_head *next, *list;
int count = 0;
list = rdp->donelist;
while (list) {
next = list->next;
prefetch(next);
list->func(list);
list = next;
if (++count >= rdp->blimit)
break;
}
rdp->donelist = list;
local_irq_disable();
rdp->qlen -= count;
local_irq_enable();
if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
rdp->blimit = blimit;
if (!rdp->donelist)
rdp->donetail = &rdp->donelist;
else
tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu));
}
/*
* Grace period handling:
* The grace period handling consists out of two steps:
* - A new grace period is started.
* This is done by rcu_start_batch. The start is not broadcasted to
* all cpus, they must pick this up by comparing rcp->cur with
* rdp->quiescbatch. All cpus are recorded in the
* rcu_ctrlblk.cpumask bitmap.
* - All cpus must go through a quiescent state.
* Since the start of the grace period is not broadcasted, at least two
* calls to rcu_check_quiescent_state are required:
* The first call just notices that a new grace period is running. The
* following calls check if there was a quiescent state since the beginning
* of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
* the bitmap is empty, then the grace period is completed.
* rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
* period (if necessary).
*/
/*
* Register a new batch of callbacks, and start it up if there is currently no
* active batch and the batch to be registered has not already occurred.
* Caller must hold rcu_ctrlblk.lock.
*/
static void rcu_start_batch(struct rcu_ctrlblk *rcp)
{
if (rcp->next_pending &&
rcp->completed == rcp->cur) {
rcp->next_pending = 0;
/*
* next_pending == 0 must be visible in
* __rcu_process_callbacks() before it can see new value of cur.
*/
smp_wmb();
rcp->cur++;
/*
* Accessing nohz_cpu_mask before incrementing rcp->cur needs a
* Barrier Otherwise it can cause tickless idle CPUs to be
* included in rcp->cpumask, which will extend graceperiods
* unnecessarily.
*/
smp_mb();
cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
rcp->signaled = 0;
}
}
/*
* cpu went through a quiescent state since the beginning of the grace period.
* Clear it from the cpu mask and complete the grace period if it was the last
* cpu. Start another grace period if someone has further entries pending
*/
static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
{
cpu_clear(cpu, rcp->cpumask);
if (cpus_empty(rcp->cpumask)) {
/* batch completed ! */
rcp->completed = rcp->cur;
rcu_start_batch(rcp);
}
}
/*
* Check if the cpu has gone through a quiescent state (say context
* switch). If so and if it already hasn't done so in this RCU
* quiescent cycle, then indicate that it has done so.
*/
static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
if (rdp->quiescbatch != rcp->cur) {
/* start new grace period: */
rdp->qs_pending = 1;
rdp->passed_quiesc = 0;
rdp->quiescbatch = rcp->cur;
return;
}
/* Grace period already completed for this cpu?
* qs_pending is checked instead of the actual bitmap to avoid
* cacheline trashing.
*/
if (!rdp->qs_pending)
return;
/*
* Was there a quiescent state since the beginning of the grace
* period? If no, then exit and wait for the next call.
*/
if (!rdp->passed_quiesc)
return;
rdp->qs_pending = 0;
spin_lock(&rcp->lock);
/*
* rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
* during cpu startup. Ignore the quiescent state.
*/
if (likely(rdp->quiescbatch == rcp->cur))
cpu_quiet(rdp->cpu, rcp);
spin_unlock(&rcp->lock);
}
#ifdef CONFIG_HOTPLUG_CPU
/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
* locking requirements, the list it's pulling from has to belong to a cpu
* which is dead and hence not processing interrupts.
*/
static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
struct rcu_head **tail)
{
local_irq_disable();
*this_rdp->nxttail = list;
if (list)
this_rdp->nxttail = tail;
local_irq_enable();
}
static void __rcu_offline_cpu(struct rcu_data *this_rdp,
struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
{
/* if the cpu going offline owns the grace period
* we can block indefinitely waiting for it, so flush
* it here
*/
spin_lock_bh(&rcp->lock);
if (rcp->cur != rcp->completed)
cpu_quiet(rdp->cpu, rcp);
spin_unlock_bh(&rcp->lock);
rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
}
static void rcu_offline_cpu(int cpu)
{
struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
__rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
&per_cpu(rcu_data, cpu));
__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
&per_cpu(rcu_bh_data, cpu));
put_cpu_var(rcu_data);
put_cpu_var(rcu_bh_data);
tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu);
}
#else
static void rcu_offline_cpu(int cpu)
{
}
#endif
/*
* This does the RCU processing work from tasklet context.
*/
static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
*rdp->donetail = rdp->curlist;
rdp->donetail = rdp->curtail;
rdp->curlist = NULL;
rdp->curtail = &rdp->curlist;
}
if (rdp->nxtlist && !rdp->curlist) {
local_irq_disable();
rdp->curlist = rdp->nxtlist;
rdp->curtail = rdp->nxttail;
rdp->nxtlist = NULL;
rdp->nxttail = &rdp->nxtlist;
local_irq_enable();
/*
* start the next batch of callbacks
*/
/* determine batch number */
rdp->batch = rcp->cur + 1;
/* see the comment and corresponding wmb() in
* the rcu_start_batch()
*/
smp_rmb();
if (!rcp->next_pending) {
/* and start it/schedule start if it's a new batch */
spin_lock(&rcp->lock);
rcp->next_pending = 1;
rcu_start_batch(rcp);
spin_unlock(&rcp->lock);
}
}
rcu_check_quiescent_state(rcp, rdp);
if (rdp->donelist)
rcu_do_batch(rdp);
}
static void rcu_process_callbacks(unsigned long unused)
{
__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
}
static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
{
/* This cpu has pending rcu entries and the grace period
* for them has completed.
*/
if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
return 1;
/* This cpu has no pending entries, but there are new entries */
if (!rdp->curlist && rdp->nxtlist)
return 1;
/* This cpu has finished callbacks to invoke */
if (rdp->donelist)
return 1;
/* The rcu core waits for a quiescent state from the cpu */
if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
return 1;
/* nothing to do */
return 0;
}
/*
* Check to see if there is any immediate RCU-related work to be done
* by the current CPU, returning 1 if so. This function is part of the
* RCU implementation; it is -not- an exported member of the RCU API.
*/
int rcu_pending(int cpu)
{
return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
}
/*
* Check to see if any future RCU-related work will need to be done
* by the current CPU, even if none need be done immediately, returning
* 1 if so. This function is part of the RCU implementation; it is -not-
* an exported member of the RCU API.
*/
int rcu_needs_cpu(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
}
void rcu_check_callbacks(int cpu, int user)
{
if (user ||
(idle_cpu(cpu) && !in_softirq() &&
hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
rcu_qsctr_inc(cpu);
rcu_bh_qsctr_inc(cpu);
} else if (!in_softirq())
rcu_bh_qsctr_inc(cpu);
tasklet_schedule(&per_cpu(rcu_tasklet, cpu));
}
static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
memset(rdp, 0, sizeof(*rdp));
rdp->curtail = &rdp->curlist;
rdp->nxttail = &rdp->nxtlist;
rdp->donetail = &rdp->donelist;
rdp->quiescbatch = rcp->completed;
rdp->qs_pending = 0;
rdp->cpu = cpu;
rdp->blimit = blimit;
}
static void __cpuinit rcu_online_cpu(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
}
static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
rcu_online_cpu(cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
rcu_offline_cpu(cpu);
break;
default:
break;
}
return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata rcu_nb = {
.notifier_call = rcu_cpu_notify,
};
/*
* Initializes rcu mechanism. Assumed to be called early.
* That is before local timer(SMP) or jiffie timer (uniproc) is setup.
* Note that rcu_qsctr and friends are implicitly
* initialized due to the choice of ``0'' for RCU_CTR_INVALID.
*/
void __init rcu_init(void)
{
rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
/* Register notifier for non-boot CPUs */
register_cpu_notifier(&rcu_nb);
}
#include <linux/module.h>
struct rcu_synchronize {
struct rcu_head head;
struct completion completion;
};
static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
static atomic_t rcu_barrier_cpu_count;
static DEFINE_MUTEX(rcu_barrier_mutex);
static struct completion rcu_barrier_completion;
/* Because of FASTCALL declaration of complete, we use this wrapper */
static void wakeme_after_rcu(struct rcu_head *head)
{
@ -618,9 +73,6 @@ static void wakeme_after_rcu(struct rcu_head *head)
* read-side critical sections have completed. RCU read-side critical
* sections are delimited by rcu_read_lock() and rcu_read_unlock(),
* and may be nested.
*
* If your read-side code is not protected by rcu_read_lock(), do -not-
* use synchronize_rcu().
*/
void synchronize_rcu(void)
{
@ -633,12 +85,54 @@ void synchronize_rcu(void)
/* Wait for it */
wait_for_completion(&rcu.completion);
}
module_param(blimit, int, 0);
module_param(qhimark, int, 0);
module_param(qlowmark, int, 0);
EXPORT_SYMBOL_GPL(rcu_batches_completed);
EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
EXPORT_SYMBOL_GPL(call_rcu);
EXPORT_SYMBOL_GPL(call_rcu_bh);
EXPORT_SYMBOL_GPL(synchronize_rcu);
static void rcu_barrier_callback(struct rcu_head *notused)
{
if (atomic_dec_and_test(&rcu_barrier_cpu_count))
complete(&rcu_barrier_completion);
}
/*
* Called with preemption disabled, and from cross-cpu IRQ context.
*/
static void rcu_barrier_func(void *notused)
{
int cpu = smp_processor_id();
struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
atomic_inc(&rcu_barrier_cpu_count);
call_rcu(head, rcu_barrier_callback);
}
/**
* rcu_barrier - Wait until all the in-flight RCUs are complete.
*/
void rcu_barrier(void)
{
BUG_ON(in_interrupt());
/* Take cpucontrol mutex to protect against CPU hotplug */
mutex_lock(&rcu_barrier_mutex);
init_completion(&rcu_barrier_completion);
atomic_set(&rcu_barrier_cpu_count, 0);
/*
* The queueing of callbacks in all CPUs must be atomic with
* respect to RCU, otherwise one CPU may queue a callback,
* wait for a grace period, decrement barrier count and call
* complete(), while other CPUs have not yet queued anything.
* So, we need to make sure that grace periods cannot complete
* until all the callbacks are queued.
*/
rcu_read_lock();
on_each_cpu(rcu_barrier_func, NULL, 0, 1);
rcu_read_unlock();
wait_for_completion(&rcu_barrier_completion);
mutex_unlock(&rcu_barrier_mutex);
}
EXPORT_SYMBOL_GPL(rcu_barrier);
void __init rcu_init(void)
{
__rcu_init();
}

953
kernel/rcupreempt.c Normal file
View file

@ -0,0 +1,953 @@
/*
* Read-Copy Update mechanism for mutual exclusion, realtime implementation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright IBM Corporation, 2006
*
* Authors: Paul E. McKenney <paulmck@us.ibm.com>
* With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
* for pushing me away from locks and towards counters, and
* to Suparna Bhattacharya for pushing me completely away
* from atomic instructions on the read side.
*
* Papers: http://www.rdrop.com/users/paulmck/RCU
*
* Design Document: http://lwn.net/Articles/253651/
*
* For detailed explanation of Read-Copy Update mechanism see -
* Documentation/RCU/ *.txt
*
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/rcupdate.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <asm/atomic.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/completion.h>
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/rcupdate.h>
#include <linux/cpu.h>
#include <linux/random.h>
#include <linux/delay.h>
#include <linux/byteorder/swabb.h>
#include <linux/cpumask.h>
#include <linux/rcupreempt_trace.h>
/*
* Macro that prevents the compiler from reordering accesses, but does
* absolutely -nothing- to prevent CPUs from reordering. This is used
* only to mediate communication between mainline code and hardware
* interrupt and NMI handlers.
*/
#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
/*
* PREEMPT_RCU data structures.
*/
/*
* GP_STAGES specifies the number of times the state machine has
* to go through the all the rcu_try_flip_states (see below)
* in a single Grace Period.
*
* GP in GP_STAGES stands for Grace Period ;)
*/
#define GP_STAGES 2
struct rcu_data {
spinlock_t lock; /* Protect rcu_data fields. */
long completed; /* Number of last completed batch. */
int waitlistcount;
struct tasklet_struct rcu_tasklet;
struct rcu_head *nextlist;
struct rcu_head **nexttail;
struct rcu_head *waitlist[GP_STAGES];
struct rcu_head **waittail[GP_STAGES];
struct rcu_head *donelist;
struct rcu_head **donetail;
long rcu_flipctr[2];
#ifdef CONFIG_RCU_TRACE
struct rcupreempt_trace trace;
#endif /* #ifdef CONFIG_RCU_TRACE */
};
/*
* States for rcu_try_flip() and friends.
*/
enum rcu_try_flip_states {
/*
* Stay here if nothing is happening. Flip the counter if somthing
* starts happening. Denoted by "I"
*/
rcu_try_flip_idle_state,
/*
* Wait here for all CPUs to notice that the counter has flipped. This
* prevents the old set of counters from ever being incremented once
* we leave this state, which in turn is necessary because we cannot
* test any individual counter for zero -- we can only check the sum.
* Denoted by "A".
*/
rcu_try_flip_waitack_state,
/*
* Wait here for the sum of the old per-CPU counters to reach zero.
* Denoted by "Z".
*/
rcu_try_flip_waitzero_state,
/*
* Wait here for each of the other CPUs to execute a memory barrier.
* This is necessary to ensure that these other CPUs really have
* completed executing their RCU read-side critical sections, despite
* their CPUs wildly reordering memory. Denoted by "M".
*/
rcu_try_flip_waitmb_state,
};
struct rcu_ctrlblk {
spinlock_t fliplock; /* Protect state-machine transitions. */
long completed; /* Number of last completed batch. */
enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
the rcu state machine */
};
static DEFINE_PER_CPU(struct rcu_data, rcu_data);
static struct rcu_ctrlblk rcu_ctrlblk = {
.fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
.completed = 0,
.rcu_try_flip_state = rcu_try_flip_idle_state,
};
#ifdef CONFIG_RCU_TRACE
static char *rcu_try_flip_state_names[] =
{ "idle", "waitack", "waitzero", "waitmb" };
#endif /* #ifdef CONFIG_RCU_TRACE */
static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE;
/*
* Enum and per-CPU flag to determine when each CPU has seen
* the most recent counter flip.
*/
enum rcu_flip_flag_values {
rcu_flip_seen, /* Steady/initial state, last flip seen. */
/* Only GP detector can update. */
rcu_flipped /* Flip just completed, need confirmation. */
/* Only corresponding CPU can update. */
};
static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
= rcu_flip_seen;
/*
* Enum and per-CPU flag to determine when each CPU has executed the
* needed memory barrier to fence in memory references from its last RCU
* read-side critical section in the just-completed grace period.
*/
enum rcu_mb_flag_values {
rcu_mb_done, /* Steady/initial state, no mb()s required. */
/* Only GP detector can update. */
rcu_mb_needed /* Flip just completed, need an mb(). */
/* Only corresponding CPU can update. */
};
static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
= rcu_mb_done;
/*
* RCU_DATA_ME: find the current CPU's rcu_data structure.
* RCU_DATA_CPU: find the specified CPU's rcu_data structure.
*/
#define RCU_DATA_ME() (&__get_cpu_var(rcu_data))
#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu))
/*
* Helper macro for tracing when the appropriate rcu_data is not
* cached in a local variable, but where the CPU number is so cached.
*/
#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
/*
* Helper macro for tracing when the appropriate rcu_data is not
* cached in a local variable.
*/
#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
/*
* Helper macro for tracing when the appropriate rcu_data is pointed
* to by a local variable.
*/
#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
/*
* Return the number of RCU batches processed thus far. Useful
* for debug and statistics.
*/
long rcu_batches_completed(void)
{
return rcu_ctrlblk.completed;
}
EXPORT_SYMBOL_GPL(rcu_batches_completed);
EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
void __rcu_read_lock(void)
{
int idx;
struct task_struct *t = current;
int nesting;
nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
if (nesting != 0) {
/* An earlier rcu_read_lock() covers us, just count it. */
t->rcu_read_lock_nesting = nesting + 1;
} else {
unsigned long flags;
/*
* We disable interrupts for the following reasons:
* - If we get scheduling clock interrupt here, and we
* end up acking the counter flip, it's like a promise
* that we will never increment the old counter again.
* Thus we will break that promise if that
* scheduling clock interrupt happens between the time
* we pick the .completed field and the time that we
* increment our counter.
*
* - We don't want to be preempted out here.
*
* NMIs can still occur, of course, and might themselves
* contain rcu_read_lock().
*/
local_irq_save(flags);
/*
* Outermost nesting of rcu_read_lock(), so increment
* the current counter for the current CPU. Use volatile
* casts to prevent the compiler from reordering.
*/
idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
/*
* Now that the per-CPU counter has been incremented, we
* are protected from races with rcu_read_lock() invoked
* from NMI handlers on this CPU. We can therefore safely
* increment the nesting counter, relieving further NMIs
* of the need to increment the per-CPU counter.
*/
ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
/*
* Now that we have preventing any NMIs from storing
* to the ->rcu_flipctr_idx, we can safely use it to
* remember which counter to decrement in the matching
* rcu_read_unlock().
*/
ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
local_irq_restore(flags);
}
}
EXPORT_SYMBOL_GPL(__rcu_read_lock);
void __rcu_read_unlock(void)
{
int idx;
struct task_struct *t = current;
int nesting;
nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
if (nesting > 1) {
/*
* We are still protected by the enclosing rcu_read_lock(),
* so simply decrement the counter.
*/
t->rcu_read_lock_nesting = nesting - 1;
} else {
unsigned long flags;
/*
* Disable local interrupts to prevent the grace-period
* detection state machine from seeing us half-done.
* NMIs can still occur, of course, and might themselves
* contain rcu_read_lock() and rcu_read_unlock().
*/
local_irq_save(flags);
/*
* Outermost nesting of rcu_read_unlock(), so we must
* decrement the current counter for the current CPU.
* This must be done carefully, because NMIs can
* occur at any point in this code, and any rcu_read_lock()
* and rcu_read_unlock() pairs in the NMI handlers
* must interact non-destructively with this code.
* Lots of volatile casts, and -very- careful ordering.
*
* Changes to this code, including this one, must be
* inspected, validated, and tested extremely carefully!!!
*/
/*
* First, pick up the index.
*/
idx = ACCESS_ONCE(t->rcu_flipctr_idx);
/*
* Now that we have fetched the counter index, it is
* safe to decrement the per-task RCU nesting counter.
* After this, any interrupts or NMIs will increment and
* decrement the per-CPU counters.
*/
ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
/*
* It is now safe to decrement this task's nesting count.
* NMIs that occur after this statement will route their
* rcu_read_lock() calls through this "else" clause, and
* will thus start incrementing the per-CPU counter on
* their own. They will also clobber ->rcu_flipctr_idx,
* but that is OK, since we have already fetched it.
*/
ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
local_irq_restore(flags);
}
}
EXPORT_SYMBOL_GPL(__rcu_read_unlock);
/*
* If a global counter flip has occurred since the last time that we
* advanced callbacks, advance them. Hardware interrupts must be
* disabled when calling this function.
*/
static void __rcu_advance_callbacks(struct rcu_data *rdp)
{
int cpu;
int i;
int wlc = 0;
if (rdp->completed != rcu_ctrlblk.completed) {
if (rdp->waitlist[GP_STAGES - 1] != NULL) {
*rdp->donetail = rdp->waitlist[GP_STAGES - 1];
rdp->donetail = rdp->waittail[GP_STAGES - 1];
RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
}
for (i = GP_STAGES - 2; i >= 0; i--) {
if (rdp->waitlist[i] != NULL) {
rdp->waitlist[i + 1] = rdp->waitlist[i];
rdp->waittail[i + 1] = rdp->waittail[i];
wlc++;
} else {
rdp->waitlist[i + 1] = NULL;
rdp->waittail[i + 1] =
&rdp->waitlist[i + 1];
}
}
if (rdp->nextlist != NULL) {
rdp->waitlist[0] = rdp->nextlist;
rdp->waittail[0] = rdp->nexttail;
wlc++;
rdp->nextlist = NULL;
rdp->nexttail = &rdp->nextlist;
RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
} else {
rdp->waitlist[0] = NULL;
rdp->waittail[0] = &rdp->waitlist[0];
}
rdp->waitlistcount = wlc;
rdp->completed = rcu_ctrlblk.completed;
}
/*
* Check to see if this CPU needs to report that it has seen
* the most recent counter flip, thereby declaring that all
* subsequent rcu_read_lock() invocations will respect this flip.
*/
cpu = raw_smp_processor_id();
if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
smp_mb(); /* Subsequent counter accesses must see new value */
per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
smp_mb(); /* Subsequent RCU read-side critical sections */
/* seen -after- acknowledgement. */
}
}
/*
* Get here when RCU is idle. Decide whether we need to
* move out of idle state, and return non-zero if so.
* "Straightforward" approach for the moment, might later
* use callback-list lengths, grace-period duration, or
* some such to determine when to exit idle state.
* Might also need a pre-idle test that does not acquire
* the lock, but let's get the simple case working first...
*/
static int
rcu_try_flip_idle(void)
{
int cpu;
RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
if (!rcu_pending(smp_processor_id())) {
RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
return 0;
}
/*
* Do the flip.
*/
RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
/*
* Need a memory barrier so that other CPUs see the new
* counter value before they see the subsequent change of all
* the rcu_flip_flag instances to rcu_flipped.
*/
smp_mb(); /* see above block comment. */
/* Now ask each CPU for acknowledgement of the flip. */
for_each_cpu_mask(cpu, rcu_cpu_online_map)
per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
return 1;
}
/*
* Wait for CPUs to acknowledge the flip.
*/
static int
rcu_try_flip_waitack(void)
{
int cpu;
RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
for_each_cpu_mask(cpu, rcu_cpu_online_map)
if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
return 0;
}
/*
* Make sure our checks above don't bleed into subsequent
* waiting for the sum of the counters to reach zero.
*/
smp_mb(); /* see above block comment. */
RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
return 1;
}
/*
* Wait for collective ``last'' counter to reach zero,
* then tell all CPUs to do an end-of-grace-period memory barrier.
*/
static int
rcu_try_flip_waitzero(void)
{
int cpu;
int lastidx = !(rcu_ctrlblk.completed & 0x1);
int sum = 0;
/* Check to see if the sum of the "last" counters is zero. */
RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
for_each_cpu_mask(cpu, rcu_cpu_online_map)
sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
if (sum != 0) {
RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
return 0;
}
/*
* This ensures that the other CPUs see the call for
* memory barriers -after- the sum to zero has been
* detected here
*/
smp_mb(); /* ^^^^^^^^^^^^ */
/* Call for a memory barrier from each CPU. */
for_each_cpu_mask(cpu, rcu_cpu_online_map)
per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
return 1;
}
/*
* Wait for all CPUs to do their end-of-grace-period memory barrier.
* Return 0 once all CPUs have done so.
*/
static int
rcu_try_flip_waitmb(void)
{
int cpu;
RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
for_each_cpu_mask(cpu, rcu_cpu_online_map)
if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
return 0;
}
smp_mb(); /* Ensure that the above checks precede any following flip. */
RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
return 1;
}
/*
* Attempt a single flip of the counters. Remember, a single flip does
* -not- constitute a grace period. Instead, the interval between
* at least GP_STAGES consecutive flips is a grace period.
*
* If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
* on a large SMP, they might want to use a hierarchical organization of
* the per-CPU-counter pairs.
*/
static void rcu_try_flip(void)
{
unsigned long flags;
RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
return;
}
/*
* Take the next transition(s) through the RCU grace-period
* flip-counter state machine.
*/
switch (rcu_ctrlblk.rcu_try_flip_state) {
case rcu_try_flip_idle_state:
if (rcu_try_flip_idle())
rcu_ctrlblk.rcu_try_flip_state =
rcu_try_flip_waitack_state;
break;
case rcu_try_flip_waitack_state:
if (rcu_try_flip_waitack())
rcu_ctrlblk.rcu_try_flip_state =
rcu_try_flip_waitzero_state;
break;
case rcu_try_flip_waitzero_state:
if (rcu_try_flip_waitzero())
rcu_ctrlblk.rcu_try_flip_state =
rcu_try_flip_waitmb_state;
break;
case rcu_try_flip_waitmb_state:
if (rcu_try_flip_waitmb())
rcu_ctrlblk.rcu_try_flip_state =
rcu_try_flip_idle_state;
}
spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
}
/*
* Check to see if this CPU needs to do a memory barrier in order to
* ensure that any prior RCU read-side critical sections have committed
* their counter manipulations and critical-section memory references
* before declaring the grace period to be completed.
*/
static void rcu_check_mb(int cpu)
{
if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
smp_mb(); /* Ensure RCU read-side accesses are visible. */
per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
}
}
void rcu_check_callbacks(int cpu, int user)
{
unsigned long flags;
struct rcu_data *rdp = RCU_DATA_CPU(cpu);
rcu_check_mb(cpu);
if (rcu_ctrlblk.completed == rdp->completed)
rcu_try_flip();
spin_lock_irqsave(&rdp->lock, flags);
RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
__rcu_advance_callbacks(rdp);
if (rdp->donelist == NULL) {
spin_unlock_irqrestore(&rdp->lock, flags);
} else {
spin_unlock_irqrestore(&rdp->lock, flags);
raise_softirq(RCU_SOFTIRQ);
}
}
/*
* Needed by dynticks, to make sure all RCU processing has finished
* when we go idle:
*/
void rcu_advance_callbacks(int cpu, int user)
{
unsigned long flags;
struct rcu_data *rdp = RCU_DATA_CPU(cpu);
if (rcu_ctrlblk.completed == rdp->completed) {
rcu_try_flip();
if (rcu_ctrlblk.completed == rdp->completed)
return;
}
spin_lock_irqsave(&rdp->lock, flags);
RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
__rcu_advance_callbacks(rdp);
spin_unlock_irqrestore(&rdp->lock, flags);
}
#ifdef CONFIG_HOTPLUG_CPU
#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
*dsttail = srclist; \
if (srclist != NULL) { \
dsttail = srctail; \
srclist = NULL; \
srctail = &srclist;\
} \
} while (0)
void rcu_offline_cpu(int cpu)
{
int i;
struct rcu_head *list = NULL;
unsigned long flags;
struct rcu_data *rdp = RCU_DATA_CPU(cpu);
struct rcu_head **tail = &list;
/*
* Remove all callbacks from the newly dead CPU, retaining order.
* Otherwise rcu_barrier() will fail
*/
spin_lock_irqsave(&rdp->lock, flags);
rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
for (i = GP_STAGES - 1; i >= 0; i--)
rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
list, tail);
rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
spin_unlock_irqrestore(&rdp->lock, flags);
rdp->waitlistcount = 0;
/* Disengage the newly dead CPU from the grace-period computation. */
spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
rcu_check_mb(cpu);
if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
smp_mb(); /* Subsequent counter accesses must see new value */
per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
smp_mb(); /* Subsequent RCU read-side critical sections */
/* seen -after- acknowledgement. */
}
RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
cpu_clear(cpu, rcu_cpu_online_map);
spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
/*
* Place the removed callbacks on the current CPU's queue.
* Make them all start a new grace period: simple approach,
* in theory could starve a given set of callbacks, but
* you would need to be doing some serious CPU hotplugging
* to make this happen. If this becomes a problem, adding
* a synchronize_rcu() to the hotplug path would be a simple
* fix.
*/
rdp = RCU_DATA_ME();
spin_lock_irqsave(&rdp->lock, flags);
*rdp->nexttail = list;
if (list)
rdp->nexttail = tail;
spin_unlock_irqrestore(&rdp->lock, flags);
}
void __devinit rcu_online_cpu(int cpu)
{
unsigned long flags;
spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
cpu_set(cpu, rcu_cpu_online_map);
spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
}
#else /* #ifdef CONFIG_HOTPLUG_CPU */
void rcu_offline_cpu(int cpu)
{
}
void __devinit rcu_online_cpu(int cpu)
{
}
#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_process_callbacks(struct softirq_action *unused)
{
unsigned long flags;
struct rcu_head *next, *list;
struct rcu_data *rdp = RCU_DATA_ME();
spin_lock_irqsave(&rdp->lock, flags);
list = rdp->donelist;
if (list == NULL) {
spin_unlock_irqrestore(&rdp->lock, flags);
return;
}
rdp->donelist = NULL;
rdp->donetail = &rdp->donelist;
RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
spin_unlock_irqrestore(&rdp->lock, flags);
while (list) {
next = list->next;
list->func(list);
list = next;
RCU_TRACE_ME(rcupreempt_trace_invoke);
}
}
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
{
unsigned long flags;
struct rcu_data *rdp;
head->func = func;
head->next = NULL;
local_irq_save(flags);
rdp = RCU_DATA_ME();
spin_lock(&rdp->lock);
__rcu_advance_callbacks(rdp);
*rdp->nexttail = head;
rdp->nexttail = &head->next;
RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
spin_unlock(&rdp->lock);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(call_rcu);
/*
* Wait until all currently running preempt_disable() code segments
* (including hardware-irq-disable segments) complete. Note that
* in -rt this does -not- necessarily result in all currently executing
* interrupt -handlers- having completed.
*/
void __synchronize_sched(void)
{
cpumask_t oldmask;
int cpu;
if (sched_getaffinity(0, &oldmask) < 0)
oldmask = cpu_possible_map;
for_each_online_cpu(cpu) {
sched_setaffinity(0, cpumask_of_cpu(cpu));
schedule();
}
sched_setaffinity(0, oldmask);
}
EXPORT_SYMBOL_GPL(__synchronize_sched);
/*
* Check to see if any future RCU-related work will need to be done
* by the current CPU, even if none need be done immediately, returning
* 1 if so. Assumes that notifiers would take care of handling any
* outstanding requests from the RCU core.
*
* This function is part of the RCU implementation; it is -not-
* an exported member of the RCU API.
*/
int rcu_needs_cpu(int cpu)
{
struct rcu_data *rdp = RCU_DATA_CPU(cpu);
return (rdp->donelist != NULL ||
!!rdp->waitlistcount ||
rdp->nextlist != NULL);
}
int rcu_pending(int cpu)
{
struct rcu_data *rdp = RCU_DATA_CPU(cpu);
/* The CPU has at least one callback queued somewhere. */
if (rdp->donelist != NULL ||
!!rdp->waitlistcount ||
rdp->nextlist != NULL)
return 1;
/* The RCU core needs an acknowledgement from this CPU. */
if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
(per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
return 1;
/* This CPU has fallen behind the global grace-period number. */
if (rdp->completed != rcu_ctrlblk.completed)
return 1;
/* Nothing needed from this CPU. */
return 0;
}
static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
rcu_online_cpu(cpu);
break;
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
rcu_offline_cpu(cpu);
break;
default:
break;
}
return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata rcu_nb = {
.notifier_call = rcu_cpu_notify,
};
void __init __rcu_init(void)
{
int cpu;
int i;
struct rcu_data *rdp;
printk(KERN_NOTICE "Preemptible RCU implementation.\n");
for_each_possible_cpu(cpu) {
rdp = RCU_DATA_CPU(cpu);
spin_lock_init(&rdp->lock);
rdp->completed = 0;
rdp->waitlistcount = 0;
rdp->nextlist = NULL;
rdp->nexttail = &rdp->nextlist;
for (i = 0; i < GP_STAGES; i++) {
rdp->waitlist[i] = NULL;
rdp->waittail[i] = &rdp->waitlist[i];
}
rdp->donelist = NULL;
rdp->donetail = &rdp->donelist;
rdp->rcu_flipctr[0] = 0;
rdp->rcu_flipctr[1] = 0;
}
register_cpu_notifier(&rcu_nb);
/*
* We don't need protection against CPU-Hotplug here
* since
* a) If a CPU comes online while we are iterating over the
* cpu_online_map below, we would only end up making a
* duplicate call to rcu_online_cpu() which sets the corresponding
* CPU's mask in the rcu_cpu_online_map.
*
* b) A CPU cannot go offline at this point in time since the user
* does not have access to the sysfs interface, nor do we
* suspend the system.
*/
for_each_online_cpu(cpu)
rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
}
/*
* Deprecated, use synchronize_rcu() or synchronize_sched() instead.
*/
void synchronize_kernel(void)
{
synchronize_rcu();
}
#ifdef CONFIG_RCU_TRACE
long *rcupreempt_flipctr(int cpu)
{
return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
}
EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
int rcupreempt_flip_flag(int cpu)
{
return per_cpu(rcu_flip_flag, cpu);
}
EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
int rcupreempt_mb_flag(int cpu)
{
return per_cpu(rcu_mb_flag, cpu);
}
EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
char *rcupreempt_try_flip_state_name(void)
{
return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
}
EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
{
struct rcu_data *rdp = RCU_DATA_CPU(cpu);
return &rdp->trace;
}
EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
#endif /* #ifdef RCU_TRACE */

330
kernel/rcupreempt_trace.c Normal file
View file

@ -0,0 +1,330 @@
/*
* Read-Copy Update tracing for realtime implementation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright IBM Corporation, 2006
*
* Papers: http://www.rdrop.com/users/paulmck/RCU
*
* For detailed explanation of Read-Copy Update mechanism see -
* Documentation/RCU/ *.txt
*
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/rcupdate.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <asm/atomic.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/completion.h>
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/rcupdate.h>
#include <linux/cpu.h>
#include <linux/mutex.h>
#include <linux/rcupreempt_trace.h>
#include <linux/debugfs.h>
static struct mutex rcupreempt_trace_mutex;
static char *rcupreempt_trace_buf;
#define RCUPREEMPT_TRACE_BUF_SIZE 4096
void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
{
trace->done_length += trace->wait_length;
trace->done_add += trace->wait_length;
trace->wait_length = 0;
}
void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
{
trace->wait_length += trace->next_length;
trace->wait_add += trace->next_length;
trace->next_length = 0;
}
void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
{
atomic_inc(&trace->rcu_try_flip_1);
}
void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
{
atomic_inc(&trace->rcu_try_flip_e1);
}
void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_i1++;
}
void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_ie1++;
}
void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_g1++;
}
void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_a1++;
}
void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_ae1++;
}
void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_a2++;
}
void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_z1++;
}
void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_ze1++;
}
void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_z2++;
}
void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_m1++;
}
void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_me1++;
}
void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
{
trace->rcu_try_flip_m2++;
}
void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
{
trace->rcu_check_callbacks++;
}
void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
{
trace->done_remove += trace->done_length;
trace->done_length = 0;
}
void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
{
atomic_inc(&trace->done_invoked);
}
void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
{
trace->next_add++;
trace->next_length++;
}
static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
{
struct rcupreempt_trace *cp;
int cpu;
memset(sp, 0, sizeof(*sp));
for_each_possible_cpu(cpu) {
cp = rcupreempt_trace_cpu(cpu);
sp->next_length += cp->next_length;
sp->next_add += cp->next_add;
sp->wait_length += cp->wait_length;
sp->wait_add += cp->wait_add;
sp->done_length += cp->done_length;
sp->done_add += cp->done_add;
sp->done_remove += cp->done_remove;
atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked));
sp->rcu_check_callbacks += cp->rcu_check_callbacks;
atomic_set(&sp->rcu_try_flip_1,
atomic_read(&cp->rcu_try_flip_1));
atomic_set(&sp->rcu_try_flip_e1,
atomic_read(&cp->rcu_try_flip_e1));
sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
}
}
static ssize_t rcustats_read(struct file *filp, char __user *buffer,
size_t count, loff_t *ppos)
{
struct rcupreempt_trace trace;
ssize_t bcount;
int cnt = 0;
rcupreempt_trace_sum(&trace);
mutex_lock(&rcupreempt_trace_mutex);
snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
"ggp=%ld rcc=%ld\n",
rcu_batches_completed(),
trace.rcu_check_callbacks);
snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
"na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
"1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
"z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
trace.next_add, trace.next_length,
trace.wait_add, trace.wait_length,
trace.done_add, trace.done_length,
trace.done_remove, atomic_read(&trace.done_invoked),
atomic_read(&trace.rcu_try_flip_1),
atomic_read(&trace.rcu_try_flip_e1),
trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
trace.rcu_try_flip_g1,
trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
trace.rcu_try_flip_a2,
trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
trace.rcu_try_flip_z2,
trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
trace.rcu_try_flip_m2);
bcount = simple_read_from_buffer(buffer, count, ppos,
rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
mutex_unlock(&rcupreempt_trace_mutex);
return bcount;
}
static ssize_t rcugp_read(struct file *filp, char __user *buffer,
size_t count, loff_t *ppos)
{
long oldgp = rcu_batches_completed();
ssize_t bcount;
mutex_lock(&rcupreempt_trace_mutex);
synchronize_rcu();
snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
"oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed());
bcount = simple_read_from_buffer(buffer, count, ppos,
rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
mutex_unlock(&rcupreempt_trace_mutex);
return bcount;
}
static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
size_t count, loff_t *ppos)
{
int cnt = 0;
int cpu;
int f = rcu_batches_completed() & 0x1;
ssize_t bcount;
mutex_lock(&rcupreempt_trace_mutex);
cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
"CPU last cur F M\n");
for_each_online_cpu(cpu) {
long *flipctr = rcupreempt_flipctr(cpu);
cnt += snprintf(&rcupreempt_trace_buf[cnt],
RCUPREEMPT_TRACE_BUF_SIZE - cnt,
"%3d %4ld %3ld %d %d\n",
cpu,
flipctr[!f],
flipctr[f],
rcupreempt_flip_flag(cpu),
rcupreempt_mb_flag(cpu));
}
cnt += snprintf(&rcupreempt_trace_buf[cnt],
RCUPREEMPT_TRACE_BUF_SIZE - cnt,
"ggp = %ld, state = %s\n",
rcu_batches_completed(),
rcupreempt_try_flip_state_name());
cnt += snprintf(&rcupreempt_trace_buf[cnt],
RCUPREEMPT_TRACE_BUF_SIZE - cnt,
"\n");
bcount = simple_read_from_buffer(buffer, count, ppos,
rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
mutex_unlock(&rcupreempt_trace_mutex);
return bcount;
}
static struct file_operations rcustats_fops = {
.owner = THIS_MODULE,
.read = rcustats_read,
};
static struct file_operations rcugp_fops = {
.owner = THIS_MODULE,
.read = rcugp_read,
};
static struct file_operations rcuctrs_fops = {
.owner = THIS_MODULE,
.read = rcuctrs_read,
};
static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
static int rcupreempt_debugfs_init(void)
{
rcudir = debugfs_create_dir("rcu", NULL);
if (!rcudir)
goto out;
statdir = debugfs_create_file("rcustats", 0444, rcudir,
NULL, &rcustats_fops);
if (!statdir)
goto free_out;
gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
if (!gpdir)
goto free_out;
ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
NULL, &rcuctrs_fops);
if (!ctrsdir)
goto free_out;
return 0;
free_out:
if (statdir)
debugfs_remove(statdir);
if (gpdir)
debugfs_remove(gpdir);
debugfs_remove(rcudir);
out:
return 1;
}
static int __init rcupreempt_trace_init(void)
{
mutex_init(&rcupreempt_trace_mutex);
rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
if (!rcupreempt_trace_buf)
return 1;
return rcupreempt_debugfs_init();
}
static void __exit rcupreempt_trace_cleanup(void)
{
debugfs_remove(statdir);
debugfs_remove(gpdir);
debugfs_remove(ctrsdir);
debugfs_remove(rcudir);
kfree(rcupreempt_trace_buf);
}
module_init(rcupreempt_trace_init);
module_exit(rcupreempt_trace_cleanup);

View file

@ -726,11 +726,11 @@ static void rcu_torture_shuffle_tasks(void)
cpumask_t tmp_mask = CPU_MASK_ALL;
int i;
lock_cpu_hotplug();
get_online_cpus();
/* No point in shuffling if there is only one online CPU (ex: UP) */
if (num_online_cpus() == 1) {
unlock_cpu_hotplug();
put_online_cpus();
return;
}
@ -762,7 +762,7 @@ static void rcu_torture_shuffle_tasks(void)
else
rcu_idle_cpu--;
unlock_cpu_hotplug();
put_online_cpus();
}
/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the

File diff suppressed because it is too large Load diff

View file

@ -179,6 +179,7 @@ static void print_cpu(struct seq_file *m, int cpu)
PN(prev_clock_raw);
P(clock_warps);
P(clock_overflows);
P(clock_underflows);
P(clock_deep_idle_events);
PN(clock_max_delta);
P(cpu_load[0]);
@ -299,6 +300,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
PN(se.exec_max);
PN(se.slice_max);
PN(se.wait_max);
PN(se.wait_sum);
P(se.wait_count);
P(sched_info.bkl_count);
P(se.nr_migrations);
P(se.nr_migrations_cold);
@ -366,6 +369,8 @@ void proc_sched_set_task(struct task_struct *p)
{
#ifdef CONFIG_SCHEDSTATS
p->se.wait_max = 0;
p->se.wait_sum = 0;
p->se.wait_count = 0;
p->se.sleep_max = 0;
p->se.sum_sleep_runtime = 0;
p->se.block_max = 0;

View file

@ -20,6 +20,8 @@
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
*/
#include <linux/latencytop.h>
/*
* Targeted preemption latency for CPU-bound tasks:
* (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
@ -248,8 +250,8 @@ static u64 __sched_period(unsigned long nr_running)
unsigned long nr_latency = sched_nr_latency;
if (unlikely(nr_running > nr_latency)) {
period = sysctl_sched_min_granularity;
period *= nr_running;
do_div(period, nr_latency);
}
return period;
@ -383,6 +385,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
schedstat_set(se->wait_max, max(se->wait_max,
rq_of(cfs_rq)->clock - se->wait_start));
schedstat_set(se->wait_count, se->wait_count + 1);
schedstat_set(se->wait_sum, se->wait_sum +
rq_of(cfs_rq)->clock - se->wait_start);
schedstat_set(se->wait_start, 0);
}
@ -434,6 +439,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
#ifdef CONFIG_SCHEDSTATS
if (se->sleep_start) {
u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
struct task_struct *tsk = task_of(se);
if ((s64)delta < 0)
delta = 0;
@ -443,9 +449,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->sleep_start = 0;
se->sum_sleep_runtime += delta;
account_scheduler_latency(tsk, delta >> 10, 1);
}
if (se->block_start) {
u64 delta = rq_of(cfs_rq)->clock - se->block_start;
struct task_struct *tsk = task_of(se);
if ((s64)delta < 0)
delta = 0;
@ -462,11 +471,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
* time that the task spent sleeping:
*/
if (unlikely(prof_on == SLEEP_PROFILING)) {
struct task_struct *tsk = task_of(se);
profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
delta >> 20);
}
account_scheduler_latency(tsk, delta >> 10, 0);
}
#endif
}
@ -642,13 +651,29 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
cfs_rq->curr = NULL;
}
static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
* validating it and just reschedule.
*/
if (queued)
return resched_task(rq_of(cfs_rq)->curr);
/*
* don't let the period tick interfere with the hrtick preemption
*/
if (!sched_feat(DOUBLE_TICK) &&
hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
return;
#endif
if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
check_preempt_tick(cfs_rq, curr);
}
@ -690,7 +715,7 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
/* Iterate thr' all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */
static inline int
@ -707,6 +732,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
return se->parent;
}
#define GROUP_IMBALANCE_PCT 20
#else /* CONFIG_FAIR_GROUP_SCHED */
#define for_each_sched_entity(se) \
@ -752,6 +779,43 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_SCHED_HRTICK
static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
int requeue = rq->curr == p;
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
WARN_ON(task_rq(p) != rq);
if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
s64 delta = slice - ran;
if (delta < 0) {
if (rq->curr == p)
resched_task(p);
return;
}
/*
* Don't schedule slices shorter than 10000ns, that just
* doesn't make sense. Rely on vruntime for fairness.
*/
if (!requeue)
delta = max(10000LL, delta);
hrtick_start(rq, delta, requeue);
}
}
#else
static inline void
hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
}
#endif
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@ -760,15 +824,28 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
struct sched_entity *se = &p->se,
*topse = NULL; /* Highest schedulable entity */
int incload = 1;
for_each_sched_entity(se) {
if (se->on_rq)
topse = se;
if (se->on_rq) {
incload = 0;
break;
}
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, wakeup);
wakeup = 1;
}
/* Increment cpu load if we just enqueued the first task of a group on
* 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
* at the highest grouping level.
*/
if (incload)
inc_cpu_load(rq, topse->load.weight);
hrtick_start_fair(rq, rq->curr);
}
/*
@ -779,16 +856,30 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
struct sched_entity *se = &p->se,
*topse = NULL; /* Highest schedulable entity */
int decload = 1;
for_each_sched_entity(se) {
topse = se;
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, sleep);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight)
if (cfs_rq->load.weight) {
if (parent_entity(se))
decload = 0;
break;
}
sleep = 1;
}
/* Decrement cpu load if we just dequeued the last task of a group on
* 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
* at the highest grouping level.
*/
if (decload)
dec_cpu_load(rq, topse->load.weight);
hrtick_start_fair(rq, rq->curr);
}
/*
@ -835,6 +926,154 @@ static void yield_task_fair(struct rq *rq)
se->vruntime = rightmost->vruntime + 1;
}
/*
* wake_idle() will wake a task on an idle cpu if task->cpu is
* not idle and an idle cpu is available. The span of cpus to
* search starts with cpus closest then further out as needed,
* so we always favor a closer, idle cpu.
*
* Returns the CPU we should wake onto.
*/
#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
static int wake_idle(int cpu, struct task_struct *p)
{
cpumask_t tmp;
struct sched_domain *sd;
int i;
/*
* If it is idle, then it is the best cpu to run this task.
*
* This cpu is also the best, if it has more than one task already.
* Siblings must be also busy(in most cases) as they didn't already
* pickup the extra load from this cpu and hence we need not check
* sibling runqueue info. This will avoid the checks and cache miss
* penalities associated with that.
*/
if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
return cpu;
for_each_domain(cpu, sd) {
if (sd->flags & SD_WAKE_IDLE) {
cpus_and(tmp, sd->span, p->cpus_allowed);
for_each_cpu_mask(i, tmp) {
if (idle_cpu(i)) {
if (i != task_cpu(p)) {
schedstat_inc(p,
se.nr_wakeups_idle);
}
return i;
}
}
} else {
break;
}
}
return cpu;
}
#else
static inline int wake_idle(int cpu, struct task_struct *p)
{
return cpu;
}
#endif
#ifdef CONFIG_SMP
static int select_task_rq_fair(struct task_struct *p, int sync)
{
int cpu, this_cpu;
struct rq *rq;
struct sched_domain *sd, *this_sd = NULL;
int new_cpu;
cpu = task_cpu(p);
rq = task_rq(p);
this_cpu = smp_processor_id();
new_cpu = cpu;
if (cpu == this_cpu)
goto out_set_cpu;
for_each_domain(this_cpu, sd) {
if (cpu_isset(cpu, sd->span)) {
this_sd = sd;
break;
}
}
if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
goto out_set_cpu;
/*
* Check for affine wakeup and passive balancing possibilities.
*/
if (this_sd) {
int idx = this_sd->wake_idx;
unsigned int imbalance;
unsigned long load, this_load;
imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
load = source_load(cpu, idx);
this_load = target_load(this_cpu, idx);
new_cpu = this_cpu; /* Wake to this CPU if we can */
if (this_sd->flags & SD_WAKE_AFFINE) {
unsigned long tl = this_load;
unsigned long tl_per_task;
/*
* Attract cache-cold tasks on sync wakeups:
*/
if (sync && !task_hot(p, rq->clock, this_sd))
goto out_set_cpu;
schedstat_inc(p, se.nr_wakeups_affine_attempts);
tl_per_task = cpu_avg_load_per_task(this_cpu);
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
* of the current CPU:
*/
if (sync)
tl -= current->se.load.weight;
if ((tl <= load &&
tl + target_load(cpu, idx) <= tl_per_task) ||
100*(tl + p->se.load.weight) <= imbalance*load) {
/*
* This domain has SD_WAKE_AFFINE and
* p is cache cold in this domain, and
* there is no bad imbalance.
*/
schedstat_inc(this_sd, ttwu_move_affine);
schedstat_inc(p, se.nr_wakeups_affine);
goto out_set_cpu;
}
}
/*
* Start passive balancing when half the imbalance_pct
* limit is reached.
*/
if (this_sd->flags & SD_WAKE_BALANCE) {
if (imbalance*this_load <= 100*load) {
schedstat_inc(this_sd, ttwu_move_balance);
schedstat_inc(p, se.nr_wakeups_passive);
goto out_set_cpu;
}
}
}
new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
out_set_cpu:
return wake_idle(new_cpu, p);
}
#endif /* CONFIG_SMP */
/*
* Preempt the current task with a newly woken task if needed:
*/
@ -876,6 +1115,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
static struct task_struct *pick_next_task_fair(struct rq *rq)
{
struct task_struct *p;
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
@ -887,7 +1127,10 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
return task_of(se);
p = task_of(se);
hrtick_start_fair(rq, p);
return p;
}
/*
@ -944,25 +1187,6 @@ static struct task_struct *load_balance_next_fair(void *arg)
return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr;
struct task_struct *p;
if (!cfs_rq->nr_running)
return MAX_PRIO;
curr = cfs_rq->curr;
if (!curr)
curr = __pick_next_entity(cfs_rq);
p = task_of(curr);
return p->prio;
}
#endif
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
@ -972,28 +1196,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
struct cfs_rq *busy_cfs_rq;
long rem_load_move = max_load_move;
struct rq_iterator cfs_rq_iterator;
unsigned long load_moved;
cfs_rq_iterator.start = load_balance_start_fair;
cfs_rq_iterator.next = load_balance_next_fair;
for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
#ifdef CONFIG_FAIR_GROUP_SCHED
struct cfs_rq *this_cfs_rq;
long imbalance;
unsigned long maxload;
struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu];
unsigned long maxload, task_load, group_weight;
unsigned long thisload, per_task_load;
struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
task_load = busy_cfs_rq->load.weight;
group_weight = se->load.weight;
imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
if (imbalance <= 0)
/*
* 'group_weight' is contributed by tasks of total weight
* 'task_load'. To move 'rem_load_move' worth of weight only,
* we need to move a maximum task load of:
*
* maxload = (remload / group_weight) * task_load;
*/
maxload = (rem_load_move * task_load) / group_weight;
if (!maxload || !task_load)
continue;
/* Don't pull more than imbalance/2 */
imbalance /= 2;
maxload = min(rem_load_move, imbalance);
per_task_load = task_load / busy_cfs_rq->nr_running;
/*
* balance_tasks will try to forcibly move atleast one task if
* possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
* maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
*/
if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
continue;
*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
/* Disable priority-based load balance */
*this_best_prio = 0;
thisload = this_cfs_rq->load.weight;
#else
# define maxload rem_load_move
#endif
@ -1002,11 +1243,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
* load_balance_[start|next]_fair iterators
*/
cfs_rq_iterator.arg = busy_cfs_rq;
rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
load_moved = balance_tasks(this_rq, this_cpu, busiest,
maxload, sd, idle, all_pinned,
this_best_prio,
&cfs_rq_iterator);
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* load_moved holds the task load that was moved. The
* effective (group) weight moved would be:
* load_moved_eff = load_moved/task_load * group_weight;
*/
load_moved = (group_weight * load_moved) / task_load;
/* Adjust shares on both cpus to reflect load_moved */
group_weight -= load_moved;
set_se_shares(se, group_weight);
se = busy_cfs_rq->tg->se[this_cpu];
if (!thisload)
group_weight = load_moved;
else
group_weight = se->load.weight + load_moved;
set_se_shares(se, group_weight);
#endif
rem_load_move -= load_moved;
if (rem_load_move <= 0)
break;
}
@ -1042,14 +1305,14 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
/*
* scheduler tick hitting a task of our scheduling class:
*/
static void task_tick_fair(struct rq *rq, struct task_struct *curr)
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &curr->se;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se);
entity_tick(cfs_rq, se, queued);
}
}
@ -1087,6 +1350,42 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
resched_task(rq->curr);
}
/*
* Priority of the task has changed. Check to see if we preempt
* the current task.
*/
static void prio_changed_fair(struct rq *rq, struct task_struct *p,
int oldprio, int running)
{
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
if (running) {
if (p->prio > oldprio)
resched_task(rq->curr);
} else
check_preempt_curr(rq, p);
}
/*
* We switched to the sched_fair class.
*/
static void switched_to_fair(struct rq *rq, struct task_struct *p,
int running)
{
/*
* We were most likely switched from sched_rt, so
* kick off the schedule if running, otherwise just see
* if we can still preempt the current task.
*/
if (running)
resched_task(rq->curr);
else
check_preempt_curr(rq, p);
}
/* Account for a task changing its policy or group.
*
* This routine is mostly called to set cfs_rq->curr field when a task
@ -1108,6 +1407,9 @@ static const struct sched_class fair_sched_class = {
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_fair,
#endif /* CONFIG_SMP */
.check_preempt_curr = check_preempt_wakeup,
@ -1122,6 +1424,9 @@ static const struct sched_class fair_sched_class = {
.set_curr_task = set_curr_task_fair,
.task_tick = task_tick_fair,
.task_new = task_new_fair,
.prio_changed = prio_changed_fair,
.switched_to = switched_to_fair,
};
#ifdef CONFIG_SCHED_DEBUG
@ -1132,7 +1437,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
#ifdef CONFIG_FAIR_GROUP_SCHED
print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
#endif
rcu_read_lock();
for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
print_cfs_rq(m, cpu, cfs_rq);
rcu_read_unlock();
}
#endif

View file

@ -5,6 +5,12 @@
* handled in sched_fair.c)
*/
#ifdef CONFIG_SMP
static int select_task_rq_idle(struct task_struct *p, int sync)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
#endif /* CONFIG_SMP */
/*
* Idle tasks are unconditionally rescheduled:
*/
@ -55,7 +61,7 @@ move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
}
#endif
static void task_tick_idle(struct rq *rq, struct task_struct *curr)
static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
{
}
@ -63,6 +69,33 @@ static void set_curr_task_idle(struct rq *rq)
{
}
static void switched_to_idle(struct rq *rq, struct task_struct *p,
int running)
{
/* Can this actually happen?? */
if (running)
resched_task(rq->curr);
else
check_preempt_curr(rq, p);
}
static void prio_changed_idle(struct rq *rq, struct task_struct *p,
int oldprio, int running)
{
/* This can happen for hot plug CPUS */
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
if (running) {
if (p->prio > oldprio)
resched_task(rq->curr);
} else
check_preempt_curr(rq, p);
}
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
@ -72,6 +105,9 @@ const struct sched_class idle_sched_class = {
/* dequeue is not valid, we print a debug message there: */
.dequeue_task = dequeue_task_idle,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
#endif /* CONFIG_SMP */
.check_preempt_curr = check_preempt_curr_idle,
@ -85,5 +121,9 @@ const struct sched_class idle_sched_class = {
.set_curr_task = set_curr_task_idle,
.task_tick = task_tick_idle,
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
/* no .task_new for idle tasks */
};

File diff suppressed because it is too large Load diff

View file

@ -8,6 +8,7 @@
*/
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/nmi.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/freezer.h>
@ -23,8 +24,8 @@ static DEFINE_PER_CPU(unsigned long, touch_timestamp);
static DEFINE_PER_CPU(unsigned long, print_timestamp);
static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
static int did_panic;
int softlockup_thresh = 10;
static int __read_mostly did_panic;
unsigned long __read_mostly softlockup_thresh = 60;
static int
softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
@ -45,7 +46,7 @@ static struct notifier_block panic_block = {
*/
static unsigned long get_timestamp(int this_cpu)
{
return cpu_clock(this_cpu) >> 30; /* 2^30 ~= 10^9 */
return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
}
void touch_softlockup_watchdog(void)
@ -100,11 +101,7 @@ void softlockup_tick(void)
now = get_timestamp(this_cpu);
/* Wake up the high-prio watchdog task every second: */
if (now > (touch_timestamp + 1))
wake_up_process(per_cpu(watchdog_task, this_cpu));
/* Warn about unreasonable 10+ seconds delays: */
/* Warn about unreasonable delays: */
if (now <= (touch_timestamp + softlockup_thresh))
return;
@ -121,12 +118,94 @@ void softlockup_tick(void)
spin_unlock(&print_lock);
}
/*
* Have a reasonable limit on the number of tasks checked:
*/
unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
/*
* Zero means infinite timeout - no checking done:
*/
unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
unsigned long __read_mostly sysctl_hung_task_warnings = 10;
/*
* Only do the hung-tasks check on one CPU:
*/
static int check_cpu __read_mostly = -1;
static void check_hung_task(struct task_struct *t, unsigned long now)
{
unsigned long switch_count = t->nvcsw + t->nivcsw;
if (t->flags & PF_FROZEN)
return;
if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
t->last_switch_count = switch_count;
t->last_switch_timestamp = now;
return;
}
if ((long)(now - t->last_switch_timestamp) <
sysctl_hung_task_timeout_secs)
return;
if (sysctl_hung_task_warnings < 0)
return;
sysctl_hung_task_warnings--;
/*
* Ok, the task did not get scheduled for more than 2 minutes,
* complain:
*/
printk(KERN_ERR "INFO: task %s:%d blocked for more than "
"%ld seconds.\n", t->comm, t->pid,
sysctl_hung_task_timeout_secs);
printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
" disables this message.\n");
sched_show_task(t);
__debug_show_held_locks(t);
t->last_switch_timestamp = now;
touch_nmi_watchdog();
}
/*
* Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
* a really long time (120 seconds). If that happens, print out
* a warning.
*/
static void check_hung_uninterruptible_tasks(int this_cpu)
{
int max_count = sysctl_hung_task_check_count;
unsigned long now = get_timestamp(this_cpu);
struct task_struct *g, *t;
/*
* If the system crashed already then all bets are off,
* do not report extra hung tasks:
*/
if ((tainted & TAINT_DIE) || did_panic)
return;
read_lock(&tasklist_lock);
do_each_thread(g, t) {
if (!--max_count)
break;
if (t->state & TASK_UNINTERRUPTIBLE)
check_hung_task(t, now);
} while_each_thread(g, t);
read_unlock(&tasklist_lock);
}
/*
* The watchdog thread - runs every second and touches the timestamp.
*/
static int watchdog(void *__bind_cpu)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
int this_cpu = (long)__bind_cpu;
sched_setscheduler(current, SCHED_FIFO, &param);
@ -135,13 +214,18 @@ static int watchdog(void *__bind_cpu)
/*
* Run briefly once per second to reset the softlockup timestamp.
* If this gets delayed for more than 10 seconds then the
* If this gets delayed for more than 60 seconds then the
* debug-printout triggers in softlockup_tick().
*/
while (!kthread_should_stop()) {
set_current_state(TASK_INTERRUPTIBLE);
touch_softlockup_watchdog();
schedule();
msleep_interruptible(10000);
if (this_cpu != check_cpu)
continue;
if (sysctl_hung_task_timeout_secs)
check_hung_uninterruptible_tasks(this_cpu);
}
return 0;
@ -171,6 +255,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
check_cpu = any_online_cpu(cpu_online_map);
wake_up_process(per_cpu(watchdog_task, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
@ -181,6 +266,15 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
/* Unbind so it can run. Fall thru. */
kthread_bind(per_cpu(watchdog_task, hotcpu),
any_online_cpu(cpu_online_map));
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
if (hotcpu == check_cpu) {
cpumask_t temp_cpu_online_map = cpu_online_map;
cpu_clear(hotcpu, temp_cpu_online_map);
check_cpu = any_online_cpu(temp_cpu_online_map);
}
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
p = per_cpu(watchdog_task, hotcpu);

View file

@ -203,13 +203,13 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
int ret;
/* No CPUs can come up or down during this. */
lock_cpu_hotplug();
get_online_cpus();
p = __stop_machine_run(fn, data, cpu);
if (!IS_ERR(p))
ret = kthread_stop(p);
else
ret = PTR_ERR(p);
unlock_cpu_hotplug();
put_online_cpus();
return ret;
}

View file

@ -81,6 +81,7 @@ extern int compat_log;
extern int maps_protect;
extern int sysctl_stat_interval;
extern int audit_argv_kb;
extern int latencytop_enabled;
/* Constants used for minimum and maximum */
#ifdef CONFIG_DETECT_SOFTLOCKUP
@ -306,9 +307,43 @@ static struct ctl_table kern_table[] = {
.procname = "sched_nr_migrate",
.data = &sysctl_sched_nr_migrate,
.maxlen = sizeof(unsigned int),
.mode = 644,
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_rt_period_ms",
.data = &sysctl_sched_rt_period,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_rt_ratio",
.data = &sysctl_sched_rt_ratio,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_min_bal_int_shares",
.data = &sysctl_sched_min_bal_int_shares,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_max_bal_int_shares",
.data = &sysctl_sched_max_bal_int_shares,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#endif
#endif
{
.ctl_name = CTL_UNNUMBERED,
@ -382,6 +417,15 @@ static struct ctl_table kern_table[] = {
.proc_handler = &proc_dointvec_taint,
},
#endif
#ifdef CONFIG_LATENCYTOP
{
.procname = "latencytop",
.data = &latencytop_enabled,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#endif
#ifdef CONFIG_SECURITY_CAPABILITIES
{
.procname = "cap-bound",
@ -728,13 +772,40 @@ static struct ctl_table kern_table[] = {
.ctl_name = CTL_UNNUMBERED,
.procname = "softlockup_thresh",
.data = &softlockup_thresh,
.maxlen = sizeof(int),
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.proc_handler = &proc_doulongvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &one,
.extra2 = &sixty,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "hung_task_check_count",
.data = &sysctl_hung_task_check_count,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = &proc_doulongvec_minmax,
.strategy = &sysctl_intvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "hung_task_timeout_secs",
.data = &sysctl_hung_task_timeout_secs,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = &proc_doulongvec_minmax,
.strategy = &sysctl_intvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "hung_task_warnings",
.data = &sysctl_hung_task_warnings,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = &proc_doulongvec_minmax,
.strategy = &sysctl_intvec,
},
#endif
#ifdef CONFIG_COMPAT
{

View file

@ -153,6 +153,7 @@ void tick_nohz_update_jiffies(void)
void tick_nohz_stop_sched_tick(void)
{
unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
unsigned long rt_jiffies;
struct tick_sched *ts;
ktime_t last_update, expires, now, delta;
struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@ -216,6 +217,10 @@ void tick_nohz_stop_sched_tick(void)
next_jiffies = get_next_timer_interrupt(last_jiffies);
delta_jiffies = next_jiffies - last_jiffies;
rt_jiffies = rt_needs_cpu(cpu);
if (rt_jiffies && rt_jiffies < delta_jiffies)
delta_jiffies = rt_jiffies;
if (rcu_needs_cpu(cpu))
delta_jiffies = 1;
/*
@ -509,7 +514,6 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
{
struct tick_sched *ts =
container_of(timer, struct tick_sched, sched_timer);
struct hrtimer_cpu_base *base = timer->base->cpu_base;
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
int cpu = smp_processor_id();
@ -547,15 +551,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
touch_softlockup_watchdog();
ts->idle_jiffies++;
}
/*
* update_process_times() might take tasklist_lock, hence
* drop the base lock. sched-tick hrtimers are per-CPU and
* never accessible by userspace APIs, so this is safe to do.
*/
spin_unlock(&base->lock);
update_process_times(user_mode(regs));
profile_tick(CPU_PROFILING);
spin_lock(&base->lock);
}
/* Do not restart, when we are in the idle loop */

View file

@ -896,7 +896,7 @@ static void run_timer_softirq(struct softirq_action *h)
{
tvec_base_t *base = __get_cpu_var(tvec_bases);
hrtimer_run_queues();
hrtimer_run_pending();
if (time_after_eq(jiffies, base->timer_jiffies))
__run_timers(base);
@ -907,6 +907,7 @@ static void run_timer_softirq(struct softirq_action *h)
*/
void run_local_timers(void)
{
hrtimer_run_queues();
raise_softirq(TIMER_SOFTIRQ);
softlockup_tick();
}

View file

@ -319,7 +319,7 @@ void free_uid(struct user_struct *up)
struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
{
struct hlist_head *hashent = uidhashentry(ns, uid);
struct user_struct *up;
struct user_struct *up, *new;
/* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
* atomic.
@ -331,13 +331,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
spin_unlock_irq(&uidhash_lock);
if (!up) {
struct user_struct *new;
new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
if (!new) {
uids_mutex_unlock();
return NULL;
}
if (!new)
goto out_unlock;
new->uid = uid;
atomic_set(&new->__count, 1);
@ -353,28 +349,14 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
#endif
new->locked_shm = 0;
if (alloc_uid_keyring(new, current) < 0) {
kmem_cache_free(uid_cachep, new);
uids_mutex_unlock();
return NULL;
}
if (alloc_uid_keyring(new, current) < 0)
goto out_free_user;
if (sched_create_user(new) < 0) {
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
uids_mutex_unlock();
return NULL;
}
if (sched_create_user(new) < 0)
goto out_put_keys;
if (uids_user_create(new)) {
sched_destroy_user(new);
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
uids_mutex_unlock();
return NULL;
}
if (uids_user_create(new))
goto out_destoy_sched;
/*
* Before adding this, check whether we raced
@ -402,6 +384,17 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
uids_mutex_unlock();
return up;
out_destoy_sched:
sched_destroy_user(new);
out_put_keys:
key_put(new->uid_keyring);
key_put(new->session_keyring);
out_free_user:
kmem_cache_free(uid_cachep, new);
out_unlock:
uids_mutex_unlock();
return NULL;
}
void switch_uid(struct user_struct *new_user)

View file

@ -67,9 +67,8 @@ struct workqueue_struct {
#endif
};
/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
threads to each one as cpus come/go. */
static DEFINE_MUTEX(workqueue_mutex);
/* Serializes the accesses to the list of workqueues. */
static DEFINE_SPINLOCK(workqueue_lock);
static LIST_HEAD(workqueues);
static int singlethread_cpu __read_mostly;
@ -592,8 +591,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
* Returns zero on success.
* Returns -ve errno on failure.
*
* Appears to be racy against CPU hotplug.
*
* schedule_on_each_cpu() is very slow.
*/
int schedule_on_each_cpu(work_func_t func)
@ -605,7 +602,7 @@ int schedule_on_each_cpu(work_func_t func)
if (!works)
return -ENOMEM;
preempt_disable(); /* CPU hotplug */
get_online_cpus();
for_each_online_cpu(cpu) {
struct work_struct *work = per_cpu_ptr(works, cpu);
@ -613,8 +610,8 @@ int schedule_on_each_cpu(work_func_t func)
set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
}
preempt_enable();
flush_workqueue(keventd_wq);
put_online_cpus();
free_percpu(works);
return 0;
}
@ -750,8 +747,10 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
err = create_workqueue_thread(cwq, singlethread_cpu);
start_workqueue_thread(cwq, -1);
} else {
mutex_lock(&workqueue_mutex);
get_online_cpus();
spin_lock(&workqueue_lock);
list_add(&wq->list, &workqueues);
spin_unlock(&workqueue_lock);
for_each_possible_cpu(cpu) {
cwq = init_cpu_workqueue(wq, cpu);
@ -760,7 +759,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
err = create_workqueue_thread(cwq, cpu);
start_workqueue_thread(cwq, cpu);
}
mutex_unlock(&workqueue_mutex);
put_online_cpus();
}
if (err) {
@ -775,7 +774,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
{
/*
* Our caller is either destroy_workqueue() or CPU_DEAD,
* workqueue_mutex protects cwq->thread
* get_online_cpus() protects cwq->thread.
*/
if (cwq->thread == NULL)
return;
@ -810,9 +809,11 @@ void destroy_workqueue(struct workqueue_struct *wq)
struct cpu_workqueue_struct *cwq;
int cpu;
mutex_lock(&workqueue_mutex);
get_online_cpus();
spin_lock(&workqueue_lock);
list_del(&wq->list);
mutex_unlock(&workqueue_mutex);
spin_unlock(&workqueue_lock);
put_online_cpus();
for_each_cpu_mask(cpu, *cpu_map) {
cwq = per_cpu_ptr(wq->cpu_wq, cpu);
@ -835,13 +836,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
action &= ~CPU_TASKS_FROZEN;
switch (action) {
case CPU_LOCK_ACQUIRE:
mutex_lock(&workqueue_mutex);
return NOTIFY_OK;
case CPU_LOCK_RELEASE:
mutex_unlock(&workqueue_mutex);
return NOTIFY_OK;
case CPU_UP_PREPARE:
cpu_set(cpu, cpu_populated_map);
@ -854,7 +848,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
case CPU_UP_PREPARE:
if (!create_workqueue_thread(cwq, cpu))
break;
printk(KERN_ERR "workqueue for %i failed\n", cpu);
printk(KERN_ERR "workqueue [%s] for %i failed\n",
wq->name, cpu);
return NOTIFY_BAD;
case CPU_ONLINE:

View file

@ -517,4 +517,18 @@ config FAULT_INJECTION_STACKTRACE_FILTER
help
Provide stacktrace filter for fault-injection capabilities
config LATENCYTOP
bool "Latency measuring infrastructure"
select FRAME_POINTER if !MIPS
select KALLSYMS
select KALLSYMS_ALL
select STACKTRACE
select SCHEDSTATS
select SCHED_DEBUG
depends on X86 || X86_64
help
Enable this option if you want to use the LatencyTOP tool
to find out which userspace is blocking on what kernel operations.
source "samples/Kconfig"

View file

@ -9,7 +9,6 @@
#include <linux/module.h>
#include <linux/kallsyms.h>
#ifdef CONFIG_PREEMPT_BKL
/*
* The 'big kernel semaphore'
*
@ -86,128 +85,6 @@ void __lockfunc unlock_kernel(void)
up(&kernel_sem);
}
#else
/*
* The 'big kernel lock'
*
* This spinlock is taken and released recursively by lock_kernel()
* and unlock_kernel(). It is transparently dropped and reacquired
* over schedule(). It is used to protect legacy code that hasn't
* been migrated to a proper locking design yet.
*
* Don't use in new code.
*/
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kernel_flag);
/*
* Acquire/release the underlying lock from the scheduler.
*
* This is called with preemption disabled, and should
* return an error value if it cannot get the lock and
* TIF_NEED_RESCHED gets set.
*
* If it successfully gets the lock, it should increment
* the preemption count like any spinlock does.
*
* (This works on UP too - _raw_spin_trylock will never
* return false in that case)
*/
int __lockfunc __reacquire_kernel_lock(void)
{
while (!_raw_spin_trylock(&kernel_flag)) {
if (test_thread_flag(TIF_NEED_RESCHED))
return -EAGAIN;
cpu_relax();
}
preempt_disable();
return 0;
}
void __lockfunc __release_kernel_lock(void)
{
_raw_spin_unlock(&kernel_flag);
preempt_enable_no_resched();
}
/*
* These are the BKL spinlocks - we try to be polite about preemption.
* If SMP is not on (ie UP preemption), this all goes away because the
* _raw_spin_trylock() will always succeed.
*/
#ifdef CONFIG_PREEMPT
static inline void __lock_kernel(void)
{
preempt_disable();
if (unlikely(!_raw_spin_trylock(&kernel_flag))) {
/*
* If preemption was disabled even before this
* was called, there's nothing we can be polite
* about - just spin.
*/
if (preempt_count() > 1) {
_raw_spin_lock(&kernel_flag);
return;
}
/*
* Otherwise, let's wait for the kernel lock
* with preemption enabled..
*/
do {
preempt_enable();
while (spin_is_locked(&kernel_flag))
cpu_relax();
preempt_disable();
} while (!_raw_spin_trylock(&kernel_flag));
}
}
#else
/*
* Non-preemption case - just get the spinlock
*/
static inline void __lock_kernel(void)
{
_raw_spin_lock(&kernel_flag);
}
#endif
static inline void __unlock_kernel(void)
{
/*
* the BKL is not covered by lockdep, so we open-code the
* unlocking sequence (and thus avoid the dep-chain ops):
*/
_raw_spin_unlock(&kernel_flag);
preempt_enable();
}
/*
* Getting the big kernel lock.
*
* This cannot happen asynchronously, so we only need to
* worry about other CPU's.
*/
void __lockfunc lock_kernel(void)
{
int depth = current->lock_depth+1;
if (likely(!depth))
__lock_kernel();
current->lock_depth = depth;
}
void __lockfunc unlock_kernel(void)
{
BUG_ON(current->lock_depth < 0);
if (likely(--current->lock_depth < 0))
__unlock_kernel();
}
#endif
EXPORT_SYMBOL(lock_kernel);
EXPORT_SYMBOL(unlock_kernel);

View file

@ -286,7 +286,7 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
* all the memory it needs. That way it should be able to
* exit() and clear out its resources quickly...
*/
p->time_slice = HZ;
p->rt.time_slice = HZ;
set_tsk_thread_flag(p, TIF_MEMDIE);
force_sig(SIGKILL, p);

View file

@ -730,8 +730,7 @@ static inline void init_lock_keys(void)
#endif
/*
* 1. Guard access to the cache-chain.
* 2. Protect sanity of cpu_online_map against cpu hotplug events
* Guard access to the cache-chain.
*/
static DEFINE_MUTEX(cache_chain_mutex);
static struct list_head cache_chain;
@ -1331,12 +1330,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
int err = 0;
switch (action) {
case CPU_LOCK_ACQUIRE:
mutex_lock(&cache_chain_mutex);
break;
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
mutex_lock(&cache_chain_mutex);
err = cpuup_prepare(cpu);
mutex_unlock(&cache_chain_mutex);
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
@ -1373,9 +1371,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
#endif
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
mutex_lock(&cache_chain_mutex);
cpuup_canceled(cpu);
break;
case CPU_LOCK_RELEASE:
mutex_unlock(&cache_chain_mutex);
break;
}
@ -2170,6 +2167,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
* We use cache_chain_mutex to ensure a consistent view of
* cpu_online_map as well. Please see cpuup_callback
*/
get_online_cpus();
mutex_lock(&cache_chain_mutex);
list_for_each_entry(pc, &cache_chain, next) {
@ -2396,6 +2394,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
panic("kmem_cache_create(): failed to create slab `%s'\n",
name);
mutex_unlock(&cache_chain_mutex);
put_online_cpus();
return cachep;
}
EXPORT_SYMBOL(kmem_cache_create);
@ -2547,9 +2546,11 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
int ret;
BUG_ON(!cachep || in_interrupt());
get_online_cpus();
mutex_lock(&cache_chain_mutex);
ret = __cache_shrink(cachep);
mutex_unlock(&cache_chain_mutex);
put_online_cpus();
return ret;
}
EXPORT_SYMBOL(kmem_cache_shrink);
@ -2575,6 +2576,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
BUG_ON(!cachep || in_interrupt());
/* Find the cache in the chain of caches. */
get_online_cpus();
mutex_lock(&cache_chain_mutex);
/*
* the chain is never empty, cache_cache is never destroyed
@ -2584,6 +2586,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
slab_error(cachep, "Can't free all objects");
list_add(&cachep->next, &cache_chain);
mutex_unlock(&cache_chain_mutex);
put_online_cpus();
return;
}
@ -2592,6 +2595,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
__kmem_cache_destroy(cachep);
mutex_unlock(&cache_chain_mutex);
put_online_cpus();
}
EXPORT_SYMBOL(kmem_cache_destroy);

View file

@ -293,7 +293,7 @@ void flow_cache_flush(void)
static DEFINE_MUTEX(flow_flush_sem);
/* Don't want cpus going down or up during this. */
lock_cpu_hotplug();
get_online_cpus();
mutex_lock(&flow_flush_sem);
atomic_set(&info.cpuleft, num_online_cpus());
init_completion(&info.completion);
@ -305,7 +305,7 @@ void flow_cache_flush(void)
wait_for_completion(&info.completion);
mutex_unlock(&flow_flush_sem);
unlock_cpu_hotplug();
put_online_cpus();
}
static void __devinit flow_cache_cpu_prepare(int cpu)