Commit graph

8375 commits

Author SHA1 Message Date
Mike Frysinger
2a9ad18deb lockdep: use new arch_is_kernel_data()
This allows lockdep to locate symbols that are in arch-specific data
sections (such as data in Blackfin on-chip SRAM regions).

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Robin Getz <rgetz@blackfin.uclinux.org>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-23 07:39:30 -07:00
Mike Frysinger
128e8db38e kallsyms: use new arch_is_kernel_text()
This allows kallsyms to locate symbols that are in arch-specific text
sections (such as text in Blackfin on-chip SRAM regions).

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Robin Getz <rgetz@blackfin.uclinux.org>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-23 07:39:30 -07:00
Jiri Pirko
1f10206cf8 getrusage: fill ru_maxrss value
Make ->ru_maxrss value in struct rusage filled accordingly to rss hiwater
mark.  This struct is filled as a parameter to getrusage syscall.
->ru_maxrss value is set to KBs which is the way it is done in BSD
systems.  /usr/bin/time (gnu time) application converts ->ru_maxrss to KBs
which seems to be incorrect behavior.  Maintainer of this util was
notified by me with the patch which corrects it and cc'ed.

To make this happen we extend struct signal_struct by two fields.  The
first one is ->maxrss which we use to store rss hiwater of the task.  The
second one is ->cmaxrss which we use to store highest rss hiwater of all
task childs.  These values are used in k_getrusage() to actually fill
->ru_maxrss.  k_getrusage() uses current rss hiwater value directly if mm
struct exists.

Note:
exec() clear mm->hiwater_rss, but doesn't clear sig->maxrss.
it is intetionally behavior. *BSD getrusage have exec() inheriting.

test programs
========================================================

getrusage.c
===========
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
 #include <signal.h>
 #include <sys/mman.h>

 #include "common.h"

 #define err(str) perror(str), exit(1)

int main(int argc, char** argv)
{
	int status;

	printf("allocate 100MB\n");
	consume(100);

	printf("testcase1: fork inherit? \n");
	printf("  expect: initial.self ~= child.self\n");
	show_rusage("initial");
	if (__fork()) {
		wait(&status);
	} else {
		show_rusage("fork child");
		_exit(0);
	}
	printf("\n");

	printf("testcase2: fork inherit? (cont.) \n");
	printf("  expect: initial.children ~= 100MB, but child.children = 0\n");
	show_rusage("initial");
	if (__fork()) {
		wait(&status);
	} else {
		show_rusage("child");
		_exit(0);
	}
	printf("\n");

	printf("testcase3: fork + malloc \n");
	printf("  expect: child.self ~= initial.self + 50MB\n");
	show_rusage("initial");
	if (__fork()) {
		wait(&status);
	} else {
		printf("allocate +50MB\n");
		consume(50);
		show_rusage("fork child");
		_exit(0);
	}
	printf("\n");

	printf("testcase4: grandchild maxrss\n");
	printf("  expect: post_wait.children ~= 300MB\n");
	show_rusage("initial");
	if (__fork()) {
		wait(&status);
		show_rusage("post_wait");
	} else {
		system("./child -n 0 -g 300");
		_exit(0);
	}
	printf("\n");

	printf("testcase5: zombie\n");
	printf("  expect: pre_wait ~= initial, IOW the zombie process is not accounted.\n");
	printf("          post_wait ~= 400MB, IOW wait() collect child's max_rss. \n");
	show_rusage("initial");
	if (__fork()) {
		sleep(1); /* children become zombie */
		show_rusage("pre_wait");
		wait(&status);
		show_rusage("post_wait");
	} else {
		system("./child -n 400");
		_exit(0);
	}
	printf("\n");

	printf("testcase6: SIG_IGN\n");
	printf("  expect: initial ~= after_zombie (child's 500MB alloc should be ignored).\n");
	show_rusage("initial");
	signal(SIGCHLD, SIG_IGN);
	if (__fork()) {
		sleep(1); /* children become zombie */
		show_rusage("after_zombie");
	} else {
		system("./child -n 500");
		_exit(0);
	}
	printf("\n");
	signal(SIGCHLD, SIG_DFL);

	printf("testcase7: exec (without fork) \n");
	printf("  expect: initial ~= exec \n");
	show_rusage("initial");
	execl("./child", "child", "-v", NULL);

	return 0;
}

child.c
=======
 #include <sys/types.h>
 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/resource.h>

 #include "common.h"

int main(int argc, char** argv)
{
	int status;
	int c;
	long consume_size = 0;
	long grandchild_consume_size = 0;
	int show = 0;

	while ((c = getopt(argc, argv, "n:g:v")) != -1) {
		switch (c) {
		case 'n':
			consume_size = atol(optarg);
			break;
		case 'v':
			show = 1;
			break;
		case 'g':

			grandchild_consume_size = atol(optarg);
			break;
		default:
			break;
		}
	}

	if (show)
		show_rusage("exec");

	if (consume_size) {
		printf("child alloc %ldMB\n", consume_size);
		consume(consume_size);
	}

	if (grandchild_consume_size) {
		if (fork()) {
			wait(&status);
		} else {
			printf("grandchild alloc %ldMB\n", grandchild_consume_size);
			consume(grandchild_consume_size);

			exit(0);
		}
	}

	return 0;
}

common.c
========
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
 #include <signal.h>
 #include <sys/mman.h>

 #include "common.h"
 #define err(str) perror(str), exit(1)

void show_rusage(char *prefix)
{
    	int err, err2;
    	struct rusage rusage_self;
    	struct rusage rusage_children;

    	printf("%s: ", prefix);
    	err = getrusage(RUSAGE_SELF, &rusage_self);
    	if (!err)
    		printf("self %ld ", rusage_self.ru_maxrss);
    	err2 = getrusage(RUSAGE_CHILDREN, &rusage_children);
    	if (!err2)
    		printf("children %ld ", rusage_children.ru_maxrss);

    	printf("\n");
}

/* Some buggy OS need this worthless CPU waste. */
void make_pagefault(void)
{
	void *addr;
	int size = getpagesize();
	int i;

	for (i=0; i<1000; i++) {
		addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
		if (addr == MAP_FAILED)
			err("make_pagefault");
		memset(addr, 0, size);
		munmap(addr, size);
	}
}

void consume(int mega)
{
    	size_t sz = mega * 1024 * 1024;
    	void *ptr;

    	ptr = malloc(sz);
    	memset(ptr, 0, sz);
	make_pagefault();
}

pid_t __fork(void)
{
	pid_t pid;

	pid = fork();
	make_pagefault();

	return pid;
}

common.h
========
void show_rusage(char *prefix);
void make_pagefault(void);
void consume(int mega);
pid_t __fork(void);

FreeBSD result (expected result)
========================================================
allocate 100MB
testcase1: fork inherit?
  expect: initial.self ~= child.self
initial: self 103492 children 0
fork child: self 103540 children 0

testcase2: fork inherit? (cont.)
  expect: initial.children ~= 100MB, but child.children = 0
initial: self 103540 children 103540
child: self 103564 children 0

testcase3: fork + malloc
  expect: child.self ~= initial.self + 50MB
initial: self 103564 children 103564
allocate +50MB
fork child: self 154860 children 0

testcase4: grandchild maxrss
  expect: post_wait.children ~= 300MB
initial: self 103564 children 154860
grandchild alloc 300MB
post_wait: self 103564 children 308720

testcase5: zombie
  expect: pre_wait ~= initial, IOW the zombie process is not accounted.
          post_wait ~= 400MB, IOW wait() collect child's max_rss.
initial: self 103564 children 308720
child alloc 400MB
pre_wait: self 103564 children 308720
post_wait: self 103564 children 411312

testcase6: SIG_IGN
  expect: initial ~= after_zombie (child's 500MB alloc should be ignored).
initial: self 103564 children 411312
child alloc 500MB
after_zombie: self 103624 children 411312

testcase7: exec (without fork)
  expect: initial ~= exec
initial: self 103624 children 411312
exec: self 103624 children 411312

Linux result (actual test result)
========================================================
allocate 100MB
testcase1: fork inherit?
  expect: initial.self ~= child.self
initial: self 102848 children 0
fork child: self 102572 children 0

testcase2: fork inherit? (cont.)
  expect: initial.children ~= 100MB, but child.children = 0
initial: self 102876 children 102644
child: self 102572 children 0

testcase3: fork + malloc
  expect: child.self ~= initial.self + 50MB
initial: self 102876 children 102644
allocate +50MB
fork child: self 153804 children 0

testcase4: grandchild maxrss
  expect: post_wait.children ~= 300MB
initial: self 102876 children 153864
grandchild alloc 300MB
post_wait: self 102876 children 307536

testcase5: zombie
  expect: pre_wait ~= initial, IOW the zombie process is not accounted.
          post_wait ~= 400MB, IOW wait() collect child's max_rss.
initial: self 102876 children 307536
child alloc 400MB
pre_wait: self 102876 children 307536
post_wait: self 102876 children 410076

testcase6: SIG_IGN
  expect: initial ~= after_zombie (child's 500MB alloc should be ignored).
initial: self 102876 children 410076
child alloc 500MB
after_zombie: self 102880 children 410076

testcase7: exec (without fork)
  expect: initial ~= exec
initial: self 102880 children 410076
exec: self 102880 children 410076

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-23 07:39:30 -07:00
Scott James Remnant
02b51df1b0 proc connector: add event for process becoming session leader
The act of a process becoming a session leader is a useful signal to a
supervising init daemon such as Upstart.

While a daemon will normally do this as part of the process of becoming a
daemon, it is rare for its children to do so.  When the children do, it is
nearly always a sign that the child should be considered detached from the
parent and not supervised along with it.

The poster-child example is OpenSSH; the per-login children call setsid()
so that they may control the pty connected to them.  If the primary daemon
dies or is restarted, we do not want to consider the per-login children
and want to respawn the primary daemon without killing the children.

This patch adds a new PROC_SID_EVENT and associated structure to the
proc_event event_data union, it arranges for this to be emitted when the
special PIDTYPE_SID pid is set.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Scott James Remnant <scott@ubuntu.com>
Acked-by: Matt Helsley <matthltc@us.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Acked-by: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-23 07:39:29 -07:00
James Morris
88e9d34c72 seq_file: constify seq_operations
Make all seq_operations structs const, to help mitigate against
revectoring user-triggerable function pointers.

This is derived from the grsecurity patch, although generated from scratch
because it's simpler than extracting the changes from there.

Signed-off-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-23 07:39:29 -07:00
Xiao Guangrong
54fdade1c3 generic-ipi: make struct call_function_data lockless
This patch can remove spinlock from struct call_function_data, the
reasons are below:

1: add a new interface for cpumask named cpumask_test_and_clear_cpu(),
   it can atomically test and clear specific cpu, we can use it instead
   of cpumask_test_cpu() and cpumask_clear_cpu() and no need data->lock
   to protect those in generic_smp_call_function_interrupt().

2: in smp_call_function_many(), after csd_lock() return, the current's
   cfd_data is deleted from call_function list, so it not have race
   between other cpus, then cfs_data is only used in
   smp_call_function_many() that must disable preemption and not from
   a hardware interrupthandler or from a bottom half handler to call,
   only the correspond cpu can use it, so it not have race in current
   cpu, no need cfs_data->lock to protect it.

3: after 1 and 2, cfs_data->lock is only use to protect cfs_data->refs in
   generic_smp_call_function_interrupt(), so we can define cfs_data->refs
   to atomic_t, and no need cfs_data->lock any more.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
[akpm@linux-foundation.org: use atomic_dec_return()]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-23 07:39:28 -07:00
Neil Horman
c02e3f361c kmod: fix race in usermodehelper code
The user mode helper code has a race in it.  call_usermodehelper_exec()
takes an allocated subprocess_info structure, which it passes to a
workqueue, and then passes it to a kernel thread which it creates, after
which it calls complete to signal to the caller of
call_usermodehelper_exec() that it can free the subprocess_info struct.

But since we use that structure in the created thread, we can't call
complete from __call_usermodehelper(), which is where we create the kernel
thread.  We need to call complete() from within the kernel thread and then
not use subprocess_info afterward in the case of UMH_WAIT_EXEC.  Tested
successfully by me.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-23 07:39:28 -07:00
Dave Young
af91322ef3 printk: add printk_delay to make messages readable for some scenarios
When syslog is not possible, at the same time there's no serial/net
console available, it will be hard to read the printk messages.  For
example oops/panic/warning messages in shutdown phase.

Add a printk delay feature, we can make each printk message delay some
milliseconds.

Setting the delay by proc/sysctl interface: /proc/sys/kernel/printk_delay

The value range from 0 - 10000, default value is 0

[akpm@linux-foundation.org: fix a few things]
Signed-off-by: Dave Young <hidave.darkstar@gmail.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-23 07:39:28 -07:00
Dave Young
3a3b6ed223 printk boot_delay: rename printk_delay_msec to loops_per_msec
Rename `printk_delay_msec' to `loops_per_msec', because the patch "printk:
add printk_delay to make messages readable for some scenarios" wishes to
more appropriately use the `printk_delay_msec' identifier.

[akpm@linux-foundation.org: add a comment]
Signed-off-by: Dave Young <hidave.darkstar@gmail.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-23 07:39:28 -07:00
Ingo Molnar
115e8a2882 modules, tracing: Remove stale struct marker signature from module_layout()
Linus reported this new build warning:

  kernel/module.c:2951: warning: ?struct marker? declared inside parameter list
  kernel/module.c:2951: warning: its scope is only this definition or declaration, which is probably not what you want

Caused by:

  fc53776: tracing: Remove markers

module_layout() is an artificial symbol with 'significant' symbols
listed in its argument list so that it gets a proper argument types
signature that modversions can pick up to decide whether a
module is version-compatible or not. If these dont match then we
wont even look at a module.

Remove the stale marker symbol.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <alpine.LFD.2.01.0909210908020.4950@localhost.localdomain>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-23 10:34:21 +02:00
Linus Torvalds
342ff1a1b5 Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jikos/trivial
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jikos/trivial: (34 commits)
  trivial: fix typo in aic7xxx comment
  trivial: fix comment typo in drivers/ata/pata_hpt37x.c
  trivial: typo in kernel-parameters.txt
  trivial: fix typo in tracing documentation
  trivial: add __init/__exit macros in drivers/gpio/bt8xxgpio.c
  trivial: add __init macro/ fix of __exit macro location in ipmi_poweroff.c
  trivial: remove unnecessary semicolons
  trivial: Fix duplicated word "options" in comment
  trivial: kbuild: remove extraneous blank line after declaration of usage()
  trivial: improve help text for mm debug config options
  trivial: doc: hpfall: accept disk device to unload as argument
  trivial: doc: hpfall: reduce risk that hpfall can do harm
  trivial: SubmittingPatches: Fix reference to renumbered step
  trivial: fix typos "man[ae]g?ment" -> "management"
  trivial: media/video/cx88: add __init/__exit macros to cx88 drivers
  trivial: fix typo in CONFIG_DEBUG_FS in gcov doc
  trivial: fix missing printk space in amd_k7_smp_check
  trivial: fix typo s/ketymap/keymap/ in comment
  trivial: fix typo "to to" in multiple files
  trivial: fix typos in comments s/DGBU/DBGU/
  ...
2009-09-22 07:51:45 -07:00
Arjan van de Ven
69d25870f2 cpuidle: fix the menu governor to boost IO performance
Fix the menu idle governor which balances power savings, energy efficiency
and performance impact.

The reason for a reworked governor is that there have been serious
performance issues reported with the existing code on Nehalem server
systems.

To show this I'm sure Andrew wants to see benchmark results:
(benchmark is "fio", "no cstates" is using "idle=poll")

		no cstates	current linux	new algorithm
1 disk		107 Mb/s	85 Mb/s		105 Mb/s
2 disks		215 Mb/s	123 Mb/s	209 Mb/s
12 disks	590 Mb/s	320 Mb/s	585 Mb/s

In various power benchmark measurements, no degredation was found by our
measurement&diagnostics team.  Obviously a small percentage more power was
used in the "fio" benchmark, due to the much higher performance.

While it would be a novel idea to describe the new algorithm in this
commit message, I cheaped out and described it in comments in the code
instead.

[changes since first post: spelling fixes from akpm, review feedback,
folded menu-tng into menu.c]

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-22 07:17:45 -07:00
Bernd Schmidt
eb8cdec4a9 nommu: add support for Memory Protection Units (MPU)
Some architectures (like the Blackfin arch) implement some of the
"simpler" features that one would expect out of a MMU such as memory
protection.

In our case, we actually get read/write/exec protection down to the page
boundary so processes can't stomp on each other let alone the kernel.

There is a performance decrease (which depends greatly on the workload)
however as the hardware/software interaction was not optimized at design
time.

Signed-off-by: Bernd Schmidt <bernds_cb1@t-online.de>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Acked-by: David Howells <dhowells@redhat.com>
Acked-by: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-22 07:17:43 -07:00
KOSAKI Motohiro
28b83c5193 oom: move oom_adj value from task_struct to signal_struct
Currently, OOM logic callflow is here.

    __out_of_memory()
        select_bad_process()            for each task
            badness()                   calculate badness of one task
                oom_kill_process()      search child
                    oom_kill_task()     kill target task and mm shared tasks with it

example, process-A have two thread, thread-A and thread-B and it have very
fat memory and each thread have following oom_adj and oom_score.

     thread-A: oom_adj = OOM_DISABLE, oom_score = 0
     thread-B: oom_adj = 0,           oom_score = very-high

Then, select_bad_process() select thread-B, but oom_kill_task() refuse
kill the task because thread-A have OOM_DISABLE.  Thus __out_of_memory()
call select_bad_process() again.  but select_bad_process() select the same
task.  It mean kernel fall in livelock.

The fact is, select_bad_process() must select killable task.  otherwise
OOM logic go into livelock.

And root cause is, oom_adj shouldn't be per-thread value.  it should be
per-process value because OOM-killer kill a process, not thread.  Thus
This patch moves oomkilladj (now more appropriately named oom_adj) from
struct task_struct to struct signal_struct.  it naturally prevent
select_bad_process() choose wrong task.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-22 07:17:39 -07:00
Alexey Dobriyan
1a8670a29b oom: move oom_killer_enable()/oom_killer_disable to where they belong
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-22 07:17:38 -07:00
Jan Beulich
2c85f51d22 mm: also use alloc_large_system_hash() for the PID hash table
This is being done by allowing boot time allocations to specify that they
may want a sub-page sized amount of memory.

Overall this seems more consistent with the other hash table allocations,
and allows making two supposedly mm-only variables really mm-only
(nr_{kernel,all}_pages).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-22 07:17:38 -07:00
Jan Beulich
3c1596efe1 mm: don't use alloc_bootmem_low() where not strictly needed
Since alloc_bootmem() will never return inaccessible (via virtual
addressing) memory anyway, using the ..._low() variant only makes sense
when the physical address range of the allocated memory must fulfill
further constraints, espacially since on 64-bits (or more generally in all
cases where the pools the two variants allocate from are than the full
available range.

Probably the use in alloc_tce_table() could also be eliminated (based on
code inspection of pci-calgary_64.c), but that seems too risky given I
know nothing about that hardware and have no way to test it.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-22 07:17:38 -07:00
Andrea Arcangeli
1c2fb7a4c2 ksm: fix deadlock with munlock in exit_mmap
Rawhide users have reported hang at startup when cryptsetup is run: the
same problem can be simply reproduced by running a program int main() {
mlockall(MCL_CURRENT | MCL_FUTURE); return 0; }

The problem is that exit_mmap() applies munlock_vma_pages_all() to
clean up VM_LOCKED areas, and its current implementation (stupidly)
tries to fault in absent pages, for example where PROT_NONE prevented
them being faulted in when mlocking.  Whereas the "ksm: fix oom
deadlock" patch, knowing there's a race by which KSM might try to fault
in pages after exit_mmap() had finally zapped the range, backs out of
such faults doing nothing when its ksm_test_exit() notices mm_users 0.

So revert that part of "ksm: fix oom deadlock" which moved the
ksm_exit() call from before exit_mmap() to the middle of exit_mmap();
and remove those ksm_test_exit() checks from the page fault paths, so
allowing the munlocking to proceed without interference.

ksm_exit, if there are rmap_items still chained on this mm slot, takes
mmap_sem write side: so preventing KSM from working on an mm while
exit_mmap runs.  And KSM will bail out as soon as it notices that
mm_users is already zero, thanks to its internal ksm_test_exit checks.
So that when a task is killed by OOM killer or the user, KSM will not
indefinitely prevent it from running exit_mmap to release its memory.

This does break a part of what "ksm: fix oom deadlock" was trying to
achieve.  When unmerging KSM (echo 2 >/sys/kernel/mm/ksm), and even
when ksmd itself has to cancel a KSM page, it is possible that the
first OOM-kill victim would be the KSM process being faulted: then its
memory won't be freed until a second victim has been selected (freeing
memory for the unmerging fault to complete).

But the OOM killer is already liable to kill a second victim once the
intended victim's p->mm goes to NULL: so there's not much point in
rejecting this KSM patch before fixing that OOM behaviour.  It is very
much more important to allow KSM users to boot up, than to haggle over
an unlikely and poorly supported OOM case.

We also intend to fix munlocking to not fault pages: at which point
this patch _could_ be reverted; though that would be controversial, so
we hope to find a better solution.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Justin M. Forbes <jforbes@redhat.com>
Acked-for-now-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Izik Eidus <ieidus@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-22 07:17:32 -07:00
Hugh Dickins
9ba6929480 ksm: fix oom deadlock
There's a now-obvious deadlock in KSM's out-of-memory handling:
imagine ksmd or KSM_RUN_UNMERGE handling, holding ksm_thread_mutex,
trying to allocate a page to break KSM in an mm which becomes the
OOM victim (quite likely in the unmerge case): it's killed and goes
to exit, and hangs there waiting to acquire ksm_thread_mutex.

Clearly we must not require ksm_thread_mutex in __ksm_exit, simple
though that made everything else: perhaps use mmap_sem somehow?
And part of the answer lies in the comments on unmerge_ksm_pages:
__ksm_exit should also leave all the rmap_item removal to ksmd.

But there's a fundamental problem, that KSM relies upon mmap_sem to
guarantee the consistency of the mm it's dealing with, yet exit_mmap
tears down an mm without taking mmap_sem.  And bumping mm_users won't
help at all, that just ensures that the pages the OOM killer assumes
are on their way to being freed will not be freed.

The best answer seems to be, to move the ksm_exit callout from just
before exit_mmap, to the middle of exit_mmap: after the mm's pages
have been freed (if the mmu_gather is flushed), but before its page
tables and vma structures have been freed; and down_write,up_write
mmap_sem there to serialize with KSM's own reliance on mmap_sem.

But KSM then needs to be careful, whenever it downs mmap_sem, to
check that the mm is not already exiting: there's a danger of using
find_vma on a layout that's being torn apart, or writing into page
tables which have been freed for reuse; and even do_anonymous_page
and __do_fault need to check they're not being called by break_ksm
to reinstate a pte after zap_pte_range has zapped that page table.

Though it might be clearer to add an exiting flag, set while holding
mmap_sem in __ksm_exit, that wouldn't cover the issue of reinstating
a zapped pte.  All we need is to check whether mm_users is 0 - but
must remember that ksmd may detect that before __ksm_exit is reached.
So, ksm_test_exit(mm) added to comment such checks on mm->mm_users.

__ksm_exit now has to leave clearing up the rmap_items to ksmd,
that needs ksm_thread_mutex; but shift the exiting mm just after the
ksm_scan cursor so that it will soon be dealt with.  __ksm_enter raise
mm_count to hold the mm_struct, ksmd's exit processing (exactly like
its processing when it finds all VM_MERGEABLEs unmapped) mmdrop it,
similar procedure for KSM_RUN_UNMERGE (which has stopped ksmd).

But also give __ksm_exit a fast path: when there's no complication
(no rmap_items attached to mm and it's not at the ksm_scan cursor),
it can safely do all the exiting work itself.  This is not just an
optimization: when ksmd is not running, the raised mm_count would
otherwise leak mm_structs.

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Acked-by: Izik Eidus <ieidus@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-22 07:17:32 -07:00
Hugh Dickins
f8af4da3b4 ksm: the mm interface to ksm
This patch presents the mm interface to a dummy version of ksm.c, for
better scrutiny of that interface: the real ksm.c follows later.

When CONFIG_KSM is not set, madvise(2) reject MADV_MERGEABLE and
MADV_UNMERGEABLE with EINVAL, since that seems more helpful than
pretending that they can be serviced.  But when CONFIG_KSM=y, accept them
even if KSM is not currently running, and even on areas which KSM will not
touch (e.g.  hugetlb or shared file or special driver mappings).

Like other madvices, report ENOMEM despite success if any area in the
range is unmapped, and use EAGAIN to report out of memory.

Define vma flag VM_MERGEABLE to identify an area on which KSM may try
merging pages: leave it to ksm_madvise() to decide whether to set it.
Define mm flag MMF_VM_MERGEABLE to identify an mm which might contain
VM_MERGEABLE areas, to minimize callouts when forking or exiting.

Based upon earlier patches by Chris Wright and Izik Eidus.

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Chris Wright <chrisw@redhat.com>
Signed-off-by: Izik Eidus <ieidus@redhat.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-22 07:17:31 -07:00
KOSAKI Motohiro
c6a7f5728a mm: oom analysis: Show kernel stack usage in /proc/meminfo and OOM log output
The amount of memory allocated to kernel stacks can become significant and
cause OOM conditions.  However, we do not display the amount of memory
consumed by stacks.

Add code to display the amount of memory used for stacks in /proc/meminfo.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-22 07:17:27 -07:00
Alexey Dobriyan
6e1d5dcc2b const: mark remaining inode_operations as const
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-22 07:17:24 -07:00
Alexey Dobriyan
b87221de6a const: mark remaining super_operations const
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-22 07:17:24 -07:00
Darren Hart
0729e19614 futex: Fix wakeup race by setting TASK_INTERRUPTIBLE before queue_me()
PI futexes do not use the same plist_node_empty() test for wakeup.
It was possible for the waiter (in futex_wait_requeue_pi()) to set
TASK_INTERRUPTIBLE after the waker assigned the rtmutex to the
waiter. The waiter would then note the plist was not empty and call
schedule(). The task would not be found by any subsequeuent futex
wakeups, resulting in a userspace hang.

By moving the setting of TASK_INTERRUPTIBLE to before the call to
queue_me(), the race with the waker is eliminated. Since we no
longer call get_user() from within queue_me(), there is no need to
delay the setting of TASK_INTERRUPTIBLE until after the call to
queue_me().

The FUTEX_LOCK_PI operation is not affected as futex_lock_pi()
relies entirely on the rtmutex code to handle schedule() and
wakeup.  The requeue PI code is affected because the waiter starts
as a non-PI waiter and is woken on a PI futex.

Remove the crusty old comment about holding spinlocks() across
get_user() as we no longer do that. Correct the locking statement
with a description of why the test is performed.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Dinakar Guniguntala <dino@in.ibm.com>
Cc: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20090922053038.8717.97838.stgit@Aeon>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-22 10:37:44 +02:00
Darren Hart
d8d88fbb18 futex: Correct futex_q woken state commentary
Use kernel-doc format to describe struct futex_q.

Correct the wakeup definition to eliminate the statement about
waking the waiter between the plist_del() and the q->lock_ptr = 0.

Note in the comment that PI futexes have a different definition of
the woken state.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Dinakar Guniguntala <dino@in.ibm.com>
Cc: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20090922053029.8717.62798.stgit@Aeon>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-22 10:37:44 +02:00
Darren Hart
d96ee56ce0 futex: Make function kernel-doc commentary consistent
Make the existing function kernel-doc consistent throughout
futex.c, following Documentation/kernel-doc-nano-howto.txt as
closely as possible.

When unsure, at least be consistent within futex.c.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Dinakar Guniguntala <dino@in.ibm.com>
Cc: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20090922053022.8717.13339.stgit@Aeon>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-22 10:37:43 +02:00
Darren Hart
d40d65c8db futex: Correct queue_me and unqueue_me commentary
The queue_me/unqueue_me commentary is oddly placed and out of date.
Clean it up and correct the inaccurate bits.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Dinakar Guniguntala <dino@in.ibm.com>
Cc: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20090922053015.8717.71713.stgit@Aeon>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-22 10:37:43 +02:00
Darren Hart
56ec1607b1 futex: Correct futex_wait_requeue_pi() commentary
Correct various typos and formatting inconsistencies in the
commentary of futex_wait_requeue_pi().

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Dinakar Guniguntala <dino@in.ibm.com>
Cc: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20090922052958.8717.21932.stgit@Aeon>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-22 10:37:42 +02:00
Li Zefan
79fe249c83 tracing: Fix failure path in ftrace_regex_open()
Don't forget to free trace_parser if seq_open() returned failure.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4AB86694.4040803@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-22 10:28:57 +02:00
Li Zefan
1eb90f138b tracing: Fix failure path in ftrace_graph_write()
Don't call trace_parser_put() on uninitialized trace_parser.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4AB86639.3000003@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-22 10:28:56 +02:00
Li Zefan
4ba7978e98 tracing: Check the return value of trace_get_user()
Return immediately if trace_get_user() returned failure.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4AB86614.7020803@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-22 10:28:55 +02:00
Li Zefan
3c235a337e tracing: Fix off-by-one in trace_get_user()
Leave the last slot for the tailing '\0'.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4AB865FA.5080801@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-22 10:28:53 +02:00
Andrew Morton
dedcf2971c net: fix CONFIG_NET=n build on sparc64
sparc64 allnoconfig:

arch/sparc/kernel/built-in.o(.text+0x134e0): In function `sys32_recvfrom':
: undefined reference to `compat_sys_recvfrom'
arch/sparc/kernel/built-in.o(.text+0x134e4): In function `sys32_recvfrom':
: undefined reference to `compat_sys_recvfrom'

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-09-21 11:32:27 -07:00
Linus Torvalds
43c1266ce4 Merge branch 'perfcounters-rename-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'perfcounters-rename-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  perf: Tidy up after the big rename
  perf: Do the big rename: Performance Counters -> Performance Events
  perf_counter: Rename 'event' to event_id/hw_event
  perf_counter: Rename list_entry -> group_entry, counter_list -> group_list

Manually resolved some fairly trivial conflicts with the tracing tree in
include/trace/ftrace.h and kernel/trace/trace_syscalls.c.
2009-09-21 09:15:07 -07:00
Linus Torvalds
b8c7f1dc5c Merge branch 'core-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'core-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  rcu: Fix whitespace inconsistencies
  rcu: Fix thinko, actually initialize full tree
  rcu: Apply results of code inspection of kernel/rcutree_plugin.h
  rcu: Add WARN_ON_ONCE() consistency checks covering state transitions
  rcu: Fix synchronize_rcu() for TREE_PREEMPT_RCU
  rcu: Simplify rcu_read_unlock_special() quiescent-state accounting
  rcu: Add debug checks to TREE_PREEMPT_RCU for premature grace periods
  rcu: Kconfig help needs to say that TREE_PREEMPT_RCU scales down
  rcutorture: Occasionally delay readers enough to make RCU force_quiescent_state
  rcu: Initialize multi-level RCU grace periods holding locks
  rcu: Need to update rnp->gpnum if preemptable RCU is to be reliable
2009-09-21 09:06:52 -07:00
Linus Torvalds
8e4bc3dd2c Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  sched: Simplify sys_sched_rr_get_interval() system call
  sched: Fix potential NULL derference of doms_cur
  sched: Fix raciness in runqueue_is_locked()
  sched: Re-add lost cpu_allowed check to sched_fair.c::select_task_rq_fair()
  sched: Remove unneeded indentation in sched_fair.c::place_entity()
2009-09-21 09:06:17 -07:00
Linus Torvalds
bd4c3a3441 Merge branch 'tracing-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'tracing-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  kernel/profile.c: Switch /proc/irq/prof_cpu_mask to seq_file
  tracing: Export trace_profile_buf symbols
  tracing/events: use list_for_entry_continue
  tracing: remove max_tracer_type_len
  function-graph: use ftrace_graph_funcs directly
  tracing: Remove markers
  tracing: Allocate the ftrace event profile buffer dynamically
  tracing: Factorize the events profile accounting
2009-09-21 09:05:47 -07:00
Joe Perches
a419aef8b8 trivial: remove unnecessary semicolons
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
2009-09-21 15:14:58 +02:00
Uwe Kleine-Koenig
2944fcbe03 trivial: Fix duplicated word "options" in comment
this was introduced in

	5e0a093 (tracing: fix config options to not show when automatically selected)

Signed-off-by: Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: trivial@kernel.org
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
2009-09-21 15:14:58 +02:00
Anand Gadiyar
fd589a8f0a trivial: fix typo "to to" in multiple files
Signed-off-by: Anand Gadiyar <gadiyar@ti.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
2009-09-21 15:14:55 +02:00
Robert P. J. Day
fe002a4197 trivial: Correct print_tainted routine name in comment
Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
2009-09-21 15:14:53 +02:00
Ingo Molnar
57c0c15b52 perf: Tidy up after the big rename
- provide compatibility Kconfig entry for existing PERF_COUNTERS .config's

 - provide courtesy copy of old perf_counter.h, for user-space projects

 - small indentation fixups

 - fix up MAINTAINERS

 - fix small x86 printout fallout

 - fix up small PowerPC comment fallout (use 'counter' as in register)

Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 14:34:11 +02:00
Michal Simek
1f74b1f7e5 microblaze: Enable GCOV_PROFILE_ALL
Signed-off-by: Michal Simek <monstr@monstr.eu>
2009-09-21 14:29:21 +02:00
Ingo Molnar
cdd6c482c9 perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!

In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.

Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.

All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)

The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.

Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.

User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)

This patch has been generated via the following script:

  FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')

  sed -i \
    -e 's/PERF_EVENT_/PERF_RECORD_/g' \
    -e 's/PERF_COUNTER/PERF_EVENT/g' \
    -e 's/perf_counter/perf_event/g' \
    -e 's/nb_counters/nb_events/g' \
    -e 's/swcounter/swevent/g' \
    -e 's/tpcounter_event/tp_event/g' \
    $FILES

  for N in $(find . -name perf_counter.[ch]); do
    M=$(echo $N | sed 's/perf_counter/perf_event/g')
    mv $N $M
  done

  FILES=$(find . -name perf_event.*)

  sed -i \
    -e 's/COUNTER_MASK/REG_MASK/g' \
    -e 's/COUNTER/EVENT/g' \
    -e 's/\<event\>/event_id/g' \
    -e 's/counter/event/g' \
    -e 's/Counter/Event/g' \
    $FILES

... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.

Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.

( NOTE: 'counters' are still the proper terminology when we deal
  with hardware registers - and these sed scripts are a bit
  over-eager in renaming them. I've undone some of that, but
  in case there's something left where 'counter' would be
  better than 'event' we can undo that on an individual basis
  instead of touching an otherwise nicely automated patch. )

Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 14:28:04 +02:00
Ingo Molnar
dfc65094d0 perf_counter: Rename 'event' to event_id/hw_event
In preparation to the renames, to avoid a namespace clash.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 12:54:59 +02:00
Ingo Molnar
65abc8653c perf_counter: Rename list_entry -> group_entry, counter_list -> group_list
This is in preparation of the big rename, but also makes sense
in a standalone way: 'list_entry' is a bad name as we already
have a list_entry() in list.h.

Also, the 'counter list' is too vague, it doesnt tell us the
purpose of that list.

Clarify these names to show that it's all about the group
hiearchy.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 12:54:51 +02:00
Peter Williams
0d721ceadb sched: Simplify sys_sched_rr_get_interval() system call
By removing the need for it to know details of scheduling classes.

This allows PlugSched to define orthogonal scheduling classes.

Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <06d1b89ee15a0eef82d7.1253496713@mudlark.pw.nest>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 09:53:55 +02:00
Linus Torvalds
ebc79c4f8d Merge git://git.kernel.org/pub/scm/linux/kernel/git/jaswinder/linux-2.6
* git://git.kernel.org/pub/scm/linux/kernel/git/jaswinder/linux-2.6:
  includecheck fix: x86, cpu/common.c
  includecheck fix: kernel/trace, ring_buffer.c
  includecheck fix: include/linux, ftrace.h
  includecheck fix: include/linux, page_cgroup.h
  includecheck fix: include/linux, aio.h
  includecheck fix: include/drm, drm_memory.h
  includecheck fix: include/acpi, acpi_bus.h
  includecheck fix: drivers/xen, evtchn.c
  includecheck fix: drivers/video, vgacon.c
  includecheck fix: drivers/scsi, ibmvscsi.c
  includecheck fix: drivers/scsi, libfcoe.c
  includecheck fix: x86, shadow.c
  includecheck fix: x86, traps.c
  includecheck fix: um, helper.c
  includecheck fix: s390, sys_s390.c
2009-09-20 16:02:06 -07:00
Linus Torvalds
e11c675ede Merge git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty-2.6
* git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty-2.6: (79 commits)
  USB serial: update the console driver
  usb-serial: straighten out serial_open
  usb-serial: add missing tests and debug lines
  usb-serial: rename subroutines
  usb-serial: fix termios initialization logic
  usb-serial: acquire references when a new tty is installed
  usb-serial: change logic of serial lookups
  usb-serial: put subroutines in logical order
  usb-serial: change referencing of port and serial structures
  tty: Char: mxser, use THRE for ASPP_OQUEUE ioctl
  tty: Char: mxser, add support for CP112UL
  uartlite: support shared interrupt lines
  tty: USB: serial/mct_u232, fix tty refcnt
  tty: riscom8, fix tty refcnt
  tty: riscom8, fix shutdown declaration
  TTY: fix typos
  tty: Power: fix suspend vt regression
  tty: vt: use printk_once
  tty: handle VT specific compat ioctls in vt driver
  n_tty: move echoctl check and clean up logic
  ...
2009-09-20 15:55:17 -07:00
Linus Torvalds
467f9957d9 Merge branch 'perfcounters-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'perfcounters-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (58 commits)
  perf_counter: Fix perf_copy_attr() pointer arithmetic
  perf utils: Use a define for the maximum length of a trace event
  perf: Add timechart help text and add timechart to "perf help"
  tracing, x86, cpuidle: Move the end point of a C state in the power tracer
  perf utils: Be consistent about minimum text size in the svghelper
  perf timechart: Add "perf timechart record"
  perf: Add the timechart tool
  perf: Add a SVG helper library file
  tracing, perf: Convert the power tracer into an event tracer
  perf: Add a sample_event type to the event_union
  perf: Allow perf utilities to have "callback" options without arguments
  perf: Store trace event name/id pairs in perf.data
  perf: Add a timestamp to fork events
  sched_clock: Make it NMI safe
  perf_counter: Fix up swcounter throttling
  x86, perf_counter, bts: Optimize BTS overflow handling
  perf sched: Add --input=file option to builtin-sched.c
  perf trace: Sample timestamp and cpu when using record flag
  perf tools: Increase MAX_EVENT_LENGTH
  perf tools: Fix memory leak in read_ftrace_printk()
  ...
2009-09-20 15:54:37 -07:00
Yong Zhang
cb5fd13f11 sched: Fix potential NULL derference of doms_cur
If CONFIG_CPUMASK_OFFSTACK is enabled but doms_cur alloc failed in
arch_init_sched_domains(), doms_cur will move back to
fallback_doms. But this time, fallback_doms has not been
initialized yet.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Cc: a.p.zijlstra@chello.nl
LKML-Reference: <1252930816-7672-1-git-send-email-yong.zhang0@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-20 20:20:30 +02:00
Alexey Dobriyan
583a22e7c1 kernel/profile.c: Switch /proc/irq/prof_cpu_mask to seq_file
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-20 20:15:40 +02:00
Andrew Morton
89f19f04dc sched: Fix raciness in runqueue_is_locked()
runqueue_is_locked() is unavoidably racy due to a poor interface design.
It does

	cpu = get_cpu()
	ret = some_perpcu_thing(cpu);
	put_cpu(cpu);
	return ret;

Its return value is unreliable.

Fix.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <200909191855.n8JItiko022148@imap1.linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-20 20:00:32 +02:00
Peter Zijlstra
05bafda856 tracing: Export trace_profile_buf symbols
ERROR: "trace_profile_buf_nmi" [fs/jbd2/jbd2.ko] undefined!
ERROR: "trace_profile_buf" [fs/jbd2/jbd2.ko] undefined!
ERROR: "trace_profile_buf_nmi" [fs/ext4/ext4.ko] undefined!
ERROR: "trace_profile_buf" [fs/ext4/ext4.ko] undefined!
ERROR: "trace_profile_buf_nmi" [arch/x86/kvm/kvm.ko] undefined!
ERROR: "trace_profile_buf" [arch/x86/kvm/kvm.ko] undefined!

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1253442878.7542.3.camel@laptop>
[ fixed whitespace noise and checkpatch complaint ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-20 19:15:57 +02:00
Jaswinder Singh Rajput
a0f320f487 includecheck fix: kernel/trace, ring_buffer.c
fix the following 'make includecheck' warning:

  kernel/trace/ring_buffer.c: trace.h is included more than once.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Sam Ravnborg <sam@ravnborg.org>
LKML-Reference: <1247068617.4382.107.camel@ht.satnam>
2009-09-20 16:58:56 +05:30
Alan Cox
8d233558cd vt: remove power stuff from kernel/power
In the past someone gratuitiously borrowed chunks of kernel internal vt
code and dumped them in kernel/power. They have all sorts of deep relations
with the vt code so put them in the vt tree instead

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2009-09-19 13:13:25 -07:00
Alan Cox
f8a7c1a976 kfifo: Use "const" definitions
Currently kfifo cannot be used by parts of the kernel that use "const"
properly as kfifo itself does not use const for passed data blocks which
are indeed const.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2009-09-19 13:13:17 -07:00
Ian Schram
cdf8073d6b perf_counter: Fix perf_copy_attr() pointer arithmetic
There is still some weird code in per_copy_attr(). Which supposedly
checks that all bytes trailing a struct are zero.

It doesn't seem to get pointer arithmetic right. Since it
increments an iterating pointer by sizeof(unsigned long) rather
than 1.

Signed-off-by: Ian Schram <ischram@telenet.be>
[ v2: clean up the messy PTR_ALIGN logic as well. ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: <stable@kernel.org> # for v2.6.31.x
LKML-Reference: <4AB3DEE2.3030600@telenet.be>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-19 19:32:55 +02:00
Ingo Molnar
2df2881804 Merge branch 'tip/tracing/core' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-2.6-trace into tracing/urgent 2009-09-19 19:21:15 +02:00
Li Zefan
30bd39cd62 tracing/events: use list_for_entry_continue
Simplify s_next() and t_next().

Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4AB32389.1030005@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-19 11:30:40 -04:00
Li Zefan
ee6c2c1bd1 tracing: remove max_tracer_type_len
Limit the length of a tracer's name within 100 chars, and then we
don't have to play with max_tracer_type_len.

Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4AB32377.9020601@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-19 11:28:19 -04:00
Li Zefan
a4ec5e0c26 function-graph: use ftrace_graph_funcs directly
No need to store ftrace_graph_funcs in file->private.

Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4AB32364.7020602@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-19 11:26:54 -04:00
Mike Galbraith
3f04e8cd5b sched: Re-add lost cpu_allowed check to sched_fair.c::select_task_rq_fair()
While doing some testing, I pinned mplayer, only to find it
following X around like a puppy. Looking at commit c88d591, I found
a cpu_allowed check that went AWOL.  I plugged it back in where it
looks like it needs to go, and now when I say "sit, stay!", mplayer
obeys again.

'c88d591 sched: Merge select_task_rq_fair() and
sched_balance_self()' accidentally dropped the check, causing
wake_affine() to pull pinned tasks - put it back.

[ v2: use a cheaper version from Peter ]

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-19 17:11:31 +02:00
Ingo Molnar
be4bdbfbae Merge branch 'tracing/core-v3' of git://git.kernel.org/pub/scm/linux/kernel/git/frederic/random-tracing into tracing/urgent 2009-09-19 12:05:25 +02:00
Arjan van de Ven
6161352142 tracing, perf: Convert the power tracer into an event tracer
This patch converts the existing power tracer into an event tracer,
so that power events (C states and frequency changes) can be
tracked via "perf".

This also removes the perl script that was used to demo the tracer;
its functionality is being replaced entirely with timechart.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090912130542.6d314860@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-19 11:42:12 +02:00
Arjan van de Ven
393b2ad8c7 perf: Add a timestamp to fork events
perf timechart needs to know when a process forked, in order to be
able to visualize properly when tasks start.

This patch adds a time field to the event structure, and fills it
in appropriately.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090912130341.51ad2de2@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-19 11:42:10 +02:00
Ingo Molnar
929bf0d015 Merge branch 'linus' into perfcounters/core
Merge reason: Bring in tracing changes we depend on.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-19 11:28:41 +02:00
Paul E. McKenney
a71fca58b7 rcu: Fix whitespace inconsistencies
Fix a number of whitespace ^Ierrors in the include/linux/rcu*
and the kernel/rcu* files.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
LKML-Reference: <20090918172819.GA24405@linux.vnet.ibm.com>
[ did more checkpatch fixlets ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-19 08:53:22 +02:00
Paul E. McKenney
49e291266d rcu: Fix thinko, actually initialize full tree
Commit de078d8 ("rcu: Need to update rnp->gpnum if preemptable RCU
is to be reliable") repeatedly and incorrectly initializes the root
rcu_node structure's ->gpnum field rather than initializing the
->gpnum field of each node in the tree.  Fix this.  Also add an
additional consistency check to catch this in the future.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
LKML-Reference: <125329262011-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-19 08:53:21 +02:00
Paul E. McKenney
e7d8842ed3 rcu: Apply results of code inspection of kernel/rcutree_plugin.h
o Drop the calls to cpu_quiet() from the online/offline code.
  These are unnecessary, since force_quiescent_state() will
  clean up, and removing them simplifies the code a bit.

o Add a warning to check that we don't enqueue the same blocked
  task twice onto the ->blocked_tasks[] lists.

o Rework the phase computation in rcu_preempt_note_context_switch()
  to be more readable, as suggested by Josh Triplett.

o Disable irqs to close a race between the scheduling clock
  interrupt and rcu_preempt_note_context_switch() WRT the
  ->rcu_read_unlock_special field.

o Add comments to rnp->lock acquisition and release within
  rcu_read_unlock_special() noting that irqs are already
  disabled.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
LKML-Reference: <12532926201851-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-19 08:53:21 +02:00
Paul E. McKenney
28ecd58020 rcu: Add WARN_ON_ONCE() consistency checks covering state transitions
o Verify that qsmask bits stay clear through GP
  initialization.

o Verify that cpu_quiet_msk_finish() is never invoked unless
  there actually is an RCU grace period in progress.

o Verify that all internal-node rcu_node structures have empty
  blocked_tasks[] lists.

o Verify that child rcu_node structure's bits remain clear after
  acquiring parent's lock.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
LKML-Reference: <12532926191947-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-19 08:53:19 +02:00
Christoph Hellwig
fc5377668c tracing: Remove markers
Now that the last users of markers have migrated to the event
tracer we can kill off the (now orphan) support code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090917173527.GA1699@lst.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-18 21:22:08 +02:00
Peter Zijlstra
def0a9b257 sched_clock: Make it NMI safe
Arjan complained about the suckyness of TSC on modern machines, and
asked if we could do something about that for PERF_SAMPLE_TIME.

Make cpu_clock() NMI safe by removing the spinlock and using
cmpxchg. This also makes it smaller and more robust.

Affects architectures that use HAVE_UNSTABLE_SCHED_CLOCK, i.e. IA64
and x86.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-18 20:47:30 +02:00
Peter Zijlstra
cf450a7355 perf_counter: Fix up swcounter throttling
/me dons the brown paper bag.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-18 20:43:22 +02:00
Markus Metzger
5622f295b5 x86, perf_counter, bts: Optimize BTS overflow handling
Draining the BTS buffer on a buffer overflow interrupt takes too
long resulting in a kernel lockup when tracing the kernel.

Restructure perf_counter sampling into sample creation and sample
output.

Prepare a single reference sample for BTS sampling and update the
from and to address fields when draining the BTS buffer. Drain the
entire BTS buffer between a single perf_output_begin() /
perf_output_end() pair.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090915130023.A16204@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-18 20:43:20 +02:00
Alexey Dobriyan
6952b61de9 headers: taskstats_kern.h trim
Remove net/genetlink.h inclusion, now sched.c won't be recompiled
because of some networking changes.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-18 09:48:52 -07:00
Linus Torvalds
a03fdb7612 Merge branch 'timers-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'timers-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (34 commits)
  time: Prevent 32 bit overflow with set_normalized_timespec()
  clocksource: Delay clocksource down rating to late boot
  clocksource: clocksource_select must be called with mutex locked
  clocksource: Resolve cpu hotplug dead lock with TSC unstable, fix crash
  timers: Drop a function prototype
  clocksource: Resolve cpu hotplug dead lock with TSC unstable
  timer.c: Fix S/390 comments
  timekeeping: Fix invalid getboottime() value
  timekeeping: Fix up read_persistent_clock() breakage on sh
  timekeeping: Increase granularity of read_persistent_clock(), build fix
  time: Introduce CLOCK_REALTIME_COARSE
  x86: Do not unregister PIT clocksource on PIT oneshot setup/shutdown
  clocksource: Avoid clocksource watchdog circular locking dependency
  clocksource: Protect the watchdog rating changes with clocksource_mutex
  clocksource: Call clocksource_change_rating() outside of watchdog_lock
  timekeeping: Introduce read_boot_clock
  timekeeping: Increase granularity of read_persistent_clock()
  timekeeping: Update clocksource with stop_machine
  timekeeping: Add timekeeper read_clock helper functions
  timekeeping: Move NTP adjusted clock multiplier to struct timekeeper
  ...

Fix trivial conflict due to MIPS lemote -> loongson renaming.
2009-09-18 09:15:24 -07:00
Mike Galbraith
a2e7a7eb2f sched: Remove unneeded indentation in sched_fair.c::place_entity()
Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1253258365.22787.33.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-18 09:22:39 +02:00
Frederic Weisbecker
20ab4425a7 tracing: Allocate the ftrace event profile buffer dynamically
Currently the trace event profile buffer is allocated in the stack. But
this may be too much for the stack, as the events can have large
statically defined field size and can also grow with dynamic arrays.

Allocate two per cpu buffer for all profiled events. The first cpu
buffer is used to host every non-nmi context traces. It is protected
by disabling the interrupts while writing and committing the trace.

The second buffer is reserved for nmi. So that there is no race between
them and the first buffer.

The whole write/commit section is rcu protected because we release
these buffers while deactivating the last profiling trace event.

v2: Move the buffers from trace_event to be global, as pointed by
    Steven Rostedt.

v3: Fix the syscall events to handle the profiling buffer races
    by disabling interrupts, now that the buffers are globals.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
2009-09-18 07:25:44 +02:00
Frederic Weisbecker
e5e25cf47b tracing: Factorize the events profile accounting
Factorize the events enabling accounting in a common tracing core
helper. This reduces the size of the profile_enable() and
profile_disable() callbacks for each trace events.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
2009-09-18 06:14:32 +02:00
Linus Torvalds
dcbf77b9e8 Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (37 commits)
  sched: Fix SD_POWERSAVING_BALANCE|SD_PREFER_LOCAL vs SD_WAKE_AFFINE
  sched: Stop buddies from hogging the system
  sched: Add new wakeup preemption mode: WAKEUP_RUNNING
  sched: Fix TASK_WAKING & loadaverage breakage
  sched: Disable wakeup balancing
  sched: Rename flags to wake_flags
  sched: Clean up the load_idx selection in select_task_rq_fair
  sched: Optimize cgroup vs wakeup a bit
  sched: x86: Name old_perf in a unique way
  sched: Implement a gentler fair-sleepers feature
  sched: Add SD_PREFER_LOCAL
  sched: Add a few SYNC hint knobs to play with
  sched: Fix sync wakeups again
  sched: Add WF_FORK
  sched: Rename sync arguments
  sched: Rename select_task_rq() argument
  sched: Feature to disable APERF/MPERF cpu_power
  x86: sched: Provide arch implementations using aperf/mperf
  x86: Add generic aperf/mperf code
  x86: Move APERF/MPERF into a X86_FEATURE
  ...

Fix up trivial conflict in arch/x86/include/asm/processor.h due to
nearby addition of amd_get_nb_id() declaration from the EDAC merge.
2009-09-17 21:00:02 -07:00
Paul E. McKenney
16e3081191 rcu: Fix synchronize_rcu() for TREE_PREEMPT_RCU
The redirection of synchronize_sched() to synchronize_rcu() was
appropriate for TREE_RCU, but not for TREE_PREEMPT_RCU.

Fix this by creating an underlying synchronize_sched().  TREE_RCU
then redirects synchronize_rcu() to synchronize_sched(), while
TREE_PREEMPT_RCU has its own version of synchronize_rcu().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
LKML-Reference: <12528585111916-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-18 00:06:53 +02:00
Paul E. McKenney
c3422bea5f rcu: Simplify rcu_read_unlock_special() quiescent-state accounting
The earlier approach required two scheduling-clock ticks to note an
preemptable-RCU quiescent state in the situation in which the
scheduling-clock interrupt is unlucky enough to always interrupt an
RCU read-side critical section.

With this change, the quiescent state is instead noted by the
outermost rcu_read_unlock() immediately following the first
scheduling-clock tick, or, alternatively, by the first subsequent
context switch.  Therefore, this change also speeds up grace
periods.

Suggested-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
LKML-Reference: <12528585111945-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-18 00:06:33 +02:00
Paul E. McKenney
b0e165c035 rcu: Add debug checks to TREE_PREEMPT_RCU for premature grace periods
Check to make sure that there are no blocked tasks for the previous
grace period while initializing for the next grace period, verify
that rcu_preempt_qs() is given the correct CPU number and is never
called for an offline CPU.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
LKML-Reference: <12528585111986-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-18 00:06:13 +02:00
Josh Triplett
b8d57a76d9 rcutorture: Occasionally delay readers enough to make RCU force_quiescent_state
rcutorture already delays readers, but never for long enough to
make RCU force a quiescent state.  Add an occasional delay of
50ms.

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
LKML-Reference: <12524504772607-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-18 00:05:34 +02:00
Paul E. McKenney
b835db1f9c rcu: Initialize multi-level RCU grace periods holding locks
Prior implementations initialized the root and any internal
nodes without holding locks, then initialized the leaves
holding locks.

This is a false economy, as the leaf nodes will usually greatly
outnumber the root and internal nodes.  Acquiring locks on all
nodes is conceptually much simpler as well.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
LKML-Reference: <12524504773190-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-18 00:05:14 +02:00
Paul E. McKenney
de078d875c rcu: Need to update rnp->gpnum if preemptable RCU is to be reliable
Without this patch, tasks preempted in RCU read-side critical
sections can fail to block the grace period, given that
rnp->gpnum is used to determine which rnp->blocked_tasks[]
element the preempted task is enqueued on.

Before the patch, rnp->gpnum is always zero, so preempted tasks
are always enqueued on rnp->blocked_tasks[0], which is correct
only when the current CPU has not checked into the current
grace period and the grace-period number is even, or,
similarly, if the current CPU -has- checked into the current
grace period and the grace-period number is odd.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
LKML-Reference: <12524504771622-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-18 00:04:54 +02:00
Peter Zijlstra
2667de81f3 perf_counter: Allow for a wakeup watermark
Currently we wake the mmap() consumer once every PAGE_SIZE of data
and/or once event wakeup_events when specified.

For high speed sampling this results in too many wakeups wrt. the
buffer size, hence change this.

We move the default wakeup limit to 1/4-th the buffer size, and
provide for means to manually specify this limit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-17 22:08:26 +02:00
Peter Zijlstra
850bc73ffc perf_counter: Do not throttle single swcounter events
We can have swcounter events that contribute more than a single
count per event, when used with a non-zero period, those can
generate multiple events, which is when we need throttling.

However, swcounter that contribute only a single count per event
can only come as fast as we can run code, hence don't throttle
them.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-17 22:08:25 +02:00
Li Zefan
5dd4de587f softirq: add BLOCK_IOPOLL to softirq_to_name
With BLOCK_IOPOLL_SOFTIRQ added, softirq_to_name[] and
show_softirq_name() needs to be updated.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4AB20398.8070209@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-17 15:53:44 -04:00
Steven Rostedt
b375a11a23 tracing: switch function prints from %pf to %ps
For direct function pointers (like what mcount provides) PowerPC64
requires the use of %ps, otherwise nothing is printed.

This patch converts all prints of functions retrieved through mcount
to use the %ps format from the %pf.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-17 15:53:40 -04:00
Ingo Molnar
45bd00d31d Merge branch 'linus' into tracing/core
Merge reason: Pick up kernel/softirq.c update for dependent fix.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-17 20:53:10 +02:00
Peter Zijlstra
29cd8bae39 sched: Fix SD_POWERSAVING_BALANCE|SD_PREFER_LOCAL vs SD_WAKE_AFFINE
The SD_POWERSAVING_BALANCE|SD_PREFER_LOCAL code can break out of
the domain iteration early, making us miss the SD_WAKE_AFFINE bits.

Fix this by continuing iteration until there is no need for a
larger domain.

This also cleans up the cgroup stuff a bit, but not having two
update_shares() invocations.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-17 10:40:31 +02:00
Peter Zijlstra
de69a80be3 sched: Stop buddies from hogging the system
Clear buddies more agressively.

The (theoretical, haven't actually observed any of this) problem is
that when we do not select either buddy in pick_next_entity()
because they are too far ahead of the left-most task, we do not
clear the buddies.

This means that as soon as we service the left-most task, these
same buddies will be tried again on the next schedule. Now if the
left-most task was a pure hog, it wouldn't have done any wakeups
and it wouldn't have set buddies of its own. That leads to the old
buddies dominating, which would lead to bad latencies.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-17 10:40:30 +02:00
Peter Zijlstra
ad4b78bbcb sched: Add new wakeup preemption mode: WAKEUP_RUNNING
Create a new wakeup preemption mode, preempt towards tasks that run
shorter on avg. It sets next buddy to be sure we actually run the task
we preempted for.

Test results:

 root@twins:~# while :; do :; done &
 [1] 6537
 root@twins:~# while :; do :; done &
 [2] 6538
 root@twins:~# while :; do :; done &
 [3] 6539
 root@twins:~# while :; do :; done &
 [4] 6540

 root@twins:/home/peter# ./latt -c4 sleep 4
 Entries: 48 (clients=4)

 Averages:
 ------------------------------
        Max          4750 usec
        Avg           497 usec
        Stdev         737 usec

 root@twins:/home/peter# echo WAKEUP_RUNNING > /debug/sched_features

 root@twins:/home/peter# ./latt -c4 sleep 4
 Entries: 48 (clients=4)

 Averages:
 ------------------------------
        Max            14 usec
        Avg             5 usec
        Stdev           3 usec

Disabled by default - needs more testing.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <new-submission>
2009-09-17 10:17:25 +02:00
Ingo Molnar
eb24073bc1 sched: Fix TASK_WAKING & loadaverage breakage
Fix this:

top - 21:54:00 up  2:59,  1 user,  load average: 432512.33, 426421.74, 417432.74

Which happens because we now set TASK_WAKING before activate_task().

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-17 09:51:20 +02:00
Linus Torvalds
ab86e5765d Merge git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core-2.6
* git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core-2.6:
  Driver Core: devtmpfs - kernel-maintained tmpfs-based /dev
  debugfs: Modify default debugfs directory for debugging pktcdvd.
  debugfs: Modified default dir of debugfs for debugging UHCI.
  debugfs: Change debugfs directory of IWMC3200
  debugfs: Change debuhgfs directory of trace-events-sample.h
  debugfs: Fix mount directory of debugfs by default in events.txt
  hpilo: add poll f_op
  hpilo: add interrupt handler
  hpilo: staging for interrupt handling
  driver core: platform_device_add_data(): use kmemdup()
  Driver core: Add support for compatibility classes
  uio: add generic driver for PCI 2.3 devices
  driver-core: move dma-coherent.c from kernel to driver/base
  mem_class: fix bug
  mem_class: use minor as index instead of searching the array
  driver model: constify attribute groups
  UIO: remove 'default n' from Kconfig
  Driver core: Add accessor for device platform data
  Driver core: move dev_get/set_drvdata to drivers/base/dd.c
  Driver core: add new device to bus's list before probing
2009-09-16 08:27:10 -07:00
Linus Torvalds
6b7b352f21 Merge branch 'for-linus' of git://git.kernel.dk/linux-2.6-block
* 'for-linus' of git://git.kernel.dk/linux-2.6-block:
  block: fix linkage problem with blk_iopoll and !CONFIG_BLOCK
2009-09-16 07:46:34 -07:00
Peter Zijlstra
5a9b86f647 sched: Rename flags to wake_flags
For consistencies sake, rename the argument (again).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-16 16:44:33 +02:00
Peter Zijlstra
5158f4e442 sched: Clean up the load_idx selection in select_task_rq_fair
Clean up the code a little.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-16 16:44:32 +02:00
Peter Zijlstra
3b64089422 sched: Optimize cgroup vs wakeup a bit
We don't need to call update_shares() for each domain we iterate,
just got the largets one.

However, we should call it before wake_affine() as well, so that
that can use up-to-date values too.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-16 16:44:32 +02:00
Atsushi Tsuji
b36461da2a tracing: Fix minor bugs for __unregister_ftrace_function_probe
Fix the condition of strcmp for "*".
Also fix NULL pointer dereference when glob is NULL.

Signed-off-by: Atsushi Tsuji <a-tsuji@bk.jp.nec.com>
LKML-Reference: <4AAF6726.5090905@bk.jp.nec.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-16 09:08:54 -04:00
Andi Kleen
6a46079cf5 HWPOISON: The high level memory error handler in the VM v7
Add the high level memory handler that poisons pages
that got corrupted by hardware (typically by a two bit flip in a DIMM
or a cache) on the Linux level. The goal is to prevent everyone
from accessing these pages in the future.

This done at the VM level by marking a page hwpoisoned
and doing the appropriate action based on the type of page
it is.

The code that does this is portable and lives in mm/memory-failure.c

To quote the overview comment:

High level machine check handler. Handles pages reported by the
hardware as being corrupted usually due to a 2bit ECC memory or cache
failure.

This focuses on pages detected as corrupted in the background.
When the current CPU tries to consume corruption the currently
running process can just be killed directly instead. This implies
that if the error cannot be handled for some reason it's safe to
just ignore it because no corruption has been consumed yet. Instead
when that happens another machine check will happen.

Handles page cache pages in various states. The tricky part
here is that we can access any page asynchronous to other VM
users, because memory failures could happen anytime and anywhere,
possibly violating some of their assumptions. This is why this code
has to be extremely careful. Generally it tries to use normal locking
rules, as in get the standard locks, even if that means the
error handling takes potentially a long time.

Some of the operations here are somewhat inefficient and have non
linear algorithmic complexity, because the data structures have not
been optimized for this case. This is in particular the case
for the mapping from a vma to a process. Since this case is expected
to be rare we hope we can get away with this.

There are in principle two strategies to kill processes on poison:
- just unmap the data and wait for an actual reference before
killing
- kill as soon as corruption is detected.
Both have advantages and disadvantages and should be used
in different situations. Right now both are implemented and can
be switched with a new sysctl vm.memory_failure_early_kill
The default is early kill.

The patch does some rmap data structure walking on its own to collect
processes to kill. This is unusual because normally all rmap data structure
knowledge is in rmap.c only. I put it here for now to keep
everything together and rmap knowledge has been seeping out anyways

Includes contributions from Johannes Weiner, Chris Mason, Fengguang Wu,
Nick Piggin (who did a lot of great work) and others.

Cc: npiggin@suse.de
Cc: riel@redhat.com
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
2009-09-16 11:50:15 +02:00
Andi Kleen
4db96cf077 HWPOISON: Add PR_MCE_KILL prctl to control early kill behaviour per process
This allows processes to override their early/late kill
behaviour on hardware memory errors.

Typically applications which are memory error aware is
better of with early kill (see the error as soon
as possible), all others with late kill (only
see the error when the error is really impacting execution)

There's a global sysctl, but this way an application
can set its specific policy.

We're using two bits, one to signify that the process
stated its intention and that

I also made the prctl future proof by enforcing
the unused arguments are 0.

The state is inherited to children.

Note this makes us officially run out of process flags
on 32bit, but the next patch can easily add another field.

Manpage patch will be supplied separately.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
2009-09-16 11:50:14 +02:00
Ingo Molnar
51e0304ce6 sched: Implement a gentler fair-sleepers feature
Add back FAIR_SLEEPERS and GENTLE_FAIR_SLEEPERS.

FAIR_SLEEPERS is the old logic: credit sleepers with their sleep time.

GENTLE_FAIR_SLEEPERS dampens this a bit: 50% of their sleep time gets
credited.

The hope here is to still give the benefits of fair-sleepers logic
(quick wakeups, etc.) while not allow them to have 100% of their
sleep time as if they were running.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-16 09:05:20 +02:00
Peter Zijlstra
59abf02644 sched: Add SD_PREFER_LOCAL
And turn it on for NUMA and MC domains. This improves
locality in balancing decisions by keeping up to
capacity amount of tasks local before looking for idle
CPUs. (and twice the capacity if SD_POWERSAVINGS_BALANCE
is set.)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-16 08:42:40 +02:00
Jens Axboe
cb684b5bcd block: fix linkage problem with blk_iopoll and !CONFIG_BLOCK
kernel/built-in.o:(.data+0x17b0): undefined reference to `blk_iopoll_enabled'

Since the extern declaration makes the compile work, but the actual
symbol is missing when block/blk-iopoll.o isn't linked in.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2009-09-15 21:53:11 +02:00
Peter Zijlstra
e69b0f1b41 sched: Add a few SYNC hint knobs to play with
Currently we use overlap to weaken the SYNC hint, but allow it to
set the hint as well.

 echo NO_SYNC_WAKEUP > /debug/sched_features
 echo SYNC_MORE > /debug/sched_features

preserves pipe-test behaviour without using the WF_SYNC hint.

Worth playing with on more workloads...

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 19:47:23 +02:00
Peter Zijlstra
63859d4fe4 sched: Fix sync wakeups again
The sync argument rename to introduce WF_* broke stuff by missing a
local alias for an argument in __wake_up_common, fix it by using
the more descriptive wake_flags name.

This restores WF_SYNC propagation, which fixes wake_affine()
behaviour, which fixes pipe-test.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 19:47:22 +02:00
Linus Torvalds
723e9db7a4 Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc
* 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc: (134 commits)
  powerpc/nvram: Enable use Generic NVRAM driver for different size chips
  powerpc/iseries: Fix oops reading from /proc/iSeries/mf/*/cmdline
  powerpc/ps3: Workaround for flash memory I/O error
  powerpc/booke: Don't set DABR on 64-bit BookE, use DAC1 instead
  powerpc/perf_counters: Reduce stack usage of power_check_constraints
  powerpc: Fix bug where perf_counters breaks oprofile
  powerpc/85xx: Fix SMP compile error and allow NULL for smp_ops
  powerpc/irq: Improve nanodoc
  powerpc: Fix some late PowerMac G5 with PCIe ATI graphics
  powerpc/fsl-booke: Use HW PTE format if CONFIG_PTE_64BIT
  powerpc/book3e: Add missing page sizes
  powerpc/pseries: Fix to handle slb resize across migration
  powerpc/powermac: Thermal control turns system off too eagerly
  powerpc/pci: Merge ppc32 and ppc64 versions of phb_scan()
  powerpc/405ex: support cuImage via included dtb
  powerpc/405ex: provide necessary fixup function to support cuImage
  powerpc/40x: Add support for the ESTeem 195E (PPC405EP) SBC
  powerpc/44x: Add Eiger AMCC (AppliedMicro) PPC460SX evaluation board support.
  powerpc/44x: Update Arches defconfig
  powerpc/44x: Update Arches dts
  ...

Fix up conflicts in drivers/char/agp/uninorth-agp.c
2009-09-15 09:51:09 -07:00
Ming Lei
a56af87648 driver-core: move dma-coherent.c from kernel to driver/base
Placing dma-coherent.c in driver/base is better than in kernel,
since it contains code to do per-device coherent dma memory
handling.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2009-09-15 09:50:47 -07:00
Linus Torvalds
ada3fa1505 Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu: (46 commits)
  powerpc64: convert to dynamic percpu allocator
  sparc64: use embedding percpu first chunk allocator
  percpu: kill lpage first chunk allocator
  x86,percpu: use embedding for 64bit NUMA and page for 32bit NUMA
  percpu: update embedding first chunk allocator to handle sparse units
  percpu: use group information to allocate vmap areas sparsely
  vmalloc: implement pcpu_get_vm_areas()
  vmalloc: separate out insert_vmalloc_vm()
  percpu: add chunk->base_addr
  percpu: add pcpu_unit_offsets[]
  percpu: introduce pcpu_alloc_info and pcpu_group_info
  percpu: move pcpu_lpage_build_unit_map() and pcpul_lpage_dump_cfg() upward
  percpu: add @align to pcpu_fc_alloc_fn_t
  percpu: make @dyn_size mandatory for pcpu_setup_first_chunk()
  percpu: drop @static_size from first chunk allocators
  percpu: generalize first chunk allocator selection
  percpu: build first chunk allocators selectively
  percpu: rename 4k first chunk allocator to page
  percpu: improve boot messages
  percpu: fix pcpu_reclaim() locking
  ...

Fix trivial conflict as by Tejun Heo in kernel/sched.c
2009-09-15 09:39:44 -07:00
Linus Torvalds
f199fd9906 Merge branch 'perfcounters-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'perfcounters-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  perf_counter: Fix buffer overflow in perf_copy_attr()
2009-09-15 09:34:27 -07:00
Steven Rostedt
6ca6cca31d tracing: optimize global_trace_clock cachelines
The prev_trace_clock_time is only read or written to when the
trace_clock_lock is taken. For better perfomance, they
should share the same cache line.

Reported-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-15 12:24:22 -04:00
Linus Torvalds
227423904c Merge branch 'x86-pat-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-pat-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  x86, pat: Fix cacheflush address in change_page_attr_set_clr()
  mm: remove !NUMA condition from PAGEFLAGS_EXTENDED condition set
  x86: Fix earlyprintk=dbgp for machines without NX
  x86, pat: Sanity check remap_pfn_range for RAM region
  x86, pat: Lookup the protection from memtype list on vm_insert_pfn()
  x86, pat: Add lookup_memtype to get the current memtype of a paddr
  x86, pat: Use page flags to track memtypes of RAM pages
  x86, pat: Generalize the use of page flag PG_uncached
  x86, pat: Add rbtree to do quick lookup in memtype tracking
  x86, pat: Add PAT reserve free to io_mapping* APIs
  x86, pat: New i/f for driver to request memtype for IO regions
  x86, pat: ioremap to follow same PAT restrictions as other PAT users
  x86, pat: Keep identity maps consistent with mmaps even when pat_disabled
  x86, mtrr: make mtrr_aps_delayed_init static bool
  x86, pat/mtrr: Rendezvous all the cpus for MTRR/PAT init
  generic-ipi: Allow cpus not yet online to call smp_call_function with irqs disabled
  x86: Fix an incorrect argument of reserve_bootmem()
  x86: Fix system crash when loading with "reservetop" parameter
2009-09-15 09:19:38 -07:00
Linus Torvalds
1aaf2e5913 Merge branch 'x86-txt-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-txt-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  x86, intel_txt: clean up the impact on generic code, unbreak non-x86
  x86, intel_txt: Handle ACPI_SLEEP without X86_TRAMPOLINE
  x86, intel_txt: Fix typos in Kconfig help
  x86, intel_txt: Factor out the code for S3 setup
  x86, intel_txt: tboot.c needs <asm/fixmap.h>
  intel_txt: Force IOMMU on for Intel TXT launch
  x86, intel_txt: Intel TXT Sx shutdown support
  x86, intel_txt: Intel TXT reboot/halt shutdown support
  x86, intel_txt: Intel TXT boot support
2009-09-15 09:19:20 -07:00
Ashwin Chaugule
7403f41f19 hrtimer: Eliminate needless reprogramming of clock events device
On NOHZ systems the following timers,

-  tick_nohz_restart_sched_tick (tick_sched_timer)
-  hrtimer_start (tick_sched_timer)

are reprogramming the clock events device far more often than needed.
No specific test case was required to observe this effect. This
occurres because there was no check to see if the currently removed or
restarted hrtimer was:

1) the one which previously armed the clock events device.
2) going to be replaced by another timer which has the same expiry time.

Avoid the reprogramming in hrtimer_force_reprogram when the new expiry
value which is evaluated from the clock bases is equal to
cpu_base->expires_next. This results in faster application startup
time by ~4%.

[ tglx: simplified initial solution ]

Signed-off-by: Ashwin Chaugule <ashwinc@quicinc.com>
LKML-Reference: <4AA00165.90609@codeaurora.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2009-09-15 17:09:44 +02:00
Peter Zijlstra
a7558e0105 sched: Add WF_FORK
Avoid the cache buddies from biasing the time distribution away
from fork()ers. Normally the next buddy will be the preferred
scheduling target, but this makes fork()s prefer to run the new
child, whereas we prefer to run the parent, since that will
generate more work.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 16:51:31 +02:00
Peter Zijlstra
7d47872146 sched: Rename sync arguments
In order to extend the functions to have more than 1 flag (sync),
rename the argument to flags, and explicitly define a WF_ space for
individual flags.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 16:51:30 +02:00
Peter Zijlstra
0763a660a8 sched: Rename select_task_rq() argument
In order to be able to rename the sync argument, we need to rename
the current flag argument.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 16:51:29 +02:00
Peter Zijlstra
8e6598af3f sched: Feature to disable APERF/MPERF cpu_power
I suspect a feed-back loop between cpuidle and the aperf/mperf
cpu_power bits, where when we have idle C-states lower the ratio,
which leads to lower cpu_power and then less load, which generates
more idle time, etc..

Put in a knob to disable it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 16:51:28 +02:00
Peter Zijlstra
d6a59aa3a2 sched: Provide arch_scale_freq_power
Provide an ach specific hook for cpufreq based scaling of
cpu_power.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
[ego@in.ibm.com: spotting bugs]
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 16:51:24 +02:00
Mike Galbraith
0ec9fab3d1 sched: Improve latencies and throughput
Make the idle balancer more agressive, to improve a
x264 encoding workload provided by Jason Garrett-Glaser:

 NEXT_BUDDY NO_LB_BIAS
 encoded 600 frames, 252.82 fps, 22096.60 kb/s
 encoded 600 frames, 250.69 fps, 22096.60 kb/s
 encoded 600 frames, 245.76 fps, 22096.60 kb/s

 NO_NEXT_BUDDY LB_BIAS
 encoded 600 frames, 344.44 fps, 22096.60 kb/s
 encoded 600 frames, 346.66 fps, 22096.60 kb/s
 encoded 600 frames, 352.59 fps, 22096.60 kb/s

 NO_NEXT_BUDDY NO_LB_BIAS
 encoded 600 frames, 425.75 fps, 22096.60 kb/s
 encoded 600 frames, 425.45 fps, 22096.60 kb/s
 encoded 600 frames, 422.49 fps, 22096.60 kb/s

Peter pointed out that this is better done via newidle_idx,
not via LB_BIAS, newidle balancing should look for where
there is load _now_, not where there was load 2 ticks ago.

Worst-case latencies are improved as well as no buddies
means less vruntime spread. (as per prior lkml discussions)

This change improves kbuild-peak parallelism as well.

Reported-by: Jason Garrett-Glaser <darkshikari@gmail.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1253011667.9128.16.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 16:51:16 +02:00
Peter Zijlstra
78e7ed53c9 sched: Tweak wake_idx
When merging select_task_rq_fair() and sched_balance_self() we lost
the use of wake_idx, restore that and set them to 0 to make wake
balancing more aggressive.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 16:01:07 +02:00
Peter Zijlstra
d7c33c4930 sched: Fix task affinity for select_task_rq_fair
While merging select_task_rq_fair() and sched_balance_self() I made
a mistake that leads to testing the wrong task affinty.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 16:01:07 +02:00
Peter Zijlstra
83f54960c1 sched: for_each_domain() vs RCU
for_each_domain() uses RCU to serialize the sched_domains, except
it doesn't actually use rcu_read_lock() and instead relies on
disabling preemption -> FAIL.

XXX: audit other sched_domain code.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 16:01:06 +02:00
Peter Zijlstra
ae154be1f3 sched: Weaken SD_POWERSAVINGS_BALANCE
One of the problems of power-saving balancing is that under certain
scenarios it is too slow and allows tons of real work to pile up.

Avoid this by ignoring the powersave stuff when there's real work
to be done.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 16:01:06 +02:00
Peter Zijlstra
c88d591089 sched: Merge select_task_rq_fair() and sched_balance_self()
The problem with wake_idle() is that is doesn't respect things like
cpu_power, which means it doesn't deal well with SMT nor the recent
RT interaction.

To cure this, it needs to do what sched_balance_self() does, which
leads to the possibility of merging select_task_rq_fair() and
sched_balance_self().

Modify sched_balance_self() to:

  - update_shares() when walking up the domain tree,
    (it only called it for the top domain, but it should
     have done this anyway), which allows us to remove
    this ugly bit from try_to_wake_up().

  - do wake_affine() on the smallest domain that contains
    both this (the waking) and the prev (the wakee) cpu for
    WAKE invocations.

Then use the top-down balance steps it had to replace wake_idle().

This leads to the dissapearance of SD_WAKE_BALANCE and
SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE.

SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective.

Touch all topology bits to replace the old with new SD flags --
platforms might need re-tuning, enabling SD_BALANCE_WAKE
conditionally on a NUMA distance seems like a good additional
feature, magny-core and small nehalem systems would want this
enabled, systems with slow interconnects would not.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 16:01:05 +02:00
Peter Zijlstra
e9c8431185 sched: Add TASK_WAKING
We're going to want to drop rq->lock in try_to_wake_up() for a
longer period of time, however we also want to deal with concurrent
waking of the same task, which is currently handled by holding
rq->lock.

So introduce a new TASK state, namely TASK_WAKING, which indicates
someone is already waking the task (other wakers will fail p->state
& state).

We also keep preemption disabled over the whole ttwu().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 16:01:05 +02:00
Peter Zijlstra
5f3edc1b1e sched: Hook sched_balance_self() into sched_class::select_task_rq()
Rather ugly patch to fully place the sched_balance_self() code
inside the fair class.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 16:01:04 +02:00
Peter Zijlstra
aaee1203ca sched: Move sched_balance_self() into sched_fair.c
Move the sched_balance_self() code into sched_fair.c

This facilitates the merger of sched_balance_self() and
sched_fair::select_task_rq().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 16:01:04 +02:00
Peter Zijlstra
f5f08f39ee sched: Move code around
In preparation to other code movement, move weighted_cpuload(),
source_load() and target_load() before the class includes.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 16:01:03 +02:00
Peter Zijlstra
e26af0e8b2 sched: Add come comments to the sched features
Add text...

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 16:01:03 +02:00
Mike Galbraith
3cb63d527f sched: Complete buddy switches
Add a NEXT_BUDDY feature flag to aid in debugging.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 16:01:02 +02:00
Peter Zijlstra
e6b1b2c9c0 sched: Split WAKEUP_OVERLAP
It consists of two conditions, split them out in separate toggles
so we can test them independently.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 16:01:02 +02:00
Peter Zijlstra
b78bb868c5 sched: Fix double_rq_lock() compile warning
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 16:01:01 +02:00
Thomas Gleixner
12e09337fe time: Prevent 32 bit overflow with set_normalized_timespec()
set_normalized_timespec() nsec argument is of type long. The recent
timekeeping changes of ktime_get_ts() feed 

	ts->tv_nsec + tomono.tv_nsec + nsecs

to set_normalized_timespec(). On 32 bit machines that sum can be
larger than (1 << 31) and therefor result in a negative value which
screws up the result completely.

Make the nsec argument of set_normalized_timespec() s64 to fix the
problem at hand. This also prevents similar problems for future users
of set_normalized_timespec().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Carsten Emde <carsten.emde@osadl.org>
LKML-Reference: <new-submission>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: John Stultz <johnstul@us.ibm.com>
2009-09-15 10:17:30 +02:00
Xiao Guangrong
b3e62e3505 perf_counter: Fix buffer overflow in perf_copy_attr()
If we pass a big size data over perf_counter_open() syscall,
the kernel will copy this data to a small buffer, it will
cause kernel crash.

This bug makes the kernel unsafe and non-root local user can
trigger it.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Paul Mackerras <paulus@samba.org>
Cc: <stable@kernel.org>
LKML-Reference: <4AAF37D4.5010706@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-15 09:53:31 +02:00
Linus Torvalds
1824090496 Merge branch 'for-linus3' of git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/security-testing-2.6
* 'for-linus3' of git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/security-testing-2.6:
  SELinux: inline selinux_is_enabled in !CONFIG_SECURITY_SELINUX
  KEYS: Fix garbage collector
  KEYS: Unlock tasklist when exiting early from keyctl_session_to_parent
  CRED: Allow put_cred() to cope with a NULL groups list
  SELinux: flush the avc before disabling SELinux
  SELinux: seperate avc_cache flushing
  Creds: creds->security can be NULL is selinux is disabled
2009-09-14 20:07:31 -07:00
Linus Torvalds
f86054c245 Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/suspend-2.6
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/suspend-2.6: (23 commits)
  at_hdmac: Rework suspend_late()/resume_early()
  PM: Reset transition_started at dpm_resume_noirq
  PM: Update kerneldoc comments in drivers/base/power/main.c
  PM: Add convenience macro to make switching to dev_pm_ops less error-prone
  hp-wmi: Switch driver to dev_pm_ops
  floppy: Switch driver to dev_pm_ops
  PM: Trivial fixes
  PM / Hibernate / Memory hotplug: Always use for_each_populated_zone()
  PM/Hibernate: Do not try to allocate too much memory too hard (rev. 2)
  PM/Hibernate: Do not release preallocated memory unnecessarily (rev. 2)
  PM/Hibernate: Rework shrinking of memory
  PM: Fix typo in label name s/Platofrm_finish/Platform_finish/
  PM: Run-time PM platform device bus support
  PM: Introduce core framework for run-time PM of I/O devices (rev. 17)
  Driver Core: Make PM operations a const pointer
  PM: Remove platform device suspend_late()/resume_early() V2
  USB: Rework musb suspend()/resume_early()
  I2C: Rework i2c-s3c2410 suspend_late()/resume() V2
  I2C: Rework i2c-pxa suspend_late()/resume_early()
  DMA: Rework txx9dmac suspend_late()/resume_early()
  ...

Fix trivial conflict in drivers/base/platform.c (due to same
constification patch being merged in both sides, along with some other
PM work in the PM branch)
2009-09-14 20:03:54 -07:00
Linus Torvalds
c91d7d54ea Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-2.6-kconfig
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-2.6-kconfig:
  kconfig: add missing dependency of conf to localyesconfig
  kconfig: test if a .config already exists
  kconfig: make local .config default for streamline_config
  kconfig: test for /boot/config-uname after /proc/config.gz in localconfig
  kconfig: unset IKCONFIG_PROC and clean up nesting
  kconfig: search for a config to base the local(mod|yes)config on
  kconfig: keep config.gz around even if CONFIG_IKCONFIG_PROC is not set
  kconfig: have extract-ikconfig read ELF files
  kconfig: add check if end exists in extract-ikconfig
  kconfig: enable CONFIG_IKCONFIG from streamline_config.pl
  kconfig: do not warn about modules built in
  kconfig: streamline_config.pl do not stop with no depends
  kconfig: add make localyesconfig option
  kconfig: make localmodconfig to run streamline_config.pl
  kconfig: add streamline_config.pl to scripts
2009-09-14 19:59:37 -07:00
Mike Frysinger
555f386c98 ftrace: document function and function graph implementation
While implementing function tracer and function tracer graph support,
I found the exact arch implementation details to be a bit lacking
(and my x86 foo ain't great).  So after pounding out support for
the Blackfin arch, start documenting the requirements/details.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
LKML-Reference: <1252973415-21264-1-git-send-email-vapier@gentoo.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-14 21:43:15 -04:00
Linus Torvalds
355bbd8cb8 Merge branch 'for-2.6.32' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.32' of git://git.kernel.dk/linux-2.6-block: (29 commits)
  block: use blkdev_issue_discard in blk_ioctl_discard
  Make DISCARD_BARRIER and DISCARD_NOBARRIER writes instead of reads
  block: don't assume device has a request list backing in nr_requests store
  block: Optimal I/O limit wrapper
  cfq: choose a new next_req when a request is dispatched
  Seperate read and write statistics of in_flight requests
  aoe: end barrier bios with EOPNOTSUPP
  block: trace bio queueing trial only when it occurs
  block: enable rq CPU completion affinity by default
  cfq: fix the log message after dispatched a request
  block: use printk_once
  cciss: memory leak in cciss_init_one()
  splice: update mtime and atime on files
  block: make blk_iopoll_prep_sched() follow normal 0/1 return convention
  cfq-iosched: get rid of must_alloc flag
  block: use interrupts disabled version of raise_softirq_irqoff()
  block: fix comment in blk-iopoll.c
  block: adjust default budget for blk-iopoll
  block: fix long lines in block/blk-iopoll.c
  block: add blk-iopoll, a NAPI like approach for block devices
  ...
2009-09-14 17:55:15 -07:00
Anirban Sinha
353f6dd2de cleanup console_print()
console_print() is an old legacy interface mostly unused in the entire
kernel tree. It's best to clean up its existing use and let developers
use their own implementation of it as they feel fit.

Signed-off-by: Anirban Sinha <asinha@zeugmasystems.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-14 17:41:42 -07:00
David Howells
4a5d6ba191 CRED: Allow put_cred() to cope with a NULL groups list
put_cred() will oops if given a NULL groups list, but that is now possible with
the existence of cred_alloc_blank(), as used in keyctl_session_to_parent().

Added in commit:

	commit ee18d64c1f
	Author: David Howells <dhowells@redhat.com>
	Date:   Wed Sep 2 09:14:21 2009 +0100
	KEYS: Add a keyctl to install a process's session keyring on its parent [try #6]

Reported-by: Marc Dionne <marc.c.dionne@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
2009-09-15 09:10:57 +10:00
Thomas Gleixner
54a6bc0b07 clocksource: Delay clocksource down rating to late boot
The down rating of clock sources in the early boot process via the
clock source watchdog mechanism can happen way before the per cpu
event queues are initialized. This leads to a boot crash on x86 when
the TSC is marked unstable in the SMP bring up.

The selection of a clock source for time keeping happens in the late
boot process so we can safely delay the list manipulation until
clocksource_done_booting() is called.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
2009-09-14 21:59:32 +02:00
Thomas Gleixner
e6c733050f clocksource: clocksource_select must be called with mutex locked
The callers of clocksource_select must hold clocksource_mutex to
protect the clocksource_list.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
2009-09-14 21:59:32 +02:00
Wu Fengguang
8de0307326 PM: Trivial fixes
Fix the definition of BM_BITS_PER_BLOCK and kerneldoc
description of create_bm_block_list().

[rjw: Added changelog.]

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
2009-09-14 20:26:59 +02:00
Gerald Schaefer
98e73dc5d2 PM / Hibernate / Memory hotplug: Always use for_each_populated_zone()
Use for_each_populated_zone() instead of for_each_zone() in hibernation
code. This fixes a bug on s390, where we allow both config options
HIBERNATION and MEMORY_HOTPLUG, so that we also have a ZONE_MOVABLE
here. We only allow hibernation if no memory hotplug operation was
performed, so in fact both features can only be used exclusively, but
this way we don't need 2 differently configured (distribution) kernels.

If we have an unpopulated ZONE_MOVABLE, we allow hibernation but run
into a BUG_ON() in memory_bm_test/set/clear_bit() because hibernation
code iterates through all zones, not only the populated zones, in
several places. For example, swsusp_free() does for_each_zone() and
then checks for pfn_valid(), which is true even if the zone is not
populated, resulting in a BUG_ON() later because the pfn cannot be
found in the memory bitmap.

Replacing all occurences of for_each_zone() in hibernation code with
for_each_populated_zone() would fix this issue.

[rjw: Rebased on top of linux-next hibernation patches.]

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
2009-09-14 20:26:59 +02:00
Rafael J. Wysocki
ef4aede3f1 PM/Hibernate: Do not try to allocate too much memory too hard (rev. 2)
We want to avoid attempting to free too much memory too hard during
hibernation, so estimate the minimum size of the image to use as the
lower limit for preallocating memory.

The approach here is based on the (experimental) observation that we
can't free more page frames than the sum of:

* global_page_state(NR_SLAB_RECLAIMABLE)
* global_page_state(NR_ACTIVE_ANON)
* global_page_state(NR_INACTIVE_ANON)
* global_page_state(NR_ACTIVE_FILE)
* global_page_state(NR_INACTIVE_FILE)

minus

* global_page_state(NR_FILE_MAPPED)

Namely, if this number is subtracted from the number of saveable
pages in the system, we get a good estimate of the minimum reasonable
size of a hibernation image.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
2009-09-14 20:26:59 +02:00
Rafael J. Wysocki
64a473cb74 PM/Hibernate: Do not release preallocated memory unnecessarily (rev. 2)
Since the hibernation code is now going to use allocations of memory
to make enough room for the image, it can also use the page frames
allocated at this stage as image page frames.  The low-level
hibernation code needs to be rearranged for this purpose, but it
allows us to avoid freeing a great number of pages and allocating
these same pages once again later, so it generally is worth doing.

[rev. 2: Take highmem into account correctly.]

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
2009-09-14 20:26:58 +02:00
Rafael J. Wysocki
4bb334353e PM/Hibernate: Rework shrinking of memory
Rework swsusp_shrink_memory() so that it calls shrink_all_memory()
just once to make some room for the image and then allocates memory
to apply more pressure to the memory management subsystem, if
necessary.

Unfortunately, we don't seem to be able to drop shrink_all_memory()
entirely just yet, because that would lead to huge performance
regressions in some test cases.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
2009-09-14 20:26:58 +02:00
Thadeu Lima de Souza Cascardo
e681c9dd62 PM: Fix typo in label name s/Platofrm_finish/Platform_finish/
Although the same label name is used somewhere else in the file, this
particular label was consistently typoed in all of its uses.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
2009-09-14 20:26:58 +02:00
Rafael J. Wysocki
ac8d513a68 Merge branch 'master' into for-linus 2009-09-14 20:26:05 +02:00
Linus Torvalds
d7e9660ad9 Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next-2.6: (1623 commits)
  netxen: update copyright
  netxen: fix tx timeout recovery
  netxen: fix file firmware leak
  netxen: improve pci memory access
  netxen: change firmware write size
  tg3: Fix return ring size breakage
  netxen: build fix for INET=n
  cdc-phonet: autoconfigure Phonet address
  Phonet: back-end for autoconfigured addresses
  Phonet: fix netlink address dump error handling
  ipv6: Add IFA_F_DADFAILED flag
  net: Add DEVTYPE support for Ethernet based devices
  mv643xx_eth.c: remove unused txq_set_wrr()
  ucc_geth: Fix hangs after switching from full to half duplex
  ucc_geth: Rearrange some code to avoid forward declarations
  phy/marvell: Make non-aneg speed/duplex forcing work for 88E1111 PHYs
  drivers/net/phy: introduce missing kfree
  drivers/net/wan: introduce missing kfree
  net: force bridge module(s) to be GPL
  Subject: [PATCH] appletalk: Fix skb leak when ipddp interface is not loaded
  ...

Fixed up trivial conflicts:

 - arch/x86/include/asm/socket.h

   converted to <asm-generic/socket.h> in the x86 tree.  The generic
   header has the same new #define's, so that works out fine.

 - drivers/net/tun.c

   fix conflict between 89f56d1e9 ("tun: reuse struct sock fields") that
   switched over to using 'tun->socket.sk' instead of the redundantly
   available (and thus removed) 'tun->sk', and 2b980dbd ("lsm: Add hooks
   to the TUN driver") which added a new 'tun->sk' use.

   Noted in 'next' by Stephen Rothwell.
2009-09-14 10:37:28 -07:00
Steven Rostedt
1f5a6b4541 tracing: make testing syscall events a separate configuration
Parag noticed that the number of event tests has increased tremendously:

grep "Testing event" dmesg.31rc9 |wc -l
100

grep "Testing event" dmesg.31git |wc -l
1172

This is due to the testing of every syscall event when ftrace self
test is enabled. This adds a bit more time to kernel boot up and can
affect development by slowing down the time it takes between reboots.

This option makes the testing of the syscall events into a separate
config, to still be able to test most of ftrace internals at boot up
but not have to wait for all the syscall events to be tested.

The syscall event testing only tests the enabling and disabling of
the trace point, since the syscalls are not executed. What really needs
to be done is to somehow have a userspace tool test the syscall tracepoints
as well.

Reported-by: Parag Warudkar <parag.lkml@gmail.com>
LKML-Reference: <f7848160909130815l3e768a30n3b28808bbe5c254b@mail.gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-14 11:58:24 -04:00
Li Zefan
20a58a7723 tracing: remove some unused macros
- remove FTRACE_ENTRY_STRUCT_ONLY()
- remove TRACE_XXX() macros

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4AADF6E6.3080606@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-14 11:43:24 -04:00
Li Zefan
05ffa2d020 ftrace: add compile-time check on F_printk()
Make sure F_printk() has corrent format and args, and make sure
changes in F_STRUCT() won't break F_printk().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4AADF6CC.1060809@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-14 11:42:10 -04:00
Li Zefan
c16de8fd7a tracing: fix F_printk() typos
I found some typos in F_printk(), so I wrote compile-time check
for it, and triggered some compile errors and warnings.

I've fixed them on x86_32, but I have no x86_64 in my hand, so there
may still be some compile warnings on 64bits.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4AADF60B.5070407@cn.fujitsu.com>

[ tested on x86_64, and works fine ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-14 11:40:59 -04:00
Steven Rostedt
ec827c7ece tracing: add static to generated TRACE_EVENT functions
Some of the generated functions used in the TRACE_EVENT macros are
not declared static, but they are not global.

Discovered by sparse.

Reported-by: Jaswinder Singh Rajput <jaswinder@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-14 10:50:23 -04:00
Steven Rostedt
08a4081617 ring-buffer: typecast cmpxchg to fix PowerPC warning
The cmpxchg used by PowerPC does the following:

  ({									 \
     __typeof__(*(ptr)) _o_ = (o);					 \
     __typeof__(*(ptr)) _n_ = (n);					 \
     (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_,		 \
				    (unsigned long)_n_, sizeof(*(ptr))); \
  })

This does a type check of *ptr to both o and n.

Unfortunately, the code in ring-buffer.c assigns longs to pointers
and pointers to longs and causes a warning on PowerPC:

ring_buffer.c: In function 'rb_head_page_set':
ring_buffer.c:704: warning: initialization makes pointer from integer without a cast
ring_buffer.c:704: warning: initialization makes pointer from integer without a cast
ring_buffer.c: In function 'rb_head_page_replace':
ring_buffer.c:797: warning: initialization makes integer from pointer without a cast

This patch adds the typecasts inside cmpxchg to annotate that a long is
being cast to a pointer and a pointer is being casted to a long and this
removes the PowerPC warnings.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-14 09:41:57 -04:00
Ingo Molnar
f977bb4937 perf_counter, sched: Add sched_stat_runtime tracepoint
This allows more precise tracking of how the scheduler accounts
(and acts upon) a task having spent N nanoseconds of CPU time.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-13 18:17:28 +02:00
Ingo Molnar
459ec28ab4 perf_counter: Allow mmap if paranoid checks are turned off
Before:

  $ perf sched record -f sleep 1
  Error: failed to mmap with 1 (Operation not permitted)

After:

  $ perf sched record -f sleep 1
  [ perf record: Captured and wrote 0.095 MB perf.data (~4161 samples) ]

Note, this is only allowed if perfcounter_paranoid is set to
the most permissive (non-default) value of -1.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-13 17:33:44 +02:00
Steven Rostedt
60ba770227 tracing: add filter event logic to special, mmiotrace and boot tracers
Now that the pluging tracers use macros to create the structures and
automate the exporting of their formats to the format files, they also
automatically get a filter file.

This patch adds the code to implement the filter logic in the trace
recordings.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-12 23:34:04 -04:00
Steven Rostedt
51df5fcbc1 tracing: remove trace_event_types.h
The macros in trace_entries.h have made the code in trace_event_types.h
obsolete. The file is no longer used, so this patch removes it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-12 23:08:11 -04:00
Steven Rostedt
4e5292ea1a tracing: use the new trace_entries.h to create format files
This patch changes the way the format files in

  debugfs/tracing/events/ftrace/*/format

are created. It uses the new trace_entries.h file to automate the
creation of the format files to ensure that they are always in sync
with the actual structures. This is the same methodology used to
create the format files for the TRACE_EVENT macro.

This also updates the filter creation that was built on the creation
of the format files.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-12 23:08:10 -04:00
Steven Rostedt
d73150943c tracing: show details of structures within the ftrace structures
Some of the internal ftrace structures use structures within. The
output of a field saying it is just a structure is useless for a format
file. A binary reader of the ring buffer needs to know more about
how the fields are broken up.

This patch adds to the ftrace structure macros new fields to
describe the structures inside a structure.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-12 23:08:07 -04:00
Steven Rostedt
0a1c49db8d tracing: use macros to create internal ftrace entry ring buffer structures
The entries used by ftrace internal code (plugins) currently have their
formats manually exported to userspace. That is, the format files in
debugfs/tracing/events/ftrace/*/format are currently created by hand.
This is a maintenance nightmare, and can easily become out of sync
with what is actually shown.

This patch uses the methodology of the TRACE_EVENT macros to build
the structures so that their formats can be automated and this
will keep the structures in sync with what users can see.

This patch only changes the way the structures are created. Further
patches will build off of this to automate the format files.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-12 23:08:06 -04:00
Li Zefan
558e6547e4 tracing/profile: fix profile_disable vs module_unload
If the correspoding module is unloaded before ftrace_profile_disable()
is called, event->profile_disable() won't be called, which can
cause oops:

  # insmod trace-events-sample.ko
  # perf record -f -a -e sample:foo_bar sleep 3 &
  # sleep 1
  # rmmod trace_events_sample
  # insmod trace-events-sample.ko
  OOPS!

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A9214E3.2070807@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-12 22:28:38 -04:00
Carsten Emde
b5130b1e7d tracing: do not update tracing_max_latency when tracer is stopped
The state of the function pair tracing_stop()/tracing_start() is
correctly considered when tracer data are updated. However, the global
and externally accessible variable tracing_max_latency is always updated
- even when tracing is stopped.

The update should only occur, if tracing was not stopped.

Signed-off-by: Carsten Emde <C.Emde@osadl.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-12 21:45:17 -04:00
Carsten Emde
41dfba4367 tracing: remove unused local variables in tracer probe functions
When the nsecs_to_usecs() conversion in probe_wakeup_sched_switch() and
check_critical_timing() was moved to a later stage in order to avoid
unnecessary computing, it was overlooked to remove the original
variables, assignments and comments..

Signed-off-by: Carsten Emde <C.Emde@osadl.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-12 21:44:13 -04:00
Linus Torvalds
483e3cd6a3 Merge branch 'tracing-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'tracing-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (105 commits)
  ring-buffer: only enable ring_buffer_swap_cpu when needed
  ring-buffer: check for swapped buffers in start of committing
  tracing: report error in trace if we fail to swap latency buffer
  tracing: add trace_array_printk for internal tracers to use
  tracing: pass around ring buffer instead of tracer
  tracing: make tracing_reset safe for external use
  tracing: use timestamp to determine start of latency traces
  tracing: Remove mentioning of legacy latency_trace file from documentation
  tracing/filters: Defer pred allocation, fix memory leak
  tracing: remove users of tracing_reset
  tracing: disable buffers and synchronize_sched before resetting
  tracing: disable update max tracer while reading trace
  tracing: print out start and stop in latency traces
  ring-buffer: disable all cpu buffers when one finds a problem
  ring-buffer: do not count discarded events
  ring-buffer: remove ring_buffer_event_discard
  ring-buffer: fix ring_buffer_read crossing pages
  ring-buffer: remove unnecessary cpu_relax
  ring-buffer: do not swap buffers during a commit
  ring-buffer: do not reset while in a commit
  ...
2009-09-11 13:24:03 -07:00
Linus Torvalds
774a694f8c Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (64 commits)
  sched: Fix sched::sched_stat_wait tracepoint field
  sched: Disable NEW_FAIR_SLEEPERS for now
  sched: Keep kthreads at default priority
  sched: Re-tune the scheduler latency defaults to decrease worst-case latencies
  sched: Turn off child_runs_first
  sched: Ensure that a child can't gain time over it's parent after fork()
  sched: enable SD_WAKE_IDLE
  sched: Deal with low-load in wake_affine()
  sched: Remove short cut from select_task_rq_fair()
  sched: Turn on SD_BALANCE_NEWIDLE
  sched: Clean up topology.h
  sched: Fix dynamic power-balancing crash
  sched: Remove reciprocal for cpu_power
  sched: Try to deal with low capacity, fix update_sd_power_savings_stats()
  sched: Try to deal with low capacity
  sched: Scale down cpu_power due to RT tasks
  sched: Implement dynamic cpu_power
  sched: Add smt_gain
  sched: Update the cpu_power sum during load-balance
  sched: Add SD_PREFER_SIBLING
  ...
2009-09-11 13:23:18 -07:00
Linus Torvalds
4f0ac85416 Merge branch 'perfcounters-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'perfcounters-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (60 commits)
  perf tools: Avoid unnecessary work in directory lookups
  perf stat: Clean up statistics calculations a bit more
  perf stat: More advanced variance computation
  perf stat: Use stddev_mean in stead of stddev
  perf stat: Remove the limit on repeat
  perf stat: Change noise calculation to use stddev
  x86, perf_counter, bts: Do not allow kernel BTS tracing for now
  x86, perf_counter, bts: Correct pointer-to-u64 casts
  x86, perf_counter, bts: Fail if BTS is not available
  perf_counter: Fix output-sharing error path
  perf trace: Fix read_string()
  perf trace: Print out in nanoseconds
  perf tools: Seek to the end of the header area
  perf trace: Fix parsing of perf.data
  perf trace: Sample timestamps as well
  perf_counter: Introduce new (non-)paranoia level to allow raw tracepoint access
  perf trace: Sample the CPU too
  perf tools: Work around strict aliasing related warnings
  perf tools: Clean up warnings list in the Makefile
  perf tools: Complete support for dynamic strings
  ...
2009-09-11 13:22:43 -07:00
Linus Torvalds
d90a7e8640 Merge branch 'irq-threaded-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'irq-threaded-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  genirq: Do not mask oneshot edge type interrupts
  genirq: Support nested threaded irq handling
  genirq: Add buslock support
  genirq: Add oneshot support
2009-09-11 13:21:31 -07:00
Linus Torvalds
12a499612e Merge branch 'irq-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'irq-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  pci/intr_remapping: Allocate irq_iommu on node
  irq: Add irq_node() primitive
  irq: Make sure irq_desc for legacy irq get correct node setting
  genirq: Add prototype for handle_nested_irq()
  irq: Remove superfluous NULL pointer check in check_irq_resend()
  irq: Clean up by removing irqfixup MODULE_PARM_DESC()
  genirq: Fix comment describing suspend_device_irqs()
  genirq: Remove obsolete defines and typedefs
2009-09-11 13:20:42 -07:00
Linus Torvalds
eee2775d99 Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (28 commits)
  rcu: Move end of special early-boot RCU operation earlier
  rcu: Changes from reviews: avoid casts, fix/add warnings, improve comments
  rcu: Create rcutree plugins to handle hotplug CPU for multi-level trees
  rcu: Remove lockdep annotations from RCU's _notrace() API members
  rcu: Add #ifdef to suppress __rcu_offline_cpu() warning in !HOTPLUG_CPU builds
  rcu: Add CPU-offline processing for single-node configurations
  rcu: Add "notrace" to RCU function headers used by ftrace
  rcu: Remove CONFIG_PREEMPT_RCU
  rcu: Merge preemptable-RCU functionality into hierarchical RCU
  rcu: Simplify rcu_pending()/rcu_check_callbacks() API
  rcu: Use debugfs_remove_recursive() simplify code.
  rcu: Merge per-RCU-flavor initialization into pre-existing macro
  rcu: Fix online/offline indication for rcudata.csv trace file
  rcu: Consolidate sparse and lockdep declarations in include/linux/rcupdate.h
  rcu: Renamings to increase RCU clarity
  rcu: Move private definitions from include/linux/rcutree.h to kernel/rcutree.h
  rcu: Expunge lingering references to CONFIG_CLASSIC_RCU, optimize on !SMP
  rcu: Delay rcu_barrier() wait until beginning of next CPU-hotunplug operation.
  rcu: Fix typo in rcu_irq_exit() comment header
  rcu: Make rcupreempt_trace.c look at offline CPUs
  ...
2009-09-11 13:20:18 -07:00
Linus Torvalds
53e16fbd30 Merge branch 'core-printk-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'core-printk-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  printk: Fix "printk: Enable the use of more than one CON_BOOT (early console)"
  printk: Restore previous console_loglevel when re-enabling logging
  printk: Ensure that "console enabled" messages are printed on the console
  printk: Enable the use of more than one CON_BOOT (early console)
2009-09-11 13:19:40 -07:00
Linus Torvalds
4e3408d9f7 Merge branch 'core-locking-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'core-locking-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (32 commits)
  locking, m68k/asm-offsets: Rename signal defines
  locking: Inline spinlock code for all locking variants on s390
  locking: Simplify spinlock inlining
  locking: Allow arch-inlined spinlocks
  locking: Move spinlock function bodies to header file
  locking, m68k: Calculate thread_info offset with asm offset
  locking, m68k/asm-offsets: Rename pt_regs offset defines
  locking, sparc: Rename __spin_try_lock() and friends
  locking, powerpc: Rename __spin_try_lock() and friends
  lockdep: Remove recursion stattistics
  lockdep: Simplify lock_stat seqfile code
  lockdep: Simplify lockdep_chains seqfile code
  lockdep: Simplify lockdep seqfile code
  lockdep: Fix missing entries in /proc/lock_chains
  lockdep: Fix missing entry in /proc/lock_stat
  lockdep: Fix memory usage info of BFS
  lockdep: Reintroduce generation count to make BFS faster
  lockdep: Deal with many similar locks
  lockdep: Introduce lockdep_assert_held()
  lockdep: Fix style nits
  ...
2009-09-11 13:17:24 -07:00
Linus Torvalds
7193bea53f Merge branch 'core-futexes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'core-futexes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  futex: Detect mismatched requeue targets
  futex: Correct futex_wait_requeue_pi() commentary
2009-09-11 13:16:22 -07:00
Linus Torvalds
989aa44a5f Merge branch 'core-debug-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'core-debug-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  debug lockups: Improve lockup detection, fix generic arch fallback
  debug lockups: Improve lockup detection
2009-09-11 13:15:55 -07:00
Linus Torvalds
4004f02d7a Merge branch 'core-cleanups-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'core-cleanups-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  workqueues: Improve schedule_work() documentation
2009-09-11 13:13:32 -07:00
jolsa@redhat.com
689fd8b65d tracing: trace parser support for function and graph
Convert the writing to 'set_graph_function', 'set_ftrace_filter'
and 'set_ftrace_notrace' to use the generic trace_parser
'trace_get_user' function.

Removed FTRACE_ITER_CONT flag, since it's not needed after this change.

Minor fix in set_graph_function display - g_show function.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <1252682969-3366-4-git-send-email-jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-11 15:20:18 -04:00
jolsa@redhat.com
489663644c tracing: trace parser support for set_event
Convert the parsing of the file 'set_event' to use the generic
trace_praser 'trace_get_user' function.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <1252682969-3366-3-git-send-email-jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-11 14:47:11 -04:00
jolsa@redhat.com
b63f39ea50 tracing: create generic trace parser
Create a "trace_parser" that can parse the user space input for
separate words.

struct trace_parser is the descriptor.

Generic "trace_get_user" function that can be a helper to read multiple
words passed in by user space.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <1252682969-3366-2-git-send-email-jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-11 14:46:55 -04:00
Steven Rostedt
f81c972d27 tracing: consolidate code between trace_output.c and trace_function_graph.c
Both trace_output.c and trace_function_graph.c do basically the same
thing to handle the printing of the latency-format. This patch moves
the code into one function that both can use.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-11 14:24:13 -04:00
Martin Schwidefsky
f79e0258ea clocksource: Resolve cpu hotplug dead lock with TSC unstable, fix crash
The watchdog timer is started after the watchdog clocksource
and at least one watched clocksource have been registered. The
clocksource work element watchdog_work is initialized just
before the clocksource timer is started. This is too late for
the clocksource_mark_unstable call from native_cpu_up. To fix
this use a static initializer for watchdog_work.

This resolves a boot crash reported by multiple people.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <20090911153305.3fe9a361@skybase>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-11 20:17:18 +02:00
Steven Rostedt
637e7e8641 tracing: add lock depth to entries
This patch adds the lock depth of the big kernel lock to the generic
entry header. This way we can see the depth of the lock and help
in removing the BKL.

Example:

 #                  _------=> CPU#
 #                 / _-----=> irqs-off
 #                | / _----=> need-resched
 #                || / _---=> hardirq/softirq
 #                ||| / _--=> preempt-depth
 #                |||| /_--=> lock-depth
 #                |||||/     delay
 #  cmd     pid   |||||| time  |   caller
 #     \   /      ||||||   \   |   /
   <idle>-0       2.N..3 5902255250us+: lock_acquire: read rcu_read_lock
   <idle>-0       2.N..3 5902255253us+: lock_release: rcu_read_lock
   <idle>-0       2dN..3 5902255257us+: lock_acquire: xtime_lock
   <idle>-0       2dN..4 5902255259us : lock_acquire: clocksource_lock
   <idle>-0       2dN..4 5902255261us+: lock_release: clocksource_lock

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-11 13:55:35 -04:00
Linus Torvalds
a12e4d304c Merge branch 'writeback' of git://git.kernel.dk/linux-2.6-block
* 'writeback' of git://git.kernel.dk/linux-2.6-block:
  writeback: check for registered bdi in flusher add and inode dirty
  writeback: add name to backing_dev_info
  writeback: add some debug inode list counters to bdi stats
  writeback: get rid of pdflush completely
  writeback: switch to per-bdi threads for flushing data
  writeback: move dirty inodes from super_block to backing_dev_info
  writeback: get rid of generic_sync_sb_inodes() export
2009-09-11 09:17:05 -07:00
Steven Rostedt
48659d3119 tracing: move tgid out of generic entry and into userstack
The userstack trace required the recording of the tgid entry.
Unfortunately, it was added to the generic entry where it wasted
4 bytes of every entry and was only used by one entry.

This patch moves it out of the generic field and moves it into the
only user (userstack_entry).

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-11 11:36:23 -04:00
Steven Rostedt
49ff590390 tracing: add latency format to function_graph tracer
While debugging something with the function_graph tracer, I found the
need to see the preempt count of the traces. Unfortunately, since
the function graph tracer has its own output formatting, it does not
honor the latency-format option.

This patch makes the function_graph tracer honor the latency-format
option, but still keeps control of the output. But now we have the
same details that the latency-format supplies.

 # tracer: function_graph
 #
 #      _-----=> irqs-off
 #     / _----=> need-resched
 #    | / _---=> hardirq/softirq
 #    || / _--=> preempt-depth
 #    ||| /
 #    ||||
 # CPU||||  DURATION                  FUNCTION CALLS
 # |  ||||   |   |                     |   |   |   |
  3)  d..1  1.333 us    |        idle_cpu();
  3)  d.h1              |        tick_check_idle() {
  3)  d.h1  0.550 us    |          tick_check_oneshot_broadcast();
  3)  d.h1              |          tick_nohz_stop_idle() {
  3)  d.h1              |            ktime_get() {
  3)  d.h1              |              ktime_get_ts() {

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-11 10:59:49 -04:00
Jens Axboe
5e605b64a1 block: add blk-iopoll, a NAPI like approach for block devices
This borrows some code from NAPI and implements a polled completion
mode for block devices. The idea is the same as NAPI - instead of
doing the command completion when the irq occurs, schedule a dedicated
softirq in the hopes that we will complete more IO when the iopoll
handler is invoked. Devices have a budget of commands assigned, and will
stay in polled mode as long as they continue to consume their budget
from the iopoll softirq handler. If they do not, the device is set back
to interrupt completion mode.

This patch holds the core bits for blk-iopoll, device driver support
sold separately.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2009-09-11 14:33:31 +02:00
Jens Axboe
d993831fa7 writeback: add name to backing_dev_info
This enables us to track who does what and print info. Its main use
is catching dirty inodes on the default_backing_dev_info, so we can
fix that up.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2009-09-11 09:20:26 +02:00
James Morris
a3c8b97396 Merge branch 'next' into for-linus 2009-09-11 08:04:49 +10:00
Ingo Molnar
e1f8450854 sched: Fix sched::sched_stat_wait tracepoint field
This weird perf trace output:

  cc1-9943  [001]  2802.059479616: sched_stat_wait: task: as:9944 wait: 2801938766276 [ns]

Is caused by setting one component field of the delta to zero
a bit too early. Move it to later.

( Note, this does not affect the NEW_FAIR_SLEEPERS interactivity bug,
  it's just a reporting bug in essence. )

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nikos Chantziaras <realnc@arcor.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <4AA93D34.8040500@arcor.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-10 20:52:54 +02:00
Ingo Molnar
3f2aa307c4 sched: Disable NEW_FAIR_SLEEPERS for now
Nikos Chantziaras and Jens Axboe reported that turning off
NEW_FAIR_SLEEPERS improves desktop interactivity visibly.

Nikos described his experiences the following way:

  " With this setting, I can do "nice -n 19 make -j20" and
    still have a very smooth desktop and watch a movie at
    the same time.  Various other annoyances (like the
    "logout/shutdown/restart" dialog of KDE not appearing
    at all until the background fade-out effect has finished)
    are also gone.  So this seems to be the single most
    important setting that vastly improves desktop behavior,
    at least here. "

Jens described it the following way, referring to a 10-seconds
xmodmap scheduling delay he was trying to debug:

  " Then I tried switching NO_NEW_FAIR_SLEEPERS on, and then
    I get:

    Performance counter stats for 'xmodmap .xmodmap-carl':

         9.009137  task-clock-msecs         #      0.447 CPUs
               18  context-switches         #      0.002 M/sec
                1  CPU-migrations           #      0.000 M/sec
              315  page-faults              #      0.035 M/sec

    0.020167093  seconds time elapsed

    Woot! "

So disable it for now. In perf trace output i can see weird
delta timestamps:

  cc1-9943  [001]  2802.059479616: sched_stat_wait: task: as:9944 wait: 2801938766276 [ns]

That nsec field is not supposed to be that large. More digging
is needed - but lets turn it off while the real bug is found.

Reported-by: Nikos Chantziaras <realnc@arcor.de>
Tested-by: Nikos Chantziaras <realnc@arcor.de>
Reported-by: Jens Axboe <jens.axboe@oracle.com>
Tested-by: Jens Axboe <jens.axboe@oracle.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <4AA93D34.8040500@arcor.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-10 20:34:48 +02:00
Li Zefan
197e2eabc9 tracing: move PRED macros to trace_events_filter.c
Move DEFINE_COMPARISON_PRED() and DEFINE_EQUALITY_PRED()
  to kernel/trace/trace_events_filter.c

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4AA8579B.4020706@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-09 23:54:11 -04:00
Li Zefan
a5921c6c37 tracing: remove stats from struct tracer
Remove unused field @stats from struct tracer.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4AA8579B.4020706@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-09 23:54:09 -04:00
Li Zefan
bd9cfca9cb tracing: format clean ups
Fix white-space formatting.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4AA8579B.4020706@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-09 23:54:07 -04:00
Li Zefan
e0ab5f2dae tracing: remove dead code
Removes unreachable code.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4AA8579B.4020706@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-09-09 23:54:06 -04:00