From 1d93e52eb48df986a3c4d5ad8a520bf1f6837367 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 11 Feb 2009 08:47:19 -0700 Subject: [PATCH 001/580] dmaengine: update kerneldoc Some of the kerneldoc comments in the dmaengine header describe already removed structure members. Remove them. Also add a short description for dma_device->device_is_tx_complete. Signed-off-by: Johannes Weiner Signed-off-by: Dan Williams --- include/linux/dmaengine.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 3e68469c1885..087e79acf8c7 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -97,7 +97,6 @@ typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t; /** * struct dma_chan_percpu - the per-CPU part of struct dma_chan - * @refcount: local_t used for open-coded "bigref" counting * @memcpy_count: transaction counter * @bytes_transferred: byte counter */ @@ -114,9 +113,6 @@ struct dma_chan_percpu { * @cookie: last cookie value returned to client * @chan_id: channel ID for sysfs * @dev: class device for sysfs - * @refcount: kref, used in "bigref" slow-mode - * @slow_ref: indicates that the DMA channel is free - * @rcu: the DMA channel's RCU head * @device_node: used to add this to the device chan list * @local: per-cpu pointer to a struct dma_chan_percpu * @client-count: how many clients are using this channel @@ -211,8 +207,6 @@ struct dma_async_tx_descriptor { * @global_node: list_head for global dma_device_list * @cap_mask: one or more dma_capability flags * @max_xor: maximum number of xor sources, 0 if no capability - * @refcount: reference count - * @done: IO completion struct * @dev_id: unique device ID * @dev: struct device reference for dma mapping api * @device_alloc_chan_resources: allocate resources and return the @@ -225,6 +219,7 @@ struct dma_async_tx_descriptor { * @device_prep_dma_interrupt: prepares an end of chain interrupt operation * @device_prep_slave_sg: prepares a slave dma operation * @device_terminate_all: terminate all pending operations + * @device_is_tx_complete: poll for transaction completion * @device_issue_pending: push pending transactions to hardware */ struct dma_device { From 973a2dd1d50a11d380086601f14e59116f93e8c5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:32 +0100 Subject: [PATCH 002/580] x86, mce: disable machine checks on suspend Impact: Bug fix During suspend it is not reliable to process machine check exceptions, because CPUs disappear but can still get machine check broadcasts. Also the system is slightly more likely to machine check them, but the handler is typically not a position to handle them in a meaningfull way. So disable them during suspend and enable them during resume. Also make sure they are always disabled on hot-unplugged CPUs. This new code assumes that suspend always hotunplugs all non BP CPUs. v2: Remove the WARN_ONs Thomas objected to. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 25cf624eccb7..5ed80991ab9e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -728,6 +728,29 @@ __setup("mce=", mcheck_enable); * Sysfs support */ +/* + * Disable machine checks on suspend and shutdown. We can't really handle + * them later. + */ +static int mce_disable(void) +{ + int i; + + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + return 0; +} + +static int mce_suspend(struct sys_device *dev, pm_message_t state) +{ + return mce_disable(); +} + +static int mce_shutdown(struct sys_device *dev) +{ + return mce_disable(); +} + /* On resume clear all MCE state. Don't want to see leftovers from the BIOS. Only one CPU is active at this time, the others get readded later using CPU hotplug. */ @@ -752,6 +775,8 @@ static void mce_restart(void) } static struct sysdev_class mce_sysclass = { + .suspend = mce_suspend, + .shutdown = mce_shutdown, .resume = mce_resume, .name = "machinecheck", }; From 123aa76ec0cab5d4881cd8509faed43231e68801 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:27 +0100 Subject: [PATCH 003/580] x86, mce: don't disable machine checks during code patching Impact: low priority bug fix This removes part of a a patch I added myself some time ago. After some consideration the patch was a bad idea. In particular it stopped machine check exceptions during code patching. To quote the comment: * MCEs only happen when something got corrupted and in this * case we must do something about the corruption. * Ignoring it is worse than a unlikely patching race. * Also machine checks tend to be broadcast and if one CPU * goes into machine check the others follow quickly, so we don't * expect a machine check to cause undue problems during to code * patching. So undo the machine check related parts of 8f4e956b313dcccbc7be6f10808952345e3b638c NMIs are still disabled. This only removes code, the only additions are a new comment. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 2 -- arch/x86/kernel/alternative.c | 17 +++++++++++------ arch/x86/kernel/cpu/mcheck/mce_32.c | 14 -------------- arch/x86/kernel/cpu/mcheck/mce_64.c | 14 -------------- 4 files changed, 11 insertions(+), 36 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 32c6e17b960b..5522273a3ad8 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -120,8 +120,6 @@ extern void mcheck_init(struct cpuinfo_x86 *c); #else #define mcheck_init(c) do { } while (0) #endif -extern void stop_mce(void); -extern void restart_mce(void); #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a84ac7b570e6..5b8394a3a6b2 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -414,9 +414,17 @@ void __init alternative_instructions(void) that might execute the to be patched code. Other CPUs are not running. */ stop_nmi(); -#ifdef CONFIG_X86_MCE - stop_mce(); -#endif + + /* + * Don't stop machine check exceptions while patching. + * MCEs only happen when something got corrupted and in this + * case we must do something about the corruption. + * Ignoring it is worse than a unlikely patching race. + * Also machine checks tend to be broadcast and if one CPU + * goes into machine check the others follow quickly, so we don't + * expect a machine check to cause undue problems during to code + * patching. + */ apply_alternatives(__alt_instructions, __alt_instructions_end); @@ -456,9 +464,6 @@ void __init alternative_instructions(void) (unsigned long)__smp_locks_end); restart_nmi(); -#ifdef CONFIG_X86_MCE - restart_mce(); -#endif } /** diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c index dfaebce3633e..3552119b091d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ b/arch/x86/kernel/cpu/mcheck/mce_32.c @@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c) } } -static unsigned long old_cr4 __initdata; - -void __init stop_mce(void) -{ - old_cr4 = read_cr4(); - clear_in_cr4(X86_CR4_MCE); -} - -void __init restart_mce(void) -{ - if (old_cr4 & X86_CR4_MCE) - set_in_cr4(X86_CR4_MCE); -} - static int __init mcheck_disable(char *str) { mce_disabled = 1; diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 5ed80991ab9e..25ccdbec86e2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -680,20 +680,6 @@ static struct miscdevice mce_log_device = { &mce_chrdev_ops, }; -static unsigned long old_cr4 __initdata; - -void __init stop_mce(void) -{ - old_cr4 = read_cr4(); - clear_in_cr4(X86_CR4_MCE); -} - -void __init restart_mce(void) -{ - if (old_cr4 & X86_CR4_MCE) - set_in_cr4(X86_CR4_MCE); -} - /* * Old style boot options parsing. Only for compatibility. */ From 9bd984058088d6ef7af6946591a207e51a2f4890 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:28 +0100 Subject: [PATCH 004/580] x86, mce: always use separate work queue to run trigger Impact: Needed for bug fix in next patch This relaxes the requirement that mce_notify_user has to run in process context. Useful for future changes, but also leads to cleaner behaviour now. Now instead mce_notify_user can be called directly from interrupt (but not NMI) context. The work queue only uses a single global work struct, which can be done safely because it is always free to reuse before the trigger function is executed. This way no events can be lost. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 25ccdbec86e2..18b379cf0610 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -380,11 +380,17 @@ static void mcheck_timer(struct work_struct *work) schedule_delayed_work(&mcheck_work, next_interval); } +static void mce_do_trigger(struct work_struct *work) +{ + call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT); +} + +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); + /* - * This is only called from process context. This is where we do - * anything we need to alert userspace about new MCEs. This is called - * directly from the poller and also from entry.S and idle, thanks to - * TIF_MCE_NOTIFY. + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context. */ int mce_notify_user(void) { @@ -394,9 +400,14 @@ int mce_notify_user(void) unsigned long now = jiffies; wake_up_interruptible(&mce_wait); - if (trigger[0]) - call_usermodehelper(trigger, trigger_argv, NULL, - UMH_NO_WAIT); + + /* + * There is no risk of missing notifications because + * work_pending is always cleared before the function is + * executed. + */ + if (trigger[0] && !work_pending(&mce_trigger_work)) + schedule_work(&mce_trigger_work); if (time_after_eq(now, last_print + (check_interval*HZ))) { last_print = now; From 52d168e28bc11dd026b620fe1767cadde5a747cd Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:29 +0100 Subject: [PATCH 005/580] x86, mce: switch machine check polling to per CPU timer Impact: Higher priority bug fix The machine check poller runs a single timer and then broadcasted an IPI to all CPUs to check them. This leads to unnecessary synchronization between CPUs. The original CPU running the timer has to wait potentially a long time for all other CPUs answering. This is also real time unfriendly and in general inefficient. This was especially a problem on systems with a lot of events where the poller run with a higher frequency after processing some events. There could be more and more CPU time wasted with this, to the point of significantly slowing down machines. The machine check polling is actually fully independent per CPU, so there's no reason to not just do this all with per CPU timers. This patch implements that. Also switch the poller also to use standard timers instead of work queues. It was using work queues to be able to execute a user program on a event, but mce_notify_user() handles this case now with a separate callback. So instead always run the poll code in in a standard per CPU timer, which means that in the common case of not having to execute a trigger there will be less overhead. This allows to clean up the initialization significantly, because standard timers are already up when machine checks get init'ed. No multiple initialization functions. Thanks to Thomas Gleixner for some help. Cc: thockin@google.com v2: Use del_timer_sync() on cpu shutdown and don't try to handle migrated timers. v3: Add WARN_ON for timer running on unexpected CPU Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 68 +++++++++++++++++++---------- 1 file changed, 45 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 18b379cf0610..3f0550d16f3c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -353,18 +353,17 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status) static int check_interval = 5 * 60; /* 5 minutes */ static int next_interval; /* in jiffies */ -static void mcheck_timer(struct work_struct *work); -static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); +static void mcheck_timer(unsigned long); +static DEFINE_PER_CPU(struct timer_list, mce_timer); -static void mcheck_check_cpu(void *info) +static void mcheck_timer(unsigned long data) { + struct timer_list *t = &per_cpu(mce_timer, data); + + WARN_ON(smp_processor_id() != data); + if (mce_available(¤t_cpu_data)) do_machine_check(NULL, 0); -} - -static void mcheck_timer(struct work_struct *work) -{ - on_each_cpu(mcheck_check_cpu, NULL, 1); /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -377,7 +376,8 @@ static void mcheck_timer(struct work_struct *work) (int)round_jiffies_relative(check_interval*HZ)); } - schedule_delayed_work(&mcheck_work, next_interval); + t->expires = jiffies + next_interval; + add_timer(t); } static void mce_do_trigger(struct work_struct *work) @@ -436,16 +436,11 @@ static struct notifier_block mce_idle_notifier = { static __init int periodic_mcheck_init(void) { - next_interval = check_interval * HZ; - if (next_interval) - schedule_delayed_work(&mcheck_work, - round_jiffies_relative(next_interval)); - idle_notifier_register(&mce_idle_notifier); - return 0; + idle_notifier_register(&mce_idle_notifier); + return 0; } __initcall(periodic_mcheck_init); - /* * Initialize Machine Checks for a CPU. */ @@ -515,6 +510,20 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) } } +static void mce_init_timer(void) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + + /* data race harmless because everyone sets to the same value */ + if (!next_interval) + next_interval = check_interval * HZ; + if (!next_interval) + return; + setup_timer(t, mcheck_timer, smp_processor_id()); + t->expires = round_jiffies_relative(jiffies + next_interval); + add_timer(t); +} + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off. @@ -529,6 +538,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) mce_init(NULL); mce_cpu_features(c); + mce_init_timer(); } /* @@ -758,17 +768,19 @@ static int mce_resume(struct sys_device *dev) return 0; } +static void mce_cpu_restart(void *data) +{ + del_timer_sync(&__get_cpu_var(mce_timer)); + if (mce_available(¤t_cpu_data)) + mce_init(NULL); + mce_init_timer(); +} + /* Reinit MCEs after user configuration changes */ static void mce_restart(void) { - if (next_interval) - cancel_delayed_work(&mcheck_work); - /* Timer race is harmless here */ - on_each_cpu(mce_init, NULL, 1); next_interval = check_interval * HZ; - if (next_interval) - schedule_delayed_work(&mcheck_work, - round_jiffies_relative(next_interval)); + on_each_cpu(mce_cpu_restart, NULL, 1); } static struct sysdev_class mce_sysclass = { @@ -899,6 +911,7 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; + struct timer_list *t = &per_cpu(mce_timer, cpu); switch (action) { case CPU_ONLINE: @@ -913,6 +926,15 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, threshold_cpu_callback(action, cpu); mce_remove_device(cpu); break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + del_timer_sync(t); + break; + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + t->expires = round_jiffies_relative(jiffies + next_interval); + add_timer_on(t, cpu); + break; } return NOTIFY_OK; } From 5b4408fdaa62474dd9485cddb9126370d90d4b82 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:30 +0100 Subject: [PATCH 006/580] x86, mce: don't set up mce sysdev devices with mce=off Impact: bug fix, in this case the resume handler shouldn't run which avoids incorrectly reenabling machine checks on resume When MCEs are completely disabled on the command line don't set up the sysdev devices for them either. Includes a comment fix from Thomas Gleixner. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 3f0550d16f3c..4e2b1bc5131c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -151,6 +151,8 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) static int mce_available(struct cpuinfo_x86 *c) { + if (mce_dont_init) + return 0; return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } @@ -532,8 +534,7 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { mce_cpu_quirks(c); - if (mce_dont_init || - !mce_available(c)) + if (!mce_available(c)) return; mce_init(NULL); @@ -710,8 +711,7 @@ static int __init mcheck_disable(char *str) return 1; } -/* mce=off disables machine check. Note you can re-enable it later - using sysfs. +/* mce=off disables machine check. mce=TOLERANCELEVEL (number, see above) mce=bootlog Log MCEs from before booting. Disabled by default on AMD. mce=nobootlog Don't log MCEs from before booting. */ From d6b75584a3eaab8cb2ab3e8cf90c5e57c1928a85 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:31 +0100 Subject: [PATCH 007/580] x86, mce: disable machine checks on offlined CPUs Impact: Lower priority bug fix Offlined CPUs could still get machine checks, but the machine check handler cannot handle them properly, leading to an unconditional crash. Disable machine checks on CPUs that are going down. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 4e2b1bc5131c..1db94c0d5aaf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -906,6 +906,27 @@ static __cpuinit void mce_remove_device(unsigned int cpu) cpu_clear(cpu, mce_device_initialized); } +/* Make sure there are no machine checks on offlined CPUs. */ +static void __cpuexit mce_disable_cpu(void *h) +{ + int i; + + if (!mce_available(¤t_cpu_data)) + return; + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); +} + +static void __cpuexit mce_reenable_cpu(void *h) +{ + int i; + + if (!mce_available(¤t_cpu_data)) + return; + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); +} + /* Get notified when a cpu comes on/off. Be hotplug friendly. */ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -929,11 +950,13 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: del_timer_sync(t); + smp_call_function_single(cpu, mce_disable_cpu, NULL, 1); break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: t->expires = round_jiffies_relative(jiffies + next_interval); add_timer_on(t, cpu); + smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1); break; } return NOTIFY_OK; From ef41df4344ff952c79746d44a6126bd2cf7ed2bc Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 12 Feb 2009 13:39:34 +0100 Subject: [PATCH 008/580] x86, mce: fix a race condition in mce_read() Impact: bugfix Considering the situation as follow: before: mcelog.next == 1, mcelog.entry[0].finished = 1 +-------------------------------------------------------------------------- R W1 W2 W3 read mcelog.next (1) mcelog.next++ (2) (working on entry 1, finished == 0) mcelog.next = 0 mcelog.next++ (1) (working on entry 0) mcelog.next++ (2) (working on entry 1) <----------------- race ----------------> (done on entry 1, finished = 1) (done on entry 1, finished = 1) To fix the race condition, a cmpxchg loop is added to mce_read() to ensure no new MCE record can be added between mcelog.next reading and mcelog.next = 0. Signed-off-by: Huang Ying Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 39 +++++++++++++++++------------ 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 1db94c0d5aaf..870d08deccf7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -595,7 +595,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, { unsigned long *cpu_tsc; static DEFINE_MUTEX(mce_read_mutex); - unsigned next; + unsigned prev, next; char __user *buf = ubuf; int i, err; @@ -614,25 +614,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, } err = 0; - for (i = 0; i < next; i++) { - unsigned long start = jiffies; + prev = 0; + do { + for (i = prev; i < next; i++) { + unsigned long start = jiffies; - while (!mcelog.entry[i].finished) { - if (time_after_eq(jiffies, start + 2)) { - memset(mcelog.entry + i,0, sizeof(struct mce)); - goto timeout; + while (!mcelog.entry[i].finished) { + if (time_after_eq(jiffies, start + 2)) { + memset(mcelog.entry + i, 0, + sizeof(struct mce)); + goto timeout; + } + cpu_relax(); } - cpu_relax(); + smp_rmb(); + err |= copy_to_user(buf, mcelog.entry + i, + sizeof(struct mce)); + buf += sizeof(struct mce); +timeout: + ; } - smp_rmb(); - err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); - buf += sizeof(struct mce); - timeout: - ; - } - memset(mcelog.entry, 0, next * sizeof(struct mce)); - mcelog.next = 0; + memset(mcelog.entry + prev, 0, + (next - prev) * sizeof(struct mce)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); synchronize_sched(); From e35849e910a6543d37c0d13648ef166678d03565 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:43:20 +0100 Subject: [PATCH 009/580] x86, mce: enable machine checks in 64-bit defconfig Impact: Low priority fix The 32-bit defconfig already had it enabled. And it's a pretty fundamental feature, so better enable it on 64 bits too. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/configs/x86_64_defconfig | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 322dd2748fc9..786ce5ac2acc 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -247,7 +247,10 @@ CONFIG_PREEMPT_VOLUNTARY=y # CONFIG_PREEMPT is not set CONFIG_X86_LOCAL_APIC=y CONFIG_X86_IO_APIC=y -# CONFIG_X86_MCE is not set +CONFIG_X86_MCE=y +CONFIG_X86_NEW_MCE=y +CONFIG_X86_MCE_INTEL=y +CONFIG_X86_MCE_AMD=y # CONFIG_I8K is not set CONFIG_MICROCODE=y CONFIG_MICROCODE_OLD_INTERFACE=y From 0d7482e3d76522157c9d741d79fce22c401fa0c5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 17 Feb 2009 23:07:13 +0100 Subject: [PATCH 010/580] x86, mce: implement dynamic machine check banks support Impact: cleanup; making code future proof; memory saving on small systems This patch replaces the hardcoded max number of machine check banks with dynamic allocation depending on what the CPU reports. The sysfs data structures and the banks array are dynamically allocated. There is still a hard bank limit (128) because the mcelog protocol uses banks >= 128 as pseudo banks to escape other events. But we expect that 128 banks is beyond any reasonable CPU for now. This supersedes an earlier patch by Venki, but it solves the problem more completely by making the limit fully dynamic (up to the 128 boundary). This saves some memory on machines with less than 6 banks because they won't need sysdevs for unused ones and also allows to use sysfs to control these banks on possible future CPUs with more than 6 banks. This is an updated patch addressing Venki's comments. I also added in another patch from Thomas which fixed the error allocation path (that patch was previously separated) Cc: Venki Pallipadi Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 155 +++++++++++++++++++++------- 1 file changed, 119 insertions(+), 36 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 870d08deccf7..2297730bb514 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include #include #include @@ -32,7 +34,12 @@ #include #define MISC_MCELOG_MINOR 227 -#define NR_SYSFS_BANKS 6 + +/* + * To support more than 128 would need to escape the predefined + * Linux defined extended banks first. + */ +#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) atomic_t mce_entry; @@ -47,7 +54,7 @@ static int mce_dont_init; */ static int tolerant = 1; static int banks; -static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL }; +static u64 *bank; static unsigned long notify_user; static int rip_msr; static int mce_bootlog = -1; @@ -212,7 +219,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) barrier(); for (i = 0; i < banks; i++) { - if (i < NR_SYSFS_BANKS && !bank[i]) + if (!bank[i]) continue; m.misc = 0; @@ -446,37 +453,54 @@ __initcall(periodic_mcheck_init); /* * Initialize Machine Checks for a CPU. */ +static int mce_cap_init(void) +{ + u64 cap; + unsigned b; + + rdmsrl(MSR_IA32_MCG_CAP, cap); + b = cap & 0xff; + if (b > MAX_NR_BANKS) { + printk(KERN_WARNING + "MCE: Using only %u machine check banks out of %u\n", + MAX_NR_BANKS, b); + b = MAX_NR_BANKS; + } + + /* Don't support asymmetric configurations today */ + WARN_ON(banks != 0 && b != banks); + banks = b; + if (!bank) { + bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); + if (!bank) + return -ENOMEM; + memset(bank, 0xff, banks * sizeof(u64)); + } + + /* Use accurate RIP reporting if available. */ + if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) + rip_msr = MSR_IA32_MCG_EIP; + + return 0; +} + static void mce_init(void *dummy) { u64 cap; int i; - rdmsrl(MSR_IA32_MCG_CAP, cap); - banks = cap & 0xff; - if (banks > MCE_EXTENDED_BANK) { - banks = MCE_EXTENDED_BANK; - printk(KERN_INFO "MCE: warning: using only %d banks\n", - MCE_EXTENDED_BANK); - } - /* Use accurate RIP reporting if available. */ - if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9) - rip_msr = MSR_IA32_MCG_EIP; - /* Log the machine checks left over from the previous reset. This also clears all registers */ do_machine_check(NULL, mce_bootlog ? -1 : -2); set_in_cr4(X86_CR4_MCE); + rdmsrl(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - if (i < NR_SYSFS_BANKS) - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); - else - wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL); - + wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } } @@ -486,10 +510,10 @@ static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { - if(c->x86 == 15) + if (c->x86 == 15 && banks > 4) /* disable GART TBL walk error reporting, which trips off incorrectly with the IOMMU & 3ware & Cerberus. */ - clear_bit(10, &bank[4]); + clear_bit(10, (unsigned long *)&bank[4]); if(c->x86 <= 17 && mce_bootlog < 0) /* Lots of broken BIOS around that don't clear them by default and leave crap in there. Don't log. */ @@ -532,11 +556,15 @@ static void mce_init_timer(void) */ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { - mce_cpu_quirks(c); - if (!mce_available(c)) return; + if (mce_cap_init() < 0) { + mce_dont_init = 1; + return; + } + mce_cpu_quirks(c); + mce_init(NULL); mce_cpu_features(c); mce_init_timer(); @@ -819,16 +847,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit } \ static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); -/* - * TBD should generate these dynamically based on number of available banks. - * Have only 6 contol banks in /sysfs until then. - */ -ACCESSOR(bank0ctl,bank[0],mce_restart()) -ACCESSOR(bank1ctl,bank[1],mce_restart()) -ACCESSOR(bank2ctl,bank[2],mce_restart()) -ACCESSOR(bank3ctl,bank[3],mce_restart()) -ACCESSOR(bank4ctl,bank[4],mce_restart()) -ACCESSOR(bank5ctl,bank[5],mce_restart()) +static struct sysdev_attribute *bank_attrs; + +static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, + char *buf) +{ + u64 b = bank[attr - bank_attrs]; + return sprintf(buf, "%Lx\n", b); +} + +static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, + const char *buf, size_t siz) +{ + char *end; + u64 new = simple_strtoull(buf, &end, 0); + if (end == buf) + return -EINVAL; + bank[attr - bank_attrs] = new; + mce_restart(); + return end-buf; +} static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) @@ -855,8 +893,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); ACCESSOR(check_interval,check_interval,mce_restart()) static struct sysdev_attribute *mce_attributes[] = { - &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, - &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, &attr_tolerant.attr, &attr_check_interval, &attr_trigger, NULL }; @@ -886,11 +922,22 @@ static __cpuinit int mce_create_device(unsigned int cpu) if (err) goto error; } + for (i = 0; i < banks; i++) { + err = sysdev_create_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); + if (err) + goto error2; + } cpu_set(cpu, mce_device_initialized); return 0; +error2: + while (--i >= 0) { + sysdev_remove_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); + } error: - while (i--) { + while (--i >= 0) { sysdev_remove_file(&per_cpu(device_mce,cpu), mce_attributes[i]); } @@ -909,6 +956,9 @@ static __cpuinit void mce_remove_device(unsigned int cpu) for (i = 0; mce_attributes[i]; i++) sysdev_remove_file(&per_cpu(device_mce,cpu), mce_attributes[i]); + for (i = 0; i < banks; i++) + sysdev_remove_file(&per_cpu(device_mce, cpu), + &bank_attrs[i]); sysdev_unregister(&per_cpu(device_mce,cpu)); cpu_clear(cpu, mce_device_initialized); } @@ -973,6 +1023,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = { .notifier_call = mce_cpu_callback, }; +static __init int mce_init_banks(void) +{ + int i; + + bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, + GFP_KERNEL); + if (!bank_attrs) + return -ENOMEM; + + for (i = 0; i < banks; i++) { + struct sysdev_attribute *a = &bank_attrs[i]; + a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); + if (!a->attr.name) + goto nomem; + a->attr.mode = 0644; + a->show = show_bank; + a->store = set_bank; + } + return 0; + +nomem: + while (--i >= 0) + kfree(bank_attrs[i].attr.name); + kfree(bank_attrs); + bank_attrs = NULL; + return -ENOMEM; +} + static __init int mce_init_device(void) { int err; @@ -980,6 +1058,11 @@ static __init int mce_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; + + err = mce_init_banks(); + if (err) + return err; + err = sysdev_class_register(&mce_sysclass); if (err) return err; From b5f2fa4ea00a179ac1c2ff342ceeee261dd75e53 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:43:22 +0100 Subject: [PATCH 011/580] x86, mce: factor out duplicated struct mce setup into one function Impact: cleanup This merely factors out duplicated code to set up the initial struct mce state into a single function. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 3 ++- arch/x86/kernel/cpu/mcheck/mce_64.c | 23 ++++++++++++++--------- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 4 +--- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +- 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 5522273a3ad8..048b71d9387a 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -90,6 +90,7 @@ extern int mce_disabled; #include +void mce_setup(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, device_mce); extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); @@ -106,7 +107,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c); static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif -void mce_log_therm_throt_event(unsigned int cpu, __u64 status); +void mce_log_therm_throt_event(__u64 status); extern atomic_t mce_entry; diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 2297730bb514..fed875742b1a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -65,6 +65,14 @@ static char *trigger_argv[2] = { trigger, NULL }; static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +/* Do initial initialization of a struct mce */ +void mce_setup(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + m->cpu = smp_processor_id(); + rdtscll(m->tsc); +} + /* * Lockless MCE logging infrastructure. * This avoids deadlocks on printk locks without having to break locks. Also @@ -208,8 +216,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) || !banks) goto out2; - memset(&m, 0, sizeof(struct mce)); - m.cpu = smp_processor_id(); + mce_setup(&m); + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); /* if the restart IP is not valid, we're done for */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) @@ -225,7 +233,6 @@ void do_machine_check(struct pt_regs * regs, long error_code) m.misc = 0; m.addr = 0; m.bank = i; - m.tsc = 0; rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); if ((m.status & MCI_STATUS_VAL) == 0) @@ -252,8 +259,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); mce_get_rip(&m, regs); - if (error_code >= 0) - rdtscll(m.tsc); + if (error_code < 0) + m.tsc = 0; if (error_code != -2) mce_log(&m); @@ -341,15 +348,13 @@ void do_machine_check(struct pt_regs * regs, long error_code) * and historically has been the register value of the * MSR_IA32_THERMAL_STATUS (Intel) msr. */ -void mce_log_therm_throt_event(unsigned int cpu, __u64 status) +void mce_log_therm_throt_event(__u64 status) { struct mce m; - memset(&m, 0, sizeof(m)); - m.cpu = cpu; + mce_setup(&m); m.bank = MCE_THERMAL_BANK; m.status = status; - rdtscll(m.tsc); mce_log(&m); } #endif /* CONFIG_X86_MCE_INTEL */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 8ae8c4ff094d..75d9dd25e3dc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -197,9 +197,7 @@ asmlinkage void mce_threshold_interrupt(void) exit_idle(); irq_enter(); - memset(&m, 0, sizeof(m)); - rdtscll(m.tsc); - m.cpu = smp_processor_id(); + mce_setup(&m); /* assume first bank caused it */ for (bank = 0; bank < NR_BANKS; ++bank) { diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 4b48f251fd39..7f7f1015ef19 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -24,7 +24,7 @@ asmlinkage void smp_thermal_interrupt(void) rdmsrl(MSR_IA32_THERM_STATUS, msr_val); if (therm_throt_process(msr_val & 1)) - mce_log_therm_throt_event(smp_processor_id(), msr_val); + mce_log_therm_throt_event(msr_val); inc_irq_stat(irq_thermal_count); irq_exit(); From b79109c3bbcf52cac5103979b283b9e5df4e796c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:43:23 +0100 Subject: [PATCH 012/580] x86, mce: separate correct machine check poller and fatal exception handler Impact: cleanup, performance enhancement The machine check poller is diverging more and more from the fatal exception handler. Instead of adding more special cases separate the code paths completely. The corrected poll path is actually quite simple, and this doesn't result in much code duplication. This makes both handlers much easier to read and results in cleaner code flow. The exception handler now only needs to care about uncorrected errors, which also simplifies the handling of multiple errors. The corrected poller also now always runs in standard interrupt context and does not need to do anything special to handle NMI context. Minor behaviour changes: - MCG status is now not cleared on polling. - Only the banks which had corrected errors get cleared on polling - The exception handler only clears banks with errors now v2: Forward port to new patch order. Add "uc" argument. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 7 ++ arch/x86/kernel/cpu/mcheck/mce_64.c | 129 ++++++++++++++++++++---- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 2 +- 3 files changed, 116 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 048b71d9387a..225cdb5d2bfc 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -112,6 +112,13 @@ void mce_log_therm_throt_event(__u64 status); extern atomic_t mce_entry; extern void do_machine_check(struct pt_regs *, long); + +enum mcp_flags { + MCP_TIMESTAMP = (1 << 0), /* log time stamp */ + MCP_UC = (1 << 1), /* log uncorrected errors */ +}; +extern void machine_check_poll(enum mcp_flags flags); + extern int mce_notify_user(void); #endif /* !CONFIG_X86_32 */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index fed875742b1a..268b05edade7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -3,6 +3,8 @@ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. * Rest from unknown author(s). * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen */ #include @@ -189,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } /* - * The actual machine check handler + * Poll for corrected events or events that happened before reset. + * Those are just logged through /dev/mcelog. + * + * This is executed in standard interrupt context. + */ +void machine_check_poll(enum mcp_flags flags) +{ + struct mce m; + int i; + + mce_setup(&m); + + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + for (i = 0; i < banks; i++) { + if (!bank[i]) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + m.tsc = 0; + + barrier(); + rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + if (!(m.status & MCI_STATUS_VAL)) + continue; + + /* + * Uncorrected events are handled by the exception handler + * when it is enabled. But when the exception is disabled log + * everything. + * + * TBD do the same check for MCI_STATUS_EN here? + */ + if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC)) + continue; + + if (m.status & MCI_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + + if (!(flags & MCP_TIMESTAMP)) + m.tsc = 0; + /* + * Don't get the IP here because it's unlikely to + * have anything to do with the actual error location. + */ + + mce_log(&m); + add_taint(TAINT_MACHINE_CHECK); + + /* + * Clear state for this bank. + */ + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } + + /* + * Don't clear MCG_STATUS here because it's only defined for + * exceptions. + */ +} + +/* + * The actual machine check handler. This only handles real + * exceptions when something got corrupted coming in through int 18. + * + * This is executed in NMI context not subject to normal locking rules. This + * implies that most kernel services cannot be safely used. Don't even + * think about putting a printk in there! */ void do_machine_check(struct pt_regs * regs, long error_code) { @@ -207,13 +279,14 @@ void do_machine_check(struct pt_regs * regs, long error_code) * error. */ int kill_it = 0; + DECLARE_BITMAP(toclear, MAX_NR_BANKS); atomic_inc(&mce_entry); - if ((regs - && notify_die(DIE_NMI, "machine check", regs, error_code, + if (notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL) == NOTIFY_STOP) - || !banks) + goto out2; + if (!banks) goto out2; mce_setup(&m); @@ -227,6 +300,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) barrier(); for (i = 0; i < banks; i++) { + __clear_bit(i, toclear); if (!bank[i]) continue; @@ -238,6 +312,20 @@ void do_machine_check(struct pt_regs * regs, long error_code) if ((m.status & MCI_STATUS_VAL) == 0) continue; + /* + * Non uncorrected errors are handled by machine_check_poll + * Leave them alone. + */ + if ((m.status & MCI_STATUS_UC) == 0) + continue; + + /* + * Set taint even when machine check was not enabled. + */ + add_taint(TAINT_MACHINE_CHECK); + + __set_bit(i, toclear); + if (m.status & MCI_STATUS_EN) { /* if PCC was set, there's no way out */ no_way_out |= !!(m.status & MCI_STATUS_PCC); @@ -251,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code) no_way_out = 1; kill_it = 1; } + } else { + /* + * Machine check event was not enabled. Clear, but + * ignore. + */ + continue; } if (m.status & MCI_STATUS_MISCV) @@ -259,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); mce_get_rip(&m, regs); - if (error_code < 0) - m.tsc = 0; - if (error_code != -2) - mce_log(&m); + mce_log(&m); /* Did this bank cause the exception? */ /* Assume that the bank with uncorrectable errors did it, @@ -271,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) panicm = m; panicm_found = 1; } - - add_taint(TAINT_MACHINE_CHECK); } - /* Never do anything final in the polling timer */ - if (!regs) - goto out; - /* If we didn't find an uncorrectable error, pick the last one (shouldn't happen, just being safe). */ if (!panicm_found) @@ -325,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code) /* notify userspace ASAP */ set_thread_flag(TIF_MCE_NOTIFY); - out: /* the last thing we do is clear state */ - for (i = 0; i < banks; i++) - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + for (i = 0; i < banks; i++) { + if (test_bit(i, toclear)) + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } wrmsrl(MSR_IA32_MCG_STATUS, 0); out2: atomic_dec(&mce_entry); @@ -377,7 +463,7 @@ static void mcheck_timer(unsigned long data) WARN_ON(smp_processor_id() != data); if (mce_available(¤t_cpu_data)) - do_machine_check(NULL, 0); + machine_check_poll(MCP_TIMESTAMP); /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -494,9 +580,10 @@ static void mce_init(void *dummy) u64 cap; int i; - /* Log the machine checks left over from the previous reset. - This also clears all registers */ - do_machine_check(NULL, mce_bootlog ? -1 : -2); + /* + * Log the machine checks left over from the previous reset. + */ + machine_check_poll(MCP_UC); set_in_cr4(X86_CR4_MCE); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 75d9dd25e3dc..0069c653f4ed 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -231,7 +231,7 @@ asmlinkage void mce_threshold_interrupt(void) /* Log the machine check that caused the threshold event. */ - do_machine_check(NULL, 0); + machine_check_poll(MCP_TIMESTAMP); if (high & MASK_OVERFLOW_HI) { rdmsrl(address, m.misc); From f6d1826dfad0d15fd14a455facc80b91f2ee642f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 19 Feb 2009 15:44:58 -0800 Subject: [PATCH 013/580] x86, mce: use %ll instead of %L for 64-bit numbers Impact: Cleanup The standard spelling of a printf pattern for long long is "ll", not "L", which is for long double. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 268b05edade7..60a114ca14f6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -136,11 +136,11 @@ static void print_mce(struct mce *m) print_symbol("{%s}", m->ip); printk("\n"); } - printk(KERN_EMERG "TSC %Lx ", m->tsc); + printk(KERN_EMERG "TSC %llx ", m->tsc); if (m->addr) - printk("ADDR %Lx ", m->addr); + printk("ADDR %llx ", m->addr); if (m->misc) - printk("MISC %Lx ", m->misc); + printk("MISC %llx ", m->misc); printk("\n"); printk(KERN_EMERG "This is not a software problem!\n"); printk(KERN_EMERG "Run through mcelog --ascii to decode " @@ -945,7 +945,7 @@ static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { u64 b = bank[attr - bank_attrs]; - return sprintf(buf, "%Lx\n", b); + return sprintf(buf, "%llx\n", b); } static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, From 42f8faecf7a88371de0f30aebb052d1ae51762c0 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 17 Feb 2009 11:46:42 +0800 Subject: [PATCH 014/580] x86: use percpu data for 4k hardirq and softirq stacks Impact: economize memory for large NR_CPUS percpu data is setup earlier than irq, we can use percpu data to economize memory. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- arch/x86/kernel/irq_32.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index e0f29be8ab0b..90cd94aba69a 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { } union irq_ctx { struct thread_info tinfo; u32 stack[THREAD_SIZE/sizeof(u32)]; -}; +} __attribute__((aligned(PAGE_SIZE))); -static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; -static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; +static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); +static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); -static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; -static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; +static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack); +static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack); static void call_on_stack(void *func, void *stack) { @@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) u32 *isp, arg1, arg2; curctx = (union irq_ctx *) current_thread_info(); - irqctx = hardirq_ctx[smp_processor_id()]; + irqctx = __get_cpu_var(hardirq_ctx); /* * this is where we switch to the IRQ stack. However, if we are @@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu) { union irq_ctx *irqctx; - if (hardirq_ctx[cpu]) + if (per_cpu(hardirq_ctx, cpu)) return; - irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; + irqctx = &per_cpu(hardirq_stack, cpu); irqctx->tinfo.task = NULL; irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.cpu = cpu; irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); - hardirq_ctx[cpu] = irqctx; + per_cpu(hardirq_ctx, cpu) = irqctx; - irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE]; + irqctx = &per_cpu(softirq_stack, cpu); irqctx->tinfo.task = NULL; irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.cpu = cpu; irqctx->tinfo.preempt_count = 0; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); - softirq_ctx[cpu] = irqctx; + per_cpu(softirq_ctx, cpu) = irqctx; printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", - cpu, hardirq_ctx[cpu], softirq_ctx[cpu]); + cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); } void irq_ctx_exit(int cpu) { - hardirq_ctx[cpu] = NULL; + per_cpu(hardirq_ctx, cpu) = NULL; } asmlinkage void do_softirq(void) @@ -169,7 +170,7 @@ asmlinkage void do_softirq(void) if (local_softirq_pending()) { curctx = current_thread_info(); - irqctx = softirq_ctx[smp_processor_id()]; + irqctx = __get_cpu_var(softirq_ctx); irqctx->tinfo.task = curctx->task; irqctx->tinfo.previous_esp = current_stack_pointer; From 734269521e320ad14ed39ae9b64d482b9028dcd2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:07 +0900 Subject: [PATCH 015/580] vmalloc: call flush_cache_vunmap() from unmap_kernel_range() Impact: proper vcache flush on unmap_kernel_range() flush_cache_vunmap() should be called before pages are unmapped. Add a call to it in unmap_kernel_range(). Signed-off-by: Tejun Heo --- mm/vmalloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 75f49d312e8c..c37924a2ee36 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1012,6 +1012,8 @@ void __init vmalloc_init(void) void unmap_kernel_range(unsigned long addr, unsigned long size) { unsigned long end = addr + size; + + flush_cache_vunmap(addr, end); vunmap_page_range(addr, end); flush_tlb_kernel_range(addr, end); } From 6b588c18f8dacfa6d7957c33c5ff832096e752d3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:07 +0900 Subject: [PATCH 016/580] module: reorder module pcpu related functions Impact: cleanup Move percpu_modinit() upwards. This is to ease further changes. Signed-off-by: Tejun Heo --- kernel/module.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/kernel/module.c b/kernel/module.c index ba22484a987e..52b3497b8748 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -480,21 +480,6 @@ static void percpu_modfree(void *freeme) } } -static unsigned int find_pcpusec(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings) -{ - return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); -} - -static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) -{ - int cpu; - - for_each_possible_cpu(cpu) - memcpy(pcpudest + per_cpu_offset(cpu), from, size); -} - static int percpu_modinit(void) { pcpu_num_used = 2; @@ -513,7 +498,24 @@ static int percpu_modinit(void) return 0; } __initcall(percpu_modinit); + +static unsigned int find_pcpusec(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + const char *secstrings) +{ + return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); +} + +static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) +{ + int cpu; + + for_each_possible_cpu(cpu) + memcpy(pcpudest + per_cpu_offset(cpu), from, size); +} + #else /* ... !CONFIG_SMP */ + static inline void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) { @@ -535,6 +537,7 @@ static inline void percpu_modcopy(void *pcpudst, const void *src, /* pcpusec should be 0, and size of that section should be 0. */ BUG_ON(size != 0); } + #endif /* CONFIG_SMP */ #define MODINFO_ATTR(field) \ From b36128c830a8f5bd7d4981f5b0b69950f5928ee6 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 20 Feb 2009 16:29:08 +0900 Subject: [PATCH 017/580] alloc_percpu: change percpu_ptr to per_cpu_ptr Impact: cleanup There are two allocated per-cpu accessor macros with almost identical spelling. The original and far more popular is per_cpu_ptr (44 files), so change over the other 4 files. tj: kill percpu_ptr() and update UP too Signed-off-by: Rusty Russell Cc: mingo@redhat.com Cc: lenb@kernel.org Cc: cpufreq@vger.kernel.org Signed-off-by: Tejun Heo --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 2 +- drivers/acpi/processor_perflib.c | 4 ++-- include/linux/percpu.h | 23 +++++++++++----------- kernel/sched.c | 6 +++--- kernel/stop_machine.c | 2 +- 5 files changed, 18 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 4b1c319d30c3..22590cf688ae 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (!data) return -ENOMEM; - data->acpi_data = percpu_ptr(acpi_perf_data, cpu); + data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); per_cpu(drv_data, cpu) = data; if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c index 9cc769b587ff..68fd3d292799 100644 --- a/drivers/acpi/processor_perflib.c +++ b/drivers/acpi/processor_perflib.c @@ -516,12 +516,12 @@ int acpi_processor_preregister_performance( continue; } - if (!performance || !percpu_ptr(performance, i)) { + if (!performance || !per_cpu_ptr(performance, i)) { retval = -EINVAL; continue; } - pr->performance = percpu_ptr(performance, i); + pr->performance = per_cpu_ptr(performance, i); cpumask_set_cpu(i, pr->performance->shared_cpu_map); if (acpi_processor_get_psd(pr)) { retval = -EINVAL; diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 3577ffd90d45..c80cfe1260ec 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -81,23 +81,13 @@ struct percpu_data { }; #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata) -/* - * Use this to get to a cpu's version of the per-cpu object dynamically - * allocated. Non-atomic access to the current CPU's version should - * probably be combined with get_cpu()/put_cpu(). - */ -#define percpu_ptr(ptr, cpu) \ -({ \ - struct percpu_data *__p = __percpu_disguise(ptr); \ - (__typeof__(ptr))__p->ptrs[(cpu)]; \ -}) extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask); extern void percpu_free(void *__pdata); #else /* CONFIG_SMP */ -#define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) +#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) { @@ -122,6 +112,15 @@ static inline void percpu_free(void *__pdata) cpu_possible_map) #define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type)) #define free_percpu(ptr) percpu_free((ptr)) -#define per_cpu_ptr(ptr, cpu) percpu_ptr((ptr), (cpu)) +/* + * Use this to get to a cpu's version of the per-cpu object dynamically + * allocated. Non-atomic access to the current CPU's version should + * probably be combined with get_cpu()/put_cpu(). + */ +#define per_cpu_ptr(ptr, cpu) \ +({ \ + struct percpu_data *__p = __percpu_disguise(ptr); \ + (__typeof__(ptr))__p->ptrs[(cpu)]; \ +}) #endif /* __LINUX_PERCPU_H */ diff --git a/kernel/sched.c b/kernel/sched.c index fc17fd91ab57..9d30ac956328 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9472,7 +9472,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 data; #ifndef CONFIG_64BIT @@ -9491,7 +9491,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); #ifndef CONFIG_64BIT /* @@ -9587,7 +9587,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) ca = task_ca(tsk); for (; ca; ca = ca->parent) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; } } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 0cd415ee62a2..74541ca49536 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) * doesn't hit this CPU until we're ready. */ get_cpu(); for_each_online_cpu(i) { - sm_work = percpu_ptr(stop_machine_work, i); + sm_work = per_cpu_ptr(stop_machine_work, i); INIT_WORK(sm_work, stop_cpu); queue_work_on(i, stop_machine_wq, sm_work); } From 313e458f81ec3852106c5a83830fe0d4f405a71a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 20 Feb 2009 16:29:08 +0900 Subject: [PATCH 018/580] alloc_percpu: add align argument to __alloc_percpu. This prepares for a real __alloc_percpu, by adding an alignment argument. Only one place uses __alloc_percpu directly, and that's for a string. tj: af_inet also uses __alloc_percpu(), update it. Signed-off-by: Rusty Russell Cc: Christoph Lameter Cc: Jens Axboe --- block/blktrace.c | 2 +- include/linux/percpu.h | 5 +++-- net/ipv4/af_inet.c | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/block/blktrace.c b/block/blktrace.c index 39cc3bfe56e4..487766237d28 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -363,7 +363,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->sequence) goto err; - bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG); + bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); if (!bt->msg_data) goto err; diff --git a/include/linux/percpu.h b/include/linux/percpu.h index c80cfe1260ec..1fdaee93c04d 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -108,9 +108,10 @@ static inline void percpu_free(void *__pdata) /* (legacy) interface for use without CPU hotplug handling */ -#define __alloc_percpu(size) percpu_alloc_mask((size), GFP_KERNEL, \ +#define __alloc_percpu(size, align) percpu_alloc_mask((size), GFP_KERNEL, \ cpu_possible_map) -#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type)) +#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \ + __alignof__(type)) #define free_percpu(ptr) percpu_free((ptr)) /* * Use this to get to a cpu's version of the per-cpu object dynamically diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 743f5542d65a..3a3dad801354 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1375,10 +1375,10 @@ EXPORT_SYMBOL_GPL(snmp_fold_field); int snmp_mib_init(void *ptr[2], size_t mibsize) { BUG_ON(ptr == NULL); - ptr[0] = __alloc_percpu(mibsize); + ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long)); if (!ptr[0]) goto err0; - ptr[1] = __alloc_percpu(mibsize); + ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long)); if (!ptr[1]) goto err1; return 0; From f2a8205c4ef1af917d175c36a4097ae5587791c8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:08 +0900 Subject: [PATCH 019/580] percpu: kill percpu_alloc() and friends Impact: kill unused functions percpu_alloc() and its friends never saw much action. It was supposed to replace the cpu-mask unaware __alloc_percpu() but it never happened and in fact __percpu_alloc_mask() itself never really grew proper up/down handling interface either (no exported interface for populate/depopulate). percpu allocation is about to go through major reimplementation and there's no reason to carry this unused interface around. Replace it with __alloc_percpu() and free_percpu(). Signed-off-by: Tejun Heo --- include/linux/percpu.h | 65 ++++++++++++++++++++---------------------- mm/allocpercpu.c | 32 ++++++++++++--------- 2 files changed, 50 insertions(+), 47 deletions(-) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 1fdaee93c04d..d99e24ae1811 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -82,41 +82,10 @@ struct percpu_data { #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata) -extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask); -extern void percpu_free(void *__pdata); - -#else /* CONFIG_SMP */ - -#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) - -static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) -{ - return kzalloc(size, gfp); -} - -static inline void percpu_free(void *__pdata) -{ - kfree(__pdata); -} - -#endif /* CONFIG_SMP */ - -#define percpu_alloc_mask(size, gfp, mask) \ - __percpu_alloc_mask((size), (gfp), &(mask)) - -#define percpu_alloc(size, gfp) percpu_alloc_mask((size), (gfp), cpu_online_map) - -/* (legacy) interface for use without CPU hotplug handling */ - -#define __alloc_percpu(size, align) percpu_alloc_mask((size), GFP_KERNEL, \ - cpu_possible_map) -#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \ - __alignof__(type)) -#define free_percpu(ptr) percpu_free((ptr)) /* - * Use this to get to a cpu's version of the per-cpu object dynamically - * allocated. Non-atomic access to the current CPU's version should - * probably be combined with get_cpu()/put_cpu(). + * Use this to get to a cpu's version of the per-cpu object + * dynamically allocated. Non-atomic access to the current CPU's + * version should probably be combined with get_cpu()/put_cpu(). */ #define per_cpu_ptr(ptr, cpu) \ ({ \ @@ -124,4 +93,32 @@ static inline void percpu_free(void *__pdata) (__typeof__(ptr))__p->ptrs[(cpu)]; \ }) +extern void *__alloc_percpu(size_t size, size_t align); +extern void free_percpu(void *__pdata); + +#else /* CONFIG_SMP */ + +#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) + +static inline void *__alloc_percpu(size_t size, size_t align) +{ + /* + * Can't easily make larger alignment work with kmalloc. WARN + * on it. Larger alignment should only be used for module + * percpu sections on SMP for which this path isn't used. + */ + WARN_ON_ONCE(align > __alignof__(unsigned long long)); + return kzalloc(size, gfp); +} + +static inline void free_percpu(void *p) +{ + kfree(p); +} + +#endif /* CONFIG_SMP */ + +#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \ + __alignof__(type)) + #endif /* __LINUX_PERCPU_H */ diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index 4297bc41bfd2..3653c570232b 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -99,45 +99,51 @@ static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) /** - * percpu_alloc_mask - initial setup of per-cpu data + * alloc_percpu - initial setup of per-cpu data * @size: size of per-cpu object - * @gfp: may sleep or not etc. - * @mask: populate per-data for cpu's selected through mask bits + * @align: alignment * - * Populating per-cpu data for all online cpu's would be a typical use case, - * which is simplified by the percpu_alloc() wrapper. - * Per-cpu objects are populated with zeroed buffers. + * Allocate dynamic percpu area. Percpu objects are populated with + * zeroed buffers. */ -void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) +void *__alloc_percpu(size_t size, size_t align) { /* * We allocate whole cache lines to avoid false sharing */ size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size()); - void *pdata = kzalloc(sz, gfp); + void *pdata = kzalloc(sz, GFP_KERNEL); void *__pdata = __percpu_disguise(pdata); + /* + * Can't easily make larger alignment work with kmalloc. WARN + * on it. Larger alignment should only be used for module + * percpu sections on SMP for which this path isn't used. + */ + WARN_ON_ONCE(align > __alignof__(unsigned long long)); + if (unlikely(!pdata)) return NULL; - if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask))) + if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL, + &cpu_possible_map))) return __pdata; kfree(pdata); return NULL; } -EXPORT_SYMBOL_GPL(__percpu_alloc_mask); +EXPORT_SYMBOL_GPL(__alloc_percpu); /** - * percpu_free - final cleanup of per-cpu data + * free_percpu - final cleanup of per-cpu data * @__pdata: object to clean up * * We simply clean up any per-cpu object left. No need for the client to * track and specify through a bis mask which per-cpu objects are to free. */ -void percpu_free(void *__pdata) +void free_percpu(void *__pdata) { if (unlikely(!__pdata)) return; __percpu_depopulate_mask(__pdata, &cpu_possible_map); kfree(__percpu_disguise(__pdata)); } -EXPORT_SYMBOL_GPL(percpu_free); +EXPORT_SYMBOL_GPL(free_percpu); From f0aa6617903648077dffe5cfcf7c4458f4610fa7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:08 +0900 Subject: [PATCH 020/580] vmalloc: implement vm_area_register_early() Impact: allow multiple early vm areas There are places where kernel VM area needs to be allocated before vmalloc is initialized. This is done by allocating static vm_struct, initializing several fields and linking it to vmlist and later vmalloc initialization picking up these from vmlist. This is currently done manually and if there's more than one such areas, there's no defined way to arbitrate who gets which address. This patch implements vm_area_register_early(), which takes vm_area struct with flags and size initialized, assigns address to it and puts it on the vmlist. This way, multiple early vm areas can determine which addresses they should use. The only current user - alpha mm init - is converted to use it. Signed-off-by: Tejun Heo --- arch/alpha/mm/init.c | 20 +++++++++++++------- include/linux/vmalloc.h | 1 + mm/vmalloc.c | 24 ++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 5d7a16eab312..df6df025ded4 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -189,9 +189,21 @@ callback_init(void * kernel_end) if (alpha_using_srm) { static struct vm_struct console_remap_vm; - unsigned long vaddr = VMALLOC_START; + unsigned long nr_pages = 0; + unsigned long vaddr; unsigned long i, j; + /* calculate needed size */ + for (i = 0; i < crb->map_entries; ++i) + nr_pages += crb->map[i].count; + + /* register the vm area */ + console_remap_vm.flags = VM_ALLOC; + console_remap_vm.size = nr_pages << PAGE_SHIFT; + vm_area_register_early(&console_remap_vm); + + vaddr = (unsigned long)consle_remap_vm.addr; + /* Set up the third level PTEs and update the virtual addresses of the CRB entries. */ for (i = 0; i < crb->map_entries; ++i) { @@ -213,12 +225,6 @@ callback_init(void * kernel_end) vaddr += PAGE_SIZE; } } - - /* Let vmalloc know that we've allocated some space. */ - console_remap_vm.flags = VM_ALLOC; - console_remap_vm.addr = (void *) VMALLOC_START; - console_remap_vm.size = vaddr - VMALLOC_START; - vmlist = &console_remap_vm; } callback_init_done = 1; diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 506e7620a986..bbc051392298 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -106,5 +106,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count); */ extern rwlock_t vmlist_lock; extern struct vm_struct *vmlist; +extern __init void vm_area_register_early(struct vm_struct *vm); #endif /* _LINUX_VMALLOC_H */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c37924a2ee36..d206261ad9ef 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -982,6 +983,29 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro } EXPORT_SYMBOL(vm_map_ram); +/** + * vm_area_register_early - register vmap area early during boot + * @vm: vm_struct to register + * @size: size of area to register + * + * This function is used to register kernel vm area before + * vmalloc_init() is called. @vm->size and @vm->flags should contain + * proper values on entry and other fields should be zero. On return, + * vm->addr contains the allocated address. + * + * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. + */ +void __init vm_area_register_early(struct vm_struct *vm) +{ + static size_t vm_init_off __initdata; + + vm->addr = (void *)VMALLOC_START + vm_init_off; + vm_init_off = PFN_ALIGN(vm_init_off + vm->size); + + vm->next = vmlist; + vmlist = vm; +} + void __init vmalloc_init(void) { struct vmap_area *va; From 8fc48985006da4ceba24508db64ec77fc0dfe3bb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:08 +0900 Subject: [PATCH 021/580] vmalloc: add un/map_kernel_range_noflush() Impact: two more public map/unmap functions Implement map_kernel_range_noflush() and unmap_kernel_range_noflush(). These functions respectively map and unmap address range in kernel VM area but doesn't do any vcache or tlb flushing. These will be used by new percpu allocator. Signed-off-by: Tejun Heo Cc: Nick Piggin --- include/linux/vmalloc.h | 3 ++ mm/vmalloc.c | 67 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index bbc051392298..599ba7984310 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -91,6 +91,9 @@ extern struct vm_struct *remove_vm_area(const void *addr); extern int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages); +extern int map_kernel_range_noflush(unsigned long start, unsigned long size, + pgprot_t prot, struct page **pages); +extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); extern void unmap_kernel_range(unsigned long addr, unsigned long size); /* Allocate/destroy a 'vmalloc' VM area. */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d206261ad9ef..224eca9650a8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -153,8 +153,8 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr, * * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] */ -static int vmap_page_range(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages) +static int vmap_page_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages) { pgd_t *pgd; unsigned long next; @@ -170,13 +170,22 @@ static int vmap_page_range(unsigned long start, unsigned long end, if (err) break; } while (pgd++, addr = next, addr != end); - flush_cache_vmap(start, end); if (unlikely(err)) return err; return nr; } +static int vmap_page_range(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages) +{ + int ret; + + ret = vmap_page_range_noflush(start, end, prot, pages); + flush_cache_vmap(start, end); + return ret; +} + static inline int is_vmalloc_or_module_addr(const void *x) { /* @@ -1033,6 +1042,58 @@ void __init vmalloc_init(void) vmap_initialized = true; } +/** + * map_kernel_range_noflush - map kernel VM area with the specified pages + * @addr: start of the VM area to map + * @size: size of the VM area to map + * @prot: page protection flags to use + * @pages: pages to map + * + * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size + * specify should have been allocated using get_vm_area() and its + * friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is + * responsible for calling flush_cache_vmap() on to-be-mapped areas + * before calling this function. + * + * RETURNS: + * The number of pages mapped on success, -errno on failure. + */ +int map_kernel_range_noflush(unsigned long addr, unsigned long size, + pgprot_t prot, struct page **pages) +{ + return vmap_page_range_noflush(addr, addr + size, prot, pages); +} + +/** + * unmap_kernel_range_noflush - unmap kernel VM area + * @addr: start of the VM area to unmap + * @size: size of the VM area to unmap + * + * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size + * specify should have been allocated using get_vm_area() and its + * friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is + * responsible for calling flush_cache_vunmap() on to-be-mapped areas + * before calling this function and flush_tlb_kernel_range() after. + */ +void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) +{ + vunmap_page_range(addr, addr + size); +} + +/** + * unmap_kernel_range - unmap kernel VM area and flush cache and TLB + * @addr: start of the VM area to unmap + * @size: size of the VM area to unmap + * + * Similar to unmap_kernel_range_noflush() but flushes vcache before + * the unmapping and tlb after. + */ void unmap_kernel_range(unsigned long addr, unsigned long size) { unsigned long end = addr + size; From fbf59bc9d74d1fb30b8e0630743aff2806eafcea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:08 +0900 Subject: [PATCH 022/580] percpu: implement new dynamic percpu allocator Impact: new scalable dynamic percpu allocator which allows dynamic percpu areas to be accessed the same way as static ones Implement scalable dynamic percpu allocator which can be used for both static and dynamic percpu areas. This will allow static and dynamic areas to share faster direct access methods. This feature is optional and enabled only when CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is defined by arch. Please read comment on top of mm/percpu.c for details. Signed-off-by: Tejun Heo Cc: Andrew Morton --- include/linux/percpu.h | 24 +- kernel/module.c | 31 ++ mm/Makefile | 4 + mm/percpu.c | 890 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 944 insertions(+), 5 deletions(-) create mode 100644 mm/percpu.c diff --git a/include/linux/percpu.h b/include/linux/percpu.h index d99e24ae1811..18080995ff3e 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -76,23 +76,37 @@ #ifdef CONFIG_SMP +#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + +extern void *pcpu_base_addr; + +typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr); + +extern size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn, + struct page **pages, size_t cpu_size); +/* + * Use this to get to a cpu's version of the per-cpu object + * dynamically allocated. Non-atomic access to the current CPU's + * version should probably be combined with get_cpu()/put_cpu(). + */ +#define per_cpu_ptr(ptr, cpu) SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu))) + +#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ + struct percpu_data { void *ptrs[1]; }; #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata) -/* - * Use this to get to a cpu's version of the per-cpu object - * dynamically allocated. Non-atomic access to the current CPU's - * version should probably be combined with get_cpu()/put_cpu(). - */ #define per_cpu_ptr(ptr, cpu) \ ({ \ struct percpu_data *__p = __percpu_disguise(ptr); \ (__typeof__(ptr))__p->ptrs[(cpu)]; \ }) +#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ + extern void *__alloc_percpu(size_t size, size_t align); extern void free_percpu(void *__pdata); diff --git a/kernel/module.c b/kernel/module.c index 52b3497b8748..1f0657ae555b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -51,6 +51,7 @@ #include #include #include +#include #if 0 #define DEBUGP printk @@ -366,6 +367,34 @@ static struct module *find_module(const char *name) } #ifdef CONFIG_SMP + +#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + +static void *percpu_modalloc(unsigned long size, unsigned long align, + const char *name) +{ + void *ptr; + + if (align > PAGE_SIZE) { + printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", + name, align, PAGE_SIZE); + align = PAGE_SIZE; + } + + ptr = __alloc_percpu(size, align); + if (!ptr) + printk(KERN_WARNING + "Could not allocate %lu bytes percpu data\n", size); + return ptr; +} + +static void percpu_modfree(void *freeme) +{ + free_percpu(freeme); +} + +#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ + /* Number of blocks used and allocated. */ static unsigned int pcpu_num_used, pcpu_num_allocated; /* Size of each block. -ve means used. */ @@ -499,6 +528,8 @@ static int percpu_modinit(void) } __initcall(percpu_modinit); +#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ + static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, const char *secstrings) diff --git a/mm/Makefile b/mm/Makefile index 72255be57f89..818569b68f46 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -30,6 +30,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o +ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +obj-$(CONFIG_SMP) += percpu.o +else obj-$(CONFIG_SMP) += allocpercpu.o +endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o diff --git a/mm/percpu.c b/mm/percpu.c new file mode 100644 index 000000000000..4617d97e877c --- /dev/null +++ b/mm/percpu.c @@ -0,0 +1,890 @@ +/* + * linux/mm/percpu.c - percpu memory allocator + * + * Copyright (C) 2009 SUSE Linux Products GmbH + * Copyright (C) 2009 Tejun Heo + * + * This file is released under the GPLv2. + * + * This is percpu allocator which can handle both static and dynamic + * areas. Percpu areas are allocated in chunks in vmalloc area. Each + * chunk is consisted of num_possible_cpus() units and the first chunk + * is used for static percpu variables in the kernel image (special + * boot time alloc/init handling necessary as these areas need to be + * brought up before allocation services are running). Unit grows as + * necessary and all units grow or shrink in unison. When a chunk is + * filled up, another chunk is allocated. ie. in vmalloc area + * + * c0 c1 c2 + * ------------------- ------------------- ------------ + * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u + * ------------------- ...... ------------------- .... ------------ + * + * Allocation is done in offset-size areas of single unit space. Ie, + * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, + * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring + * percpu base registers UNIT_SIZE apart. + * + * There are usually many small percpu allocations many of them as + * small as 4 bytes. The allocator organizes chunks into lists + * according to free size and tries to allocate from the fullest one. + * Each chunk keeps the maximum contiguous area size hint which is + * guaranteed to be eqaul to or larger than the maximum contiguous + * area in the chunk. This helps the allocator not to iterate the + * chunk maps unnecessarily. + * + * Allocation state in each chunk is kept using an array of integers + * on chunk->map. A positive value in the map represents a free + * region and negative allocated. Allocation inside a chunk is done + * by scanning this map sequentially and serving the first matching + * entry. This is mostly copied from the percpu_modalloc() allocator. + * Chunks are also linked into a rb tree to ease address to chunk + * mapping during free. + * + * To use this allocator, arch code should do the followings. + * + * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + * + * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate + * regular address to percpu pointer and back + * + * - use pcpu_setup_static() during percpu area initialization to + * setup kernel static percpu area + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define PCPU_MIN_UNIT_PAGES_SHIFT 4 /* also max alloc size */ +#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ +#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ + +struct pcpu_chunk { + struct list_head list; /* linked to pcpu_slot lists */ + struct rb_node rb_node; /* key is chunk->vm->addr */ + int free_size; /* free bytes in the chunk */ + int contig_hint; /* max contiguous size hint */ + struct vm_struct *vm; /* mapped vmalloc region */ + int map_used; /* # of map entries used */ + int map_alloc; /* # of map entries allocated */ + int *map; /* allocation map */ + struct page *page[]; /* #cpus * UNIT_PAGES */ +}; + +static int pcpu_unit_pages_shift; +static int pcpu_unit_pages; +static int pcpu_unit_shift; +static int pcpu_unit_size; +static int pcpu_chunk_size; +static int pcpu_nr_slots; +static size_t pcpu_chunk_struct_size; + +/* the address of the first chunk which starts with the kernel static area */ +void *pcpu_base_addr; +EXPORT_SYMBOL_GPL(pcpu_base_addr); + +/* the size of kernel static area */ +static int pcpu_static_size; + +/* + * One mutex to rule them all. + * + * The following mutex is grabbed in the outermost public alloc/free + * interface functions and released only when the operation is + * complete. As such, every function in this file other than the + * outermost functions are called under pcpu_mutex. + * + * It can easily be switched to use spinlock such that only the area + * allocation and page population commit are protected with it doing + * actual [de]allocation without holding any lock. However, given + * what this allocator does, I think it's better to let them run + * sequentially. + */ +static DEFINE_MUTEX(pcpu_mutex); + +static struct list_head *pcpu_slot; /* chunk list slots */ +static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ + +static int pcpu_size_to_slot(int size) +{ + int highbit = fls(size); + return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); +} + +static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) +{ + if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int)) + return 0; + + return pcpu_size_to_slot(chunk->free_size); +} + +static int pcpu_page_idx(unsigned int cpu, int page_idx) +{ + return (cpu << pcpu_unit_pages_shift) + page_idx; +} + +static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) +{ + return &chunk->page[pcpu_page_idx(cpu, page_idx)]; +} + +static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) +{ + return (unsigned long)chunk->vm->addr + + (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); +} + +static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, + int page_idx) +{ + return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; +} + +/** + * pcpu_realloc - versatile realloc + * @p: the current pointer (can be NULL for new allocations) + * @size: the current size (can be 0 for new allocations) + * @new_size: the wanted new size (can be 0 for free) + * + * More robust realloc which can be used to allocate, resize or free a + * memory area of arbitrary size. If the needed size goes over + * PAGE_SIZE, kernel VM is used. + * + * RETURNS: + * The new pointer on success, NULL on failure. + */ +static void *pcpu_realloc(void *p, size_t size, size_t new_size) +{ + void *new; + + if (new_size <= PAGE_SIZE) + new = kmalloc(new_size, GFP_KERNEL); + else + new = vmalloc(new_size); + if (new_size && !new) + return NULL; + + memcpy(new, p, min(size, new_size)); + if (new_size > size) + memset(new + size, 0, new_size - size); + + if (size <= PAGE_SIZE) + kfree(p); + else + vfree(p); + + return new; +} + +/** + * pcpu_chunk_relocate - put chunk in the appropriate chunk slot + * @chunk: chunk of interest + * @oslot: the previous slot it was on + * + * This function is called after an allocation or free changed @chunk. + * New slot according to the changed state is determined and @chunk is + * moved to the slot. + */ +static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) +{ + int nslot = pcpu_chunk_slot(chunk); + + if (oslot != nslot) { + if (oslot < nslot) + list_move(&chunk->list, &pcpu_slot[nslot]); + else + list_move_tail(&chunk->list, &pcpu_slot[nslot]); + } +} + +static struct rb_node **pcpu_chunk_rb_search(void *addr, + struct rb_node **parentp) +{ + struct rb_node **p = &pcpu_addr_root.rb_node; + struct rb_node *parent = NULL; + struct pcpu_chunk *chunk; + + while (*p) { + parent = *p; + chunk = rb_entry(parent, struct pcpu_chunk, rb_node); + + if (addr < chunk->vm->addr) + p = &(*p)->rb_left; + else if (addr > chunk->vm->addr) + p = &(*p)->rb_right; + else + break; + } + + if (parentp) + *parentp = parent; + return p; +} + +/** + * pcpu_chunk_addr_search - search for chunk containing specified address + * @addr: address to search for + * + * Look for chunk which might contain @addr. More specifically, it + * searchs for the chunk with the highest start address which isn't + * beyond @addr. + * + * RETURNS: + * The address of the found chunk. + */ +static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) +{ + struct rb_node *n, *parent; + struct pcpu_chunk *chunk; + + n = *pcpu_chunk_rb_search(addr, &parent); + if (!n) { + /* no exactly matching chunk, the parent is the closest */ + n = parent; + BUG_ON(!n); + } + chunk = rb_entry(n, struct pcpu_chunk, rb_node); + + if (addr < chunk->vm->addr) { + /* the parent was the next one, look for the previous one */ + n = rb_prev(n); + BUG_ON(!n); + chunk = rb_entry(n, struct pcpu_chunk, rb_node); + } + + return chunk; +} + +/** + * pcpu_chunk_addr_insert - insert chunk into address rb tree + * @new: chunk to insert + * + * Insert @new into address rb tree. + */ +static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) +{ + struct rb_node **p, *parent; + + p = pcpu_chunk_rb_search(new->vm->addr, &parent); + BUG_ON(*p); + rb_link_node(&new->rb_node, parent, p); + rb_insert_color(&new->rb_node, &pcpu_addr_root); +} + +/** + * pcpu_split_block - split a map block + * @chunk: chunk of interest + * @i: index of map block to split + * @head: head size (can be 0) + * @tail: tail size (can be 0) + * + * Split the @i'th map block into two or three blocks. If @head is + * non-zero, @head bytes block is inserted before block @i moving it + * to @i+1 and reducing its size by @head bytes. + * + * If @tail is non-zero, the target block, which can be @i or @i+1 + * depending on @head, is reduced by @tail bytes and @tail byte block + * is inserted after the target block. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) +{ + int nr_extra = !!head + !!tail; + int target = chunk->map_used + nr_extra; + + /* reallocation required? */ + if (chunk->map_alloc < target) { + int new_alloc = chunk->map_alloc; + int *new; + + while (new_alloc < target) + new_alloc *= 2; + + new = pcpu_realloc(chunk->map, + chunk->map_alloc * sizeof(new[0]), + new_alloc * sizeof(new[0])); + if (!new) + return -ENOMEM; + + chunk->map_alloc = new_alloc; + chunk->map = new; + } + + /* insert a new subblock */ + memmove(&chunk->map[i + nr_extra], &chunk->map[i], + sizeof(chunk->map[0]) * (chunk->map_used - i)); + chunk->map_used += nr_extra; + + if (head) { + chunk->map[i + 1] = chunk->map[i] - head; + chunk->map[i++] = head; + } + if (tail) { + chunk->map[i++] -= tail; + chunk->map[i] = tail; + } + return 0; +} + +/** + * pcpu_alloc_area - allocate area from a pcpu_chunk + * @chunk: chunk of interest + * @size: wanted size + * @align: wanted align + * + * Try to allocate @size bytes area aligned at @align from @chunk. + * Note that this function only allocates the offset. It doesn't + * populate or map the area. + * + * RETURNS: + * Allocated offset in @chunk on success, -errno on failure. + */ +static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) +{ + int oslot = pcpu_chunk_slot(chunk); + int max_contig = 0; + int i, off; + + /* + * The static chunk initially doesn't have map attached + * because kmalloc wasn't available during init. Give it one. + */ + if (unlikely(!chunk->map)) { + chunk->map = pcpu_realloc(NULL, 0, + PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); + if (!chunk->map) + return -ENOMEM; + + chunk->map_alloc = PCPU_DFL_MAP_ALLOC; + chunk->map[chunk->map_used++] = -pcpu_static_size; + if (chunk->free_size) + chunk->map[chunk->map_used++] = chunk->free_size; + } + + for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) { + bool is_last = i + 1 == chunk->map_used; + int head, tail; + + /* extra for alignment requirement */ + head = ALIGN(off, align) - off; + BUG_ON(i == 0 && head != 0); + + if (chunk->map[i] < 0) + continue; + if (chunk->map[i] < head + size) { + max_contig = max(chunk->map[i], max_contig); + continue; + } + + /* + * If head is small or the previous block is free, + * merge'em. Note that 'small' is defined as smaller + * than sizeof(int), which is very small but isn't too + * uncommon for percpu allocations. + */ + if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) { + if (chunk->map[i - 1] > 0) + chunk->map[i - 1] += head; + else { + chunk->map[i - 1] -= head; + chunk->free_size -= head; + } + chunk->map[i] -= head; + off += head; + head = 0; + } + + /* if tail is small, just keep it around */ + tail = chunk->map[i] - head - size; + if (tail < sizeof(int)) + tail = 0; + + /* split if warranted */ + if (head || tail) { + if (pcpu_split_block(chunk, i, head, tail)) + return -ENOMEM; + if (head) { + i++; + off += head; + max_contig = max(chunk->map[i - 1], max_contig); + } + if (tail) + max_contig = max(chunk->map[i + 1], max_contig); + } + + /* update hint and mark allocated */ + if (is_last) + chunk->contig_hint = max_contig; /* fully scanned */ + else + chunk->contig_hint = max(chunk->contig_hint, + max_contig); + + chunk->free_size -= chunk->map[i]; + chunk->map[i] = -chunk->map[i]; + + pcpu_chunk_relocate(chunk, oslot); + return off; + } + + chunk->contig_hint = max_contig; /* fully scanned */ + pcpu_chunk_relocate(chunk, oslot); + + /* + * Tell the upper layer that this chunk has no area left. + * Note that this is not an error condition but a notification + * to upper layer that it needs to look at other chunks. + * -ENOSPC is chosen as it isn't used in memory subsystem and + * matches the meaning in a way. + */ + return -ENOSPC; +} + +/** + * pcpu_free_area - free area to a pcpu_chunk + * @chunk: chunk of interest + * @freeme: offset of area to free + * + * Free area starting from @freeme to @chunk. Note that this function + * only modifies the allocation map. It doesn't depopulate or unmap + * the area. + */ +static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) +{ + int oslot = pcpu_chunk_slot(chunk); + int i, off; + + for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) + if (off == freeme) + break; + BUG_ON(off != freeme); + BUG_ON(chunk->map[i] > 0); + + chunk->map[i] = -chunk->map[i]; + chunk->free_size += chunk->map[i]; + + /* merge with previous? */ + if (i > 0 && chunk->map[i - 1] >= 0) { + chunk->map[i - 1] += chunk->map[i]; + chunk->map_used--; + memmove(&chunk->map[i], &chunk->map[i + 1], + (chunk->map_used - i) * sizeof(chunk->map[0])); + i--; + } + /* merge with next? */ + if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) { + chunk->map[i] += chunk->map[i + 1]; + chunk->map_used--; + memmove(&chunk->map[i + 1], &chunk->map[i + 2], + (chunk->map_used - (i + 1)) * sizeof(chunk->map[0])); + } + + chunk->contig_hint = max(chunk->map[i], chunk->contig_hint); + pcpu_chunk_relocate(chunk, oslot); +} + +/** + * pcpu_unmap - unmap pages out of a pcpu_chunk + * @chunk: chunk of interest + * @page_start: page index of the first page to unmap + * @page_end: page index of the last page to unmap + 1 + * @flush: whether to flush cache and tlb or not + * + * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. + * If @flush is true, vcache is flushed before unmapping and tlb + * after. + */ +static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, + bool flush) +{ + unsigned int last = num_possible_cpus() - 1; + unsigned int cpu; + + /* + * Each flushing trial can be very expensive, issue flush on + * the whole region at once rather than doing it for each cpu. + * This could be an overkill but is more scalable. + */ + if (flush) + flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); + + for_each_possible_cpu(cpu) + unmap_kernel_range_noflush( + pcpu_chunk_addr(chunk, cpu, page_start), + (page_end - page_start) << PAGE_SHIFT); + + /* ditto as flush_cache_vunmap() */ + if (flush) + flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); +} + +/** + * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk + * @chunk: chunk to depopulate + * @off: offset to the area to depopulate + * @size: size of the area to depopulate + * @flush: whether to flush cache and tlb or not + * + * For each cpu, depopulate and unmap pages [@page_start,@page_end) + * from @chunk. If @flush is true, vcache is flushed before unmapping + * and tlb after. + */ +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, size_t off, + size_t size, bool flush) +{ + int page_start = PFN_DOWN(off); + int page_end = PFN_UP(off + size); + int unmap_start = -1; + int uninitialized_var(unmap_end); + unsigned int cpu; + int i; + + for (i = page_start; i < page_end; i++) { + for_each_possible_cpu(cpu) { + struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + + if (!*pagep) + continue; + + __free_page(*pagep); + + /* + * If it's partial depopulation, it might get + * populated or depopulated again. Mark the + * page gone. + */ + *pagep = NULL; + + unmap_start = unmap_start < 0 ? i : unmap_start; + unmap_end = i + 1; + } + } + + if (unmap_start >= 0) + pcpu_unmap(chunk, unmap_start, unmap_end, flush); +} + +/** + * pcpu_map - map pages into a pcpu_chunk + * @chunk: chunk of interest + * @page_start: page index of the first page to map + * @page_end: page index of the last page to map + 1 + * + * For each cpu, map pages [@page_start,@page_end) into @chunk. + * vcache is flushed afterwards. + */ +static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) +{ + unsigned int last = num_possible_cpus() - 1; + unsigned int cpu; + int err; + + for_each_possible_cpu(cpu) { + err = map_kernel_range_noflush( + pcpu_chunk_addr(chunk, cpu, page_start), + (page_end - page_start) << PAGE_SHIFT, + PAGE_KERNEL, + pcpu_chunk_pagep(chunk, cpu, page_start)); + if (err < 0) + return err; + } + + /* flush at once, please read comments in pcpu_unmap() */ + flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); + return 0; +} + +/** + * pcpu_populate_chunk - populate and map an area of a pcpu_chunk + * @chunk: chunk of interest + * @off: offset to the area to populate + * @size: size of the area to populate + * + * For each cpu, populate and map pages [@page_start,@page_end) into + * @chunk. The area is cleared on return. + */ +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +{ + const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; + int page_start = PFN_DOWN(off); + int page_end = PFN_UP(off + size); + int map_start = -1; + int map_end; + unsigned int cpu; + int i; + + for (i = page_start; i < page_end; i++) { + if (pcpu_chunk_page_occupied(chunk, i)) { + if (map_start >= 0) { + if (pcpu_map(chunk, map_start, map_end)) + goto err; + map_start = -1; + } + continue; + } + + map_start = map_start < 0 ? i : map_start; + map_end = i + 1; + + for_each_possible_cpu(cpu) { + struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + + *pagep = alloc_pages_node(cpu_to_node(cpu), + alloc_mask, 0); + if (!*pagep) + goto err; + } + } + + if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) + goto err; + + for_each_possible_cpu(cpu) + memset(chunk->vm->addr + (cpu << pcpu_unit_shift) + off, 0, + size); + + return 0; +err: + /* likely under heavy memory pressure, give memory back */ + pcpu_depopulate_chunk(chunk, off, size, true); + return -ENOMEM; +} + +static void free_pcpu_chunk(struct pcpu_chunk *chunk) +{ + if (!chunk) + return; + if (chunk->vm) + free_vm_area(chunk->vm); + pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0); + kfree(chunk); +} + +static struct pcpu_chunk *alloc_pcpu_chunk(void) +{ + struct pcpu_chunk *chunk; + + chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL); + if (!chunk) + return NULL; + + chunk->map = pcpu_realloc(NULL, 0, + PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); + chunk->map_alloc = PCPU_DFL_MAP_ALLOC; + chunk->map[chunk->map_used++] = pcpu_unit_size; + + chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); + if (!chunk->vm) { + free_pcpu_chunk(chunk); + return NULL; + } + + INIT_LIST_HEAD(&chunk->list); + chunk->free_size = pcpu_unit_size; + chunk->contig_hint = pcpu_unit_size; + + return chunk; +} + +/** + * __alloc_percpu - allocate percpu area + * @size: size of area to allocate + * @align: alignment of area (max PAGE_SIZE) + * + * Allocate percpu area of @size bytes aligned at @align. Might + * sleep. Might trigger writeouts. + * + * RETURNS: + * Percpu pointer to the allocated area on success, NULL on failure. + */ +void *__alloc_percpu(size_t size, size_t align) +{ + void *ptr = NULL; + struct pcpu_chunk *chunk; + int slot, off; + + if (unlikely(!size || size > PAGE_SIZE << PCPU_MIN_UNIT_PAGES_SHIFT || + align > PAGE_SIZE)) { + WARN(true, "illegal size (%zu) or align (%zu) for " + "percpu allocation\n", size, align); + return NULL; + } + + mutex_lock(&pcpu_mutex); + + /* allocate area */ + for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { + list_for_each_entry(chunk, &pcpu_slot[slot], list) { + if (size > chunk->contig_hint) + continue; + off = pcpu_alloc_area(chunk, size, align); + if (off >= 0) + goto area_found; + if (off != -ENOSPC) + goto out_unlock; + } + } + + /* hmmm... no space left, create a new chunk */ + chunk = alloc_pcpu_chunk(); + if (!chunk) + goto out_unlock; + pcpu_chunk_relocate(chunk, -1); + pcpu_chunk_addr_insert(chunk); + + off = pcpu_alloc_area(chunk, size, align); + if (off < 0) + goto out_unlock; + +area_found: + /* populate, map and clear the area */ + if (pcpu_populate_chunk(chunk, off, size)) { + pcpu_free_area(chunk, off); + goto out_unlock; + } + + ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off); +out_unlock: + mutex_unlock(&pcpu_mutex); + return ptr; +} +EXPORT_SYMBOL_GPL(__alloc_percpu); + +static void pcpu_kill_chunk(struct pcpu_chunk *chunk) +{ + pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); + list_del(&chunk->list); + rb_erase(&chunk->rb_node, &pcpu_addr_root); + free_pcpu_chunk(chunk); +} + +/** + * free_percpu - free percpu area + * @ptr: pointer to area to free + * + * Free percpu area @ptr. Might sleep. + */ +void free_percpu(void *ptr) +{ + void *addr = __pcpu_ptr_to_addr(ptr); + struct pcpu_chunk *chunk; + int off; + + if (!ptr) + return; + + mutex_lock(&pcpu_mutex); + + chunk = pcpu_chunk_addr_search(addr); + off = addr - chunk->vm->addr; + + pcpu_free_area(chunk, off); + + /* the chunk became fully free, kill one if there are other free ones */ + if (chunk->free_size == pcpu_unit_size) { + struct pcpu_chunk *pos; + + list_for_each_entry(pos, + &pcpu_slot[pcpu_chunk_slot(chunk)], list) + if (pos != chunk) { + pcpu_kill_chunk(pos); + break; + } + } + + mutex_unlock(&pcpu_mutex); +} +EXPORT_SYMBOL_GPL(free_percpu); + +/** + * pcpu_setup_static - initialize kernel static percpu area + * @populate_pte_fn: callback to allocate pagetable + * @pages: num_possible_cpus() * PFN_UP(cpu_size) pages + * + * Initialize kernel static percpu area. The caller should allocate + * all the necessary pages and pass them in @pages. + * @populate_pte_fn() is called on each page to be used for percpu + * mapping and is responsible for making sure all the necessary page + * tables for the page is allocated. + * + * RETURNS: + * The determined pcpu_unit_size which can be used to initialize + * percpu access. + */ +size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn, + struct page **pages, size_t cpu_size) +{ + static struct vm_struct static_vm; + struct pcpu_chunk *static_chunk; + int nr_cpu_pages = DIV_ROUND_UP(cpu_size, PAGE_SIZE); + unsigned int cpu; + int err, i; + + pcpu_unit_pages_shift = max_t(int, PCPU_MIN_UNIT_PAGES_SHIFT, + order_base_2(cpu_size) - PAGE_SHIFT); + + pcpu_static_size = cpu_size; + pcpu_unit_pages = 1 << pcpu_unit_pages_shift; + pcpu_unit_shift = PAGE_SHIFT + pcpu_unit_pages_shift; + pcpu_unit_size = 1 << pcpu_unit_shift; + pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; + pcpu_nr_slots = pcpu_size_to_slot(pcpu_unit_size) + 1; + pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + + (1 << pcpu_unit_pages_shift) * sizeof(struct page *); + + /* allocate chunk slots */ + pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); + for (i = 0; i < pcpu_nr_slots; i++) + INIT_LIST_HEAD(&pcpu_slot[i]); + + /* init and register vm area */ + static_vm.flags = VM_ALLOC; + static_vm.size = pcpu_chunk_size; + vm_area_register_early(&static_vm); + + /* init static_chunk */ + static_chunk = alloc_bootmem(pcpu_chunk_struct_size); + INIT_LIST_HEAD(&static_chunk->list); + static_chunk->vm = &static_vm; + static_chunk->free_size = pcpu_unit_size - pcpu_static_size; + static_chunk->contig_hint = static_chunk->free_size; + + /* assign pages and map them */ + for_each_possible_cpu(cpu) { + for (i = 0; i < nr_cpu_pages; i++) { + *pcpu_chunk_pagep(static_chunk, cpu, i) = *pages++; + populate_pte_fn(pcpu_chunk_addr(static_chunk, cpu, i)); + } + } + + err = pcpu_map(static_chunk, 0, nr_cpu_pages); + if (err) + panic("failed to setup static percpu area, err=%d\n", err); + + /* link static_chunk in */ + pcpu_chunk_relocate(static_chunk, -1); + pcpu_chunk_addr_insert(static_chunk); + + /* we're done */ + pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0); + return pcpu_unit_size; +} From 11124411aa95827404d6bfdfc14c908e1b54513c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:09 +0900 Subject: [PATCH 023/580] x86: convert to the new dynamic percpu allocator Impact: use new dynamic allocator, unified access to static/dynamic percpu memory Convert to the new dynamic percpu allocator. * implement populate_extra_pte() for both 32 and 64 * update setup_per_cpu_areas() to use pcpu_setup_static() * define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() * define config HAVE_DYNAMIC_PER_CPU_AREA Signed-off-by: Tejun Heo --- arch/x86/Kconfig | 3 ++ arch/x86/include/asm/percpu.h | 8 ++++ arch/x86/include/asm/pgtable.h | 1 + arch/x86/kernel/setup_percpu.c | 68 +++++++++++++++++++++------------- arch/x86/mm/init_32.c | 10 +++++ arch/x86/mm/init_64.c | 19 ++++++++++ 6 files changed, 84 insertions(+), 25 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f760a22f95dc..d3f6eadfd4ba 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -135,6 +135,9 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y +config HAVE_DYNAMIC_PER_CPU_AREA + def_bool y + config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index aee103b26d01..8f1d2fbec1d4 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -43,6 +43,14 @@ #else /* ...!ASSEMBLY */ #include +#include + +#define __addr_to_pcpu_ptr(addr) \ + (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ + + (unsigned long)__per_cpu_start) +#define __pcpu_ptr_to_addr(ptr) \ + (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ + - (unsigned long)__per_cpu_start) #ifdef CONFIG_SMP #define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 6f7c102018bf..dd91c2515c64 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -402,6 +402,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); +void populate_extra_pte(unsigned long vaddr); #ifdef CONFIG_X86_32 extern void native_pagetable_setup_start(pgd_t *base); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index d992e6cff730..2dce43558217 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -61,38 +61,56 @@ static inline void setup_percpu_segment(int cpu) */ void __init setup_per_cpu_areas(void) { - ssize_t size; - char *ptr; - int cpu; - - /* Copy section for each CPU (we discard the original) */ - size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE); + ssize_t size = __per_cpu_end - __per_cpu_start; + unsigned int nr_cpu_pages = DIV_ROUND_UP(size, PAGE_SIZE); + static struct page **pages; + size_t pages_size; + unsigned int cpu, i, j; + unsigned long delta; + size_t pcpu_unit_size; pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); + pr_info("PERCPU: Allocating %zd bytes for static per cpu data\n", size); - pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); + pages_size = nr_cpu_pages * num_possible_cpus() * sizeof(pages[0]); + pages = alloc_bootmem(pages_size); + j = 0; for_each_possible_cpu(cpu) { -#ifndef CONFIG_NEED_MULTIPLE_NODES - ptr = alloc_bootmem_pages(size); -#else - int node = early_cpu_to_node(cpu); - if (!node_online(node) || !NODE_DATA(node)) { - ptr = alloc_bootmem_pages(size); - pr_info("cpu %d has no node %d or node-local memory\n", - cpu, node); - pr_debug("per cpu data for cpu%d at %016lx\n", - cpu, __pa(ptr)); - } else { - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); - pr_debug("per cpu data for cpu%d on node%d at %016lx\n", - cpu, node, __pa(ptr)); - } -#endif + void *ptr; - memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); - per_cpu_offset(cpu) = ptr - __per_cpu_start; + for (i = 0; i < nr_cpu_pages; i++) { +#ifndef CONFIG_NEED_MULTIPLE_NODES + ptr = alloc_bootmem_pages(PAGE_SIZE); +#else + int node = early_cpu_to_node(cpu); + + if (!node_online(node) || !NODE_DATA(node)) { + ptr = alloc_bootmem_pages(PAGE_SIZE); + pr_info("cpu %d has no node %d or node-local " + "memory\n", cpu, node); + pr_debug("per cpu data for cpu%d at %016lx\n", + cpu, __pa(ptr)); + } else { + ptr = alloc_bootmem_pages_node(NODE_DATA(node), + PAGE_SIZE); + pr_debug("per cpu data for cpu%d on node%d " + "at %016lx\n", cpu, node, __pa(ptr)); + } +#endif + memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); + pages[j++] = virt_to_page(ptr); + } + } + + pcpu_unit_size = pcpu_setup_static(populate_extra_pte, pages, size); + + free_bootmem(__pa(pages), pages_size); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) { + per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 00263bf07a88..8b1a0ef7f874 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -137,6 +137,16 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) return pte_offset_kernel(pmd, 0); } +void __init populate_extra_pte(unsigned long vaddr) +{ + int pgd_idx = pgd_index(vaddr); + int pmd_idx = pmd_index(vaddr); + pmd_t *pmd; + + pmd = one_md_table_init(swapper_pg_dir + pgd_idx); + one_page_table_init(pmd + pmd_idx); +} + static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, unsigned long vaddr, pte_t *lastpte) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e6d36b490250..7f91e2cdc4ce 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -223,6 +223,25 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval) set_pte_vaddr_pud(pud_page, vaddr, pteval); } +void __init populate_extra_pte(unsigned long vaddr) +{ + pgd_t *pgd; + pud_t *pud; + + pgd = pgd_offset_k(vaddr); + if (pgd_none(*pgd)) { + pud = (pud_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, pud); + if (pud != pud_offset(pgd, 0)) { + printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", + pud, pud_offset(pgd, 0)); + return; + } + } + + set_pte_vaddr_pud((pud_t *)pgd_page_vaddr(*pgd), vaddr, __pte(0)); +} + /* * Create large page table mappings for a range of physical addresses. */ From cae3aeb83fef5a7c9c8ac40e653e59dd9a35469c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 21 Feb 2009 16:56:23 +0900 Subject: [PATCH 024/580] percpu: clean up size usage Andrew was concerned about the unit of variables named or have suffix size. Every usage in percpu allocator is in bytes but make it super clear by adding comments. While at it, make pcpu_depopulate_chunk() take int @off and @size like everyone else. Signed-off-by: Tejun Heo Cc: Andrew Morton --- mm/percpu.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 4617d97e877c..997724c2ea24 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -119,7 +119,7 @@ static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ static int pcpu_size_to_slot(int size) { - int highbit = fls(size); + int highbit = fls(size); /* size is in bytes */ return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); } @@ -158,8 +158,8 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, /** * pcpu_realloc - versatile realloc * @p: the current pointer (can be NULL for new allocations) - * @size: the current size (can be 0 for new allocations) - * @new_size: the wanted new size (can be 0 for free) + * @size: the current size in bytes (can be 0 for new allocations) + * @new_size: the wanted new size in bytes (can be 0 for free) * * More robust realloc which can be used to allocate, resize or free a * memory area of arbitrary size. If the needed size goes over @@ -290,8 +290,8 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) * pcpu_split_block - split a map block * @chunk: chunk of interest * @i: index of map block to split - * @head: head size (can be 0) - * @tail: tail size (can be 0) + * @head: head size in bytes (can be 0) + * @tail: tail size in bytes (can be 0) * * Split the @i'th map block into two or three blocks. If @head is * non-zero, @head bytes block is inserted before block @i moving it @@ -346,7 +346,7 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) /** * pcpu_alloc_area - allocate area from a pcpu_chunk * @chunk: chunk of interest - * @size: wanted size + * @size: wanted size in bytes * @align: wanted align * * Try to allocate @size bytes area aligned at @align from @chunk. @@ -540,15 +540,15 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk * @chunk: chunk to depopulate * @off: offset to the area to depopulate - * @size: size of the area to depopulate + * @size: size of the area to depopulate in bytes * @flush: whether to flush cache and tlb or not * * For each cpu, depopulate and unmap pages [@page_start,@page_end) * from @chunk. If @flush is true, vcache is flushed before unmapping * and tlb after. */ -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, size_t off, - size_t size, bool flush) +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, + bool flush) { int page_start = PFN_DOWN(off); int page_end = PFN_UP(off + size); @@ -617,7 +617,7 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) * pcpu_populate_chunk - populate and map an area of a pcpu_chunk * @chunk: chunk of interest * @off: offset to the area to populate - * @size: size of the area to populate + * @size: size of the area to populate in bytes * * For each cpu, populate and map pages [@page_start,@page_end) into * @chunk. The area is cleared on return. @@ -707,7 +707,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) /** * __alloc_percpu - allocate percpu area - * @size: size of area to allocate + * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * * Allocate percpu area of @size bytes aligned at @align. Might @@ -819,6 +819,7 @@ EXPORT_SYMBOL_GPL(free_percpu); * pcpu_setup_static - initialize kernel static percpu area * @populate_pte_fn: callback to allocate pagetable * @pages: num_possible_cpus() * PFN_UP(cpu_size) pages + * @cpu_size: the size of static percpu area in bytes * * Initialize kernel static percpu area. The caller should allocate * all the necessary pages and pass them in @pages. From 8cfd9e923be54ef66ce174a93f4592b444b96407 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 22 Feb 2009 12:36:55 +0000 Subject: [PATCH 025/580] [ARM] RiscPC: Fix etherh oops The 8390 driver was structured by Al Viro to allow the flexibility required by platforms. lib8390.c contains the core code which drivers explicitly include: - 8390.c includes lib8390.c to provide the standard ISA based driver. - etherh.c includes it with the accessors defined for RiscPC platforms, where it is addressed via the MMIO accessors with a device dependent register spacing. Other platform drivers do something similar. However, b9a9b4b caused the kernel to contain not only the etherh private build of lib8390 (included in etherh.c) but also lib8390.c itself, and referred the new net_device_ops methods to the ISA version. The result of this is is not pretty: Unable to handle kernel paging request at virtual address 12032030 pgd = c8330000 [12032030] *pgd=00000000 Internal error: Oops: 18331805 [#1] Modules linked in: ipv6 CPU: 0 Not tainted (2.6.29-rc3 #167) PC is at do_set_multicast_list+0xd0/0x190 LR is at bitrev32+0x28/0x34 pc : [] lr : [] psr: a0000093 sp : c8321d9c ip : c8321d84 fp : c8321dbc r10: c80c6800 r9 : 00000000 r8 : c80c6b60 r7 : c80c6b80 r6 : cc80c800 r5 : c80c6800 r4 : 00000000 r3 : cc80c80c r2 : 00000004 r1 : 00000007 r0 : e0000000 Flags: NzCv IRQs off FIQs on Mode SVC_32 ISA ARM Segment user ... Fix up b9a9b4b by making etherh's net_device_ops refer to the internal lib8390 functions, and remove the build of the ISA 8390.c driver. Acked-by: David S. Miller Signed-off-by: Russell King --- drivers/net/arm/Makefile | 2 +- drivers/net/arm/etherh.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/arm/Makefile b/drivers/net/arm/Makefile index c69c0cdba4a2..811a3ccd14c1 100644 --- a/drivers/net/arm/Makefile +++ b/drivers/net/arm/Makefile @@ -4,7 +4,7 @@ # obj-$(CONFIG_ARM_AM79C961A) += am79c961a.o -obj-$(CONFIG_ARM_ETHERH) += etherh.o ../8390.o +obj-$(CONFIG_ARM_ETHERH) += etherh.o obj-$(CONFIG_ARM_ETHER3) += ether3.o obj-$(CONFIG_ARM_ETHER1) += ether1.o obj-$(CONFIG_ARM_AT91_ETHER) += at91_ether.o diff --git a/drivers/net/arm/etherh.c b/drivers/net/arm/etherh.c index 54b52e5b1821..f52f668c49bf 100644 --- a/drivers/net/arm/etherh.c +++ b/drivers/net/arm/etherh.c @@ -641,15 +641,15 @@ static const struct net_device_ops etherh_netdev_ops = { .ndo_open = etherh_open, .ndo_stop = etherh_close, .ndo_set_config = etherh_set_config, - .ndo_start_xmit = ei_start_xmit, - .ndo_tx_timeout = ei_tx_timeout, - .ndo_get_stats = ei_get_stats, - .ndo_set_multicast_list = ei_set_multicast_list, + .ndo_start_xmit = __ei_start_xmit, + .ndo_tx_timeout = __ei_tx_timeout, + .ndo_get_stats = __ei_get_stats, + .ndo_set_multicast_list = __ei_set_multicast_list, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = eth_change_mtu, #ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = ei_poll, + .ndo_poll_controller = __ei_poll, #endif }; From d82ad6d6833ee8248e6c34e1b9fc3c4e61e3f035 Mon Sep 17 00:00:00 2001 From: Andrei Birjukov Date: Sun, 22 Feb 2009 22:37:21 +0000 Subject: [PATCH 026/580] [ARM] at91: fix for Atmel AT91 powersaving We've discovered that our AT91SAM9260 board consumed too much power when returning from a slowclock low-power mode. RAM self-refresh is enabled in a bootloader in our case, this is how we saw a difference. Estimated ca. 30mA more on 4V battery than the same state before powersaving. After a small research we found that there seems to be a bogus sdram_selfrefresh_disable() call at the end of at91_pm_enter() call, which overwrites the LPR register with uninitialized value. Please find the suggested patch attached. This patch fixes correct restoring of LPR register of the Atmel AT91 SDRAM controller when returning from a power saving mode. Signed-off-by: Andrei Birjukov Acked-by: Andrew Victor Signed-off-by: Andrew Morton Signed-off-by: Russell King --- arch/arm/mach-at91/pm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index 9bb4f043aa22..7ac812dc055a 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c @@ -332,7 +332,6 @@ static int at91_pm_enter(suspend_state_t state) at91_sys_read(AT91_AIC_IPR) & at91_sys_read(AT91_AIC_IMR)); error: - sdram_selfrefresh_disable(); target_state = PM_SUSPEND_ON; at91_irq_resume(); at91_gpio_resume(); From ec5b3d32437571b8a742069a4cfd04edb6b6eda5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 23 Feb 2009 14:01:04 -0800 Subject: [PATCH 027/580] x86, mce: remove invalid __cpuinit/__cpuexit annotations Impact: Bug fix when CPU hotplug is disabled Correct the following broken __cpuinit/__cpuexit annotations: - mce_cpu_features() is called from mce_resume(), and so cannot be __cpuinit. - mce_disable_cpu() and mce_reenable_cpu() are called from mce_cpu_callback(), and so cannot be __cpuexit(). Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 60a114ca14f6..0625993bf955 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -598,7 +598,7 @@ static void mce_init(void *dummy) } /* Add per CPU specific workarounds here */ -static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) +static void mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { @@ -1056,7 +1056,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu) } /* Make sure there are no machine checks on offlined CPUs. */ -static void __cpuexit mce_disable_cpu(void *h) +static void mce_disable_cpu(void *h) { int i; @@ -1066,7 +1066,7 @@ static void __cpuexit mce_disable_cpu(void *h) wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); } -static void __cpuexit mce_reenable_cpu(void *h) +static void mce_reenable_cpu(void *h) { int i; From cb83b42e23bd6c4bf91793a320fbe83787c13596 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:20 +0900 Subject: [PATCH 028/580] percpu: fix pcpu_chunk_struct_size Impact: fix short allocation leading to memory corruption While dropping rvalue wrapping macros around global parameters, pcpu_chunk_struct_size was set incorrectly resulting in shorter page pointer array. Fix it. Signed-off-by: Tejun Heo --- mm/percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/percpu.c b/mm/percpu.c index 997724c2ea24..ed92caa2aa3b 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -850,7 +850,7 @@ size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn, pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; pcpu_nr_slots = pcpu_size_to_slot(pcpu_unit_size) + 1; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) - + (1 << pcpu_unit_pages_shift) * sizeof(struct page *); + + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); /* allocate chunk slots */ pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); From c132937556f56ee4b831ef4b23f1846e05fde102 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:20 +0900 Subject: [PATCH 029/580] bootmem: clean up arch-specific bootmem wrapping Impact: cleaner and consistent bootmem wrapping By setting CONFIG_HAVE_ARCH_BOOTMEM_NODE, archs can define arch-specific wrappers for bootmem allocation. However, this is done a bit strangely in that only the high level convenience macros can be changed while lower level, but still exported, interface functions can't be wrapped. This not only is messy but also leads to strange situation where alloc_bootmem() does what the arch wants it to do but the equivalent __alloc_bootmem() call doesn't although they should be able to be used interchangeably. This patch updates bootmem such that archs can override / wrap the backend function - alloc_bootmem_core() instead of the highlevel interface functions to allow simpler and consistent wrapping. Also, HAVE_ARCH_BOOTMEM_NODE is renamed to HAVE_ARCH_BOOTMEM. Signed-off-by: Tejun Heo Cc: Johannes Weiner --- arch/avr32/Kconfig | 2 +- arch/x86/Kconfig | 2 +- arch/x86/include/asm/mmzone_32.h | 43 ++++---------------------------- include/linux/bootmem.h | 10 +++----- mm/bootmem.c | 14 ++++++++--- 5 files changed, 22 insertions(+), 49 deletions(-) diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index b189680d18b0..05fe3053dcae 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -181,7 +181,7 @@ source "kernel/Kconfig.preempt" config QUICKLIST def_bool y -config HAVE_ARCH_BOOTMEM_NODE +config HAVE_ARCH_BOOTMEM def_bool n config ARCH_HAVE_MEMORY_PRESENT diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d3f6eadfd4ba..6fd3b2302ed9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1111,7 +1111,7 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accomodate various tables. -config HAVE_ARCH_BOOTMEM_NODE +config HAVE_ARCH_BOOTMEM def_bool y depends on X86_32 && NUMA diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 07f1af494ca5..1e0fa9e63afa 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -93,45 +93,12 @@ static inline int pfn_valid(int pfn) #endif /* CONFIG_DISCONTIGMEM */ #ifdef CONFIG_NEED_MULTIPLE_NODES - -/* - * Following are macros that are specific to this numa platform. - */ -#define reserve_bootmem(addr, size, flags) \ - reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags)) -#define alloc_bootmem(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_nopanic(x) \ - __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \ - __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_low(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0) -#define alloc_bootmem_pages(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_pages_nopanic(x) \ - __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \ - __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_low_pages(x) \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) -#define alloc_bootmem_node(pgdat, x) \ +/* always use node 0 for bootmem on this numa platform */ +#define alloc_bootmem_core(__bdata, size, align, goal, limit) \ ({ \ - struct pglist_data __maybe_unused \ - *__alloc_bootmem_node__pgdat = (pgdat); \ - __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \ - __pa(MAX_DMA_ADDRESS)); \ -}) -#define alloc_bootmem_pages_node(pgdat, x) \ -({ \ - struct pglist_data __maybe_unused \ - *__alloc_bootmem_node__pgdat = (pgdat); \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \ - __pa(MAX_DMA_ADDRESS)); \ -}) -#define alloc_bootmem_low_pages_node(pgdat, x) \ -({ \ - struct pglist_data __maybe_unused \ - *__alloc_bootmem_node__pgdat = (pgdat); \ - __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \ + bootmem_data_t __maybe_unused * __abm_bdata_dummy = (__bdata); \ + __alloc_bootmem_core(NODE_DATA(0)->bdata, \ + (size), (align), (goal), (limit)); \ }) #endif /* CONFIG_NEED_MULTIPLE_NODES */ diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 95837bfb5256..3a87f93081ed 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -69,10 +69,9 @@ extern int reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags); -#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE -extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags); -#endif - +extern int reserve_bootmem(unsigned long addr, + unsigned long size, + int flags); extern void *__alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal); @@ -94,7 +93,7 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal); -#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE + #define alloc_bootmem(x) \ __alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_nopanic(x) \ @@ -113,7 +112,6 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, __alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_low_pages_node(pgdat, x) \ __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) -#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ extern int reserve_bootmem_generic(unsigned long addr, unsigned long size, int flags); diff --git a/mm/bootmem.c b/mm/bootmem.c index 51a0ccf61e0e..d7140c008ba8 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -37,6 +37,16 @@ static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); static int bootmem_debug; +/* + * If an arch needs to apply workarounds to bootmem allocation, it can + * set CONFIG_HAVE_ARCH_BOOTMEM and define a wrapper around + * __alloc_bootmem_core(). + */ +#ifndef CONFIG_HAVE_ARCH_BOOTMEM +#define alloc_bootmem_core(bdata, size, align, goal, limit) \ + __alloc_bootmem_core((bdata), (size), (align), (goal), (limit)) +#endif + static int __init bootmem_debug_setup(char *buf) { bootmem_debug = 1; @@ -382,7 +392,6 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); } -#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE /** * reserve_bootmem - mark a page range as usable * @addr: starting address of the range @@ -403,7 +412,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, return mark_bootmem(start, end, 1, flags); } -#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, unsigned long step) @@ -428,7 +436,7 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, return ALIGN(base + off, align) - base; } -static void * __init alloc_bootmem_core(struct bootmem_data *bdata, +static void * __init __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { From 2d0aae41695257603fc281b519677131ab5a752b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: [PATCH 030/580] bootmem: reorder interface functions and add a missing one Impact: cleanup and addition of missing interface wrapper The interface functions in bootmem.h was ordered in not so orderly manner. Reorder them such that * functions allocating the same area group together - ie. alloc_bootmem group and alloc_bootmem_low group. * functions w/o node parameter come before the ones w/ node parameter. * nopanic variants are immediately below their panicky counterparts. While at it, add alloc_bootmem_pages_node_nopanic() which was missing. Signed-off-by: Tejun Heo Cc: Johannes Weiner --- include/linux/bootmem.h | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 3a87f93081ed..455d83219fae 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -65,22 +65,20 @@ extern void free_bootmem(unsigned long addr, unsigned long size); #define BOOTMEM_DEFAULT 0 #define BOOTMEM_EXCLUSIVE (1<<0) -extern int reserve_bootmem_node(pg_data_t *pgdat, - unsigned long physaddr, - unsigned long size, - int flags); extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags); -extern void *__alloc_bootmem_nopanic(unsigned long size, +extern int reserve_bootmem_node(pg_data_t *pgdat, + unsigned long physaddr, + unsigned long size, + int flags); + +extern void *__alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal); -extern void *__alloc_bootmem(unsigned long size, +extern void *__alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal); -extern void *__alloc_bootmem_low(unsigned long size, - unsigned long align, - unsigned long goal); extern void *__alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, @@ -89,6 +87,9 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal); +extern void *__alloc_bootmem_low(unsigned long size, + unsigned long align, + unsigned long goal); extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, @@ -98,18 +99,21 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, __alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_nopanic(x) \ __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_low(x) \ - __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0) #define alloc_bootmem_pages(x) \ __alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_pages_nopanic(x) \ __alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) -#define alloc_bootmem_low_pages(x) \ - __alloc_bootmem_low(x, PAGE_SIZE, 0) #define alloc_bootmem_node(pgdat, x) \ __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_pages_node(pgdat, x) \ __alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_pages_node_nopanic(pgdat, x) \ + __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) + +#define alloc_bootmem_low(x) \ + __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0) +#define alloc_bootmem_low_pages(x) \ + __alloc_bootmem_low(x, PAGE_SIZE, 0) #define alloc_bootmem_low_pages_node(pgdat, x) \ __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) From c0c0a29379b5848aec2e8f1c58d853d3cb7118b8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: [PATCH 031/580] vmalloc: add @align to vm_area_register_early() Impact: allow larger alignment for early vmalloc area allocation Some early vmalloc users might want larger alignment, for example, for custom large page mapping. Add @align to vm_area_register_early(). While at it, drop docbook comment on non-existent @size. Signed-off-by: Tejun Heo Cc: Nick Piggin Cc: Ivan Kokshaysky --- arch/alpha/mm/init.c | 2 +- include/linux/vmalloc.h | 2 +- mm/percpu.c | 2 +- mm/vmalloc.c | 11 +++++++---- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index df6df025ded4..91eddd8505df 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -200,7 +200,7 @@ callback_init(void * kernel_end) /* register the vm area */ console_remap_vm.flags = VM_ALLOC; console_remap_vm.size = nr_pages << PAGE_SHIFT; - vm_area_register_early(&console_remap_vm); + vm_area_register_early(&console_remap_vm, PAGE_SIZE); vaddr = (unsigned long)consle_remap_vm.addr; diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 599ba7984310..2f6994fdf0e0 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -109,6 +109,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count); */ extern rwlock_t vmlist_lock; extern struct vm_struct *vmlist; -extern __init void vm_area_register_early(struct vm_struct *vm); +extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); #endif /* _LINUX_VMALLOC_H */ diff --git a/mm/percpu.c b/mm/percpu.c index ed92caa2aa3b..41e7a5f5ab1b 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -860,7 +860,7 @@ size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn, /* init and register vm area */ static_vm.flags = VM_ALLOC; static_vm.size = pcpu_chunk_size; - vm_area_register_early(&static_vm); + vm_area_register_early(&static_vm, PAGE_SIZE); /* init static_chunk */ static_chunk = alloc_bootmem(pcpu_chunk_struct_size); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 224eca9650a8..366ae9ea6af2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -995,7 +995,7 @@ EXPORT_SYMBOL(vm_map_ram); /** * vm_area_register_early - register vmap area early during boot * @vm: vm_struct to register - * @size: size of area to register + * @align: requested alignment * * This function is used to register kernel vm area before * vmalloc_init() is called. @vm->size and @vm->flags should contain @@ -1004,12 +1004,15 @@ EXPORT_SYMBOL(vm_map_ram); * * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. */ -void __init vm_area_register_early(struct vm_struct *vm) +void __init vm_area_register_early(struct vm_struct *vm, size_t align) { static size_t vm_init_off __initdata; + unsigned long addr; - vm->addr = (void *)VMALLOC_START + vm_init_off; - vm_init_off = PFN_ALIGN(vm_init_off + vm->size); + addr = ALIGN(VMALLOC_START + vm_init_off, align); + vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; + + vm->addr = (void *)addr; vm->next = vmlist; vmlist = vm; From 458a3e644c3327be529393982e24277eda8f1ac7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: [PATCH 032/580] x86: update populate_extra_pte() and add populate_extra_pmd() Impact: minor change to populate_extra_pte() and addition of pmd flavor Update populate_extra_pte() to return pointer to the pte_t for the specified address and add populate_extra_pmd() which only populates till the pmd and returns pointer to the pmd entry for the address. For 64bit, pud/pmd/pte fill functions are separated out from set_pte_vaddr[_pud]() and used for set_pte_vaddr[_pud]() and populate_extra_{pte|pmd}(). Signed-off-by: Tejun Heo --- arch/x86/include/asm/pgtable.h | 3 +- arch/x86/kernel/setup_percpu.c | 7 ++- arch/x86/mm/init_32.c | 13 ++++-- arch/x86/mm/init_64.c | 83 ++++++++++++++++++++-------------- 4 files changed, 67 insertions(+), 39 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index dd91c2515c64..46312eb0d682 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -402,7 +402,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); -void populate_extra_pte(unsigned long vaddr); +pmd_t *populate_extra_pmd(unsigned long vaddr); +pte_t *populate_extra_pte(unsigned long vaddr); #ifdef CONFIG_X86_32 extern void native_pagetable_setup_start(pgd_t *base); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 2dce43558217..671e6528a82d 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -41,6 +41,11 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); +static void __init pcpu4k_populate_pte(unsigned long addr) +{ + populate_extra_pte(addr); +} + static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -104,7 +109,7 @@ void __init setup_per_cpu_areas(void) } } - pcpu_unit_size = pcpu_setup_static(populate_extra_pte, pages, size); + pcpu_unit_size = pcpu_setup_static(pcpu4k_populate_pte, pages, size); free_bootmem(__pa(pages), pages_size); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8b1a0ef7f874..84a26883ab44 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -137,14 +137,21 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) return pte_offset_kernel(pmd, 0); } -void __init populate_extra_pte(unsigned long vaddr) +pmd_t * __init populate_extra_pmd(unsigned long vaddr) { int pgd_idx = pgd_index(vaddr); int pmd_idx = pmd_index(vaddr); + + return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx; +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + int pte_idx = pte_index(vaddr); pmd_t *pmd; - pmd = one_md_table_init(swapper_pg_dir + pgd_idx); - one_page_table_init(pmd + pmd_idx); + pmd = populate_extra_pmd(vaddr); + return one_page_table_init(pmd) + pte_idx; } static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7f91e2cdc4ce..7d4e76da3368 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -168,34 +168,51 @@ static __ref void *spp_getpage(void) return ptr; } -void -set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr) +{ + if (pgd_none(*pgd)) { + pud_t *pud = (pud_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, pud); + if (pud != pud_offset(pgd, 0)) + printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", + pud, pud_offset(pgd, 0)); + } + return pud_offset(pgd, vaddr); +} + +static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr) +{ + if (pud_none(*pud)) { + pmd_t *pmd = (pmd_t *) spp_getpage(); + pud_populate(&init_mm, pud, pmd); + if (pmd != pmd_offset(pud, 0)) + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", + pmd, pmd_offset(pud, 0)); + } + return pmd_offset(pud, vaddr); +} + +static pte_t * __init fill_pte(pmd_t *pmd, unsigned long vaddr) +{ + if (pmd_none(*pmd)) { + pte_t *pte = (pte_t *) spp_getpage(); + pmd_populate_kernel(&init_mm, pmd, pte); + if (pte != pte_offset_kernel(pmd, 0)) + printk(KERN_ERR "PAGETABLE BUG #02!\n"); + } + return pte_offset_kernel(pmd, vaddr); +} + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) { pud_t *pud; pmd_t *pmd; pte_t *pte; pud = pud_page + pud_index(vaddr); - if (pud_none(*pud)) { - pmd = (pmd_t *) spp_getpage(); - pud_populate(&init_mm, pud, pmd); - if (pmd != pmd_offset(pud, 0)) { - printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", - pmd, pmd_offset(pud, 0)); - return; - } - } - pmd = pmd_offset(pud, vaddr); - if (pmd_none(*pmd)) { - pte = (pte_t *) spp_getpage(); - pmd_populate_kernel(&init_mm, pmd, pte); - if (pte != pte_offset_kernel(pmd, 0)) { - printk(KERN_ERR "PAGETABLE BUG #02!\n"); - return; - } - } + pmd = fill_pmd(pud, vaddr); + pte = fill_pte(pmd, vaddr); - pte = pte_offset_kernel(pmd, vaddr); set_pte(pte, new_pte); /* @@ -205,8 +222,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) __flush_tlb_one(vaddr); } -void -set_pte_vaddr(unsigned long vaddr, pte_t pteval) +void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; pud_t *pud_page; @@ -223,23 +239,22 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval) set_pte_vaddr_pud(pud_page, vaddr, pteval); } -void __init populate_extra_pte(unsigned long vaddr) +pmd_t * __init populate_extra_pmd(unsigned long vaddr) { pgd_t *pgd; pud_t *pud; pgd = pgd_offset_k(vaddr); - if (pgd_none(*pgd)) { - pud = (pud_t *)spp_getpage(); - pgd_populate(&init_mm, pgd, pud); - if (pud != pud_offset(pgd, 0)) { - printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", - pud, pud_offset(pgd, 0)); - return; - } - } + pud = fill_pud(pgd, vaddr); + return fill_pmd(pud, vaddr); +} - set_pte_vaddr_pud((pud_t *)pgd_page_vaddr(*pgd), vaddr, __pte(0)); +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + pmd_t *pmd; + + pmd = populate_extra_pmd(vaddr); + return fill_pte(pmd, vaddr); } /* From d9b55eeb1d55ef2dc5a4fdbff9604c2c68cb5649 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: [PATCH 033/580] percpu: remove unit_size power-of-2 restriction Impact: allow unit_size to be arbitrary multiple of PAGE_SIZE In dynamic percpu allocator, there is no reason the unit size should be power of two. Remove the restriction. As non-power-of-two unit size means that empty chunks fall into the same slot index as lightly occupied chunks which is bad for reclaming. Reserve an extra slot for empty chunks. Signed-off-by: Tejun Heo --- mm/percpu.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 41e7a5f5ab1b..d9e6e5d1dbd4 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -67,7 +67,7 @@ #include #include -#define PCPU_MIN_UNIT_PAGES_SHIFT 4 /* also max alloc size */ +#define PCPU_MIN_UNIT_PAGES 16 /* max alloc size in pages */ #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ @@ -83,9 +83,7 @@ struct pcpu_chunk { struct page *page[]; /* #cpus * UNIT_PAGES */ }; -static int pcpu_unit_pages_shift; static int pcpu_unit_pages; -static int pcpu_unit_shift; static int pcpu_unit_size; static int pcpu_chunk_size; static int pcpu_nr_slots; @@ -117,12 +115,19 @@ static DEFINE_MUTEX(pcpu_mutex); static struct list_head *pcpu_slot; /* chunk list slots */ static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ -static int pcpu_size_to_slot(int size) +static int __pcpu_size_to_slot(int size) { int highbit = fls(size); /* size is in bytes */ return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); } +static int pcpu_size_to_slot(int size) +{ + if (size == pcpu_unit_size) + return pcpu_nr_slots - 1; + return __pcpu_size_to_slot(size); +} + static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) { if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int)) @@ -133,7 +138,7 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) static int pcpu_page_idx(unsigned int cpu, int page_idx) { - return (cpu << pcpu_unit_pages_shift) + page_idx; + return cpu * pcpu_unit_pages + page_idx; } static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, @@ -659,7 +664,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) goto err; for_each_possible_cpu(cpu) - memset(chunk->vm->addr + (cpu << pcpu_unit_shift) + off, 0, + memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, size); return 0; @@ -722,7 +727,7 @@ void *__alloc_percpu(size_t size, size_t align) struct pcpu_chunk *chunk; int slot, off; - if (unlikely(!size || size > PAGE_SIZE << PCPU_MIN_UNIT_PAGES_SHIFT || + if (unlikely(!size || size > PCPU_MIN_UNIT_PAGES * PAGE_SIZE || align > PAGE_SIZE)) { WARN(true, "illegal size (%zu) or align (%zu) for " "percpu allocation\n", size, align); @@ -840,19 +845,19 @@ size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn, unsigned int cpu; int err, i; - pcpu_unit_pages_shift = max_t(int, PCPU_MIN_UNIT_PAGES_SHIFT, - order_base_2(cpu_size) - PAGE_SHIFT); + pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_PAGES, PFN_UP(cpu_size)); pcpu_static_size = cpu_size; - pcpu_unit_pages = 1 << pcpu_unit_pages_shift; - pcpu_unit_shift = PAGE_SHIFT + pcpu_unit_pages_shift; - pcpu_unit_size = 1 << pcpu_unit_shift; + pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; - pcpu_nr_slots = pcpu_size_to_slot(pcpu_unit_size) + 1; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); - /* allocate chunk slots */ + /* + * Allocate chunk slots. The additional last slot is for + * empty chunks. + */ + pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); From 8d408b4be37bc49c9086531f2ebe411cf5731746 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: [PATCH 034/580] percpu: give more latitude to arch specific first chunk initialization Impact: more latitude for first percpu chunk allocation The first percpu chunk serves the kernel static percpu area and may or may not contain extra room for further dynamic allocation. Initialization of the first chunk needs to be done before normal memory allocation service is up, so it has its own init path - pcpu_setup_static(). It seems archs need more latitude while initializing the first chunk for example to take advantage of large page mapping. This patch makes the following changes to allow this. * Define PERCPU_DYNAMIC_RESERVE to give arch hint about how much space to reserve in the first chunk for further dynamic allocation. * Rename pcpu_setup_static() to pcpu_setup_first_chunk(). * Make pcpu_setup_first_chunk() much more flexible by fetching page pointer by callback and adding optional @unit_size, @free_size and @base_addr arguments which allow archs to selectively part of chunk initialization to their likings. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 15 +++- include/linux/percpu.h | 39 ++++++++- mm/percpu.c | 151 +++++++++++++++++++++++++-------- 3 files changed, 168 insertions(+), 37 deletions(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 671e6528a82d..d928e8887201 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -41,6 +41,16 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); +static struct page **pcpu4k_pages __initdata; +static int pcpu4k_nr_static_pages __initdata; + +static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) +{ + if (pageno < pcpu4k_nr_static_pages) + return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; + return NULL; +} + static void __init pcpu4k_populate_pte(unsigned long addr) { populate_extra_pte(addr); @@ -109,7 +119,10 @@ void __init setup_per_cpu_areas(void) } } - pcpu_unit_size = pcpu_setup_static(pcpu4k_populate_pte, pages, size); + pcpu4k_pages = pages; + pcpu4k_nr_static_pages = nr_cpu_pages; + pcpu_unit_size = pcpu_setup_first_chunk(pcpu4k_get_page, size, 0, 0, + NULL, pcpu4k_populate_pte); free_bootmem(__pa(pages), pages_size); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 18080995ff3e..910beb0abea2 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -78,12 +78,47 @@ #ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +/* minimum unit size, also is the maximum supported allocation size */ +#define PCPU_MIN_UNIT_SIZE (16UL << PAGE_SHIFT) + +/* + * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy + * back on the first chunk if arch is manually allocating and mapping + * it for faster access (as a part of large page mapping for example). + * Note that dynamic percpu allocator covers both static and dynamic + * areas, so these values are bigger than PERCPU_MODULE_RESERVE. + * + * On typical configuration with modules, the following values leave + * about 8k of free space on the first chunk after boot on both x86_32 + * and 64 when module support is enabled. When module support is + * disabled, it's much tighter. + */ +#ifndef PERCPU_DYNAMIC_RESERVE +# if BITS_PER_LONG > 32 +# ifdef CONFIG_MODULES +# define PERCPU_DYNAMIC_RESERVE (6 << PAGE_SHIFT) +# else +# define PERCPU_DYNAMIC_RESERVE (4 << PAGE_SHIFT) +# endif +# else +# ifdef CONFIG_MODULES +# define PERCPU_DYNAMIC_RESERVE (4 << PAGE_SHIFT) +# else +# define PERCPU_DYNAMIC_RESERVE (2 << PAGE_SHIFT) +# endif +# endif +#endif /* PERCPU_DYNAMIC_RESERVE */ + extern void *pcpu_base_addr; +typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr); -extern size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn, - struct page **pages, size_t cpu_size); +extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, + size_t static_size, size_t unit_size, + size_t free_size, void *base_addr, + pcpu_populate_pte_fn_t populate_pte_fn); + /* * Use this to get to a cpu's version of the per-cpu object * dynamically allocated. Non-atomic access to the current CPU's diff --git a/mm/percpu.c b/mm/percpu.c index d9e6e5d1dbd4..9ac01980cce0 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -48,8 +48,8 @@ * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back * - * - use pcpu_setup_static() during percpu area initialization to - * setup kernel static percpu area + * - use pcpu_setup_first_chunk() during percpu area initialization to + * setup the first chunk containing the kernel static percpu area */ #include @@ -67,7 +67,6 @@ #include #include -#define PCPU_MIN_UNIT_PAGES 16 /* max alloc size in pages */ #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ @@ -80,6 +79,7 @@ struct pcpu_chunk { int map_used; /* # of map entries used */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ + bool immutable; /* no [de]population allowed */ struct page *page[]; /* #cpus * UNIT_PAGES */ }; @@ -521,6 +521,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, unsigned int last = num_possible_cpus() - 1; unsigned int cpu; + /* unmap must not be done on immutable chunk */ + WARN_ON(chunk->immutable); + /* * Each flushing trial can be very expensive, issue flush on * the whole region at once rather than doing it for each cpu. @@ -602,6 +605,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) unsigned int cpu; int err; + /* map must not be done on immutable chunk */ + WARN_ON(chunk->immutable); + for_each_possible_cpu(cpu) { err = map_kernel_range_noflush( pcpu_chunk_addr(chunk, cpu, page_start), @@ -727,8 +733,7 @@ void *__alloc_percpu(size_t size, size_t align) struct pcpu_chunk *chunk; int slot, off; - if (unlikely(!size || size > PCPU_MIN_UNIT_PAGES * PAGE_SIZE || - align > PAGE_SIZE)) { + if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { WARN(true, "illegal size (%zu) or align (%zu) for " "percpu allocation\n", size, align); return NULL; @@ -776,6 +781,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); static void pcpu_kill_chunk(struct pcpu_chunk *chunk) { + WARN_ON(chunk->immutable); pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); list_del(&chunk->list); rb_erase(&chunk->rb_node, &pcpu_addr_root); @@ -821,33 +827,73 @@ void free_percpu(void *ptr) EXPORT_SYMBOL_GPL(free_percpu); /** - * pcpu_setup_static - initialize kernel static percpu area - * @populate_pte_fn: callback to allocate pagetable - * @pages: num_possible_cpus() * PFN_UP(cpu_size) pages - * @cpu_size: the size of static percpu area in bytes + * pcpu_setup_first_chunk - initialize the first percpu chunk + * @get_page_fn: callback to fetch page pointer + * @static_size: the size of static percpu area in bytes + * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto + * @free_size: free size in bytes, 0 for auto + * @base_addr: mapped address, NULL for auto + * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary * - * Initialize kernel static percpu area. The caller should allocate - * all the necessary pages and pass them in @pages. - * @populate_pte_fn() is called on each page to be used for percpu - * mapping and is responsible for making sure all the necessary page - * tables for the page is allocated. + * Initialize the first percpu chunk which contains the kernel static + * perpcu area. This function is to be called from arch percpu area + * setup path. The first two parameters are mandatory. The rest are + * optional. + * + * @get_page_fn() should return pointer to percpu page given cpu + * number and page number. It should at least return enough pages to + * cover the static area. The returned pages for static area should + * have been initialized with valid data. If @unit_size is specified, + * it can also return pages after the static area. NULL return + * indicates end of pages for the cpu. Note that @get_page_fn() must + * return the same number of pages for all cpus. + * + * @unit_size, if non-zero, determines unit size and must be aligned + * to PAGE_SIZE and equal to or larger than @static_size + @free_size. + * + * @free_size determines the number of free bytes after the static + * area in the first chunk. If zero, whatever left is available. + * Specifying non-zero value make percpu leave the area after + * @static_size + @free_size alone. + * + * Non-null @base_addr means that the caller already allocated virtual + * region for the first chunk and mapped it. percpu must not mess + * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL + * @populate_pte_fn doesn't make any sense. + * + * @populate_pte_fn is used to populate the pagetable. NULL means the + * caller already populated the pagetable. * * RETURNS: * The determined pcpu_unit_size which can be used to initialize * percpu access. */ -size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn, - struct page **pages, size_t cpu_size) +size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, + size_t static_size, size_t unit_size, + size_t free_size, void *base_addr, + pcpu_populate_pte_fn_t populate_pte_fn) { static struct vm_struct static_vm; struct pcpu_chunk *static_chunk; - int nr_cpu_pages = DIV_ROUND_UP(cpu_size, PAGE_SIZE); unsigned int cpu; + int nr_pages; int err, i; - pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_PAGES, PFN_UP(cpu_size)); + /* santiy checks */ + BUG_ON(!static_size); + BUG_ON(!unit_size && free_size); + BUG_ON(unit_size && unit_size < static_size + free_size); + BUG_ON(unit_size & ~PAGE_MASK); + BUG_ON(base_addr && !unit_size); + BUG_ON(base_addr && populate_pte_fn); - pcpu_static_size = cpu_size; + if (unit_size) + pcpu_unit_pages = unit_size >> PAGE_SHIFT; + else + pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, + PFN_UP(static_size)); + + pcpu_static_size = static_size; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) @@ -862,29 +908,66 @@ size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn, for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); - /* init and register vm area */ - static_vm.flags = VM_ALLOC; - static_vm.size = pcpu_chunk_size; - vm_area_register_early(&static_vm, PAGE_SIZE); - /* init static_chunk */ static_chunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&static_chunk->list); static_chunk->vm = &static_vm; - static_chunk->free_size = pcpu_unit_size - pcpu_static_size; + + if (free_size) + static_chunk->free_size = free_size; + else + static_chunk->free_size = pcpu_unit_size - pcpu_static_size; + static_chunk->contig_hint = static_chunk->free_size; - /* assign pages and map them */ - for_each_possible_cpu(cpu) { - for (i = 0; i < nr_cpu_pages; i++) { - *pcpu_chunk_pagep(static_chunk, cpu, i) = *pages++; - populate_pte_fn(pcpu_chunk_addr(static_chunk, cpu, i)); - } + /* allocate vm address */ + static_vm.flags = VM_ALLOC; + static_vm.size = pcpu_chunk_size; + + if (!base_addr) + vm_area_register_early(&static_vm, PAGE_SIZE); + else { + /* + * Pages already mapped. No need to remap into + * vmalloc area. In this case the static chunk can't + * be mapped or unmapped by percpu and is marked + * immutable. + */ + static_vm.addr = base_addr; + static_chunk->immutable = true; } - err = pcpu_map(static_chunk, 0, nr_cpu_pages); - if (err) - panic("failed to setup static percpu area, err=%d\n", err); + /* assign pages */ + nr_pages = -1; + for_each_possible_cpu(cpu) { + for (i = 0; i < pcpu_unit_pages; i++) { + struct page *page = get_page_fn(cpu, i); + + if (!page) + break; + *pcpu_chunk_pagep(static_chunk, cpu, i) = page; + } + + BUG_ON(i < PFN_UP(pcpu_static_size)); + + if (nr_pages < 0) + nr_pages = i; + else + BUG_ON(nr_pages != i); + } + + /* map them */ + if (populate_pte_fn) { + for_each_possible_cpu(cpu) + for (i = 0; i < nr_pages; i++) + populate_pte_fn(pcpu_chunk_addr(static_chunk, + cpu, i)); + + err = pcpu_map(static_chunk, 0, nr_pages); + if (err) + panic("failed to setup static percpu area, err=%d\n", + err); + } /* link static_chunk in */ pcpu_chunk_relocate(static_chunk, -1); From 5f5d8405d1c50f5cf7e1dbfe9c9b44e2f015c8fd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: [PATCH 035/580] x86: separate out setup_pcpu_4k() from setup_per_cpu_areas() Impact: modularize percpu first chunk allocation x86 is gonna have a few different strategies for the first chunk allocation. Modularize it by separating out the current allocation mechanism into pcpu_alloc_bootmem() and setup_pcpu_4k(). Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 144 +++++++++++++++++++++++---------- 1 file changed, 102 insertions(+), 42 deletions(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index d928e8887201..4a17c96f4f6c 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,52 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); +/** + * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu + * @cpu: cpu to allocate for + * @size: size allocation in bytes + * @align: alignment + * + * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper + * does the right thing for NUMA regardless of the current + * configuration. + * + * RETURNS: + * Pointer to the allocated area on success, NULL on failure. + */ +static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, + unsigned long align) +{ + const unsigned long goal = __pa(MAX_DMA_ADDRESS); +#ifdef CONFIG_NEED_MULTIPLE_NODES + int node = early_cpu_to_node(cpu); + void *ptr; + + if (!node_online(node) || !NODE_DATA(node)) { + ptr = __alloc_bootmem_nopanic(size, align, goal); + pr_info("cpu %d has no node %d or node-local memory\n", + cpu, node); + pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", + cpu, size, __pa(ptr)); + } else { + ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), + size, align, goal); + pr_debug("per cpu data for cpu%d %lu bytes on node%d at " + "%016lx\n", cpu, size, node, __pa(ptr)); + } + return ptr; +#else + return __alloc_bootmem_nopanic(size, align, goal); +#endif +} + +/* + * 4k page allocator + * + * This is the basic allocator. Static percpu area is allocated + * page-by-page and most of initialization is done by the generic + * setup function. + */ static struct page **pcpu4k_pages __initdata; static int pcpu4k_nr_static_pages __initdata; @@ -56,6 +103,51 @@ static void __init pcpu4k_populate_pte(unsigned long addr) populate_extra_pte(addr); } +static ssize_t __init setup_pcpu_4k(size_t static_size) +{ + size_t pages_size; + unsigned int cpu; + int i, j; + ssize_t ret; + + pcpu4k_nr_static_pages = PFN_UP(static_size); + + /* unaligned allocations can't be freed, round up to page size */ + pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() + * sizeof(pcpu4k_pages[0])); + pcpu4k_pages = alloc_bootmem(pages_size); + + /* allocate and copy */ + j = 0; + for_each_possible_cpu(cpu) + for (i = 0; i < pcpu4k_nr_static_pages; i++) { + void *ptr; + + ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); + if (!ptr) + goto enomem; + + memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); + pcpu4k_pages[j++] = virt_to_page(ptr); + } + + /* we're ready, commit */ + pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", + pcpu4k_nr_static_pages, static_size); + + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL, + pcpu4k_populate_pte); + goto out_free_ar; + +enomem: + while (--j >= 0) + free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); + ret = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pcpu4k_pages), pages_size); + return ret; +} + static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -76,56 +168,24 @@ static inline void setup_percpu_segment(int cpu) */ void __init setup_per_cpu_areas(void) { - ssize_t size = __per_cpu_end - __per_cpu_start; - unsigned int nr_cpu_pages = DIV_ROUND_UP(size, PAGE_SIZE); - static struct page **pages; - size_t pages_size; - unsigned int cpu, i, j; + size_t static_size = __per_cpu_end - __per_cpu_start; + unsigned int cpu; unsigned long delta; size_t pcpu_unit_size; + ssize_t ret; pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); - pr_info("PERCPU: Allocating %zd bytes for static per cpu data\n", size); - pages_size = nr_cpu_pages * num_possible_cpus() * sizeof(pages[0]); - pages = alloc_bootmem(pages_size); + /* allocate percpu area */ + ret = setup_pcpu_4k(static_size); + if (ret < 0) + panic("cannot allocate static percpu area (%zu bytes, err=%zd)", + static_size, ret); - j = 0; - for_each_possible_cpu(cpu) { - void *ptr; - - for (i = 0; i < nr_cpu_pages; i++) { -#ifndef CONFIG_NEED_MULTIPLE_NODES - ptr = alloc_bootmem_pages(PAGE_SIZE); -#else - int node = early_cpu_to_node(cpu); - - if (!node_online(node) || !NODE_DATA(node)) { - ptr = alloc_bootmem_pages(PAGE_SIZE); - pr_info("cpu %d has no node %d or node-local " - "memory\n", cpu, node); - pr_debug("per cpu data for cpu%d at %016lx\n", - cpu, __pa(ptr)); - } else { - ptr = alloc_bootmem_pages_node(NODE_DATA(node), - PAGE_SIZE); - pr_debug("per cpu data for cpu%d on node%d " - "at %016lx\n", cpu, node, __pa(ptr)); - } -#endif - memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); - pages[j++] = virt_to_page(ptr); - } - } - - pcpu4k_pages = pages; - pcpu4k_nr_static_pages = nr_cpu_pages; - pcpu_unit_size = pcpu_setup_first_chunk(pcpu4k_get_page, size, 0, 0, - NULL, pcpu4k_populate_pte); - - free_bootmem(__pa(pages), pages_size); + pcpu_unit_size = ret; + /* alrighty, percpu areas up and running */ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; From 89c9215165ca609096e845926d9a18f1306176a4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: [PATCH 036/580] x86: add embedding percpu first chunk allocator Impact: add better first percpu allocation for !NUMA On !NUMA, we can simply allocate contiguous memory and use it for the first chunk without mapping it into vmalloc area. As the memory area is covered by the large page physical memory mapping, it allows the dynamic perpcu allocator to not add any TLB overhead for the static percpu area and whatever falls into the first chunk and the implementation is very simple too. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 86 +++++++++++++++++++++++++++++++++- 1 file changed, 85 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 4a17c96f4f6c..fd4c399675df 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -42,6 +42,35 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); +/** + * pcpu_need_numa - determine percpu allocation needs to consider NUMA + * + * If NUMA is not configured or there is only one NUMA node available, + * there is no reason to consider NUMA. This function determines + * whether percpu allocation should consider NUMA or not. + * + * RETURNS: + * true if NUMA should be considered; otherwise, false. + */ +static bool __init pcpu_need_numa(void) +{ +#ifdef CONFIG_NEED_MULTIPLE_NODES + pg_data_t *last = NULL; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + int node = early_cpu_to_node(cpu); + + if (node_online(node) && NODE_DATA(node) && + last && last != NODE_DATA(node)) + return true; + + last = NODE_DATA(node); + } +#endif + return false; +} + /** * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu * @cpu: cpu to allocate for @@ -81,6 +110,59 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, #endif } +/* + * Embedding allocator + * + * The first chunk is sized to just contain the static area plus + * PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using + * bootmem allocator and used as-is without being mapped into vmalloc + * area. This enables the first chunk to piggy back on the linear + * physical PMD mapping and doesn't add any additional pressure to + * TLB. + */ +static void *pcpue_ptr __initdata; +static size_t pcpue_unit_size __initdata; + +static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) +{ + return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + + ((size_t)pageno << PAGE_SHIFT)); +} + +static ssize_t __init setup_pcpu_embed(size_t static_size) +{ + unsigned int cpu; + + /* + * If large page isn't supported, there's no benefit in doing + * this. Also, embedding allocation doesn't play well with + * NUMA. + */ + if (!cpu_has_pse || pcpu_need_numa()) + return -EINVAL; + + /* allocate and copy */ + pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); + pcpue_unit_size = max(pcpue_unit_size, PCPU_MIN_UNIT_SIZE); + pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, + PAGE_SIZE); + if (!pcpue_ptr) + return -ENOMEM; + + for_each_possible_cpu(cpu) + memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load, + static_size); + + /* we're ready, commit */ + pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", + pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size); + + return pcpu_setup_first_chunk(pcpue_get_page, static_size, + pcpue_unit_size, + pcpue_unit_size - static_size, pcpue_ptr, + NULL); +} + /* * 4k page allocator * @@ -178,7 +260,9 @@ void __init setup_per_cpu_areas(void) NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); /* allocate percpu area */ - ret = setup_pcpu_4k(static_size); + ret = setup_pcpu_embed(static_size); + if (ret < 0) + ret = setup_pcpu_4k(static_size); if (ret < 0) panic("cannot allocate static percpu area (%zu bytes, err=%zd)", static_size, ret); From 8ac837571491e239e64bd87863c1679d8002e8a2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:22 +0900 Subject: [PATCH 037/580] x86: add remapping percpu first chunk allocator Impact: add better first percpu allocation for NUMA On NUMA, embedding allocator can't be used as different units can't be made to fall in the correct NUMA nodes. To use large page mapping, each unit needs to be remapped. However, percpu areas are usually much smaller than large page size and unused space hurts a lot as the number of cpus grow. This allocator remaps large pages for each chunk but gives back unused part to the bootmem allocator making the large pages mapped twice. This adds slightly to the TLB pressure but is much better than using 4k mappings while still being NUMA-friendly. Ingo suggested that this would be the correct approach for NUMA. Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 137 ++++++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index fd4c399675df..2d946a8f78b9 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -110,6 +110,133 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, #endif } +/* + * Remap allocator + * + * This allocator uses PMD page as unit. A PMD page is allocated for + * each cpu and each is remapped into vmalloc area using PMD mapping. + * As PMD page is quite large, only part of it is used for the first + * chunk. Unused part is returned to the bootmem allocator. + * + * So, the PMD pages are mapped twice - once to the physical mapping + * and to the vmalloc area for the first percpu chunk. The double + * mapping does add one more PMD TLB entry pressure but still is much + * better than only using 4k mappings while still being NUMA friendly. + */ +#ifdef CONFIG_NEED_MULTIPLE_NODES +static size_t pcpur_size __initdata; +static void **pcpur_ptrs __initdata; + +static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) +{ + size_t off = (size_t)pageno << PAGE_SHIFT; + + if (off >= pcpur_size) + return NULL; + + return virt_to_page(pcpur_ptrs[cpu] + off); +} + +static ssize_t __init setup_pcpu_remap(size_t static_size) +{ + static struct vm_struct vm; + pg_data_t *last; + size_t ptrs_size; + unsigned int cpu; + ssize_t ret; + + /* + * If large page isn't supported, there's no benefit in doing + * this. Also, on non-NUMA, embedding is better. + */ + if (!cpu_has_pse || pcpu_need_numa()) + return -EINVAL; + + last = NULL; + for_each_possible_cpu(cpu) { + int node = early_cpu_to_node(cpu); + + if (node_online(node) && NODE_DATA(node) && + last && last != NODE_DATA(node)) + goto proceed; + + last = NODE_DATA(node); + } + return -EINVAL; + +proceed: + /* + * Currently supports only single page. Supporting multiple + * pages won't be too difficult if it ever becomes necessary. + */ + pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); + if (pcpur_size > PMD_SIZE) { + pr_warning("PERCPU: static data is larger than large page, " + "can't use large page\n"); + return -EINVAL; + } + + /* allocate pointer array and alloc large pages */ + ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); + pcpur_ptrs = alloc_bootmem(ptrs_size); + + for_each_possible_cpu(cpu) { + pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE); + if (!pcpur_ptrs[cpu]) + goto enomem; + + /* + * Only use pcpur_size bytes and give back the rest. + * + * Ingo: The 2MB up-rounding bootmem is needed to make + * sure the partial 2MB page is still fully RAM - it's + * not well-specified to have a PAT-incompatible area + * (unmapped RAM, device memory, etc.) in that hole. + */ + free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), + PMD_SIZE - pcpur_size); + + memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); + } + + /* allocate address and map */ + vm.flags = VM_ALLOC; + vm.size = num_possible_cpus() * PMD_SIZE; + vm_area_register_early(&vm, PMD_SIZE); + + for_each_possible_cpu(cpu) { + pmd_t *pmd; + + pmd = populate_extra_pmd((unsigned long)vm.addr + + cpu * PMD_SIZE); + set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])), + PAGE_KERNEL_LARGE)); + } + + /* we're ready, commit */ + pr_info("PERCPU: Remapped at %p with large pages, static data " + "%zu bytes\n", vm.addr, static_size); + + ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE, + pcpur_size - static_size, vm.addr, NULL); + goto out_free_ar; + +enomem: + for_each_possible_cpu(cpu) + if (pcpur_ptrs[cpu]) + free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE); + ret = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pcpur_ptrs), ptrs_size); + return ret; +} +#else +static ssize_t __init setup_pcpu_remap(size_t static_size) +{ + return -EINVAL; +} +#endif + /* * Embedding allocator * @@ -259,8 +386,14 @@ void __init setup_per_cpu_areas(void) pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); - /* allocate percpu area */ - ret = setup_pcpu_embed(static_size); + /* + * Allocate percpu area. If PSE is supported, try to make use + * of large page mappings. Please read comments on top of + * each allocator for details. + */ + ret = setup_pcpu_remap(static_size); + if (ret < 0) + ret = setup_pcpu_embed(static_size); if (ret < 0) ret = setup_pcpu_4k(static_size); if (ret < 0) From 40150d37be7f7949b2ec07d511244da856647d84 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 12:32:28 +0900 Subject: [PATCH 038/580] percpu: add __read_mostly to variables which are mostly read only Most global variables in percpu allocator are initialized during boot and read only from that point on. Add __read_mostly as per Rusty's suggestion. Signed-off-by: Tejun Heo Cc: Rusty Russell --- mm/percpu.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 9ac01980cce0..5954e7a9eb1e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -83,18 +83,18 @@ struct pcpu_chunk { struct page *page[]; /* #cpus * UNIT_PAGES */ }; -static int pcpu_unit_pages; -static int pcpu_unit_size; -static int pcpu_chunk_size; -static int pcpu_nr_slots; -static size_t pcpu_chunk_struct_size; +static int pcpu_unit_pages __read_mostly; +static int pcpu_unit_size __read_mostly; +static int pcpu_chunk_size __read_mostly; +static int pcpu_nr_slots __read_mostly; +static size_t pcpu_chunk_struct_size __read_mostly; /* the address of the first chunk which starts with the kernel static area */ -void *pcpu_base_addr; +void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); /* the size of kernel static area */ -static int pcpu_static_size; +static int pcpu_static_size __read_mostly; /* * One mutex to rule them all. @@ -112,7 +112,7 @@ static int pcpu_static_size; */ static DEFINE_MUTEX(pcpu_mutex); -static struct list_head *pcpu_slot; /* chunk list slots */ +static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ static int __pcpu_size_to_slot(int size) From 0af98d37e85e6958eb84987b1f60da3b54008317 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Sun, 8 Feb 2009 16:44:42 +0100 Subject: [PATCH 039/580] [WATCHDOG] rc32434_wdt: fix watchdog driver The existing driver code wasn't working. Neither the timeout was set correctly, nor system reset was being triggered, as the driver seemed to keep the WDT alive himself. There was also some unnecessary code. Signed-off-by: Phil Sutter Signed-off-by: Wim Van Sebroeck Cc: stable --- drivers/watchdog/rc32434_wdt.c | 160 ++++++++++++++------------------- 1 file changed, 65 insertions(+), 95 deletions(-) diff --git a/drivers/watchdog/rc32434_wdt.c b/drivers/watchdog/rc32434_wdt.c index 57027f4653ce..f1f98cd757c6 100644 --- a/drivers/watchdog/rc32434_wdt.c +++ b/drivers/watchdog/rc32434_wdt.c @@ -34,104 +34,89 @@ #include #include -#define MAX_TIMEOUT 20 -#define RC32434_WDT_INTERVAL (15 * HZ) - -#define VERSION "0.2" +#define VERSION "0.3" static struct { - struct completion stop; - int running; - struct timer_list timer; - int queue; - int default_ticks; unsigned long inuse; } rc32434_wdt_device; static struct integ __iomem *wdt_reg; -static int ticks = 100 * HZ; static int expect_close; -static int timeout; + +/* Board internal clock speed in Hz, + * the watchdog timer ticks at. */ +extern unsigned int idt_cpu_freq; + +/* translate wtcompare value to seconds and vice versa */ +#define WTCOMP2SEC(x) (x / idt_cpu_freq) +#define SEC2WTCOMP(x) (x * idt_cpu_freq) + +/* Use a default timeout of 20s. This should be + * safe for CPU clock speeds up to 400MHz, as + * ((2 ^ 32) - 1) / (400MHz / 2) = 21s. */ +#define WATCHDOG_TIMEOUT 20 + +static int timeout = WATCHDOG_TIMEOUT; static int nowayout = WATCHDOG_NOWAYOUT; module_param(nowayout, int, 0); MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); +/* apply or and nand masks to data read from addr and write back */ +#define SET_BITS(addr, or, nand) \ + writel((readl(&addr) | or) & ~nand, &addr) static void rc32434_wdt_start(void) { - u32 val; + u32 or, nand; - if (!rc32434_wdt_device.inuse) { - writel(0, &wdt_reg->wtcount); + /* zero the counter before enabling */ + writel(0, &wdt_reg->wtcount); - val = RC32434_ERR_WRE; - writel(readl(&wdt_reg->errcs) | val, &wdt_reg->errcs); + /* don't generate a non-maskable interrupt, + * do a warm reset instead */ + nand = 1 << RC32434_ERR_WNE; + or = 1 << RC32434_ERR_WRE; - val = RC32434_WTC_EN; - writel(readl(&wdt_reg->wtc) | val, &wdt_reg->wtc); - } - rc32434_wdt_device.running++; + /* reset the ERRCS timeout bit in case it's set */ + nand |= 1 << RC32434_ERR_WTO; + + SET_BITS(wdt_reg->errcs, or, nand); + + /* reset WTC timeout bit and enable WDT */ + nand = 1 << RC32434_WTC_TO; + or = 1 << RC32434_WTC_EN; + + SET_BITS(wdt_reg->wtc, or, nand); } static void rc32434_wdt_stop(void) { - u32 val; + /* Disable WDT */ + SET_BITS(wdt_reg->wtc, 0, 1 << RC32434_WTC_EN); +} - if (rc32434_wdt_device.running) { +static int rc32434_wdt_set(int new_timeout) +{ + int max_to = WTCOMP2SEC((u32)-1); - val = ~RC32434_WTC_EN; - writel(readl(&wdt_reg->wtc) & val, &wdt_reg->wtc); - - val = ~RC32434_ERR_WRE; - writel(readl(&wdt_reg->errcs) & val, &wdt_reg->errcs); - - rc32434_wdt_device.running = 0; + if (new_timeout < 0 || new_timeout > max_to) { + printk(KERN_ERR KBUILD_MODNAME + ": timeout value must be between 0 and %d", + max_to); + return -EINVAL; } -} - -static void rc32434_wdt_set(int new_timeout) -{ - u32 cmp = new_timeout * HZ; - u32 state, val; - timeout = new_timeout; - /* - * store and disable WTC - */ - state = (u32)(readl(&wdt_reg->wtc) & RC32434_WTC_EN); - val = ~RC32434_WTC_EN; - writel(readl(&wdt_reg->wtc) & val, &wdt_reg->wtc); + writel(SEC2WTCOMP(timeout), &wdt_reg->wtcompare); - writel(0, &wdt_reg->wtcount); - writel(cmp, &wdt_reg->wtcompare); - - /* - * restore WTC - */ - - writel(readl(&wdt_reg->wtc) | state, &wdt_reg); + return 0; } -static void rc32434_wdt_reset(void) +static void rc32434_wdt_ping(void) { - ticks = rc32434_wdt_device.default_ticks; -} - -static void rc32434_wdt_update(unsigned long unused) -{ - if (rc32434_wdt_device.running) - ticks--; - writel(0, &wdt_reg->wtcount); - - if (rc32434_wdt_device.queue && ticks) - mod_timer(&rc32434_wdt_device.timer, - jiffies + RC32434_WDT_INTERVAL); - else - complete(&rc32434_wdt_device.stop); } static int rc32434_wdt_open(struct inode *inode, struct file *file) @@ -142,19 +127,23 @@ static int rc32434_wdt_open(struct inode *inode, struct file *file) if (nowayout) __module_get(THIS_MODULE); + rc32434_wdt_start(); + rc32434_wdt_ping(); + return nonseekable_open(inode, file); } static int rc32434_wdt_release(struct inode *inode, struct file *file) { - if (expect_close && nowayout == 0) { + if (expect_close == 42) { rc32434_wdt_stop(); printk(KERN_INFO KBUILD_MODNAME ": disabling watchdog timer\n"); module_put(THIS_MODULE); - } else + } else { printk(KERN_CRIT KBUILD_MODNAME ": device closed unexpectedly. WDT will not stop !\n"); - + rc32434_wdt_ping(); + } clear_bit(0, &rc32434_wdt_device.inuse); return 0; } @@ -174,10 +163,10 @@ static ssize_t rc32434_wdt_write(struct file *file, const char *data, if (get_user(c, data + i)) return -EFAULT; if (c == 'V') - expect_close = 1; + expect_close = 42; } } - rc32434_wdt_update(0); + rc32434_wdt_ping(); return len; } return 0; @@ -197,11 +186,11 @@ static long rc32434_wdt_ioctl(struct file *file, unsigned int cmd, }; switch (cmd) { case WDIOC_KEEPALIVE: - rc32434_wdt_reset(); + rc32434_wdt_ping(); break; case WDIOC_GETSTATUS: case WDIOC_GETBOOTSTATUS: - value = readl(&wdt_reg->wtcount); + value = 0; if (copy_to_user(argp, &value, sizeof(int))) return -EFAULT; break; @@ -218,6 +207,7 @@ static long rc32434_wdt_ioctl(struct file *file, unsigned int cmd, break; case WDIOS_DISABLECARD: rc32434_wdt_stop(); + break; default: return -EINVAL; } @@ -225,11 +215,9 @@ static long rc32434_wdt_ioctl(struct file *file, unsigned int cmd, case WDIOC_SETTIMEOUT: if (copy_from_user(&new_timeout, argp, sizeof(int))) return -EFAULT; - if (new_timeout < 1) + if (rc32434_wdt_set(new_timeout)) return -EINVAL; - if (new_timeout > MAX_TIMEOUT) - return -EINVAL; - rc32434_wdt_set(new_timeout); + /* Fall through */ case WDIOC_GETTIMEOUT: return copy_to_user(argp, &timeout, sizeof(int)); default: @@ -262,7 +250,7 @@ static int rc32434_wdt_probe(struct platform_device *pdev) int ret; struct resource *r; - r = platform_get_resource_byname(pdev, IORESOURCE_MEM, "rb500_wdt_res"); + r = platform_get_resource_byname(pdev, IORESOURCE_MEM, "rb532_wdt_res"); if (!r) { printk(KERN_ERR KBUILD_MODNAME "failed to retrieve resources\n"); @@ -277,24 +265,12 @@ static int rc32434_wdt_probe(struct platform_device *pdev) } ret = misc_register(&rc32434_wdt_miscdev); - if (ret < 0) { printk(KERN_ERR KBUILD_MODNAME "failed to register watchdog device\n"); goto unmap; } - init_completion(&rc32434_wdt_device.stop); - rc32434_wdt_device.queue = 0; - - clear_bit(0, &rc32434_wdt_device.inuse); - - setup_timer(&rc32434_wdt_device.timer, rc32434_wdt_update, 0L); - - rc32434_wdt_device.default_ticks = ticks; - - rc32434_wdt_start(); - printk(banner, timeout); return 0; @@ -306,14 +282,8 @@ static int rc32434_wdt_probe(struct platform_device *pdev) static int rc32434_wdt_remove(struct platform_device *pdev) { - if (rc32434_wdt_device.queue) { - rc32434_wdt_device.queue = 0; - wait_for_completion(&rc32434_wdt_device.stop); - } misc_deregister(&rc32434_wdt_miscdev); - iounmap(wdt_reg); - return 0; } From d9a8798c4bab5ccd40e45e011f668099cfb3eb83 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Sun, 8 Feb 2009 16:44:42 +0100 Subject: [PATCH 040/580] [WATCHDOG] rc32434_wdt: fix sections Fix init and exit sections. Signed-off-by: Phil Sutter Signed-off-by: Wim Van Sebroeck Cc: stable --- drivers/watchdog/rc32434_wdt.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/watchdog/rc32434_wdt.c b/drivers/watchdog/rc32434_wdt.c index f1f98cd757c6..f3553fa40b17 100644 --- a/drivers/watchdog/rc32434_wdt.c +++ b/drivers/watchdog/rc32434_wdt.c @@ -34,7 +34,7 @@ #include #include -#define VERSION "0.3" +#define VERSION "0.4" static struct { unsigned long inuse; @@ -242,10 +242,10 @@ static struct miscdevice rc32434_wdt_miscdev = { .fops = &rc32434_wdt_fops, }; -static char banner[] = KERN_INFO KBUILD_MODNAME +static char banner[] __devinitdata = KERN_INFO KBUILD_MODNAME ": Watchdog Timer version " VERSION ", timer margin: %d sec\n"; -static int rc32434_wdt_probe(struct platform_device *pdev) +static int __devinit rc32434_wdt_probe(struct platform_device *pdev) { int ret; struct resource *r; @@ -280,7 +280,7 @@ static int rc32434_wdt_probe(struct platform_device *pdev) return ret; } -static int rc32434_wdt_remove(struct platform_device *pdev) +static int __devexit rc32434_wdt_remove(struct platform_device *pdev) { misc_deregister(&rc32434_wdt_miscdev); iounmap(wdt_reg); @@ -289,8 +289,8 @@ static int rc32434_wdt_remove(struct platform_device *pdev) static struct platform_driver rc32434_wdt = { .probe = rc32434_wdt_probe, - .remove = rc32434_wdt_remove, - .driver = { + .remove = __devexit_p(rc32434_wdt_remove), + .driver = { .name = "rc32434_wdt", } }; From c8532db7f2661b63f658b9a08cf4053a3e6abb78 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 24 Feb 2009 15:55:48 +0100 Subject: [PATCH 041/580] [ARM] 5411/1: S3C64XX: Fix EINT unmask Currently the unmask function for EINT interrupts was setting the mask bit rather than clearing it. This was also previously reported and fixed by Kyungmin Park and others. Acked-By: Ben Dooks Signed-off-by: Mark Brown Signed-off-by: Russell King --- arch/arm/plat-s3c64xx/irq-eint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/plat-s3c64xx/irq-eint.c b/arch/arm/plat-s3c64xx/irq-eint.c index 1f7cc0067f5c..ebb305ce7689 100644 --- a/arch/arm/plat-s3c64xx/irq-eint.c +++ b/arch/arm/plat-s3c64xx/irq-eint.c @@ -55,7 +55,7 @@ static void s3c_irq_eint_unmask(unsigned int irq) u32 mask; mask = __raw_readl(S3C64XX_EINT0MASK); - mask |= eint_irq_to_bit(irq); + mask &= ~eint_irq_to_bit(irq); __raw_writel(mask, S3C64XX_EINT0MASK); } From 41fdff322e26c4a86fe65cf577f2556a650cb7bc Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:30 +0100 Subject: [PATCH 042/580] x86, mce, cmci: export MAX_NR_BANKS Impact: Cleanup (code movement) Move MAX_NR_BANKS into mce.h because it's needed there for followup patches. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 6 ++++++ arch/x86/kernel/cpu/mcheck/mce_64.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 225cdb5d2bfc..39136c497c5e 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -95,6 +95,12 @@ void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, device_mce); extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); +/* + * To support more than 128 would need to escape the predefined + * Linux defined extended banks first. + */ +#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) + #ifdef CONFIG_X86_MCE_INTEL void mce_intel_feature_init(struct cpuinfo_x86 *c); #else diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index a4a7c686ce90..39f8bb525a74 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -37,12 +37,6 @@ #define MISC_MCELOG_MINOR 227 -/* - * To support more than 128 would need to escape the predefined - * Linux defined extended banks first. - */ -#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) - atomic_t mce_entry; static int mce_dont_init; From b276268631af3a1b0df871e10d19d492f0513d4b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:31 +0100 Subject: [PATCH 043/580] x86, mce, cmci: factor out threshold interrupt handler Impact: cleanup; preparation for feature The mce_amd_64 code has an own private MC threshold vector with an own interrupt handler. Since Intel needs a similar handler it makes sense to share the vector because both can not be active at the same time. I factored the common APIC handler code into a separate file which can be used by both the Intel or AMD MC code. This is needed for the next patch which adds an Intel specific CMCI handler. This patch should be a nop for AMD, it just moves some code around. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 5 +++++ arch/x86/include/asm/mce.h | 2 ++ arch/x86/kernel/cpu/mcheck/Makefile | 1 + arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 15 ++++++--------- arch/x86/kernel/cpu/mcheck/threshold.c | 24 ++++++++++++++++++++++++ 5 files changed, 38 insertions(+), 9 deletions(-) create mode 100644 arch/x86/kernel/cpu/mcheck/threshold.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9c39095b33fc..52d7013785fe 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -751,6 +751,11 @@ config X86_MCE_AMD Additional support for AMD specific MCE features such as the DRAM Error Threshold. +config X86_MCE_THRESHOLD + depends on X86_MCE_AMD || X86_MCE_INTEL + bool + default y + config X86_MCE_NONFATAL tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" depends on X86_32 && X86_MCE diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 39136c497c5e..125cd8714622 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -135,5 +135,7 @@ extern void mcheck_init(struct cpuinfo_x86 *c); #define mcheck_init(c) do { } while (0) #endif +extern void (*mce_threshold_vector)(void); + #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index d7d2323bbb69..b2f89829bbe8 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd_64.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o +obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index e82c8208b81e..49705be98209 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = { static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ +static void amd_threshold_interrupt(void); + /* * CPU Initialization */ @@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) tr.reset = 0; tr.old_limit = 0; threshold_restart_bank(&tr); + + mce_threshold_vector = amd_threshold_interrupt; } } } @@ -187,16 +191,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) * the interrupt goes off when error_count reaches threshold_limit. * the handler will simply log mcelog w/ software defined bank number. */ -asmlinkage void mce_threshold_interrupt(void) +static void amd_threshold_interrupt(void) { unsigned int bank, block; struct mce m; u32 low = 0, high = 0, address = 0; - ack_APIC_irq(); - exit_idle(); - irq_enter(); - mce_setup(&m); /* assume first bank caused it */ @@ -241,13 +241,10 @@ asmlinkage void mce_threshold_interrupt(void) + bank * NR_BLOCKS + block; mce_log(&m); - goto out; + return; } } } -out: - inc_irq_stat(irq_threshold_count); - irq_exit(); } /* diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c new file mode 100644 index 000000000000..4319142413d7 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -0,0 +1,24 @@ +/* Common corrected MCE threshold handler code */ +#include +#include +#include +#include +#include + +static void default_threshold_interrupt(void) +{ + printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n", + THRESHOLD_APIC_VECTOR); +} + +void (*mce_threshold_vector)(void) = default_threshold_interrupt; + +asmlinkage void mce_threshold_interrupt(void) +{ + ack_APIC_irq(); + exit_idle(); + irq_enter(); + inc_irq_stat(irq_threshold_count); + mce_threshold_vector(); + irq_exit(); +} From f9695df42cdbca78530b4458c38ecfdd0bb90079 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:32 +0100 Subject: [PATCH 044/580] x86, mce, cmci: avoid potential reentry of threshold interrupt Impact: minor bugfix The threshold handler on AMD (and soon on Intel) could be theoretically reentered by the hardware. This could lead to corrupted events because the machine check poll code assumes it is not reentered. Move the APIC ACK to the end of the interrupt handler to let the hardware avoid that. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/threshold.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index 4319142413d7..e4b8a3833fc5 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -15,10 +15,11 @@ void (*mce_threshold_vector)(void) = default_threshold_interrupt; asmlinkage void mce_threshold_interrupt(void) { - ack_APIC_irq(); exit_idle(); irq_enter(); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); irq_exit(); + /* Ack only at the end to avoid potential reentry */ + ack_APIC_irq(); } From 8457c84d68678cbfd4167a9073b89da58e48c037 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:33 +0100 Subject: [PATCH 045/580] x86, mce: replace machine check events logged interval with ratelimit Impact: behavior change, use common code Use a standard leaky bucket ratelimit for the machine check warning print interval instead of waiting every check_interval. Also decrease the limit to twice per minute. This interacts better with threshold interrupts because they can happen more often than check_interval. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 39f8bb525a74..9017609cadd9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -488,11 +489,11 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger); */ int mce_notify_user(void) { + /* Not more than two messages every minute */ + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + clear_thread_flag(TIF_MCE_NOTIFY); if (test_and_clear_bit(0, ¬ify_user)) { - static unsigned long last_print; - unsigned long now = jiffies; - wake_up_interruptible(&mce_wait); /* @@ -503,10 +504,8 @@ int mce_notify_user(void) if (trigger[0] && !work_pending(&mce_trigger_work)) schedule_work(&mce_trigger_work); - if (time_after_eq(now, last_print + (check_interval*HZ))) { - last_print = now; + if (__ratelimit(&ratelimit)) printk(KERN_INFO "Machine check events logged\n"); - } return 1; } From ee031c31d6381d004bfd386c2e45821211507499 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:34 +0100 Subject: [PATCH 046/580] x86, mce, cmci: use polled banks bitmap in machine check poller Define a per cpu bitmap that contains the banks polled by the machine check poller. This is needed for the CMCI code in the next patches to be able to disable polling on specific banks. The bank by default contains all banks, so there is no behaviour change. Only future code will remove some banks from the polling set. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 5 ++++- arch/x86/kernel/cpu/mcheck/mce_64.c | 16 ++++++++++++---- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 3 ++- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 125cd8714622..9b9523699dbc 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -119,11 +119,14 @@ extern atomic_t mce_entry; extern void do_machine_check(struct pt_regs *, long); +typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); +DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); + enum mcp_flags { MCP_TIMESTAMP = (1 << 0), /* log time stamp */ MCP_UC = (1 << 1), /* log uncorrected errors */ }; -extern void machine_check_poll(enum mcp_flags flags); +extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); extern int mce_notify_user(void); diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 9017609cadd9..a8ff38bfa6ed 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -62,6 +62,11 @@ static char *trigger_argv[2] = { trigger, NULL }; static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +/* MCA banks polled by the period polling timer for corrected events */ +DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { + [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL +}; + /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { @@ -191,7 +196,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) * * This is executed in standard interrupt context. */ -void machine_check_poll(enum mcp_flags flags) +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce m; int i; @@ -200,7 +205,7 @@ void machine_check_poll(enum mcp_flags flags) rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); for (i = 0; i < banks; i++) { - if (!bank[i]) + if (!bank[i] || !test_bit(i, *b)) continue; m.misc = 0; @@ -458,7 +463,8 @@ static void mcheck_timer(unsigned long data) WARN_ON(smp_processor_id() != data); if (mce_available(¤t_cpu_data)) - machine_check_poll(MCP_TIMESTAMP); + machine_check_poll(MCP_TIMESTAMP, + &__get_cpu_var(mce_poll_banks)); /* * Alert userspace if needed. If we logged an MCE, reduce the @@ -572,11 +578,13 @@ static void mce_init(void *dummy) { u64 cap; int i; + mce_banks_t all_banks; /* * Log the machine checks left over from the previous reset. */ - machine_check_poll(MCP_UC); + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC, &all_banks); set_in_cr4(X86_CR4_MCE); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 49705be98209..ee8bfcd3aa32 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -231,7 +231,8 @@ static void amd_threshold_interrupt(void) /* Log the machine check that caused the threshold event. */ - machine_check_poll(MCP_TIMESTAMP); + machine_check_poll(MCP_TIMESTAMP, + &__get_cpu_var(mce_poll_banks)); if (high & MASK_OVERFLOW_HI) { rdmsrl(address, m.misc); From 03195c6b40f2b4db92545921daa7c3a19b4e4c32 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:35 +0100 Subject: [PATCH 047/580] x86, mce, cmci: define MSR names and fields for new CMCI registers Impact: New register definitions only CMCI means support for raising an interrupt on a corrected machine check event instead of having to poll for it. It's a new feature in Intel Nehalem CPUs available on some machine check banks. For details see the IA32 SDM Vol3a 14.5 Define the registers for it as a preparation for further patches. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/apicdef.h | 1 + arch/x86/include/asm/mce.h | 2 ++ arch/x86/include/asm/msr-index.h | 5 +++++ 3 files changed, 8 insertions(+) diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 63134e31e8b9..bc9514fb3b13 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -53,6 +53,7 @@ #define APIC_ESR_SENDILL 0x00020 #define APIC_ESR_RECVILL 0x00040 #define APIC_ESR_ILLREGA 0x00080 +#define APIC_LVTCMCI 0x2f0 #define APIC_ICR 0x300 #define APIC_DEST_SELF 0x40000 #define APIC_DEST_ALLINC 0x80000 diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 9b9523699dbc..6fc5e07eca4f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -11,6 +11,8 @@ */ #define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ +#define MCG_EXT_P (1ULL<<9) /* Extended registers available */ +#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ #define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */ #define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 358acc59ae04..2dbd2314139e 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -77,6 +77,11 @@ #define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_MISC 0x00000403 +/* These are consecutive and not in the normal 4er MCE bank block */ +#define MSR_IA32_MC0_CTL2 0x00000280 +#define CMCI_EN (1ULL << 30) +#define CMCI_THRESHOLD_MASK 0xffffULL + #define MSR_P6_PERFCTR0 0x000000c1 #define MSR_P6_PERFCTR1 0x000000c2 #define MSR_P6_EVNTSEL0 0x00000186 From 88ccbedd9ca85d1aca6a6f99df48dce87b7c02d4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:36 +0100 Subject: [PATCH 048/580] x86, mce, cmci: add CMCI support Impact: Major new feature Intel CMCI (Corrected Machine Check Interrupt) is a new feature on Nehalem CPUs. It allows the CPU to trigger interrupts on corrected events, which allows faster reaction to them instead of with the traditional polling timer. Also use CMCI to discover shared banks. Machine check banks can be shared by CPU threads or even cores. Using the CMCI enable bit it is possible to detect the fact that another CPU already saw a specific bank. Use this to assign shared banks only to one CPU to avoid reporting duplicated events. On CPU hot unplug bank sharing is re discovered. This is done using a thread that cycles through all the CPUs. To avoid races between the poller and CMCI we only poll for banks that are not CMCI capable and only check CMCI owned banks on a interrupt. The shared banks ownership information is currently only used for CMCI interrupts, not polled banks. The sharing discovery code follows the algorithm recommended in the IA32 SDM Vol3a 14.5.2.1 The CMCI interrupt handler just calls the machine check poller to pick up the machine check event that caused the interrupt. I decided not to implement a separate threshold event like the AMD version has, because the threshold is always one currently and adding another event didn't seem to add any value. Some code inspired by Yunhong Jiang's Xen implementation, which was in term inspired by a earlier CMCI implementation by me. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 10 ++ arch/x86/kernel/cpu/mcheck/mce_64.c | 16 +- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 205 ++++++++++++++++++++++ 3 files changed, 228 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6fc5e07eca4f..563933e06a35 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -105,8 +105,16 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); #ifdef CONFIG_X86_MCE_INTEL void mce_intel_feature_init(struct cpuinfo_x86 *c); +void cmci_clear(void); +void cmci_reenable(void); +void cmci_rediscover(int dying); +void cmci_recheck(void); #else static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } +static inline void cmci_clear(void) {} +static inline void cmci_reenable(void) {} +static inline void cmci_rediscover(int dying) {} +static inline void cmci_recheck(void) {} #endif #ifdef CONFIG_X86_MCE_AMD @@ -115,6 +123,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c); static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif +extern int mce_available(struct cpuinfo_x86 *c); + void mce_log_therm_throt_event(__u64 status); extern atomic_t mce_entry; diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index a8ff38bfa6ed..bfbd5323a635 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -166,7 +166,7 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) panic(msg); } -static int mce_available(struct cpuinfo_x86 *c) +int mce_available(struct cpuinfo_x86 *c) { if (mce_dont_init) return 0; @@ -1060,9 +1060,12 @@ static __cpuinit void mce_remove_device(unsigned int cpu) static void mce_disable_cpu(void *h) { int i; + unsigned long action = *(unsigned long *)h; if (!mce_available(¤t_cpu_data)) return; + if (!(action & CPU_TASKS_FROZEN)) + cmci_clear(); for (i = 0; i < banks; i++) wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); } @@ -1070,9 +1073,12 @@ static void mce_disable_cpu(void *h) static void mce_reenable_cpu(void *h) { int i; + unsigned long action = *(unsigned long *)h; if (!mce_available(¤t_cpu_data)) return; + if (!(action & CPU_TASKS_FROZEN)) + cmci_reenable(); for (i = 0; i < banks; i++) wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); } @@ -1100,13 +1106,17 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: del_timer_sync(t); - smp_call_function_single(cpu, mce_disable_cpu, NULL, 1); + smp_call_function_single(cpu, mce_disable_cpu, &action, 1); break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: t->expires = round_jiffies_relative(jiffies + next_interval); add_timer_on(t, cpu); - smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1); + smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); + break; + case CPU_POST_DEAD: + /* intentionally ignoring frozen here */ + cmci_rediscover(cpu); break; } return NOTIFY_OK; diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 1b1491a76b55..a518ec8c6f89 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -1,6 +1,8 @@ /* * Intel specific MCE features. * Copyright 2004 Zwane Mwaikambo + * Copyright (C) 2008, 2009 Intel Corporation + * Author: Andi Kleen */ #include @@ -12,6 +14,7 @@ #include #include #include +#include asmlinkage void smp_thermal_interrupt(void) { @@ -84,7 +87,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) return; } +/* + * Support for Intel Correct Machine Check Interrupts. This allows + * the CPU to raise an interrupt when a corrected machine check happened. + * Normally we pick those up using a regular polling timer. + * Also supports reliable discovery of shared banks. + */ + +static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); + +/* + * cmci_discover_lock protects against parallel discovery attempts + * which could race against each other. + */ +static DEFINE_SPINLOCK(cmci_discover_lock); + +#define CMCI_THRESHOLD 1 + +static __cpuinit int cmci_supported(int *banks) +{ + u64 cap; + + /* + * Vendor check is not strictly needed, but the initial + * initialization is vendor keyed and this + * makes sure none of the backdoors are entered otherwise. + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + if (!cpu_has_apic || lapic_get_maxlvt() < 6) + return 0; + rdmsrl(MSR_IA32_MCG_CAP, cap); + *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); + return !!(cap & MCG_CMCI_P); +} + +/* + * The interrupt handler. This is called on every event. + * Just call the poller directly to log any events. + * This could in theory increase the threshold under high load, + * but doesn't for now. + */ +static void intel_threshold_interrupt(void) +{ + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + mce_notify_user(); +} + +static void print_update(char *type, int *hdr, int num) +{ + if (*hdr == 0) + printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); + *hdr = 1; + printk(KERN_CONT " %s:%d", type, num); +} + +/* + * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks + * on this CPU. Use the algorithm recommended in the SDM to discover shared + * banks. + */ +static __cpuinit void cmci_discover(int banks, int boot) +{ + unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); + int hdr = 0; + int i; + + spin_lock(&cmci_discover_lock); + for (i = 0; i < banks; i++) { + u64 val; + + if (test_bit(i, owned)) + continue; + + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + + /* Already owned by someone else? */ + if (val & CMCI_EN) { + if (test_and_clear_bit(i, owned) || boot) + print_update("SHD", &hdr, i); + __clear_bit(i, __get_cpu_var(mce_poll_banks)); + continue; + } + + val |= CMCI_EN | CMCI_THRESHOLD; + wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + + /* Did the enable bit stick? -- the bank supports CMCI */ + if (val & CMCI_EN) { + if (!test_and_set_bit(i, owned) || boot) + print_update("CMCI", &hdr, i); + __clear_bit(i, __get_cpu_var(mce_poll_banks)); + } else { + WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); + } + } + spin_unlock(&cmci_discover_lock); + if (hdr) + printk(KERN_CONT "\n"); +} + +/* + * Just in case we missed an event during initialization check + * all the CMCI owned banks. + */ +__cpuinit void cmci_recheck(void) +{ + unsigned long flags; + int banks; + + if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) + return; + local_irq_save(flags); + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + local_irq_restore(flags); +} + +/* + * Disable CMCI on this CPU for all banks it owns when it goes down. + * This allows other CPUs to claim the banks on rediscovery. + */ +void __cpuexit cmci_clear(void) +{ + int i; + int banks; + u64 val; + + if (!cmci_supported(&banks)) + return; + spin_lock(&cmci_discover_lock); + for (i = 0; i < banks; i++) { + if (!test_bit(i, __get_cpu_var(mce_banks_owned))) + continue; + /* Disable CMCI */ + rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); + wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + __clear_bit(i, __get_cpu_var(mce_banks_owned)); + } + spin_unlock(&cmci_discover_lock); +} + +/* + * After a CPU went down cycle through all the others and rediscover + * Must run in process context. + */ +void __cpuexit cmci_rediscover(int dying) +{ + int banks; + int cpu; + cpumask_var_t old; + + if (!cmci_supported(&banks)) + return; + if (!alloc_cpumask_var(&old, GFP_KERNEL)) + return; + cpumask_copy(old, ¤t->cpus_allowed); + + for_each_online_cpu (cpu) { + if (cpu == dying) + continue; + if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu))) + continue; + /* Recheck banks in case CPUs don't all have the same */ + if (cmci_supported(&banks)) + cmci_discover(banks, 0); + } + + set_cpus_allowed_ptr(current, old); + free_cpumask_var(old); +} + +/* + * Reenable CMCI on this CPU in case a CPU down failed. + */ +void cmci_reenable(void) +{ + int banks; + if (cmci_supported(&banks)) + cmci_discover(banks, 0); +} + +static __cpuinit void intel_init_cmci(void) +{ + int banks; + + if (!cmci_supported(&banks)) + return; + + mce_threshold_vector = intel_threshold_interrupt; + cmci_discover(banks, 1); + /* + * For CPU #0 this runs with still disabled APIC, but that's + * ok because only the vector is set up. We still do another + * check for the banks later for CPU #0 just to make sure + * to not miss any events. + */ + apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); + cmci_recheck(); +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); + intel_init_cmci(); } From df20e2eb3e59b8625021a1bc8b1b53a4edc6008b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 24 Feb 2009 13:19:02 -0800 Subject: [PATCH 049/580] x86, mce, cmci: remove incorrect __cpuinit/__cpuexit annotations Impact: Bug fix on UP The MCE code is reinitialized from resume, so we can't use __cpuinit/__cpuexit for most of the code. Remove those annotations for anything downstream of mce_init(). Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index a518ec8c6f89..7a2e10fcfa34 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -104,7 +104,7 @@ static DEFINE_SPINLOCK(cmci_discover_lock); #define CMCI_THRESHOLD 1 -static __cpuinit int cmci_supported(int *banks) +static int cmci_supported(int *banks) { u64 cap; @@ -147,7 +147,7 @@ static void print_update(char *type, int *hdr, int num) * on this CPU. Use the algorithm recommended in the SDM to discover shared * banks. */ -static __cpuinit void cmci_discover(int banks, int boot) +static void cmci_discover(int banks, int boot) { unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); int hdr = 0; @@ -192,7 +192,7 @@ static __cpuinit void cmci_discover(int banks, int boot) * Just in case we missed an event during initialization check * all the CMCI owned banks. */ -__cpuinit void cmci_recheck(void) +void cmci_recheck(void) { unsigned long flags; int banks; @@ -208,7 +208,7 @@ __cpuinit void cmci_recheck(void) * Disable CMCI on this CPU for all banks it owns when it goes down. * This allows other CPUs to claim the banks on rediscovery. */ -void __cpuexit cmci_clear(void) +void cmci_clear(void) { int i; int banks; @@ -233,7 +233,7 @@ void __cpuexit cmci_clear(void) * After a CPU went down cycle through all the others and rediscover * Must run in process context. */ -void __cpuexit cmci_rediscover(int dying) +void cmci_rediscover(int dying) { int banks; int cpu; From 5ca8681ca10f671427710f4954644359856581a3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:37 +0100 Subject: [PATCH 050/580] x86, mce, cmci: disable CMCI on rebooting Impact: Avoids confusing other OSes. Disable the CMCI vector on reboot to avoid confusing other OS. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 570f36e44e59..648676f0b50a 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -868,6 +868,14 @@ void clear_local_APIC(void) apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); } #endif +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 6) { + v = apic_read(APIC_LVTCMCI); + if (!(v & APIC_LVT_MASKED)) + apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED); + } +#endif + /* * Clean APIC state for other OSs: */ From be71b8553d0522aba535a815baaebb1f0bb9a9ec Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:49:38 +0100 Subject: [PATCH 051/580] x86, mce, cmci: recheck CMCI banks after APIC has been enabled on CPU #0 Impact: Fix marginal race condition One the first CPU the machine checks are enabled early before the local APIC is enabled. This could in theory lead to some lost CMCI events very early during boot because CMCIs cannot be delivered with disabled LAPIC. The poller also doesn't recover from this because it doesn't check CMCI banks. Add an explicit CMCI banks check after the LAPIC is enabled. This is only done for CPU #0, the other CPUs only initialize machine checks after the LAPIC is on. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 648676f0b50a..57b53774d986 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -1270,6 +1271,12 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_LVT1, value); preempt_enable(); + +#ifdef CONFIG_X86_MCE_INTEL + /* Recheck CMCI information after local APIC is up on CPU #0 */ + if (smp_processor_id() == 0) + cmci_recheck(); +#endif } void __cpuinit end_local_APIC_setup(void) From 24ff954233ecfd45801383f831626f88937ebe6f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 Feb 2009 10:38:10 +0900 Subject: [PATCH 052/580] x86, percpu: fix minor bugs in setup_percpu.c Recent changes in setup_percpu.c made a now meaningless DBG() statement fail to compile and introduced a comparison-of-different-types warning. Fix them. Compile failure is reported by Ingo Molnar. Signed-off-by: Tejun Heo Reported-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 2d946a8f78b9..c29f301d3885 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -270,7 +270,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) /* allocate and copy */ pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); - pcpue_unit_size = max(pcpue_unit_size, PCPU_MIN_UNIT_SIZE); + pcpue_unit_size = max_t(size_t, pcpue_unit_size, PCPU_MIN_UNIT_SIZE); pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, PAGE_SIZE); if (!pcpue_ptr) @@ -438,8 +438,6 @@ void __init setup_per_cpu_areas(void) */ if (cpu == boot_cpu_id) switch_to_new_gdt(cpu); - - DBG("PERCPU: cpu %4d %p\n", cpu, ptr); } /* indicate the early static arrays will soon be gone */ From d325100504f1d0c296a1fbfef558deaa655e2240 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 Feb 2009 11:01:40 +0900 Subject: [PATCH 053/580] x86: convert cacheflush macros inline functions Impact: cleanup Unused macro parameters cause spurious unused variable warnings. Convert all cacheflush macros to inline functions to avoid the warnings and achieve better type checking. Signed-off-by: Tejun Heo --- arch/x86/include/asm/cacheflush.h | 53 +++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 2f8466540fb5..5b301b7ff5f4 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -5,24 +5,43 @@ #include /* Caches aren't brain-dead on the intel. */ -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define flush_dcache_page(page) do { } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_icache_range(start, end) do { } while (0) -#define flush_icache_page(vma, pg) do { } while (0) -#define flush_icache_user_range(vma, pg, adr, len) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) +static inline void flush_cache_all(void) { } +static inline void flush_cache_mm(struct mm_struct *mm) { } +static inline void flush_cache_dup_mm(struct mm_struct *mm) { } +static inline void flush_cache_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) { } +static inline void flush_cache_page(struct vm_area_struct *vma, + unsigned long vmaddr, unsigned long pfn) { } +static inline void flush_dcache_page(struct page *page) { } +static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } +static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { } +static inline void flush_icache_range(unsigned long start, + unsigned long end) { } +static inline void flush_icache_page(struct vm_area_struct *vma, + struct page *page) { } +static inline void flush_icache_user_range(struct vm_area_struct *vma, + struct page *page, + unsigned long addr, + unsigned long len) { } +static inline void flush_cache_vmap(unsigned long start, unsigned long end) { } +static inline void flush_cache_vunmap(unsigned long start, + unsigned long end) { } -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ - memcpy((dst), (src), (len)) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy((dst), (src), (len)) +static inline void copy_to_user_page(struct vm_area_struct *vma, + struct page *page, unsigned long vaddr, + void *dst, const void *src, + unsigned long len) +{ + memcpy(dst, src, len); +} + +static inline void copy_from_user_page(struct vm_area_struct *vma, + struct page *page, unsigned long vaddr, + void *dst, const void *src, + unsigned long len) +{ + memcpy(dst, src, len); +} #define PG_non_WB PG_arch_1 PAGEFLAG(NonWB, non_WB) From 0dcec8c27ba44cd11c6e68c46d5fd553818a3837 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 25 Feb 2009 14:07:33 +0100 Subject: [PATCH 054/580] alloc_percpu: add align argument to __alloc_percpu, fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: build fix API was changed, but not all usage sites were converted: net/ipv4/route.c: In function ‘ip_rt_init’: net/ipv4/route.c:3379: error: too few arguments to function ‘__alloc_percpu’ Cc: Rusty Russell Cc: Tejun Heo Signed-off-by: Ingo Molnar --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 97f71153584f..bf895401218f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3376,7 +3376,7 @@ int __init ip_rt_init(void) int rc = 0; #ifdef CONFIG_NET_CLS_ROUTE - ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct)); + ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); #endif From d2b0261506602bd969164879206027b30358ffdf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 25 Feb 2009 14:36:45 +0100 Subject: [PATCH 055/580] alloc_percpu: fix UP build Impact: build fix the !SMP branch had a 'gfp' leftover: include/linux/percpu.h: In function '__alloc_percpu': include/linux/percpu.h:160: error: 'gfp' undeclared (first use in this function) include/linux/percpu.h:160: error: (Each undeclared identifier is reported only once include/linux/percpu.h:160: error: for each function it appears in.) Use GFP_KERNEL like the SMP version does. Cc: Rusty Russell Cc: Tejun Heo Signed-off-by: Ingo Molnar --- include/linux/percpu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 910beb0abea2..d8e5a9abbce0 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -157,7 +157,7 @@ static inline void *__alloc_percpu(size_t size, size_t align) * percpu sections on SMP for which this path isn't used. */ WARN_ON_ONCE(align > __alignof__(unsigned long long)); - return kzalloc(size, gfp); + return kzalloc(size, GFP_KERNEL); } static inline void free_percpu(void *p) From e317603694bfd17b28a40de9d65e1a4ec12f816e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Feb 2009 10:54:17 +0900 Subject: [PATCH 056/580] percpu: fix too low alignment restriction on UP UP __alloc_percpu() triggered WARN_ON_ONCE() if the requested alignment is larger than that of unsigned long long, which is too small for all the cacheline aligned allocations. Bump it up to SMP_CACHE_BYTES which kmalloc allocations generally guarantee. Signed-off-by: Tejun Heo Reported-by: Ingo Molnar --- include/linux/percpu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index d8e5a9abbce0..545b068bcb70 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -156,7 +156,7 @@ static inline void *__alloc_percpu(size_t size, size_t align) * on it. Larger alignment should only be used for module * percpu sections on SMP for which this path isn't used. */ - WARN_ON_ONCE(align > __alignof__(unsigned long long)); + WARN_ON_ONCE(align > SMP_CACHE_BYTES); return kzalloc(size, GFP_KERNEL); } From a682604838763981613e42015cd0e39f2989d6bb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Feb 2009 18:03:42 -0800 Subject: [PATCH 057/580] rcu: Teach RCU that idle task is not quiscent state at boot This patch fixes a bug located by Vegard Nossum with the aid of kmemcheck, updated based on review comments from Nick Piggin, Ingo Molnar, and Andrew Morton. And cleans up the variable-name and function-name language. ;-) The boot CPU runs in the context of its idle thread during boot-up. During this time, idle_cpu(0) will always return nonzero, which will fool Classic and Hierarchical RCU into deciding that a large chunk of the boot-up sequence is a big long quiescent state. This in turn causes RCU to prematurely end grace periods during this time. This patch changes the rcutree.c and rcuclassic.c rcu_check_callbacks() function to ignore the idle task as a quiescent state until the system has started up the scheduler in rest_init(), introducing a new non-API function rcu_idle_now_means_idle() to inform RCU of this transition. RCU maintains an internal rcu_idle_cpu_truthful variable to track this state, which is then used by rcu_check_callback() to determine if it should believe idle_cpu(). Because this patch has the effect of disallowing RCU grace periods during long stretches of the boot-up sequence, this patch also introduces Josh Triplett's UP-only optimization that makes synchronize_rcu() be a no-op if num_online_cpus() returns 1. This allows boot-time code that calls synchronize_rcu() to proceed normally. Note, however, that RCU callbacks registered by call_rcu() will likely queue up until later in the boot sequence. Although rcuclassic and rcutree can also use this same optimization after boot completes, rcupreempt must restrict its use of this optimization to the portion of the boot sequence before the scheduler starts up, given that an rcupreempt RCU read-side critical section may be preeempted. In addition, this patch takes Nick Piggin's suggestion to make the system_state global variable be __read_mostly. Changes since v4: o Changes the name of the introduced function and variable to be less emotional. ;-) Changes since v3: o WARN_ON(nr_context_switches() > 0) to verify that RCU switches out of boot-time mode before the first context switch, as suggested by Nick Piggin. Changes since v2: o Created rcu_blocking_is_gp() internal-to-RCU API that determines whether a call to synchronize_rcu() is itself a grace period. o The definition of rcu_blocking_is_gp() for rcuclassic and rcutree checks to see if but a single CPU is online. o The definition of rcu_blocking_is_gp() for rcupreempt checks to see both if but a single CPU is online and if the system is still in early boot. This allows rcupreempt to again work correctly if running on a single CPU after booting is complete. o Added check to rcupreempt's synchronize_sched() for there being but one online CPU. Tested all three variants both SMP and !SMP, booted fine, passed a short rcutorture test on both x86 and Power. Located-by: Vegard Nossum Tested-by: Vegard Nossum Tested-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- include/linux/rcuclassic.h | 6 ++++++ include/linux/rcupdate.h | 4 ++++ include/linux/rcupreempt.h | 15 +++++++++++++++ include/linux/rcutree.h | 6 ++++++ init/main.c | 3 ++- kernel/rcuclassic.c | 4 ++-- kernel/rcupdate.c | 12 ++++++++++++ kernel/rcupreempt.c | 3 +++ kernel/rcutree.c | 4 ++-- 9 files changed, 52 insertions(+), 5 deletions(-) diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h index f3f697df1d71..80044a4f3ab9 100644 --- a/include/linux/rcuclassic.h +++ b/include/linux/rcuclassic.h @@ -181,4 +181,10 @@ extern long rcu_batches_completed_bh(void); #define rcu_enter_nohz() do { } while (0) #define rcu_exit_nohz() do { } while (0) +/* A context switch is a grace period for rcuclassic. */ +static inline int rcu_blocking_is_gp(void) +{ + return num_online_cpus() == 1; +} + #endif /* __LINUX_RCUCLASSIC_H */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 921340a7b71c..528343e6da51 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -52,6 +52,9 @@ struct rcu_head { void (*func)(struct rcu_head *head); }; +/* Internal to kernel, but needed by rcupreempt.h. */ +extern int rcu_scheduler_active; + #if defined(CONFIG_CLASSIC_RCU) #include #elif defined(CONFIG_TREE_RCU) @@ -265,6 +268,7 @@ extern void rcu_barrier_sched(void); /* Internal to kernel */ extern void rcu_init(void); +extern void rcu_scheduler_starting(void); extern int rcu_needs_cpu(int cpu); #endif /* __LINUX_RCUPDATE_H */ diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h index 3e05c09b54a2..74304b4538d8 100644 --- a/include/linux/rcupreempt.h +++ b/include/linux/rcupreempt.h @@ -142,4 +142,19 @@ static inline void rcu_exit_nohz(void) #define rcu_exit_nohz() do { } while (0) #endif /* CONFIG_NO_HZ */ +/* + * A context switch is a grace period for rcupreempt synchronize_rcu() + * only during early boot, before the scheduler has been initialized. + * So, how the heck do we get a context switch? Well, if the caller + * invokes synchronize_rcu(), they are willing to accept a context + * switch, so we simply pretend that one happened. + * + * After boot, there might be a blocked or preempted task in an RCU + * read-side critical section, so we cannot then take the fastpath. + */ +static inline int rcu_blocking_is_gp(void) +{ + return num_online_cpus() == 1 && !rcu_scheduler_active; +} + #endif /* __LINUX_RCUPREEMPT_H */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index d4368b7975c3..a722fb67bb2d 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -326,4 +326,10 @@ static inline void rcu_exit_nohz(void) } #endif /* CONFIG_NO_HZ */ +/* A context switch is a grace period for rcutree. */ +static inline int rcu_blocking_is_gp(void) +{ + return num_online_cpus() == 1; +} + #endif /* __LINUX_RCUTREE_H */ diff --git a/init/main.c b/init/main.c index 844209453c02..83697e160b3a 100644 --- a/init/main.c +++ b/init/main.c @@ -97,7 +97,7 @@ static inline void mark_rodata_ro(void) { } extern void tc_init(void); #endif -enum system_states system_state; +enum system_states system_state __read_mostly; EXPORT_SYMBOL(system_state); /* @@ -463,6 +463,7 @@ static noinline void __init_refok rest_init(void) * at least once to get things moving: */ init_idle_bootup_task(current); + rcu_scheduler_starting(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index bd5a9003497c..654c640a6b9c 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -679,8 +679,8 @@ int rcu_needs_cpu(int cpu) void rcu_check_callbacks(int cpu, int user) { if (user || - (idle_cpu(cpu) && !in_softirq() && - hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + (idle_cpu(cpu) && rcu_scheduler_active && + !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { /* * Get here if this CPU took its interrupt from user diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index d92a76a881aa..cae8a059cf47 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -44,6 +44,7 @@ #include #include #include +#include enum rcu_barrier { RCU_BARRIER_STD, @@ -55,6 +56,7 @@ static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); static struct completion rcu_barrier_completion; +int rcu_scheduler_active __read_mostly; /* * Awaken the corresponding synchronize_rcu() instance now that a @@ -80,6 +82,10 @@ void wakeme_after_rcu(struct rcu_head *head) void synchronize_rcu(void) { struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + init_completion(&rcu.completion); /* Will wake me after RCU finished. */ call_rcu(&rcu.head, wakeme_after_rcu); @@ -175,3 +181,9 @@ void __init rcu_init(void) __rcu_init(); } +void rcu_scheduler_starting(void) +{ + WARN_ON(num_online_cpus() != 1); + WARN_ON(nr_context_switches() > 0); + rcu_scheduler_active = 1; +} diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 33cfc50781f9..5d59e850fb71 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1181,6 +1181,9 @@ void __synchronize_sched(void) { struct rcu_synchronize rcu; + if (num_online_cpus() == 1) + return; /* blocking is gp if only one CPU! */ + init_completion(&rcu.completion); /* Will wake me after RCU finished. */ call_rcu_sched(&rcu.head, wakeme_after_rcu); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b2fd602a6f6f..97ce31579ec0 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -948,8 +948,8 @@ static void rcu_do_batch(struct rcu_data *rdp) void rcu_check_callbacks(int cpu, int user) { if (user || - (idle_cpu(cpu) && !in_softirq() && - hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + (idle_cpu(cpu) && rcu_scheduler_active && + !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { /* * Get here if this CPU took its interrupt from user From a760a6656e6f00bb0144a42a048cf0266646e22c Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 26 Feb 2009 14:06:31 +0800 Subject: [PATCH 058/580] crypto: api - Fix module load deadlock with fallback algorithms With the mandatory algorithm testing at registration, we have now created a deadlock with algorithms requiring fallbacks. This can happen if the module containing the algorithm requiring fallback is loaded first, without the fallback module being loaded first. The system will then try to test the new algorithm, find that it needs to load a fallback, and then try to load that. As both algorithms share the same module alias, it can attempt to load the original algorithm again and block indefinitely. As algorithms requiring fallbacks are a special case, we can fix this by giving them a different module alias than the rest. Then it's just a matter of using the right aliases according to what algorithms we're trying to find. Signed-off-by: Herbert Xu --- arch/s390/crypto/aes_s390.c | 2 +- crypto/api.c | 15 +++++++++++++-- drivers/crypto/padlock-aes.c | 2 +- drivers/crypto/padlock-sha.c | 4 ++-- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c index c42cd898f68b..6118890c946d 100644 --- a/arch/s390/crypto/aes_s390.c +++ b/arch/s390/crypto/aes_s390.c @@ -556,7 +556,7 @@ static void __exit aes_s390_fini(void) module_init(aes_s390_init); module_exit(aes_s390_fini); -MODULE_ALIAS("aes"); +MODULE_ALIAS("aes-all"); MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm"); MODULE_LICENSE("GPL"); diff --git a/crypto/api.c b/crypto/api.c index efe77df6863f..38a2bc02a98c 100644 --- a/crypto/api.c +++ b/crypto/api.c @@ -215,8 +215,19 @@ struct crypto_alg *crypto_larval_lookup(const char *name, u32 type, u32 mask) mask &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD); type &= mask; - alg = try_then_request_module(crypto_alg_lookup(name, type, mask), - name); + alg = crypto_alg_lookup(name, type, mask); + if (!alg) { + char tmp[CRYPTO_MAX_ALG_NAME]; + + request_module(name); + + if (!((type ^ CRYPTO_ALG_NEED_FALLBACK) & mask) && + snprintf(tmp, sizeof(tmp), "%s-all", name) < sizeof(tmp)) + request_module(tmp); + + alg = crypto_alg_lookup(name, type, mask); + } + if (alg) return crypto_is_larval(alg) ? crypto_larval_wait(alg) : alg; diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c index 856b3cc25583..3f0fdd18255d 100644 --- a/drivers/crypto/padlock-aes.c +++ b/drivers/crypto/padlock-aes.c @@ -489,4 +489,4 @@ MODULE_DESCRIPTION("VIA PadLock AES algorithm support"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Michal Ludvig"); -MODULE_ALIAS("aes"); +MODULE_ALIAS("aes-all"); diff --git a/drivers/crypto/padlock-sha.c b/drivers/crypto/padlock-sha.c index a7fbadebf623..a2c8e8514b63 100644 --- a/drivers/crypto/padlock-sha.c +++ b/drivers/crypto/padlock-sha.c @@ -304,7 +304,7 @@ MODULE_DESCRIPTION("VIA PadLock SHA1/SHA256 algorithms support."); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Michal Ludvig"); -MODULE_ALIAS("sha1"); -MODULE_ALIAS("sha256"); +MODULE_ALIAS("sha1-all"); +MODULE_ALIAS("sha256-all"); MODULE_ALIAS("sha1-padlock"); MODULE_ALIAS("sha256-padlock"); From cac64d00c256e65776d575e82aaf540632b66178 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 25 Feb 2009 09:59:26 -0800 Subject: [PATCH 059/580] sched_rt: don't start timer when rt bandwidth disabled Impact: fix incorrect condition check No need to start rt bandwidth timer when rt bandwidth is disabled. If this timer starts, it may stop at sched_rt_period_timer() on the first time. Signed-off-by: Hiroshi Shimamoto Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index 410eec404133..c3baa9653d1d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { ktime_t now; - if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) From 28fd2d397bab5c7fb0eed0c20b6766c99ae34a8f Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 12 Dec 2008 00:24:33 +0000 Subject: [PATCH 060/580] [ARM] S3C64XX: Set GPIO pin when select IRQ_EINT type Set the GPIO pin mode to external interrupt when configuring an IRQ_EINT's IRQ type. Signed-off-by: Ben Dooks --- arch/arm/plat-s3c64xx/irq-eint.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/arm/plat-s3c64xx/irq-eint.c b/arch/arm/plat-s3c64xx/irq-eint.c index 1f7cc0067f5c..5d62f2799ea9 100644 --- a/arch/arm/plat-s3c64xx/irq-eint.c +++ b/arch/arm/plat-s3c64xx/irq-eint.c @@ -14,12 +14,15 @@ #include #include +#include #include #include #include #include +#include +#include #include #include @@ -74,6 +77,7 @@ static void s3c_irq_eint_maskack(unsigned int irq) static int s3c_irq_eint_set_type(unsigned int irq, unsigned int type) { int offs = eint_offset(irq); + int pin; int shift; u32 ctrl, mask; u32 newvalue = 0; @@ -125,6 +129,15 @@ static int s3c_irq_eint_set_type(unsigned int irq, unsigned int type) ctrl |= newvalue << shift; __raw_writel(ctrl, reg); + /* set the GPIO pin appropriately */ + + if (offs < 23) + pin = S3C64XX_GPN(offs); + else + pin = S3C64XX_GPM(offs - 23); + + s3c_gpio_cfgpin(pin, S3C_GPIO_SFN(2)); + return 0; } From 789b4ad36c7037758309df98f1efa65d0c26527d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 26 Jan 2009 19:12:01 +0000 Subject: [PATCH 061/580] [ARM] S3C64XX: Fix section mismatch for s3c64xx_register_clocks() Signed-off-by: Mark Brown Signed-off-by: Ben Dooks --- arch/arm/plat-s3c64xx/clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/plat-s3c64xx/clock.c b/arch/arm/plat-s3c64xx/clock.c index 136c982c68e1..ad1b9682c9c3 100644 --- a/arch/arm/plat-s3c64xx/clock.c +++ b/arch/arm/plat-s3c64xx/clock.c @@ -248,7 +248,7 @@ static struct clk *clks[] __initdata = { &clk_48m, }; -void s3c64xx_register_clocks(void) +void __init s3c64xx_register_clocks(void) { struct clk *clkp; int ret; From 778974797793e7dcd42e3701e09c686b6419cb35 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 23 Jan 2009 16:29:41 +0000 Subject: [PATCH 062/580] [ARM] SMDK6410: Correct I2C device name for WM8580 The WM8580 driver registers itself as "wm8580" rather than "WM8580". Signed-off-by: Mark Brown Signed-off-by: Ben Dooks --- arch/arm/mach-s3c6410/mach-smdk6410.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-s3c6410/mach-smdk6410.c b/arch/arm/mach-s3c6410/mach-smdk6410.c index 3c4d47145c83..0436783c87d4 100644 --- a/arch/arm/mach-s3c6410/mach-smdk6410.c +++ b/arch/arm/mach-s3c6410/mach-smdk6410.c @@ -146,7 +146,7 @@ static struct platform_device *smdk6410_devices[] __initdata = { static struct i2c_board_info i2c_devs0[] __initdata = { { I2C_BOARD_INFO("24c08", 0x50), }, - { I2C_BOARD_INFO("WM8580", 0X1b), }, + { I2C_BOARD_INFO("wm8580", 0x1b), }, }; static struct i2c_board_info i2c_devs1[] __initdata = { From 027191a8c6602d3fb6dde3820517339e1daf191d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 23 Jan 2009 16:29:43 +0000 Subject: [PATCH 063/580] [ARM] SMDK6410: Declare iodesc table static Shuts up a warning. Signed-off-by: Mark Brown Signed-off-by: Ben Dooks --- arch/arm/mach-s3c6410/mach-smdk6410.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-s3c6410/mach-smdk6410.c b/arch/arm/mach-s3c6410/mach-smdk6410.c index 0436783c87d4..25f7935576f8 100644 --- a/arch/arm/mach-s3c6410/mach-smdk6410.c +++ b/arch/arm/mach-s3c6410/mach-smdk6410.c @@ -129,7 +129,7 @@ static struct s3c_fb_platdata smdk6410_lcd_pdata __initdata = { .vidcon1 = VIDCON1_INV_HSYNC | VIDCON1_INV_VSYNC, }; -struct map_desc smdk6410_iodesc[] = {}; +static struct map_desc smdk6410_iodesc[] = {}; static struct platform_device *smdk6410_devices[] __initdata = { #ifdef CONFIG_SMDK6410_SD_CH0 From 8bd8dbdf3725ce569467bd704840249869f626d6 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 23 Jan 2009 16:29:44 +0000 Subject: [PATCH 064/580] [ARM] S3C64XX: Staticise s3c64xx_init_irq_eint() It's an initcall and does not need to be exported. Signed-off-by: Mark Brown Signed-off-by: Ben Dooks --- arch/arm/plat-s3c64xx/irq-eint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/plat-s3c64xx/irq-eint.c b/arch/arm/plat-s3c64xx/irq-eint.c index 5d62f2799ea9..8d1b7290aa13 100644 --- a/arch/arm/plat-s3c64xx/irq-eint.c +++ b/arch/arm/plat-s3c64xx/irq-eint.c @@ -194,7 +194,7 @@ static void s3c_irq_demux_eint20_27(unsigned int irq, struct irq_desc *desc) s3c_irq_demux_eint(20, 27); } -int __init s3c64xx_init_irq_eint(void) +static int __init s3c64xx_init_irq_eint(void) { int irq; From 24d4076734b4ecf083a6be611040fe0743e59989 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 23 Jan 2009 17:06:23 +0000 Subject: [PATCH 065/580] [ARM] S3C64XX: Do gpiolib configuration earlier arch_initcall() runs after the machine init function which means that any configuration of GPIO pins must currently be done later on, for example in callbacks from drivers. Move the initialisation earlier in order to allow machines to configure GPIOs directly in their init functions rather than having to have a callback invoked later on. Some other ARM platforms use this method. Other solutions for this include providing a special interface for setting up GPIOs en masse, adding callbacks to do the GPIO configuration from devices and doing the GPIO configuration implicitly. Signed-off-by: Mark Brown Signed-off-by: Ben Dooks --- arch/arm/plat-s3c64xx/gpiolib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/plat-s3c64xx/gpiolib.c b/arch/arm/plat-s3c64xx/gpiolib.c index cc62941d7b5c..ee9188add8fb 100644 --- a/arch/arm/plat-s3c64xx/gpiolib.c +++ b/arch/arm/plat-s3c64xx/gpiolib.c @@ -417,4 +417,4 @@ static __init int s3c64xx_gpiolib_init(void) return 0; } -arch_initcall(s3c64xx_gpiolib_init); +core_initcall(s3c64xx_gpiolib_init); From 4271c3bd46a0f0448118cf618c7210237a98e6bf Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 26 Feb 2009 23:00:27 +0000 Subject: [PATCH 066/580] [ARM] S3C64XX: Rename IRQ_UHOST to IRQ_USBH The USB OHCI host device expects the IRQ definition to be named IRQ_USBH, so rename the S3C64XX IRQ header to match. Signed-off-by: Ben Dooks Signed-off-by: Ben Dooks --- arch/arm/plat-s3c64xx/include/plat/irqs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/plat-s3c64xx/include/plat/irqs.h b/arch/arm/plat-s3c64xx/include/plat/irqs.h index 2846f550b727..f865bf4d709e 100644 --- a/arch/arm/plat-s3c64xx/include/plat/irqs.h +++ b/arch/arm/plat-s3c64xx/include/plat/irqs.h @@ -117,7 +117,7 @@ #define IRQ_ONENAND1 S3C64XX_IRQ_VIC1(12) #define IRQ_NFC S3C64XX_IRQ_VIC1(13) #define IRQ_CFCON S3C64XX_IRQ_VIC1(14) -#define IRQ_UHOST S3C64XX_IRQ_VIC1(15) +#define IRQ_USBH S3C64XX_IRQ_VIC1(15) #define IRQ_SPI0 S3C64XX_IRQ_VIC1(16) #define IRQ_SPI1 S3C64XX_IRQ_VIC1(17) #define IRQ_IIC S3C64XX_IRQ_VIC1(18) From 19c5957081a6ffbf014e6c944e6aafee3c1632c3 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 26 Feb 2009 23:00:33 +0000 Subject: [PATCH 067/580] [ARM] S3C64XX: Fix name of USB host clock. The usb-host-bus clock should be named usb-bus-host. Signed-off-by: Ben Dooks Signed-off-by: Ben Dooks --- arch/arm/plat-s3c64xx/s3c6400-clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/plat-s3c64xx/s3c6400-clock.c b/arch/arm/plat-s3c64xx/s3c6400-clock.c index 8d9a0cada668..a2f526bcb389 100644 --- a/arch/arm/plat-s3c64xx/s3c6400-clock.c +++ b/arch/arm/plat-s3c64xx/s3c6400-clock.c @@ -351,7 +351,7 @@ static struct clksrc_clk clk_mmc2 = { static struct clksrc_clk clk_usbhost = { .clk = { - .name = "usb-host-bus", + .name = "usb-bus-host", .id = -1, .ctrlbit = S3C_CLKCON_SCLK_UHOST, .enable = s3c64xx_sclk_ctrl, From 41ba41d7c7e1d2a3c9cdfe16c1ee9f7af4693ae2 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 26 Feb 2009 23:00:34 +0000 Subject: [PATCH 068/580] [ARM] S3C64XX: Fix USB host clock mux list The clock list for the USB host bus clock was in the wrong order, move clk_48m to position 0. Signed-off-by: Ben Dooks Signed-off-by: Ben Dooks --- arch/arm/plat-s3c64xx/s3c6400-clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/plat-s3c64xx/s3c6400-clock.c b/arch/arm/plat-s3c64xx/s3c6400-clock.c index a2f526bcb389..2984270b40a7 100644 --- a/arch/arm/plat-s3c64xx/s3c6400-clock.c +++ b/arch/arm/plat-s3c64xx/s3c6400-clock.c @@ -189,10 +189,10 @@ static struct clk_sources clkset_uart = { }; static struct clk *clkset_uhost_list[] = { + &clk_48m, &clk_mout_epll.clk, &clk_dout_mpll, &clk_fin_epll, - &clk_48m, }; static struct clk_sources clkset_uhost = { From 54e991242850edc8c53f71fa5aa3ba7a93ce38f5 Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Fri, 27 Feb 2009 15:13:54 +0530 Subject: [PATCH 069/580] sched: don't allow setuid to succeed if the user does not have rt bandwidth Impact: fix hung task with certain (non-default) rt-limit settings Corey Hickey reported that on using setuid to change the uid of a rt process, the process would be unkillable and not be running. This is because there was no rt runtime for that user group. Add in a check to see if a user can attach an rt task to its task group. On failure, return EINVAL, which is also returned in CONFIG_CGROUP_SCHED. Reported-by: Corey Hickey Signed-off-by: Dhaval Giani Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ kernel/sched.c | 13 +++++++++++-- kernel/sys.c | 31 ++++++++++++++++++++----------- kernel/user.c | 18 ++++++++++++++++++ 4 files changed, 53 insertions(+), 13 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 8981e52c714f..8c216e057c94 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2291,9 +2291,13 @@ extern long sched_group_rt_runtime(struct task_group *tg); extern int sched_group_set_rt_period(struct task_group *tg, long rt_period_us); extern long sched_group_rt_period(struct task_group *tg); +extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); #endif #endif +extern int task_can_switch_user(struct user_struct *up, + struct task_struct *tsk); + #ifdef CONFIG_TASK_XACCT static inline void add_rchar(struct task_struct *tsk, ssize_t amt) { diff --git a/kernel/sched.c b/kernel/sched.c index c3baa9653d1d..8e2558c2ba67 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9224,6 +9224,16 @@ static int sched_rt_global_constraints(void) return ret; } + +int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +{ + /* Don't accept realtime tasks when there is no way for them to run */ + if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) + return 0; + + return 1; +} + #else /* !CONFIG_RT_GROUP_SCHED */ static int sched_rt_global_constraints(void) { @@ -9317,8 +9327,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct task_struct *tsk) { #ifdef CONFIG_RT_GROUP_SCHED - /* Don't accept realtime tasks when there is no way for them to run */ - if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) + if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) return -EINVAL; #else /* We don't support RT-tasks being in separate groups */ diff --git a/kernel/sys.c b/kernel/sys.c index f145c415bc16..37f458e6882a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -559,7 +559,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) abort_creds(new); return retval; } - + /* * change the user struct in a credentials set to match the new UID */ @@ -571,6 +571,11 @@ static int set_user(struct cred *new) if (!new_user) return -EAGAIN; + if (!task_can_switch_user(new_user, current)) { + free_uid(new_user); + return -EINVAL; + } + if (atomic_read(&new_user->processes) >= current->signal->rlim[RLIMIT_NPROC].rlim_cur && new_user != INIT_USER) { @@ -631,10 +636,11 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) goto error; } - retval = -EAGAIN; - if (new->uid != old->uid && set_user(new) < 0) - goto error; - + if (new->uid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } if (ruid != (uid_t) -1 || (euid != (uid_t) -1 && euid != old->uid)) new->suid = new->euid; @@ -680,9 +686,10 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) retval = -EPERM; if (capable(CAP_SETUID)) { new->suid = new->uid = uid; - if (uid != old->uid && set_user(new) < 0) { - retval = -EAGAIN; - goto error; + if (uid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; } } else if (uid != old->uid && uid != new->suid) { goto error; @@ -734,11 +741,13 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) goto error; } - retval = -EAGAIN; if (ruid != (uid_t) -1) { new->uid = ruid; - if (ruid != old->uid && set_user(new) < 0) - goto error; + if (ruid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } } if (euid != (uid_t) -1) new->euid = euid; diff --git a/kernel/user.c b/kernel/user.c index 3551ac742395..6a9b696128c8 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -362,6 +362,24 @@ static void free_user(struct user_struct *up, unsigned long flags) #endif +#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED) +/* + * We need to check if a setuid can take place. This function should be called + * before successfully completing the setuid. + */ +int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) +{ + + return sched_rt_can_attach(up->tg, tsk); + +} +#else +int task_can_switch_user(struct user_struct *up, struct task_struct *tsk) +{ + return 1; +} +#endif + /* * Locate the user_struct for the passed UID. If found, take a ref on it. The * caller must undo that ref with free_uid(). From 3782d3605522b836f53c6d11f76abf5404425c1b Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 27 Feb 2009 11:25:37 +0000 Subject: [PATCH 070/580] [ARM] S3C64XX: sparse warnings in arch/arm/plat-s3c64xx/s3c6400-clock.c Fix the following sparse warnings in s3c6400-clock.c: 39:12: warning: symbol 'clk_ext_xtal_mux' was not declared. Should it be static? 66:12: warning: symbol 'clk_fout_apll' was not declared. Should it be static? 81:19: warning: symbol 'clk_mout_apll' was not declared. Should it be static? 91:12: warning: symbol 'clk_fout_epll' was not declared. Should it be static? 106:19: warning: symbol 'clk_mout_epll' was not declared. Should it be static? 126:19: warning: symbol 'clk_mout_mpll' was not declared. Should it be static? 148:12: warning: symbol 'clk_dout_mpll' was not declared. Should it be static? Signed-off-by: Ben Dooks --- arch/arm/plat-s3c64xx/s3c6400-clock.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/arm/plat-s3c64xx/s3c6400-clock.c b/arch/arm/plat-s3c64xx/s3c6400-clock.c index 2984270b40a7..6edbeef6aa9d 100644 --- a/arch/arm/plat-s3c64xx/s3c6400-clock.c +++ b/arch/arm/plat-s3c64xx/s3c6400-clock.c @@ -36,7 +36,7 @@ * ext_xtal_mux for want of an actual name from the manual. */ -struct clk clk_ext_xtal_mux = { +static struct clk clk_ext_xtal_mux = { .name = "ext_xtal", .id = -1, }; @@ -63,7 +63,7 @@ struct clksrc_clk { void __iomem *reg_divider; }; -struct clk clk_fout_apll = { +static struct clk clk_fout_apll = { .name = "fout_apll", .id = -1, }; @@ -78,7 +78,7 @@ static struct clk_sources clk_src_apll = { .nr_sources = ARRAY_SIZE(clk_src_apll_list), }; -struct clksrc_clk clk_mout_apll = { +static struct clksrc_clk clk_mout_apll = { .clk = { .name = "mout_apll", .id = -1, @@ -88,7 +88,7 @@ struct clksrc_clk clk_mout_apll = { .sources = &clk_src_apll, }; -struct clk clk_fout_epll = { +static struct clk clk_fout_epll = { .name = "fout_epll", .id = -1, }; @@ -103,7 +103,7 @@ static struct clk_sources clk_src_epll = { .nr_sources = ARRAY_SIZE(clk_src_epll_list), }; -struct clksrc_clk clk_mout_epll = { +static struct clksrc_clk clk_mout_epll = { .clk = { .name = "mout_epll", .id = -1, @@ -123,7 +123,7 @@ static struct clk_sources clk_src_mpll = { .nr_sources = ARRAY_SIZE(clk_src_mpll_list), }; -struct clksrc_clk clk_mout_mpll = { +static struct clksrc_clk clk_mout_mpll = { .clk = { .name = "mout_mpll", .id = -1, @@ -145,7 +145,7 @@ static unsigned long s3c64xx_clk_doutmpll_get_rate(struct clk *clk) return rate; } -struct clk clk_dout_mpll = { +static struct clk clk_dout_mpll = { .name = "dout_mpll", .id = -1, .parent = &clk_mout_mpll.clk, From fdca9bf2dae14218704ddd7dc60ad1b198c1d787 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 27 Feb 2009 11:29:23 +0000 Subject: [PATCH 071/580] [ARM] S3C64XX: sparse warnings in arch/arm/plat-s3c64xx/irq.c Fix the following sparse warnings in arch/arm/plat-s3c64xx/irq.c arch/arm/plat-s3c64xx/irq.c:210:23: warning: incorrect type in initializer (different address spaces) arch/arm/plat-s3c64xx/irq.c:210:23: expected void *reg_base arch/arm/plat-s3c64xx/irq.c:210:23: got void [noderef] *regs arch/arm/plat-s3c64xx/irq.c:215:2: warning: incorrect type in argument 1 (different address spaces) arch/arm/plat-s3c64xx/irq.c:215:2: expected void const volatile [noderef] * arch/arm/plat-s3c64xx/irq.c:215:2: got void * Signed-off-by: Ben Dooks --- arch/arm/plat-s3c64xx/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/plat-s3c64xx/irq.c b/arch/arm/plat-s3c64xx/irq.c index a94f1d5e819d..f22edf7c2d2d 100644 --- a/arch/arm/plat-s3c64xx/irq.c +++ b/arch/arm/plat-s3c64xx/irq.c @@ -207,7 +207,7 @@ static struct irq_chip s3c_irq_uart = { static void __init s3c64xx_uart_irq(struct uart_irq *uirq) { - void *reg_base = uirq->regs; + void __iomem *reg_base = uirq->regs; unsigned int irq; int offs; From efeff568677aa325f84d3ce37c219019887a79eb Mon Sep 17 00:00:00 2001 From: Werner Almesberger Date: Fri, 27 Feb 2009 08:03:07 -0300 Subject: [PATCH 072/580] [ARM] S3C64XX: Fix s3c64xx_setrate_clksrc Some of the rate selection logic in s3c64xx_setrate_clksrc uses what appears to be parent clock selection logic. This patch corrects it. I also added a check for overly large dividers to prevent them from changing unrelated clocks. Signed-off-by: Werner Almesberger Signed-off-by: Ben Dooks --- arch/arm/plat-s3c64xx/s3c6400-clock.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm/plat-s3c64xx/s3c6400-clock.c b/arch/arm/plat-s3c64xx/s3c6400-clock.c index 6edbeef6aa9d..05b17528041e 100644 --- a/arch/arm/plat-s3c64xx/s3c6400-clock.c +++ b/arch/arm/plat-s3c64xx/s3c6400-clock.c @@ -239,10 +239,12 @@ static int s3c64xx_setrate_clksrc(struct clk *clk, unsigned long rate) rate = clk_round_rate(clk, rate); div = clk_get_rate(clk->parent) / rate; + if (div > 16) + return -EINVAL; val = __raw_readl(reg); - val &= ~sclk->mask; - val |= (rate - 1) << sclk->shift; + val &= ~(0xf << sclk->shift); + val |= (div - 1) << sclk->shift; __raw_writel(val, reg); return 0; From 6b8036a877fe7a85d4474ddb89993339303959e1 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Sat, 28 Feb 2009 21:30:38 -0700 Subject: [PATCH 073/580] powerpc/4xx: Enable SERIAL_OF support by default for Virtex platforms Virtex FPGA designs have two serial port logic cores to choose from; the simple uartlite, and the full featured uart16550. Both cores are in common use so the defconfig should support both of them. Currently only console on uartlite is supported in the defconfig. This patch adds console support for the 16550 core. The Virtex reference designs do not work without this patch. Signed-off-by: Grant Likely --- arch/powerpc/configs/40x/virtex_defconfig | 2 +- arch/powerpc/configs/44x/virtex5_defconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/configs/40x/virtex_defconfig b/arch/powerpc/configs/40x/virtex_defconfig index b6888384dd74..f5698f962e58 100644 --- a/arch/powerpc/configs/40x/virtex_defconfig +++ b/arch/powerpc/configs/40x/virtex_defconfig @@ -686,7 +686,7 @@ CONFIG_SERIAL_UARTLITE_CONSOLE=y CONFIG_SERIAL_CORE=y CONFIG_SERIAL_CORE_CONSOLE=y # CONFIG_SERIAL_JSM is not set -# CONFIG_SERIAL_OF_PLATFORM is not set +CONFIG_SERIAL_OF_PLATFORM=y # CONFIG_SERIAL_OF_PLATFORM_NWPSERIAL is not set CONFIG_UNIX98_PTYS=y # CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set diff --git a/arch/powerpc/configs/44x/virtex5_defconfig b/arch/powerpc/configs/44x/virtex5_defconfig index 15aab1ca6384..1bf0a63614b1 100644 --- a/arch/powerpc/configs/44x/virtex5_defconfig +++ b/arch/powerpc/configs/44x/virtex5_defconfig @@ -691,7 +691,7 @@ CONFIG_SERIAL_UARTLITE_CONSOLE=y CONFIG_SERIAL_CORE=y CONFIG_SERIAL_CORE_CONSOLE=y # CONFIG_SERIAL_JSM is not set -# CONFIG_SERIAL_OF_PLATFORM is not set +CONFIG_SERIAL_OF_PLATFORM=y # CONFIG_SERIAL_OF_PLATFORM_NWPSERIAL is not set CONFIG_UNIX98_PTYS=y # CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set From 02d51fdfb2bfcf6bbd776f983177f55868aa0a79 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 1 Mar 2009 15:42:36 +0900 Subject: [PATCH 074/580] percpu: kill compile warning in pcpu_populate_chunk() Impact: remove compile warning Mark local variable map_end in pcpu_populate_chunk() with uninitialized_var(). The variable is always used in tandem with map_start and guaranteed to be initialized before use but gcc doesn't understand that. Signed-off-by: Tejun Heo Reported-by: Ingo Molnar --- mm/percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/percpu.c b/mm/percpu.c index 5954e7a9eb1e..3d0f5456827c 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -639,7 +639,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) int page_start = PFN_DOWN(off); int page_end = PFN_UP(off + size); int map_start = -1; - int map_end; + int uninitialized_var(map_end); unsigned int cpu; int i; From af6326d72c95d6e0bbc88c92185c654f57acef3b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 1 Mar 2009 16:03:16 +0900 Subject: [PATCH 075/580] alpha: fix typo in recent early vmalloc change Impact: fix build Add missing 'o' in variable name. Compile tested. Signed-off-by: Tejun Heo Reported-by: Ingo Molnar Cc: Ivan Kokshaysky --- arch/alpha/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 91eddd8505df..af71d38c8e41 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -202,7 +202,7 @@ callback_init(void * kernel_end) console_remap_vm.size = nr_pages << PAGE_SHIFT; vm_area_register_early(&console_remap_vm, PAGE_SIZE); - vaddr = (unsigned long)consle_remap_vm.addr; + vaddr = (unsigned long)console_remap_vm.addr; /* Set up the third level PTEs and update the virtual addresses of the CRB entries. */ From d0c4f570276cb4d2dc4215b90eb7cb6e2bdd4a15 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 1 Mar 2009 16:06:56 +0900 Subject: [PATCH 076/580] bootmem, x86: further fixes for arch-specific bootmem wrapping Impact: fix new breakages introduced by previous fix Commit c132937556f56ee4b831ef4b23f1846e05fde102 tried to clean up bootmem arch wrapper but it wasn't quite correct. Before the commit, the followings were broken. * Low level interface functions prefixed with __ ignored arch preference. * reserve_bootmem(...) can't be mapped into reserve_bootmem_node(NODE_DATA(0)->bdata, ...) because the node is not preference here. The region specified MUST fall into the specified region; otherwise, it will panic. After the commit, * If allocation fails for the arch preferred node, it should fallback to whatever is available. Instead, it simply failed allocation. There are too many internal details to allow generic wrapping and still keep things simple for archs. Plus, all that arch wants is a way to prefer certain node over another. This patch drops the generic wrapping around alloc_bootmem_core() and add alloc_bootmem_core() instead. If necessary, arch can define bootmem_arch_referred_node() macro or function which takes all allocation information and returns the preferred node. bootmem generic code will always try the preferred node first and then fallback to other nodes as usual. Breakages noted and changes reviewed by Johannes Weiner. Signed-off-by: Tejun Heo Acked-by: Johannes Weiner --- arch/x86/include/asm/mmzone_32.h | 8 ++---- mm/bootmem.c | 45 +++++++++++++++++++++----------- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index eeacf67de49e..ede6998bd92c 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -92,12 +92,8 @@ static inline int pfn_valid(int pfn) #ifdef CONFIG_NEED_MULTIPLE_NODES /* always use node 0 for bootmem on this numa platform */ -#define alloc_bootmem_core(__bdata, size, align, goal, limit) \ -({ \ - bootmem_data_t __maybe_unused * __abm_bdata_dummy = (__bdata); \ - __alloc_bootmem_core(NODE_DATA(0)->bdata, \ - (size), (align), (goal), (limit)); \ -}) +#define bootmem_arch_preferred_node(__bdata, size, align, goal, limit) \ + (NODE_DATA(0)->bdata) #endif /* CONFIG_NEED_MULTIPLE_NODES */ #endif /* _ASM_X86_MMZONE_32_H */ diff --git a/mm/bootmem.c b/mm/bootmem.c index d7140c008ba8..daf92713f7de 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -37,16 +37,6 @@ static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); static int bootmem_debug; -/* - * If an arch needs to apply workarounds to bootmem allocation, it can - * set CONFIG_HAVE_ARCH_BOOTMEM and define a wrapper around - * __alloc_bootmem_core(). - */ -#ifndef CONFIG_HAVE_ARCH_BOOTMEM -#define alloc_bootmem_core(bdata, size, align, goal, limit) \ - __alloc_bootmem_core((bdata), (size), (align), (goal), (limit)) -#endif - static int __init bootmem_debug_setup(char *buf) { bootmem_debug = 1; @@ -436,9 +426,9 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, return ALIGN(base + off, align) - base; } -static void * __init __alloc_bootmem_core(struct bootmem_data *bdata, - unsigned long size, unsigned long align, - unsigned long goal, unsigned long limit) +static void * __init alloc_bootmem_core(struct bootmem_data *bdata, + unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) { unsigned long fallback = 0; unsigned long min, max, start, sidx, midx, step; @@ -538,17 +528,34 @@ static void * __init __alloc_bootmem_core(struct bootmem_data *bdata, return NULL; } +static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, + unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) +{ +#ifdef CONFIG_HAVE_ARCH_BOOTMEM + bootmem_data_t *p_bdata; + + p_bdata = bootmem_arch_preferred_node(bdata, size, align, goal, limit); + if (p_bdata) + return alloc_bootmem_core(p_bdata, size, align, goal, limit); +#endif + return NULL; +} + static void * __init ___alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { bootmem_data_t *bdata; + void *region; restart: - list_for_each_entry(bdata, &bdata_list, list) { - void *region; + region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); + if (region) + return region; + list_for_each_entry(bdata, &bdata_list, list) { if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) continue; if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) @@ -626,6 +633,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, { void *ptr; + ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit); + if (ptr) + return ptr; + ptr = alloc_bootmem_core(bdata, size, align, goal, limit); if (ptr) return ptr; @@ -682,6 +693,10 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, { void *ptr; + ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); + if (ptr) + return ptr; + ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); if (ptr) return ptr; From fab852aaf761a00cfe16330429b7cac15cceaeb9 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Sun, 1 Mar 2009 16:09:14 +0200 Subject: [PATCH 077/580] x86: count errors in testmmiotrace.ko Check the read values against the written values in the MMIO read/write test. This test shows if the given MMIO test area really works as memory, which is a prerequisite for a successful mmiotrace test. Signed-off-by: Pekka Paalanen Cc: Stuart Bennett Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/mm/testmmiotrace.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index ab50a8d7402c..4b29f3b0ee01 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -1,5 +1,5 @@ /* - * Written by Pekka Paalanen, 2008 + * Written by Pekka Paalanen, 2008-2009 */ #include #include @@ -11,28 +11,51 @@ static unsigned long mmio_address; module_param(mmio_address, ulong, 0); MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB."); +static unsigned v16(unsigned i) +{ + return i * 12 + 7; +} + +static unsigned v32(unsigned i) +{ + return i * 212371 + 13; +} + static void do_write_test(void __iomem *p) { unsigned int i; mmiotrace_printk("Write test.\n"); + for (i = 0; i < 256; i++) iowrite8(i, p + i); + for (i = 1024; i < (5 * 1024); i += 2) - iowrite16(i * 12 + 7, p + i); + iowrite16(v16(i), p + i); + for (i = (5 * 1024); i < (16 * 1024); i += 4) - iowrite32(i * 212371 + 13, p + i); + iowrite32(v32(i), p + i); } static void do_read_test(void __iomem *p) { unsigned int i; + unsigned errs[3] = { 0 }; mmiotrace_printk("Read test.\n"); + for (i = 0; i < 256; i++) - ioread8(p + i); + if (ioread8(p + i) != i) + ++errs[0]; + for (i = 1024; i < (5 * 1024); i += 2) - ioread16(p + i); + if (ioread16(p + i) != v16(i)) + ++errs[1]; + for (i = (5 * 1024); i < (16 * 1024); i += 4) - ioread32(p + i); + if (ioread32(p + i) != v32(i)) + ++errs[2]; + + mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n", + errs[0], errs[1], errs[2]); } static void do_test(void) From 5ff93697fcfe1e2b9e61db82961d8f50d1ad5d57 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Sun, 1 Mar 2009 16:10:08 +0200 Subject: [PATCH 078/580] x86: add far read test to testmmiotrace Apparently pages far into an ioremapped region might not actually be mapped during ioremap(). Add an optional read test to try to trigger a multiply faulting MMIO access. Also add more messages to the kernel log to help debugging. This patch is based on a patch suggested by Stuart Bennett who discovered bugs in mmiotrace related to normal kernel space faults. Signed-off-by: Pekka Paalanen Cc: Stuart Bennett Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/mm/testmmiotrace.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index 4b29f3b0ee01..427fd1b56df5 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -9,7 +9,13 @@ static unsigned long mmio_address; module_param(mmio_address, ulong, 0); -MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB."); +MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " + "(or 8 MB if read_far is non-zero)."); + +static unsigned long read_far = 0x400100; +module_param(read_far, ulong, 0); +MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB " + "(default: 0x400100)."); static unsigned v16(unsigned i) { @@ -24,6 +30,7 @@ static unsigned v32(unsigned i) static void do_write_test(void __iomem *p) { unsigned int i; + pr_info(MODULE_NAME ": write test.\n"); mmiotrace_printk("Write test.\n"); for (i = 0; i < 256; i++) @@ -40,6 +47,7 @@ static void do_read_test(void __iomem *p) { unsigned int i; unsigned errs[3] = { 0 }; + pr_info(MODULE_NAME ": read test.\n"); mmiotrace_printk("Read test.\n"); for (i = 0; i < 256; i++) @@ -58,9 +66,17 @@ static void do_read_test(void __iomem *p) errs[0], errs[1], errs[2]); } -static void do_test(void) +static void do_read_far_test(void __iomem *p) { - void __iomem *p = ioremap_nocache(mmio_address, 0x4000); + pr_info(MODULE_NAME ": read far test.\n"); + mmiotrace_printk("Read far test.\n"); + + ioread32(p + read_far); +} + +static void do_test(unsigned long size) +{ + void __iomem *p = ioremap_nocache(mmio_address, size); if (!p) { pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); return; @@ -68,11 +84,15 @@ static void do_test(void) mmiotrace_printk("ioremap returned %p.\n", p); do_write_test(p); do_read_test(p); + if (read_far && read_far < size - 4) + do_read_far_test(p); iounmap(p); } static int __init init(void) { + unsigned long size = (read_far) ? (8 << 20) : (16 << 10); + if (mmio_address == 0) { pr_err(MODULE_NAME ": you have to use the module argument " "mmio_address.\n"); @@ -81,10 +101,11 @@ static int __init init(void) return -ENXIO; } - pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx " - "in PCI address space, and writing " - "rubbish in there.\n", mmio_address); - do_test(); + pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI " + "address space, and writing 16 kB of rubbish in there.\n", + size >> 10, mmio_address); + do_test(size); + pr_info(MODULE_NAME ": All done.\n"); return 0; } From e9d54cae8f03e7f963a12f44bd50d68f49b9ea36 Mon Sep 17 00:00:00 2001 From: Stuart Bennett Date: Fri, 30 Jan 2009 17:38:59 +0000 Subject: [PATCH 079/580] x86 mmiotrace: WARN_ONCE if dis/arming a page fails Print a full warning once, if arming or disarming a page fails. Also, if initial arming fails, do not handle the page further. This avoids the possibility of a page failing to arm and then later claiming to have handled any fault on that page. WARN_ONCE added by Pekka Paalanen. Signed-off-by: Stuart Bennett Signed-off-by: Pekka Paalanen Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 93d82038af4b..fb1f11546fcd 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -105,7 +105,7 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) return NULL; } -static void set_page_present(unsigned long addr, bool present, +static int set_page_present(unsigned long addr, bool present, unsigned int *pglevel) { pteval_t pteval; @@ -116,7 +116,7 @@ static void set_page_present(unsigned long addr, bool present, if (!pte) { pr_err("kmmio: no pte for page 0x%08lx\n", addr); - return; + return -1; } if (pglevel) @@ -140,22 +140,27 @@ static void set_page_present(unsigned long addr, bool present, default: pr_err("kmmio: unexpected page level 0x%x.\n", level); - return; + return -1; } __flush_tlb_one(addr); + + return 0; } /** Mark the given page as not present. Access to it will trigger a fault. */ -static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) +static int arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) { - set_page_present(page & PAGE_MASK, false, pglevel); + int ret = set_page_present(page & PAGE_MASK, false, pglevel); + WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", page); + return ret; } /** Mark the given page as present. */ static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) { - set_page_present(page & PAGE_MASK, true, pglevel); + int ret = set_page_present(page & PAGE_MASK, true, pglevel); + WARN_ONCE(ret < 0, KERN_ERR "kmmio disarming 0x%08lx failed.\n", page); } /* @@ -326,9 +331,13 @@ static int add_kmmio_fault_page(unsigned long page) f->count = 1; f->page = page; - list_add_rcu(&f->list, kmmio_page_list(f->page)); - arm_kmmio_fault_page(f->page, NULL); + if (arm_kmmio_fault_page(f->page, NULL)) { + kfree(f); + return -1; + } + + list_add_rcu(&f->list, kmmio_page_list(f->page)); return 0; } From 5359b585fb5edb3db34d6cd491e1475b098c61d3 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Sun, 1 Mar 2009 16:11:58 +0200 Subject: [PATCH 080/580] x86 mmiotrace: fix save/restore page table state From baa99e2b32449ec7bf147c234adfa444caecac8a Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Sun, 22 Feb 2009 20:02:43 +0200 Blindly setting _PAGE_PRESENT in disarm_kmmio_fault_page() overlooks the possibility, that the page was not present when it was armed. Make arm_kmmio_fault_page() store the previous page presence in struct kmmio_fault_page and use it on disarm. This patch was originally written by Stuart Bennett, but Pekka Paalanen rewrote it a little different. Signed-off-by: Pekka Paalanen Cc: Stuart Bennett Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 66 ++++++++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index fb1f11546fcd..be361eb828c8 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -32,6 +32,8 @@ struct kmmio_fault_page { struct list_head list; struct kmmio_fault_page *release_next; unsigned long page; /* location of the fault page */ + bool old_presence; /* page presence prior to arming */ + bool armed; /* * Number of times this page has been registered as a part @@ -105,8 +107,7 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) return NULL; } -static int set_page_present(unsigned long addr, bool present, - unsigned int *pglevel) +static int set_page_presence(unsigned long addr, bool present, bool *old) { pteval_t pteval; pmdval_t pmdval; @@ -119,20 +120,21 @@ static int set_page_present(unsigned long addr, bool present, return -1; } - if (pglevel) - *pglevel = level; - switch (level) { case PG_LEVEL_2M: pmd = (pmd_t *)pte; - pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT; + pmdval = pmd_val(*pmd); + *old = !!(pmdval & _PAGE_PRESENT); + pmdval &= ~_PAGE_PRESENT; if (present) pmdval |= _PAGE_PRESENT; set_pmd(pmd, __pmd(pmdval)); break; case PG_LEVEL_4K: - pteval = pte_val(*pte) & ~_PAGE_PRESENT; + pteval = pte_val(*pte); + *old = !!(pteval & _PAGE_PRESENT); + pteval &= ~_PAGE_PRESENT; if (present) pteval |= _PAGE_PRESENT; set_pte_atomic(pte, __pte(pteval)); @@ -148,19 +150,39 @@ static int set_page_present(unsigned long addr, bool present, return 0; } -/** Mark the given page as not present. Access to it will trigger a fault. */ -static int arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) +/* + * Mark the given page as not present. Access to it will trigger a fault. + * + * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the + * protection is ignored here. RCU read lock is assumed held, so the struct + * will not disappear unexpectedly. Furthermore, the caller must guarantee, + * that double arming the same virtual address (page) cannot occur. + * + * Double disarming on the other hand is allowed, and may occur when a fault + * and mmiotrace shutdown happen simultaneously. + */ +static int arm_kmmio_fault_page(struct kmmio_fault_page *f) { - int ret = set_page_present(page & PAGE_MASK, false, pglevel); - WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", page); + int ret; + WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n"); + if (f->armed) { + pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n", + f->page, f->count, f->old_presence); + } + ret = set_page_presence(f->page, false, &f->old_presence); + WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page); + f->armed = true; return ret; } -/** Mark the given page as present. */ -static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) +/** Restore the given page to saved presence state. */ +static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) { - int ret = set_page_present(page & PAGE_MASK, true, pglevel); - WARN_ONCE(ret < 0, KERN_ERR "kmmio disarming 0x%08lx failed.\n", page); + bool tmp; + int ret = set_page_presence(f->page, f->old_presence, &tmp); + WARN_ONCE(ret < 0, + KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); + f->armed = false; } /* @@ -207,7 +229,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) ctx = &get_cpu_var(kmmio_ctx); if (ctx->active) { - disarm_kmmio_fault_page(faultpage->page, NULL); + disarm_kmmio_fault_page(faultpage); if (addr == ctx->addr) { /* * On SMP we sometimes get recursive probe hits on the @@ -249,7 +271,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) regs->flags &= ~X86_EFLAGS_IF; /* Now we set present bit in PTE and single step. */ - disarm_kmmio_fault_page(ctx->fpage->page, NULL); + disarm_kmmio_fault_page(ctx->fpage); /* * If another cpu accesses the same page while we are stepping, @@ -288,7 +310,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) if (ctx->probe && ctx->probe->post_handler) ctx->probe->post_handler(ctx->probe, condition, regs); - arm_kmmio_fault_page(ctx->fpage->page, NULL); + arm_kmmio_fault_page(ctx->fpage); regs->flags &= ~X86_EFLAGS_TF; regs->flags |= ctx->saved_flags; @@ -320,19 +342,19 @@ static int add_kmmio_fault_page(unsigned long page) f = get_kmmio_fault_page(page); if (f) { if (!f->count) - arm_kmmio_fault_page(f->page, NULL); + arm_kmmio_fault_page(f); f->count++; return 0; } - f = kmalloc(sizeof(*f), GFP_ATOMIC); + f = kzalloc(sizeof(*f), GFP_ATOMIC); if (!f) return -1; f->count = 1; f->page = page; - if (arm_kmmio_fault_page(f->page, NULL)) { + if (arm_kmmio_fault_page(f)) { kfree(f); return -1; } @@ -356,7 +378,7 @@ static void release_kmmio_fault_page(unsigned long page, f->count--; BUG_ON(f->count < 0); if (!f->count) { - disarm_kmmio_fault_page(f->page, NULL); + disarm_kmmio_fault_page(f); f->release_next = *release_list; *release_list = f; } From 0b700a6a253b6a3b3059bb9a9247a73490ee33fb Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Sun, 1 Mar 2009 16:12:48 +0200 Subject: [PATCH 081/580] x86 mmiotrace: split set_page_presence() From 36772dcb6ffbbb68254cbfc379a103acd2fbfefc Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Sat, 28 Feb 2009 21:34:59 +0200 Split set_page_presence() in kmmio.c into two more functions set_pmd_presence() and set_pte_presence(). Purely code reorganization, no functional changes. Signed-off-by: Pekka Paalanen Cc: Stuart Bennett Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index be361eb828c8..d29777520af3 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -107,12 +107,29 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) return NULL; } +static void set_pmd_presence(pmd_t *pmd, bool present, bool *old) +{ + pmdval_t v = pmd_val(*pmd); + *old = !!(v & _PAGE_PRESENT); + v &= ~_PAGE_PRESENT; + if (present) + v |= _PAGE_PRESENT; + set_pmd(pmd, __pmd(v)); +} + +static void set_pte_presence(pte_t *pte, bool present, bool *old) +{ + pteval_t v = pte_val(*pte); + *old = !!(v & _PAGE_PRESENT); + v &= ~_PAGE_PRESENT; + if (present) + v |= _PAGE_PRESENT; + set_pte_atomic(pte, __pte(v)); +} + static int set_page_presence(unsigned long addr, bool present, bool *old) { - pteval_t pteval; - pmdval_t pmdval; unsigned int level; - pmd_t *pmd; pte_t *pte = lookup_address(addr, &level); if (!pte) { @@ -122,31 +139,17 @@ static int set_page_presence(unsigned long addr, bool present, bool *old) switch (level) { case PG_LEVEL_2M: - pmd = (pmd_t *)pte; - pmdval = pmd_val(*pmd); - *old = !!(pmdval & _PAGE_PRESENT); - pmdval &= ~_PAGE_PRESENT; - if (present) - pmdval |= _PAGE_PRESENT; - set_pmd(pmd, __pmd(pmdval)); + set_pmd_presence((pmd_t *)pte, present, old); break; - case PG_LEVEL_4K: - pteval = pte_val(*pte); - *old = !!(pteval & _PAGE_PRESENT); - pteval &= ~_PAGE_PRESENT; - if (present) - pteval |= _PAGE_PRESENT; - set_pte_atomic(pte, __pte(pteval)); + set_pte_presence(pte, present, old); break; - default: pr_err("kmmio: unexpected page level 0x%x.\n", level); return -1; } __flush_tlb_one(addr); - return 0; } From 3e39aa156a24ce386da378784edd0f748c770087 Mon Sep 17 00:00:00 2001 From: Stuart Bennett Date: Thu, 5 Feb 2009 11:02:02 +0000 Subject: [PATCH 082/580] x86 mmiotrace: improve handling of secondary faults Upgrade some kmmio.c debug messages to warnings. Allow secondary faults on probed pages to fall through, and only log secondary faults that are not due to non-present pages. Patch edited by Pekka Paalanen. Signed-off-by: Stuart Bennett Signed-off-by: Pekka Paalanen Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index d29777520af3..4c66bd3a240d 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -232,28 +232,32 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr) ctx = &get_cpu_var(kmmio_ctx); if (ctx->active) { - disarm_kmmio_fault_page(faultpage); if (addr == ctx->addr) { /* - * On SMP we sometimes get recursive probe hits on the - * same address. Context is already saved, fall out. + * A second fault on the same page means some other + * condition needs handling by do_page_fault(), the + * page really not being present is the most common. */ - pr_debug("kmmio: duplicate probe hit on CPU %d, for " - "address 0x%08lx.\n", - smp_processor_id(), addr); - ret = 1; - goto no_kmmio_ctx; - } - /* - * Prevent overwriting already in-flight context. - * This should not happen, let's hope disarming at least - * prevents a panic. - */ - pr_emerg("kmmio: recursive probe hit on CPU %d, " + pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n", + addr, smp_processor_id()); + + if (!faultpage->old_presence) + pr_info("kmmio: unexpected secondary hit for " + "address 0x%08lx on CPU %d.\n", addr, + smp_processor_id()); + } else { + /* + * Prevent overwriting already in-flight context. + * This should not happen, let's hope disarming at + * least prevents a panic. + */ + pr_emerg("kmmio: recursive probe hit on CPU %d, " "for address 0x%08lx. Ignoring.\n", smp_processor_id(), addr); - pr_emerg("kmmio: previous hit was at 0x%08lx.\n", - ctx->addr); + pr_emerg("kmmio: previous hit was at 0x%08lx.\n", + ctx->addr); + disarm_kmmio_fault_page(faultpage); + } goto no_kmmio_ctx; } ctx->active++; @@ -305,7 +309,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); if (!ctx->active) { - pr_debug("kmmio: spurious debug trap on CPU %d.\n", + pr_warning("kmmio: spurious debug trap on CPU %d.\n", smp_processor_id()); goto out; } From 340430c572f7b2b275d39965e88bafa71693cb23 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Tue, 24 Feb 2009 21:44:15 +0200 Subject: [PATCH 083/580] x86 mmiotrace: fix race with release_kmmio_fault_page() There was a theoretical possibility to a race between arming a page in post_kmmio_handler() and disarming the page in release_kmmio_fault_page(): cpu0 cpu1 ------------------------------------------------------------------ mmiotrace shutdown enter release_kmmio_fault_page fault on the page disarm the page disarm the page handle the MMIO access re-arm the page put the page on release list remove_kmmio_fault_pages() fault on the page page not known to mmiotrace fall back to do_page_fault() *KABOOM* (This scenario also shows the double disarm case which is allowed.) Fixed by acquiring kmmio_lock in post_kmmio_handler() and checking if the page is being released from mmiotrace. Signed-off-by: Pekka Paalanen Cc: Stuart Bennett Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 4c66bd3a240d..9f205030d9aa 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -38,7 +38,8 @@ struct kmmio_fault_page { /* * Number of times this page has been registered as a part * of a probe. If zero, page is disarmed and this may be freed. - * Used only by writers (RCU). + * Used only by writers (RCU) and post_kmmio_handler(). + * Protected by kmmio_lock, when linked into kmmio_page_table. */ int count; }; @@ -317,7 +318,11 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) if (ctx->probe && ctx->probe->post_handler) ctx->probe->post_handler(ctx->probe, condition, regs); - arm_kmmio_fault_page(ctx->fpage); + /* Prevent racing against release_kmmio_fault_page(). */ + spin_lock(&kmmio_lock); + if (ctx->fpage->count) + arm_kmmio_fault_page(ctx->fpage); + spin_unlock(&kmmio_lock); regs->flags &= ~X86_EFLAGS_TF; regs->flags |= ctx->saved_flags; From a572e217c6f09fb02853d3196458b8bf30a9a321 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Mon, 2 Mar 2009 17:22:36 +0800 Subject: [PATCH 084/580] Blackfin arch: drop untested and useless "generic" board file Signed-off-by: Mike Frysinger Signed-off-by: Bryan Wu --- arch/blackfin/mach-bf533/boards/Kconfig | 5 - arch/blackfin/mach-bf533/boards/Makefile | 1 - .../mach-bf533/boards/generic_board.c | 126 --- arch/blackfin/mach-bf537/boards/Kconfig | 5 - arch/blackfin/mach-bf537/boards/Makefile | 1 - .../mach-bf537/boards/generic_board.c | 745 ------------------ arch/blackfin/mach-bf561/boards/Kconfig | 5 - arch/blackfin/mach-bf561/boards/Makefile | 1 - .../mach-bf561/boards/generic_board.c | 113 --- 9 files changed, 1002 deletions(-) delete mode 100644 arch/blackfin/mach-bf533/boards/generic_board.c delete mode 100644 arch/blackfin/mach-bf537/boards/generic_board.c delete mode 100644 arch/blackfin/mach-bf561/boards/generic_board.c diff --git a/arch/blackfin/mach-bf533/boards/Kconfig b/arch/blackfin/mach-bf533/boards/Kconfig index 308c98dc5aba..8d8b3e7321e6 100644 --- a/arch/blackfin/mach-bf533/boards/Kconfig +++ b/arch/blackfin/mach-bf533/boards/Kconfig @@ -38,9 +38,4 @@ config BFIN532_IP0X help Core support for IP04/IP04 open hardware IP-PBX. -config GENERIC_BF533_BOARD - bool "Generic" - help - Generic or Custom board support. - endchoice diff --git a/arch/blackfin/mach-bf533/boards/Makefile b/arch/blackfin/mach-bf533/boards/Makefile index 9afbe72b484f..ff1e832f80d2 100644 --- a/arch/blackfin/mach-bf533/boards/Makefile +++ b/arch/blackfin/mach-bf533/boards/Makefile @@ -2,7 +2,6 @@ # arch/blackfin/mach-bf533/boards/Makefile # -obj-$(CONFIG_GENERIC_BF533_BOARD) += generic_board.o obj-$(CONFIG_BFIN533_STAMP) += stamp.o obj-$(CONFIG_BFIN532_IP0X) += ip0x.o obj-$(CONFIG_BFIN533_EZKIT) += ezkit.o diff --git a/arch/blackfin/mach-bf533/boards/generic_board.c b/arch/blackfin/mach-bf533/boards/generic_board.c deleted file mode 100644 index 986eeec53b1f..000000000000 --- a/arch/blackfin/mach-bf533/boards/generic_board.c +++ /dev/null @@ -1,126 +0,0 @@ -/* - * File: arch/blackfin/mach-bf533/generic_board.c - * Based on: arch/blackfin/mach-bf533/ezkit.c - * Author: Aidan Williams - * - * Created: 2005 - * Description: - * - * Modified: - * Copyright 2005 National ICT Australia (NICTA) - * Copyright 2004-2006 Analog Devices Inc. - * - * Bugs: Enter bugs at http://blackfin.uclinux.org/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see the file COPYING, or write - * to the Free Software Foundation, Inc., - * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include -#include -#include - -/* - * Name the Board for the /proc/cpuinfo - */ -const char bfin_board_name[] = "UNKNOWN BOARD"; - -#if defined(CONFIG_RTC_DRV_BFIN) || defined(CONFIG_RTC_DRV_BFIN_MODULE) -static struct platform_device rtc_device = { - .name = "rtc-bfin", - .id = -1, -}; -#endif - -/* - * Driver needs to know address, irq and flag pin. - */ -#if defined(CONFIG_SMC91X) || defined(CONFIG_SMC91X_MODULE) -static struct resource smc91x_resources[] = { - { - .start = 0x20300300, - .end = 0x20300300 + 16, - .flags = IORESOURCE_MEM, - }, { - .start = IRQ_PROG_INTB, - .end = IRQ_PROG_INTB, - .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL, - }, { - .start = IRQ_PF7, - .end = IRQ_PF7, - .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL, - }, -}; - -static struct platform_device smc91x_device = { - .name = "smc91x", - .id = 0, - .num_resources = ARRAY_SIZE(smc91x_resources), - .resource = smc91x_resources, -}; -#endif - -#if defined(CONFIG_BFIN_SIR) || defined(CONFIG_BFIN_SIR_MODULE) -#ifdef CONFIG_BFIN_SIR0 -static struct resource bfin_sir0_resources[] = { - { - .start = 0xFFC00400, - .end = 0xFFC004FF, - .flags = IORESOURCE_MEM, - }, - { - .start = IRQ_UART0_RX, - .end = IRQ_UART0_RX+1, - .flags = IORESOURCE_IRQ, - }, - { - .start = CH_UART0_RX, - .end = CH_UART0_RX+1, - .flags = IORESOURCE_DMA, - }, -}; - -static struct platform_device bfin_sir0_device = { - .name = "bfin_sir", - .id = 0, - .num_resources = ARRAY_SIZE(bfin_sir0_resources), - .resource = bfin_sir0_resources, -}; -#endif -#endif - -static struct platform_device *generic_board_devices[] __initdata = { -#if defined(CONFIG_RTC_DRV_BFIN) || defined(CONFIG_RTC_DRV_BFIN_MODULE) - &rtc_device, -#endif - -#if defined(CONFIG_SMC91X) || defined(CONFIG_SMC91X_MODULE) - &smc91x_device, -#endif - -#if defined(CONFIG_BFIN_SIR) || defined(CONFIG_BFIN_SIR_MODULE) -#ifdef CONFIG_BFIN_SIR0 - &bfin_sir0_device, -#endif -#endif -}; - -static int __init generic_board_init(void) -{ - printk(KERN_INFO "%s(): registering device resources\n", __func__); - return platform_add_devices(generic_board_devices, ARRAY_SIZE(generic_board_devices)); -} - -arch_initcall(generic_board_init); diff --git a/arch/blackfin/mach-bf537/boards/Kconfig b/arch/blackfin/mach-bf537/boards/Kconfig index 42a57b0acb29..77c59da87e85 100644 --- a/arch/blackfin/mach-bf537/boards/Kconfig +++ b/arch/blackfin/mach-bf537/boards/Kconfig @@ -33,9 +33,4 @@ config CAMSIG_MINOTAUR help Board supply package for CSP Minotaur -config GENERIC_BF537_BOARD - bool "Generic" - help - Generic or Custom board support. - endchoice diff --git a/arch/blackfin/mach-bf537/boards/Makefile b/arch/blackfin/mach-bf537/boards/Makefile index 7168cc14afd8..68b98a7af6a6 100644 --- a/arch/blackfin/mach-bf537/boards/Makefile +++ b/arch/blackfin/mach-bf537/boards/Makefile @@ -2,7 +2,6 @@ # arch/blackfin/mach-bf537/boards/Makefile # -obj-$(CONFIG_GENERIC_BF537_BOARD) += generic_board.o obj-$(CONFIG_BFIN537_STAMP) += stamp.o obj-$(CONFIG_BFIN537_BLUETECHNIX_CM) += cm_bf537.o obj-$(CONFIG_BFIN537_BLUETECHNIX_TCM) += tcm_bf537.o diff --git a/arch/blackfin/mach-bf537/boards/generic_board.c b/arch/blackfin/mach-bf537/boards/generic_board.c deleted file mode 100644 index da710fdc4569..000000000000 --- a/arch/blackfin/mach-bf537/boards/generic_board.c +++ /dev/null @@ -1,745 +0,0 @@ -/* - * File: arch/blackfin/mach-bf537/boards/generic_board.c - * Based on: arch/blackfin/mach-bf533/boards/ezkit.c - * Author: Aidan Williams - * - * Created: - * Description: - * - * Modified: - * Copyright 2005 National ICT Australia (NICTA) - * Copyright 2004-2008 Analog Devices Inc. - * - * Bugs: Enter bugs at http://blackfin.uclinux.org/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see the file COPYING, or write - * to the Free Software Foundation, Inc., - * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include -#include -#include -#include -#include -#include -#include -#if defined(CONFIG_USB_ISP1362_HCD) || defined(CONFIG_USB_ISP1362_HCD_MODULE) -#include -#endif -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Name the Board for the /proc/cpuinfo - */ -const char bfin_board_name[] = "UNKNOWN BOARD"; - -/* - * Driver needs to know address, irq and flag pin. - */ - -#if defined(CONFIG_USB_ISP1760_HCD) || defined(CONFIG_USB_ISP1760_HCD_MODULE) -#include -static struct resource bfin_isp1760_resources[] = { - [0] = { - .start = 0x203C0000, - .end = 0x203C0000 + 0x000fffff, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = IRQ_PF7, - .end = IRQ_PF7, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct isp1760_platform_data isp1760_priv = { - .is_isp1761 = 0, - .port1_disable = 0, - .bus_width_16 = 1, - .port1_otg = 0, - .analog_oc = 0, - .dack_polarity_high = 0, - .dreq_polarity_high = 0, -}; - -static struct platform_device bfin_isp1760_device = { - .name = "isp1760-hcd", - .id = 0, - .dev = { - .platform_data = &isp1760_priv, - }, - .num_resources = ARRAY_SIZE(bfin_isp1760_resources), - .resource = bfin_isp1760_resources, -}; -#endif - -#if defined(CONFIG_BFIN_CFPCMCIA) || defined(CONFIG_BFIN_CFPCMCIA_MODULE) -static struct resource bfin_pcmcia_cf_resources[] = { - { - .start = 0x20310000, /* IO PORT */ - .end = 0x20312000, - .flags = IORESOURCE_MEM, - }, { - .start = 0x20311000, /* Attribute Memory */ - .end = 0x20311FFF, - .flags = IORESOURCE_MEM, - }, { - .start = IRQ_PF4, - .end = IRQ_PF4, - .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_LOWLEVEL, - }, { - .start = 6, /* Card Detect PF6 */ - .end = 6, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device bfin_pcmcia_cf_device = { - .name = "bfin_cf_pcmcia", - .id = -1, - .num_resources = ARRAY_SIZE(bfin_pcmcia_cf_resources), - .resource = bfin_pcmcia_cf_resources, -}; -#endif - -#if defined(CONFIG_RTC_DRV_BFIN) || defined(CONFIG_RTC_DRV_BFIN_MODULE) -static struct platform_device rtc_device = { - .name = "rtc-bfin", - .id = -1, -}; -#endif - -#if defined(CONFIG_SMC91X) || defined(CONFIG_SMC91X_MODULE) -static struct resource smc91x_resources[] = { - { - .name = "smc91x-regs", - .start = 0x20300300, - .end = 0x20300300 + 16, - .flags = IORESOURCE_MEM, - }, { - - .start = IRQ_PF7, - .end = IRQ_PF7, - .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL, - }, -}; -static struct platform_device smc91x_device = { - .name = "smc91x", - .id = 0, - .num_resources = ARRAY_SIZE(smc91x_resources), - .resource = smc91x_resources, -}; -#endif - -#if defined(CONFIG_DM9000) || defined(CONFIG_DM9000_MODULE) -static struct resource dm9000_resources[] = { - [0] = { - .start = 0x203FB800, - .end = 0x203FB800 + 1, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = 0x203FB800 + 4, - .end = 0x203FB800 + 5, - .flags = IORESOURCE_MEM, - }, - [2] = { - .start = IRQ_PF9, - .end = IRQ_PF9, - .flags = (IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHEDGE), - }, -}; - -static struct platform_device dm9000_device = { - .name = "dm9000", - .id = -1, - .num_resources = ARRAY_SIZE(dm9000_resources), - .resource = dm9000_resources, -}; -#endif - -#if defined(CONFIG_USB_SL811_HCD) || defined(CONFIG_USB_SL811_HCD_MODULE) -static struct resource sl811_hcd_resources[] = { - { - .start = 0x20340000, - .end = 0x20340000, - .flags = IORESOURCE_MEM, - }, { - .start = 0x20340004, - .end = 0x20340004, - .flags = IORESOURCE_MEM, - }, { - .start = CONFIG_USB_SL811_BFIN_IRQ, - .end = CONFIG_USB_SL811_BFIN_IRQ, - .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL, - }, -}; - -#if defined(CONFIG_USB_SL811_BFIN_USE_VBUS) -void sl811_port_power(struct device *dev, int is_on) -{ - gpio_request(CONFIG_USB_SL811_BFIN_GPIO_VBUS, "usb:SL811_VBUS"); - gpio_direction_output(CONFIG_USB_SL811_BFIN_GPIO_VBUS, is_on); - -} -#endif - -static struct sl811_platform_data sl811_priv = { - .potpg = 10, - .power = 250, /* == 500mA */ -#if defined(CONFIG_USB_SL811_BFIN_USE_VBUS) - .port_power = &sl811_port_power, -#endif -}; - -static struct platform_device sl811_hcd_device = { - .name = "sl811-hcd", - .id = 0, - .dev = { - .platform_data = &sl811_priv, - }, - .num_resources = ARRAY_SIZE(sl811_hcd_resources), - .resource = sl811_hcd_resources, -}; -#endif - -#if defined(CONFIG_USB_ISP1362_HCD) || defined(CONFIG_USB_ISP1362_HCD_MODULE) -static struct resource isp1362_hcd_resources[] = { - { - .start = 0x20360000, - .end = 0x20360000, - .flags = IORESOURCE_MEM, - }, { - .start = 0x20360004, - .end = 0x20360004, - .flags = IORESOURCE_MEM, - }, { - .start = CONFIG_USB_ISP1362_BFIN_GPIO_IRQ, - .end = CONFIG_USB_ISP1362_BFIN_GPIO_IRQ, - .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL, - }, -}; - -static struct isp1362_platform_data isp1362_priv = { - .sel15Kres = 1, - .clknotstop = 0, - .oc_enable = 0, - .int_act_high = 0, - .int_edge_triggered = 0, - .remote_wakeup_connected = 0, - .no_power_switching = 1, - .power_switching_mode = 0, -}; - -static struct platform_device isp1362_hcd_device = { - .name = "isp1362-hcd", - .id = 0, - .dev = { - .platform_data = &isp1362_priv, - }, - .num_resources = ARRAY_SIZE(isp1362_hcd_resources), - .resource = isp1362_hcd_resources, -}; -#endif - -#if defined(CONFIG_BFIN_MAC) || defined(CONFIG_BFIN_MAC_MODULE) -static struct platform_device bfin_mii_bus = { - .name = "bfin_mii_bus", -}; - -static struct platform_device bfin_mac_device = { - .name = "bfin_mac", - .dev.platform_data = &bfin_mii_bus, -}; -#endif - -#if defined(CONFIG_USB_NET2272) || defined(CONFIG_USB_NET2272_MODULE) -static struct resource net2272_bfin_resources[] = { - { - .start = 0x20300000, - .end = 0x20300000 + 0x100, - .flags = IORESOURCE_MEM, - }, { - .start = IRQ_PF7, - .end = IRQ_PF7, - .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL, - }, -}; - -static struct platform_device net2272_bfin_device = { - .name = "net2272", - .id = -1, - .num_resources = ARRAY_SIZE(net2272_bfin_resources), - .resource = net2272_bfin_resources, -}; -#endif - -#if defined(CONFIG_SPI_BFIN) || defined(CONFIG_SPI_BFIN_MODULE) -/* all SPI peripherals info goes here */ - -#if defined(CONFIG_MTD_M25P80) \ - || defined(CONFIG_MTD_M25P80_MODULE) -static struct mtd_partition bfin_spi_flash_partitions[] = { - { - .name = "bootloader(spi)", - .size = 0x00020000, - .offset = 0, - .mask_flags = MTD_CAP_ROM - }, { - .name = "linux kernel(spi)", - .size = 0xe0000, - .offset = 0x20000 - }, { - .name = "file system(spi)", - .size = 0x700000, - .offset = 0x00100000, - } -}; - -static struct flash_platform_data bfin_spi_flash_data = { - .name = "m25p80", - .parts = bfin_spi_flash_partitions, - .nr_parts = ARRAY_SIZE(bfin_spi_flash_partitions), - .type = "m25p64", -}; - -/* SPI flash chip (m25p64) */ -static struct bfin5xx_spi_chip spi_flash_chip_info = { - .enable_dma = 0, /* use dma transfer with this chip*/ - .bits_per_word = 8, -}; -#endif - -#if defined(CONFIG_SPI_ADC_BF533) \ - || defined(CONFIG_SPI_ADC_BF533_MODULE) -/* SPI ADC chip */ -static struct bfin5xx_spi_chip spi_adc_chip_info = { - .enable_dma = 1, /* use dma transfer with this chip*/ - .bits_per_word = 16, -}; -#endif - -#if defined(CONFIG_SND_BLACKFIN_AD1836) \ - || defined(CONFIG_SND_BLACKFIN_AD1836_MODULE) -static struct bfin5xx_spi_chip ad1836_spi_chip_info = { - .enable_dma = 0, - .bits_per_word = 16, -}; -#endif - -#if defined(CONFIG_AD9960) || defined(CONFIG_AD9960_MODULE) -static struct bfin5xx_spi_chip ad9960_spi_chip_info = { - .enable_dma = 0, - .bits_per_word = 16, -}; -#endif - -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) -static struct bfin5xx_spi_chip spi_mmc_chip_info = { - .enable_dma = 1, - .bits_per_word = 8, -}; -#endif - -#if defined(CONFIG_PBX) -static struct bfin5xx_spi_chip spi_si3xxx_chip_info = { - .ctl_reg = 0x4, /* send zero */ - .enable_dma = 0, - .bits_per_word = 8, - .cs_change_per_word = 1, -}; -#endif - -#if defined(CONFIG_TOUCHSCREEN_AD7877) || defined(CONFIG_TOUCHSCREEN_AD7877_MODULE) -static struct bfin5xx_spi_chip spi_ad7877_chip_info = { - .enable_dma = 0, - .bits_per_word = 16, -}; - -static const struct ad7877_platform_data bfin_ad7877_ts_info = { - .model = 7877, - .vref_delay_usecs = 50, /* internal, no capacitor */ - .x_plate_ohms = 419, - .y_plate_ohms = 486, - .pressure_max = 1000, - .pressure_min = 0, - .stopacq_polarity = 1, - .first_conversion_delay = 3, - .acquisition_time = 1, - .averaging = 1, - .pen_down_acc_interval = 1, -}; -#endif - -static struct spi_board_info bfin_spi_board_info[] __initdata = { -#if defined(CONFIG_MTD_M25P80) \ - || defined(CONFIG_MTD_M25P80_MODULE) - { - /* the modalias must be the same as spi device driver name */ - .modalias = "m25p80", /* Name of spi_driver for this device */ - .max_speed_hz = 25000000, /* max spi clock (SCK) speed in HZ */ - .bus_num = 0, /* Framework bus number */ - .chip_select = 1, /* Framework chip select. On STAMP537 it is SPISSEL1*/ - .platform_data = &bfin_spi_flash_data, - .controller_data = &spi_flash_chip_info, - .mode = SPI_MODE_3, - }, -#endif - -#if defined(CONFIG_SPI_ADC_BF533) \ - || defined(CONFIG_SPI_ADC_BF533_MODULE) - { - .modalias = "bfin_spi_adc", /* Name of spi_driver for this device */ - .max_speed_hz = 6250000, /* max spi clock (SCK) speed in HZ */ - .bus_num = 0, /* Framework bus number */ - .chip_select = 1, /* Framework chip select. */ - .platform_data = NULL, /* No spi_driver specific config */ - .controller_data = &spi_adc_chip_info, - }, -#endif - -#if defined(CONFIG_SND_BLACKFIN_AD1836) \ - || defined(CONFIG_SND_BLACKFIN_AD1836_MODULE) - { - .modalias = "ad1836-spi", - .max_speed_hz = 3125000, /* max spi clock (SCK) speed in HZ */ - .bus_num = 0, - .chip_select = CONFIG_SND_BLACKFIN_SPI_PFBIT, - .controller_data = &ad1836_spi_chip_info, - }, -#endif -#if defined(CONFIG_AD9960) || defined(CONFIG_AD9960_MODULE) - { - .modalias = "ad9960-spi", - .max_speed_hz = 10000000, /* max spi clock (SCK) speed in HZ */ - .bus_num = 0, - .chip_select = 1, - .controller_data = &ad9960_spi_chip_info, - }, -#endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) - { - .modalias = "spi_mmc_dummy", - .max_speed_hz = 25000000, /* max spi clock (SCK) speed in HZ */ - .bus_num = 0, - .chip_select = 0, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, - .mode = SPI_MODE_3, - }, - { - .modalias = "spi_mmc", - .max_speed_hz = 25000000, /* max spi clock (SCK) speed in HZ */ - .bus_num = 0, - .chip_select = CONFIG_SPI_MMC_CS_CHAN, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, - .mode = SPI_MODE_3, - }, -#endif -#if defined(CONFIG_PBX) - { - .modalias = "fxs-spi", - .max_speed_hz = 12500000, /* max spi clock (SCK) speed in HZ */ - .bus_num = 0, - .chip_select = 8 - CONFIG_J11_JUMPER, - .controller_data = &spi_si3xxx_chip_info, - .mode = SPI_MODE_3, - }, - { - .modalias = "fxo-spi", - .max_speed_hz = 12500000, /* max spi clock (SCK) speed in HZ */ - .bus_num = 0, - .chip_select = 8 - CONFIG_J19_JUMPER, - .controller_data = &spi_si3xxx_chip_info, - .mode = SPI_MODE_3, - }, -#endif -#if defined(CONFIG_TOUCHSCREEN_AD7877) || defined(CONFIG_TOUCHSCREEN_AD7877_MODULE) - { - .modalias = "ad7877", - .platform_data = &bfin_ad7877_ts_info, - .irq = IRQ_PF6, - .max_speed_hz = 12500000, /* max spi clock (SCK) speed in HZ */ - .bus_num = 0, - .chip_select = 1, - .controller_data = &spi_ad7877_chip_info, - }, -#endif -}; - -/* SPI controller data */ -static struct bfin5xx_spi_master bfin_spi0_info = { - .num_chipselect = 8, - .enable_dma = 1, /* master has the ability to do dma transfer */ - .pin_req = {P_SPI0_SCK, P_SPI0_MISO, P_SPI0_MOSI, 0}, -}; - -/* SPI (0) */ -static struct resource bfin_spi0_resource[] = { - [0] = { - .start = SPI0_REGBASE, - .end = SPI0_REGBASE + 0xFF, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = CH_SPI, - .end = CH_SPI, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device bfin_spi0_device = { - .name = "bfin-spi", - .id = 0, /* Bus number */ - .num_resources = ARRAY_SIZE(bfin_spi0_resource), - .resource = bfin_spi0_resource, - .dev = { - .platform_data = &bfin_spi0_info, /* Passed to driver */ - }, -}; -#endif /* spi master and devices */ - -#if defined(CONFIG_FB_BF537_LQ035) || defined(CONFIG_FB_BF537_LQ035_MODULE) -static struct platform_device bfin_fb_device = { - .name = "bf537-lq035", -}; -#endif - -#if defined(CONFIG_FB_BFIN_7393) || defined(CONFIG_FB_BFIN_7393_MODULE) -static struct platform_device bfin_fb_adv7393_device = { - .name = "bfin-adv7393", -}; -#endif - -#if defined(CONFIG_SERIAL_BFIN) || defined(CONFIG_SERIAL_BFIN_MODULE) -static struct resource bfin_uart_resources[] = { - { - .start = 0xFFC00400, - .end = 0xFFC004FF, - .flags = IORESOURCE_MEM, - }, { - .start = 0xFFC02000, - .end = 0xFFC020FF, - .flags = IORESOURCE_MEM, - }, -}; - -static struct platform_device bfin_uart_device = { - .name = "bfin-uart", - .id = 1, - .num_resources = ARRAY_SIZE(bfin_uart_resources), - .resource = bfin_uart_resources, -}; -#endif - -#if defined(CONFIG_BFIN_SIR) || defined(CONFIG_BFIN_SIR_MODULE) -#ifdef CONFIG_BFIN_SIR0 -static struct resource bfin_sir0_resources[] = { - { - .start = 0xFFC00400, - .end = 0xFFC004FF, - .flags = IORESOURCE_MEM, - }, - { - .start = IRQ_UART0_RX, - .end = IRQ_UART0_RX+1, - .flags = IORESOURCE_IRQ, - }, - { - .start = CH_UART0_RX, - .end = CH_UART0_RX+1, - .flags = IORESOURCE_DMA, - }, -}; - -static struct platform_device bfin_sir0_device = { - .name = "bfin_sir", - .id = 0, - .num_resources = ARRAY_SIZE(bfin_sir0_resources), - .resource = bfin_sir0_resources, -}; -#endif -#ifdef CONFIG_BFIN_SIR1 -static struct resource bfin_sir1_resources[] = { - { - .start = 0xFFC02000, - .end = 0xFFC020FF, - .flags = IORESOURCE_MEM, - }, - { - .start = IRQ_UART1_RX, - .end = IRQ_UART1_RX+1, - .flags = IORESOURCE_IRQ, - }, - { - .start = CH_UART1_RX, - .end = CH_UART1_RX+1, - .flags = IORESOURCE_DMA, - }, -}; - -static struct platform_device bfin_sir1_device = { - .name = "bfin_sir", - .id = 1, - .num_resources = ARRAY_SIZE(bfin_sir1_resources), - .resource = bfin_sir1_resources, -}; -#endif -#endif - -#if defined(CONFIG_I2C_BLACKFIN_TWI) || defined(CONFIG_I2C_BLACKFIN_TWI_MODULE) -static struct resource bfin_twi0_resource[] = { - [0] = { - .start = TWI0_REGBASE, - .end = TWI0_REGBASE + 0xFF, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = IRQ_TWI, - .end = IRQ_TWI, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device i2c_bfin_twi_device = { - .name = "i2c-bfin-twi", - .id = 0, - .num_resources = ARRAY_SIZE(bfin_twi0_resource), - .resource = bfin_twi0_resource, -}; -#endif - -#if defined(CONFIG_SERIAL_BFIN_SPORT) || defined(CONFIG_SERIAL_BFIN_SPORT_MODULE) -static struct platform_device bfin_sport0_uart_device = { - .name = "bfin-sport-uart", - .id = 0, -}; - -static struct platform_device bfin_sport1_uart_device = { - .name = "bfin-sport-uart", - .id = 1, -}; -#endif - -static struct platform_device *stamp_devices[] __initdata = { -#if defined(CONFIG_BFIN_CFPCMCIA) || defined(CONFIG_BFIN_CFPCMCIA_MODULE) - &bfin_pcmcia_cf_device, -#endif - -#if defined(CONFIG_RTC_DRV_BFIN) || defined(CONFIG_RTC_DRV_BFIN_MODULE) - &rtc_device, -#endif - -#if defined(CONFIG_USB_SL811_HCD) || defined(CONFIG_USB_SL811_HCD_MODULE) - &sl811_hcd_device, -#endif - -#if defined(CONFIG_USB_ISP1362_HCD) || defined(CONFIG_USB_ISP1362_HCD_MODULE) - &isp1362_hcd_device, -#endif - -#if defined(CONFIG_SMC91X) || defined(CONFIG_SMC91X_MODULE) - &smc91x_device, -#endif - -#if defined(CONFIG_DM9000) || defined(CONFIG_DM9000_MODULE) - &dm9000_device, -#endif - -#if defined(CONFIG_BFIN_MAC) || defined(CONFIG_BFIN_MAC_MODULE) - &bfin_mii_bus, - &bfin_mac_device, -#endif - -#if defined(CONFIG_USB_NET2272) || defined(CONFIG_USB_NET2272_MODULE) - &net2272_bfin_device, -#endif - -#if defined(CONFIG_USB_ISP1760_HCD) || defined(CONFIG_USB_ISP1760_HCD_MODULE) - &bfin_isp1760_device, -#endif - -#if defined(CONFIG_SPI_BFIN) || defined(CONFIG_SPI_BFIN_MODULE) - &bfin_spi0_device, -#endif - -#if defined(CONFIG_FB_BF537_LQ035) || defined(CONFIG_FB_BF537_LQ035_MODULE) - &bfin_fb_device, -#endif - -#if defined(CONFIG_FB_BFIN_7393) || defined(CONFIG_FB_BFIN_7393_MODULE) - &bfin_fb_adv7393_device, -#endif - -#if defined(CONFIG_SERIAL_BFIN) || defined(CONFIG_SERIAL_BFIN_MODULE) - &bfin_uart_device, -#endif - -#if defined(CONFIG_BFIN_SIR) || defined(CONFIG_BFIN_SIR_MODULE) -#ifdef CONFIG_BFIN_SIR0 - &bfin_sir0_device, -#endif -#ifdef CONFIG_BFIN_SIR1 - &bfin_sir1_device, -#endif -#endif - -#if defined(CONFIG_I2C_BLACKFIN_TWI) || defined(CONFIG_I2C_BLACKFIN_TWI_MODULE) - &i2c_bfin_twi_device, -#endif - -#if defined(CONFIG_SERIAL_BFIN_SPORT) || defined(CONFIG_SERIAL_BFIN_SPORT_MODULE) - &bfin_sport0_uart_device, - &bfin_sport1_uart_device, -#endif -}; - -static int __init generic_init(void) -{ - printk(KERN_INFO "%s(): registering device resources\n", __func__); - platform_add_devices(stamp_devices, ARRAY_SIZE(stamp_devices)); -#if defined(CONFIG_SPI_BFIN) || defined(CONFIG_SPI_BFIN_MODULE) - spi_register_board_info(bfin_spi_board_info, - ARRAY_SIZE(bfin_spi_board_info)); -#endif - - return 0; -} - -arch_initcall(generic_init); - -void native_machine_restart(char *cmd) -{ - /* workaround reboot hang when booting from SPI */ - if ((bfin_read_SYSCR() & 0x7) == 0x3) - bfin_reset_boot_spi_cs(P_DEFAULT_BOOT_SPI_CS); -} - -#if defined(CONFIG_BFIN_MAC) || defined(CONFIG_BFIN_MAC_MODULE) -void bfin_get_ether_addr(char *addr) -{ - random_ether_addr(addr); - printk(KERN_WARNING "%s:%s: Setting Ethernet MAC to a random one\n", __FILE__, __func__); -} -EXPORT_SYMBOL(bfin_get_ether_addr); -#endif diff --git a/arch/blackfin/mach-bf561/boards/Kconfig b/arch/blackfin/mach-bf561/boards/Kconfig index e41a67b1fb53..e4bc6d7c5a6a 100644 --- a/arch/blackfin/mach-bf561/boards/Kconfig +++ b/arch/blackfin/mach-bf561/boards/Kconfig @@ -19,9 +19,4 @@ config BFIN561_BLUETECHNIX_CM help CM-BF561 support for EVAL- and DEV-Board. -config GENERIC_BF561_BOARD - bool "Generic" - help - Generic or Custom board support. - endchoice diff --git a/arch/blackfin/mach-bf561/boards/Makefile b/arch/blackfin/mach-bf561/boards/Makefile index 04add010b568..3a152559e957 100644 --- a/arch/blackfin/mach-bf561/boards/Makefile +++ b/arch/blackfin/mach-bf561/boards/Makefile @@ -2,7 +2,6 @@ # arch/blackfin/mach-bf561/boards/Makefile # -obj-$(CONFIG_GENERIC_BF561_BOARD) += generic_board.o obj-$(CONFIG_BFIN561_BLUETECHNIX_CM) += cm_bf561.o obj-$(CONFIG_BFIN561_EZKIT) += ezkit.o obj-$(CONFIG_BFIN561_TEPLA) += tepla.o diff --git a/arch/blackfin/mach-bf561/boards/generic_board.c b/arch/blackfin/mach-bf561/boards/generic_board.c deleted file mode 100644 index 0ba366a0e696..000000000000 --- a/arch/blackfin/mach-bf561/boards/generic_board.c +++ /dev/null @@ -1,113 +0,0 @@ -/* - * File: arch/blackfin/mach-bf561/generic_board.c - * Based on: arch/blackfin/mach-bf533/ezkit.c - * Author: Aidan Williams - * - * Created: - * Description: - * - * Modified: - * Copyright 2005 National ICT Australia (NICTA) - * Copyright 2004-2006 Analog Devices Inc. - * - * Bugs: Enter bugs at http://blackfin.uclinux.org/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see the file COPYING, or write - * to the Free Software Foundation, Inc., - * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include -#include -#include - -const char bfin_board_name[] = "UNKNOWN BOARD"; - -/* - * Driver needs to know address, irq and flag pin. - */ -#if defined(CONFIG_SMC91X) || defined(CONFIG_SMC91X_MODULE) -static struct resource smc91x_resources[] = { - { - .start = 0x2C010300, - .end = 0x2C010300 + 16, - .flags = IORESOURCE_MEM, - }, { - .start = IRQ_PROG_INTB, - .end = IRQ_PROG_INTB, - .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL, - }, { - .start = IRQ_PF9, - .end = IRQ_PF9, - .flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL, - }, -}; - -static struct platform_device smc91x_device = { - .name = "smc91x", - .id = 0, - .num_resources = ARRAY_SIZE(smc91x_resources), - .resource = smc91x_resources, -}; -#endif - -#if defined(CONFIG_BFIN_SIR) || defined(CONFIG_BFIN_SIR_MODULE) -#ifdef CONFIG_BFIN_SIR0 -static struct resource bfin_sir0_resources[] = { - { - .start = 0xFFC00400, - .end = 0xFFC004FF, - .flags = IORESOURCE_MEM, - }, - { - .start = IRQ_UART0_RX, - .end = IRQ_UART0_RX+1, - .flags = IORESOURCE_IRQ, - }, - { - .start = CH_UART0_RX, - .end = CH_UART0_RX+1, - .flags = IORESOURCE_DMA, - }, -}; - -static struct platform_device bfin_sir0_device = { - .name = "bfin_sir", - .id = 0, - .num_resources = ARRAY_SIZE(bfin_sir0_resources), - .resource = bfin_sir0_resources, -}; -#endif -#endif - -static struct platform_device *generic_board_devices[] __initdata = { -#if defined(CONFIG_SMC91X) || defined(CONFIG_SMC91X_MODULE) - &smc91x_device, -#endif - -#if defined(CONFIG_BFIN_SIR) || defined(CONFIG_BFIN_SIR_MODULE) -#ifdef CONFIG_BFIN_SIR0 - &bfin_sir0_device, -#endif -#endif -}; - -static int __init generic_board_init(void) -{ - printk(KERN_INFO "%s(): registering device resources\n", __func__); - return platform_add_devices(generic_board_devices, - ARRAY_SIZE(generic_board_devices)); -} - -arch_initcall(generic_board_init); From 28e4cf22a3707d05eb697b9b8ae6a4ed39c99b45 Mon Sep 17 00:00:00 2001 From: Sonic Zhang Date: Mon, 2 Mar 2009 18:04:24 +0800 Subject: [PATCH 085/580] Blackfin arch: Disable NAND option by default Signed-off-by: Sonic Zhang Signed-off-by: Bryan Wu --- arch/blackfin/configs/BF537-STAMP_defconfig | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/blackfin/configs/BF537-STAMP_defconfig b/arch/blackfin/configs/BF537-STAMP_defconfig index 4fb4108d3103..c61b6000a9eb 100644 --- a/arch/blackfin/configs/BF537-STAMP_defconfig +++ b/arch/blackfin/configs/BF537-STAMP_defconfig @@ -568,15 +568,7 @@ CONFIG_MTD_PHYSMAP_BANKWIDTH=2 # CONFIG_MTD_DOC2000 is not set # CONFIG_MTD_DOC2001 is not set # CONFIG_MTD_DOC2001PLUS is not set -CONFIG_MTD_NAND=m -# CONFIG_MTD_NAND_VERIFY_WRITE is not set -# CONFIG_MTD_NAND_ECC_SMC is not set -# CONFIG_MTD_NAND_MUSEUM_IDS is not set -# CONFIG_MTD_NAND_BFIN is not set -CONFIG_MTD_NAND_IDS=m -# CONFIG_MTD_NAND_DISKONCHIP is not set -# CONFIG_MTD_NAND_NANDSIM is not set -CONFIG_MTD_NAND_PLATFORM=m +# CONFIG_MTD_NAND is not set # CONFIG_MTD_ONENAND is not set # From 0f29456a21ae55c43b4e2a64611f778b1fbe4bdf Mon Sep 17 00:00:00 2001 From: Michael Hennerich Date: Mon, 2 Mar 2009 18:06:13 +0800 Subject: [PATCH 086/580] Blackfin arch: Make IRQ_EPPIx_ERROR naming consistent Signed-off-by: Michael Hennerich Signed-off-by: Bryan Wu --- arch/blackfin/mach-bf548/include/mach/irq.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/blackfin/mach-bf548/include/mach/irq.h b/arch/blackfin/mach-bf548/include/mach/irq.h index 60299a71e090..f194625f6821 100644 --- a/arch/blackfin/mach-bf548/include/mach/irq.h +++ b/arch/blackfin/mach-bf548/include/mach/irq.h @@ -123,8 +123,8 @@ Events (highest priority) EMU 0 #define IRQ_MXVR_ERROR BFIN_IRQ(51) /* MXVR Status (Error) Interrupt */ #define IRQ_MXVR_MSG BFIN_IRQ(52) /* MXVR Message Interrupt */ #define IRQ_MXVR_PKT BFIN_IRQ(53) /* MXVR Packet Interrupt */ -#define IRQ_EPP1_ERROR BFIN_IRQ(54) /* EPPI1 Error Interrupt */ -#define IRQ_EPP2_ERROR BFIN_IRQ(55) /* EPPI2 Error Interrupt */ +#define IRQ_EPPI1_ERROR BFIN_IRQ(54) /* EPPI1 Error Interrupt */ +#define IRQ_EPPI2_ERROR BFIN_IRQ(55) /* EPPI2 Error Interrupt */ #define IRQ_UART3_ERROR BFIN_IRQ(56) /* UART3 Status (Error) Interrupt */ #define IRQ_HOST_ERROR BFIN_IRQ(57) /* HOST Status (Error) Interrupt */ #define IRQ_PIXC_ERROR BFIN_IRQ(59) /* PIXC Status (Error) Interrupt */ @@ -361,8 +361,8 @@ Events (highest priority) EMU 0 #define IRQ_UART2_ERR IRQ_UART2_ERROR #define IRQ_CAN0_ERR IRQ_CAN0_ERROR #define IRQ_MXVR_ERR IRQ_MXVR_ERROR -#define IRQ_EPP1_ERR IRQ_EPP1_ERROR -#define IRQ_EPP2_ERR IRQ_EPP2_ERROR +#define IRQ_EPPI1_ERR IRQ_EPPI1_ERROR +#define IRQ_EPPI2_ERR IRQ_EPPI2_ERROR #define IRQ_UART3_ERR IRQ_UART3_ERROR #define IRQ_HOST_ERR IRQ_HOST_ERROR #define IRQ_PIXC_ERR IRQ_PIXC_ERROR From 34d464f8aa3e762ec812a131bfd53ccb4f886f69 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Mon, 2 Mar 2009 18:14:47 +0800 Subject: [PATCH 087/580] Blackfin arch: use common KGDB_TESTS rather than our own KGDB_TESTCASE Signed-off-by: Mike Frysinger Signed-off-by: Bryan Wu --- arch/blackfin/Kconfig.debug | 6 ------ arch/blackfin/kernel/Makefile | 8 +++++--- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/blackfin/Kconfig.debug b/arch/blackfin/Kconfig.debug index 5f981d9ca625..79e7e63ab709 100644 --- a/arch/blackfin/Kconfig.debug +++ b/arch/blackfin/Kconfig.debug @@ -21,12 +21,6 @@ config DEBUG_STACK_USAGE config HAVE_ARCH_KGDB def_bool y -config KGDB_TESTCASE - tristate "KGDB: for test case in expect" - default n - help - This is a kgdb test case for automated testing. - config DEBUG_VERBOSE bool "Verbose fault messages" default y diff --git a/arch/blackfin/kernel/Makefile b/arch/blackfin/kernel/Makefile index 4a92a86824b7..fd4d4328a0f2 100644 --- a/arch/blackfin/kernel/Makefile +++ b/arch/blackfin/kernel/Makefile @@ -15,13 +15,15 @@ else obj-y += time.o endif -CFLAGS_kgdb_test.o := -mlong-calls -O0 - obj-$(CONFIG_IPIPE) += ipipe.o obj-$(CONFIG_IPIPE_TRACE_MCOUNT) += mcount.o obj-$(CONFIG_BFIN_GPTIMERS) += gptimers.o obj-$(CONFIG_CPLB_INFO) += cplbinfo.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KGDB) += kgdb.o -obj-$(CONFIG_KGDB_TESTCASE) += kgdb_test.o +obj-$(CONFIG_KGDB_TESTS) += kgdb_test.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o + +# the kgdb test puts code into L2 and without linker +# relaxation, we need to force long calls to/from it +CFLAGS_kgdb_test.o := -mlong-calls -O0 From e84dcaa18b2785d8ab20a7cb25612d89feb89fa7 Mon Sep 17 00:00:00 2001 From: Bernd Schmidt Date: Mon, 2 Mar 2009 18:37:48 +0800 Subject: [PATCH 088/580] Blackfin arch: fix bug - jump_to_zero test case failed on noMPU kernel The nompu code is now derived from the mpu code, and had the same problem - no null pointer detection on ICPLBs. Signed-off-by: Bernd Schmidt Cc: Mike Frysinger Signed-off-by: Bryan Wu --- arch/blackfin/kernel/cplb-nompu/cplbinit.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/blackfin/kernel/cplb-nompu/cplbinit.c b/arch/blackfin/kernel/cplb-nompu/cplbinit.c index 0e28f7595733..d6c067782e63 100644 --- a/arch/blackfin/kernel/cplb-nompu/cplbinit.c +++ b/arch/blackfin/kernel/cplb-nompu/cplbinit.c @@ -53,9 +53,13 @@ void __init generate_cplb_tables_cpu(unsigned int cpu) i_d = i_i = 0; +#ifdef CONFIG_DEBUG_HUNT_FOR_ZERO /* Set up the zero page. */ d_tbl[i_d].addr = 0; d_tbl[i_d++].data = SDRAM_OOPS | PAGE_SIZE_1KB; + i_tbl[i_i].addr = 0; + i_tbl[i_i++].data = SDRAM_OOPS | PAGE_SIZE_1KB; +#endif /* Cover kernel memory with 4M pages. */ addr = 0; From 9976b39b5031bbf76f715893cf080b6a17683881 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 27 Feb 2009 09:19:26 -0800 Subject: [PATCH 089/580] xen: deal with virtually mapped percpu data The virtually mapped percpu space causes us two problems: - for hypercalls which take an mfn, we need to do a full pagetable walk to convert the percpu va into an mfn, and - when a hypercall requires a page to be mapped RO via all its aliases, we need to make sure its RO in both the percpu mapping and in the linear mapping This primarily affects the gdt and the vcpu info structure. Signed-off-by: Jeremy Fitzhardinge Cc: Xen-devel Cc: Gerd Hoffmann Cc: Rusty Russell Cc: Tejun Heo Signed-off-by: Ingo Molnar --- arch/x86/include/asm/xen/page.h | 1 + arch/x86/xen/enlighten.c | 10 ++++++---- arch/x86/xen/mmu.c | 7 +++++++ arch/x86/xen/smp.c | 8 ++++++-- 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 4bd990ee43df..1a918dde46b5 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -164,6 +164,7 @@ static inline pte_t __pte_ma(pteval_t x) xmaddr_t arbitrary_virt_to_machine(void *address); +unsigned long arbitrary_virt_to_mfn(void *vaddr); void make_lowmem_page_readonly(void *vaddr); void make_lowmem_page_readwrite(void *vaddr); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 86497d5f44cd..352ea6830659 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -103,7 +103,7 @@ static void xen_vcpu_setup(int cpu) vcpup = &per_cpu(xen_vcpu_info, cpu); - info.mfn = virt_to_mfn(vcpup); + info.mfn = arbitrary_virt_to_mfn(vcpup); info.offset = offset_in_page(vcpup); printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", @@ -301,8 +301,10 @@ static void xen_load_gdt(const struct desc_ptr *dtr) frames = mcs.args; for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { - frames[f] = virt_to_mfn(va); + frames[f] = arbitrary_virt_to_mfn((void *)va); + make_lowmem_page_readonly((void *)va); + make_lowmem_page_readonly(mfn_to_virt(frames[f])); } MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); @@ -314,7 +316,7 @@ static void load_TLS_descriptor(struct thread_struct *t, unsigned int cpu, unsigned int i) { struct desc_struct *gdt = get_cpu_gdt_table(cpu); - xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); + xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); struct multicall_space mc = __xen_mc_entry(0); MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); @@ -488,7 +490,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, break; default: { - xmaddr_t maddr = virt_to_machine(&dt[entry]); + xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]); xen_mc_flush(); if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 319bd40a57c2..cb6afa4ec95c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -276,6 +276,13 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn) p2m_top[topidx][idx] = mfn; } +unsigned long arbitrary_virt_to_mfn(void *vaddr) +{ + xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); + + return PFN_DOWN(maddr.maddr); +} + xmaddr_t arbitrary_virt_to_machine(void *vaddr) { unsigned long address = (unsigned long)vaddr; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 035582ae815d..8d470562ffc9 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -219,6 +219,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) { struct vcpu_guest_context *ctxt; struct desc_struct *gdt; + unsigned long gdt_mfn; if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) return 0; @@ -248,9 +249,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->ldt_ents = 0; BUG_ON((unsigned long)gdt & ~PAGE_MASK); - make_lowmem_page_readonly(gdt); - ctxt->gdt_frames[0] = virt_to_mfn(gdt); + gdt_mfn = arbitrary_virt_to_mfn(gdt); + make_lowmem_page_readonly(gdt); + make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); + + ctxt->gdt_frames[0] = gdt_mfn; ctxt->gdt_ents = GDT_ENTRIES; ctxt->user_regs.cs = __KERNEL_CS; From d1dd524785e30cf3d64d395d829b207376acb0aa Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Mon, 2 Mar 2009 06:46:50 +0000 Subject: [PATCH 090/580] sctp: fix crash during module unload An extra list_del() during the module load failure and unload resulted in a crash with a list corruption. Now sctp can be unloaded again. Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/protocol.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index b78e3be69013..4e6638449639 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1322,9 +1322,8 @@ SCTP_STATIC __init int sctp_init(void) out: return status; err_v6_add_protocol: - sctp_v6_del_protocol(); -err_add_protocol: sctp_v4_del_protocol(); +err_add_protocol: inet_ctl_sock_destroy(sctp_ctl_sock); err_ctl_sock_init: sctp_v6_protosw_exit(); @@ -1335,7 +1334,6 @@ SCTP_STATIC __init int sctp_init(void) sctp_v4_pf_exit(); sctp_v6_pf_exit(); sctp_sysctl_unregister(); - list_del(&sctp_af_inet.list); free_pages((unsigned long)sctp_port_hashtable, get_order(sctp_port_hashsize * sizeof(struct sctp_bind_hashbucket))); @@ -1383,7 +1381,6 @@ SCTP_STATIC __exit void sctp_exit(void) sctp_v4_pf_exit(); sctp_sysctl_unregister(); - list_del(&sctp_af_inet.list); free_pages((unsigned long)sctp_assoc_hashtable, get_order(sctp_assoc_hashsize * From 3df2678737974accf437dad11e584c1871a3ede3 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 2 Mar 2009 06:46:51 +0000 Subject: [PATCH 091/580] sctp: fix kernel panic with ERROR chunk containing too many error causes If ERROR chunk is received with too many error causes in ESTABLISHED state, the kernel get panic. This is because sctp limit the max length of cmds to 14, but while ERROR chunk is received, one error cause will add around 2 cmds by sctp_add_cmd_sf(). So many error causes will fill the limit of cmds and panic. This patch fixed the problem. This bug can be test by SCTP Conformance Test Suite . Signed-off-by: Wei Yongjun Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/sm_sideeffect.c | 54 ++++++++++++++++++++++++---------------- net/sctp/sm_statefuns.c | 16 ++---------- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index e1d6076b4f59..b5495aecab60 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -787,36 +787,48 @@ static void sctp_cmd_process_operr(sctp_cmd_seq_t *cmds, struct sctp_association *asoc, struct sctp_chunk *chunk) { - struct sctp_operr_chunk *operr_chunk; struct sctp_errhdr *err_hdr; + struct sctp_ulpevent *ev; - operr_chunk = (struct sctp_operr_chunk *)chunk->chunk_hdr; - err_hdr = &operr_chunk->err_hdr; + while (chunk->chunk_end > chunk->skb->data) { + err_hdr = (struct sctp_errhdr *)(chunk->skb->data); - switch (err_hdr->cause) { - case SCTP_ERROR_UNKNOWN_CHUNK: - { - struct sctp_chunkhdr *unk_chunk_hdr; + ev = sctp_ulpevent_make_remote_error(asoc, chunk, 0, + GFP_ATOMIC); + if (!ev) + return; - unk_chunk_hdr = (struct sctp_chunkhdr *)err_hdr->variable; - switch (unk_chunk_hdr->type) { - /* ADDIP 4.1 A9) If the peer responds to an ASCONF with an - * ERROR chunk reporting that it did not recognized the ASCONF - * chunk type, the sender of the ASCONF MUST NOT send any - * further ASCONF chunks and MUST stop its T-4 timer. - */ - case SCTP_CID_ASCONF: - asoc->peer.asconf_capable = 0; - sctp_add_cmd_sf(cmds, SCTP_CMD_TIMER_STOP, + sctp_ulpq_tail_event(&asoc->ulpq, ev); + + switch (err_hdr->cause) { + case SCTP_ERROR_UNKNOWN_CHUNK: + { + sctp_chunkhdr_t *unk_chunk_hdr; + + unk_chunk_hdr = (sctp_chunkhdr_t *)err_hdr->variable; + switch (unk_chunk_hdr->type) { + /* ADDIP 4.1 A9) If the peer responds to an ASCONF with + * an ERROR chunk reporting that it did not recognized + * the ASCONF chunk type, the sender of the ASCONF MUST + * NOT send any further ASCONF chunks and MUST stop its + * T-4 timer. + */ + case SCTP_CID_ASCONF: + if (asoc->peer.asconf_capable == 0) + break; + + asoc->peer.asconf_capable = 0; + sctp_add_cmd_sf(cmds, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); + break; + default: + break; + } break; + } default: break; } - break; - } - default: - break; } } diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 3a0cd075914f..f88dfded0e3a 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -3163,7 +3163,6 @@ sctp_disposition_t sctp_sf_operr_notify(const struct sctp_endpoint *ep, sctp_cmd_seq_t *commands) { struct sctp_chunk *chunk = arg; - struct sctp_ulpevent *ev; if (!sctp_vtag_verify(chunk, asoc)) return sctp_sf_pdiscard(ep, asoc, type, arg, commands); @@ -3173,21 +3172,10 @@ sctp_disposition_t sctp_sf_operr_notify(const struct sctp_endpoint *ep, return sctp_sf_violation_chunklen(ep, asoc, type, arg, commands); - while (chunk->chunk_end > chunk->skb->data) { - ev = sctp_ulpevent_make_remote_error(asoc, chunk, 0, - GFP_ATOMIC); - if (!ev) - goto nomem; + sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_OPERR, + SCTP_CHUNK(chunk)); - sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, - SCTP_ULPEVENT(ev)); - sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_OPERR, - SCTP_CHUNK(chunk)); - } return SCTP_DISPOSITION_CONSUME; - -nomem: - return SCTP_DISPOSITION_NOMEM; } /* From 07555c9880da3e2e96e5eae00a03b44cc076deaf Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 2 Mar 2009 22:29:37 -0800 Subject: [PATCH 092/580] OMAP: enable smc911x support for LDP platform The following patch enables SMC911x support to work on the OMAP LDP board. Although the SMC911x driver will eventually be obsoleted, the smsc911x patches are rather invasive for the -rc kernels. Rather than risk destablising smsc911x, this simpler patch is preferred to allow the network interface to work. Signed-off-by: Russell King Acked-by: Tony Lindgren Signed-off-by: David S. Miller --- arch/arm/mach-omap2/board-ldp.c | 2 +- drivers/net/smc911x.h | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/arm/mach-omap2/board-ldp.c b/arch/arm/mach-omap2/board-ldp.c index f6a13451d1fd..6031e179926b 100644 --- a/arch/arm/mach-omap2/board-ldp.c +++ b/arch/arm/mach-omap2/board-ldp.c @@ -81,7 +81,7 @@ static inline void __init ldp_init_smc911x(void) } ldp_smc911x_resources[0].start = cs_mem_base + 0x0; - ldp_smc911x_resources[0].end = cs_mem_base + 0xf; + ldp_smc911x_resources[0].end = cs_mem_base + 0xff; udelay(100); eth_gpio = LDP_SMC911X_GPIO; diff --git a/drivers/net/smc911x.h b/drivers/net/smc911x.h index 870b4c33f108..a45952e72018 100644 --- a/drivers/net/smc911x.h +++ b/drivers/net/smc911x.h @@ -42,6 +42,16 @@ #define SMC_USE_16BIT 0 #define SMC_USE_32BIT 1 #define SMC_IRQ_SENSE IRQF_TRIGGER_LOW +#elif defined(CONFIG_ARCH_OMAP34XX) + #define SMC_USE_16BIT 0 + #define SMC_USE_32BIT 1 + #define SMC_IRQ_SENSE IRQF_TRIGGER_LOW + #define SMC_MEM_RESERVED 1 +#elif defined(CONFIG_ARCH_OMAP24XX) + #define SMC_USE_16BIT 0 + #define SMC_USE_32BIT 1 + #define SMC_IRQ_SENSE IRQF_TRIGGER_LOW + #define SMC_MEM_RESERVED 1 #else /* * Default configuration @@ -675,6 +685,7 @@ smc_pxa_dma_outsl(struct smc911x_local *lp, u_long physaddr, #define CHIP_9116 0x0116 #define CHIP_9117 0x0117 #define CHIP_9118 0x0118 +#define CHIP_9211 0x9211 #define CHIP_9215 0x115A #define CHIP_9217 0x117A #define CHIP_9218 0x118A @@ -689,6 +700,7 @@ static const struct chip_id chip_ids[] = { { CHIP_9116, "LAN9116" }, { CHIP_9117, "LAN9117" }, { CHIP_9118, "LAN9118" }, + { CHIP_9211, "LAN9211" }, { CHIP_9215, "LAN9215" }, { CHIP_9217, "LAN9217" }, { CHIP_9218, "LAN9218" }, From 5a5990d3090b03745a9548a6f5edef02095675cf Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 26 Feb 2009 06:49:24 +0000 Subject: [PATCH 093/580] net: Avoid race between network down and sysfs Signed-off-by: Stephen Hemminger Acked-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 6ac29a46e23e..484f58750eba 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -77,7 +77,9 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, if (endp == buf) goto err; - rtnl_lock(); + if (!rtnl_trylock()) + return -ERESTARTSYS; + if (dev_isalive(net)) { if ((ret = (*set)(net, new)) == 0) ret = len; From b325fddb7f869e6c95a88dc6573220f162e5b89f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 26 Feb 2009 06:55:31 +0000 Subject: [PATCH 094/580] ipv6: Fix sysctl unregistration deadlock Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f9afb452249c..40fabdee42ae 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -493,15 +493,17 @@ static void addrconf_forward_change(struct net *net, __s32 newf) read_unlock(&dev_base_lock); } -static void addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old) +static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old) { struct net *net; net = (struct net *)table->extra2; if (p == &net->ipv6.devconf_dflt->forwarding) - return; + return 0; + + if (!rtnl_trylock()) + return -ERESTARTSYS; - rtnl_lock(); if (p == &net->ipv6.devconf_all->forwarding) { __s32 newf = net->ipv6.devconf_all->forwarding; net->ipv6.devconf_dflt->forwarding = newf; @@ -512,6 +514,7 @@ static void addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old) if (*p) rt6_purge_dflt_routers(net); + return 1; } #endif @@ -3983,7 +3986,7 @@ int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp, ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); if (write) - addrconf_fixup_forwarding(ctl, valp, val); + ret = addrconf_fixup_forwarding(ctl, valp, val); return ret; } @@ -4019,8 +4022,7 @@ static int addrconf_sysctl_forward_strategy(ctl_table *table, } *valp = new; - addrconf_fixup_forwarding(table, valp, val); - return 1; + return addrconf_fixup_forwarding(table, valp, val); } static struct addrconf_sysctl_table From ee554be9ddcb666445b6765d8b5cdbfe80a1e9cf Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Tue, 3 Mar 2009 16:52:55 +0800 Subject: [PATCH 095/580] Blackfin arch: fix compile failure when missing the anomaly definition make sure ANOMALY_05000278/ANOMALY_05000380 is defined for all parts Signed-off-by: Mike Frysinger Signed-off-by: Bryan Wu --- arch/blackfin/mach-bf518/include/mach/anomaly.h | 2 ++ arch/blackfin/mach-bf527/include/mach/anomaly.h | 1 + arch/blackfin/mach-bf533/include/mach/anomaly.h | 1 + arch/blackfin/mach-bf537/include/mach/anomaly.h | 1 + arch/blackfin/mach-bf538/include/mach/anomaly.h | 1 + arch/blackfin/mach-bf548/include/mach/anomaly.h | 1 + arch/blackfin/mach-bf561/include/mach/anomaly.h | 1 + 7 files changed, 8 insertions(+) diff --git a/arch/blackfin/mach-bf518/include/mach/anomaly.h b/arch/blackfin/mach-bf518/include/mach/anomaly.h index e5b4bef0edae..6e1670072829 100644 --- a/arch/blackfin/mach-bf518/include/mach/anomaly.h +++ b/arch/blackfin/mach-bf518/include/mach/anomaly.h @@ -65,6 +65,7 @@ #define ANOMALY_05000263 (0) #define ANOMALY_05000266 (0) #define ANOMALY_05000273 (0) +#define ANOMALY_05000278 (0) #define ANOMALY_05000285 (0) #define ANOMALY_05000307 (0) #define ANOMALY_05000311 (0) @@ -72,6 +73,7 @@ #define ANOMALY_05000323 (0) #define ANOMALY_05000353 (0) #define ANOMALY_05000363 (0) +#define ANOMALY_05000380 (0) #define ANOMALY_05000386 (0) #define ANOMALY_05000412 (0) #define ANOMALY_05000432 (0) diff --git a/arch/blackfin/mach-bf527/include/mach/anomaly.h b/arch/blackfin/mach-bf527/include/mach/anomaly.h index 035e8d835058..fdc87839af60 100644 --- a/arch/blackfin/mach-bf527/include/mach/anomaly.h +++ b/arch/blackfin/mach-bf527/include/mach/anomaly.h @@ -167,6 +167,7 @@ #define ANOMALY_05000263 (0) #define ANOMALY_05000266 (0) #define ANOMALY_05000273 (0) +#define ANOMALY_05000278 (0) #define ANOMALY_05000285 (0) #define ANOMALY_05000307 (0) #define ANOMALY_05000311 (0) diff --git a/arch/blackfin/mach-bf533/include/mach/anomaly.h b/arch/blackfin/mach-bf533/include/mach/anomaly.h index 0d3a03429fb9..657dd1820a93 100644 --- a/arch/blackfin/mach-bf533/include/mach/anomaly.h +++ b/arch/blackfin/mach-bf533/include/mach/anomaly.h @@ -278,6 +278,7 @@ #define ANOMALY_05000266 (0) #define ANOMALY_05000323 (0) #define ANOMALY_05000353 (1) +#define ANOMALY_05000380 (0) #define ANOMALY_05000386 (1) #define ANOMALY_05000412 (0) #define ANOMALY_05000432 (0) diff --git a/arch/blackfin/mach-bf537/include/mach/anomaly.h b/arch/blackfin/mach-bf537/include/mach/anomaly.h index 9cb39121d1cb..fe1ae4c75cd6 100644 --- a/arch/blackfin/mach-bf537/include/mach/anomaly.h +++ b/arch/blackfin/mach-bf537/include/mach/anomaly.h @@ -168,6 +168,7 @@ #define ANOMALY_05000323 (0) #define ANOMALY_05000353 (1) #define ANOMALY_05000363 (0) +#define ANOMALY_05000380 (0) #define ANOMALY_05000386 (1) #define ANOMALY_05000412 (0) #define ANOMALY_05000432 (0) diff --git a/arch/blackfin/mach-bf538/include/mach/anomaly.h b/arch/blackfin/mach-bf538/include/mach/anomaly.h index e130b4f8a05d..56ea454709ca 100644 --- a/arch/blackfin/mach-bf538/include/mach/anomaly.h +++ b/arch/blackfin/mach-bf538/include/mach/anomaly.h @@ -124,6 +124,7 @@ #define ANOMALY_05000323 (0) #define ANOMALY_05000353 (1) #define ANOMALY_05000363 (0) +#define ANOMALY_05000380 (0) #define ANOMALY_05000386 (1) #define ANOMALY_05000412 (0) #define ANOMALY_05000432 (0) diff --git a/arch/blackfin/mach-bf548/include/mach/anomaly.h b/arch/blackfin/mach-bf548/include/mach/anomaly.h index 23d03c52f4b4..d9d10a73c1dc 100644 --- a/arch/blackfin/mach-bf548/include/mach/anomaly.h +++ b/arch/blackfin/mach-bf548/include/mach/anomaly.h @@ -171,6 +171,7 @@ #define ANOMALY_05000263 (0) #define ANOMALY_05000266 (0) #define ANOMALY_05000273 (0) +#define ANOMALY_05000278 (0) #define ANOMALY_05000307 (0) #define ANOMALY_05000311 (0) #define ANOMALY_05000323 (0) diff --git a/arch/blackfin/mach-bf561/include/mach/anomaly.h b/arch/blackfin/mach-bf561/include/mach/anomaly.h index 1a9e17562821..24d3a2264e44 100644 --- a/arch/blackfin/mach-bf561/include/mach/anomaly.h +++ b/arch/blackfin/mach-bf561/include/mach/anomaly.h @@ -283,6 +283,7 @@ #define ANOMALY_05000273 (0) #define ANOMALY_05000311 (0) #define ANOMALY_05000353 (1) +#define ANOMALY_05000380 (0) #define ANOMALY_05000386 (1) #define ANOMALY_05000432 (0) #define ANOMALY_05000435 (0) From 176c39af29bc4edaf37f663553eeaacd47b5bc9c Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 3 Mar 2009 01:06:45 -0800 Subject: [PATCH 096/580] netns: fix addrconf_ifdown kernel panic When a network namespace is destroyed the network interfaces are all unregistered, making addrconf_ifdown called by the netdevice notifier. In the other hand, the addrconf exit method does a loop on the network devices and does addrconf_ifdown on each of them. But the ordering of the netns subsystem is not right because it uses the register_pernet_device instead of register_pernet_subsys. If we handle the loopback as any network device, we can safely use register_pernet_subsys. But if we use register_pernet_subsys, the addrconf exit method will do exactly what was already done with the unregistering of the network devices. So in definitive, this code is pointless. I removed the netns addrconf exit method and moved the code to the addrconf cleanup function. Signed-off-by: Daniel Lezcano Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 39 +++++++++------------------------------ 1 file changed, 9 insertions(+), 30 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 40fabdee42ae..1220e2c7831e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2611,9 +2611,6 @@ static int addrconf_ifdown(struct net_device *dev, int how) ASSERT_RTNL(); - if ((dev->flags & IFF_LOOPBACK) && how == 1) - how = 0; - rt6_ifdown(net, dev); neigh_ifdown(&nd_tbl, dev); @@ -4448,25 +4445,6 @@ int unregister_inet6addr_notifier(struct notifier_block *nb) EXPORT_SYMBOL(unregister_inet6addr_notifier); -static void addrconf_net_exit(struct net *net) -{ - struct net_device *dev; - - rtnl_lock(); - /* clean dev list */ - for_each_netdev(net, dev) { - if (__in6_dev_get(dev) == NULL) - continue; - addrconf_ifdown(dev, 1); - } - addrconf_ifdown(net->loopback_dev, 2); - rtnl_unlock(); -} - -static struct pernet_operations addrconf_net_ops = { - .exit = addrconf_net_exit, -}; - /* * Init / cleanup code */ @@ -4508,10 +4486,6 @@ int __init addrconf_init(void) if (err) goto errlo; - err = register_pernet_device(&addrconf_net_ops); - if (err) - return err; - register_netdevice_notifier(&ipv6_dev_notf); addrconf_verify(0); @@ -4541,15 +4515,22 @@ int __init addrconf_init(void) void addrconf_cleanup(void) { struct inet6_ifaddr *ifa; + struct net_device *dev; int i; unregister_netdevice_notifier(&ipv6_dev_notf); - unregister_pernet_device(&addrconf_net_ops); - unregister_pernet_subsys(&addrconf_ops); rtnl_lock(); + /* clean dev list */ + for_each_netdev(&init_net, dev) { + if (__in6_dev_get(dev) == NULL) + continue; + addrconf_ifdown(dev, 1); + } + addrconf_ifdown(init_net.loopback_dev, 2); + /* * Check hash table. */ @@ -4570,6 +4551,4 @@ void addrconf_cleanup(void) del_timer(&addr_chk_timer); rtnl_unlock(); - - unregister_pernet_subsys(&addrconf_net_ops); } From 6eb0777228f31932fc941eafe8b08848466630a1 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 22 Feb 2009 00:09:14 -0800 Subject: [PATCH 097/580] netns: Fix icmp shutdown. Recently I had a kernel panic in icmp_send during a network namespace cleanup. There were packets in the arp queue that failed to be sent and we attempted to generate an ICMP host unreachable message, but failed because icmp_sk_exit had already been called. The network devices are removed from a network namespace and their arp queues are flushed before we do attempt to shutdown subsystems so this error should have been impossible. It turns out icmp_init is using register_pernet_device instead of register_pernet_subsys. Which resulted in icmp being shut down while we still had the possibility of packets in flight, making a nasty NULL pointer deference in interrupt context possible. Changing this to register_pernet_subsys fixes the problem in my testing. Signed-off-by: Eric W. Biederman Acked-by: Denis V. Lunev Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 705b33b184a3..fc562d29cc46 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1205,7 +1205,7 @@ static struct pernet_operations __net_initdata icmp_sk_ops = { int __init icmp_init(void) { - return register_pernet_device(&icmp_sk_ops); + return register_pernet_subsys(&icmp_sk_ops); } EXPORT_SYMBOL(icmp_err_convert); From 2f20d2e667ab1ca44cde5fb361386dff5bb6081d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 22 Feb 2009 00:10:18 -0800 Subject: [PATCH 098/580] tcp: Like icmp use register_pernet_subsys To remove the possibility of packets flying around when network devices are being cleaned up use reisger_pernet_subsys instead of register_pernet_device. Signed-off-by: Eric W. Biederman Acked-by: Denis V. Lunev Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 19d7b429a262..cf74c416831a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2443,7 +2443,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = { void __init tcp_v4_init(void) { inet_hashinfo_init(&tcp_hashinfo); - if (register_pernet_device(&tcp_sk_ops)) + if (register_pernet_subsys(&tcp_sk_ops)) panic("Failed to create the TCP control socket.\n"); } From 17edde520927070a6bf14a6a75027c0b843443e5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 22 Feb 2009 00:11:09 -0800 Subject: [PATCH 099/580] netns: Remove net_alive It turns out that net_alive is unnecessary, and the original problem that led to it being added was simply that the icmp code thought it was a network device and wound up being unable to handle packets while there were still packets in the network namespace. Now that icmp and tcp have been fixed to properly register themselves this problem is no longer present and we have a stronger guarantee that packets will not arrive in a network namespace then that provided by net_alive in netif_receive_skb. So remove net_alive allowing packet reception run a little faster. Additionally document the strong reason why network namespace cleanup is safe so that if something happens again someone else will have a chance of figuring it out. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- include/net/net_namespace.h | 27 +++++++++++++++++---------- net/core/dev.c | 6 ------ net/core/net_namespace.c | 3 --- 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 6fc13d905c5f..ded434b032a4 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -109,11 +109,6 @@ extern struct list_head net_namespace_list; #ifdef CONFIG_NET_NS extern void __put_net(struct net *net); -static inline int net_alive(struct net *net) -{ - return net && atomic_read(&net->count); -} - static inline struct net *get_net(struct net *net) { atomic_inc(&net->count); @@ -145,11 +140,6 @@ int net_eq(const struct net *net1, const struct net *net2) } #else -static inline int net_alive(struct net *net) -{ - return 1; -} - static inline struct net *get_net(struct net *net) { return net; @@ -234,6 +224,23 @@ struct pernet_operations { void (*exit)(struct net *net); }; +/* + * Use these carefully. If you implement a network device and it + * needs per network namespace operations use device pernet operations, + * otherwise use pernet subsys operations. + * + * This is critically important. Most of the network code cleanup + * runs with the assumption that dev_remove_pack has been called so no + * new packets will arrive during and after the cleanup functions have + * been called. dev_remove_pack is not per namespace so instead the + * guarantee of no more packets arriving in a network namespace is + * provided by ensuring that all network devices and all sockets have + * left the network namespace before the cleanup methods are called. + * + * For the longest time the ipv4 icmp code was registered as a pernet + * device which caused kernel oops, and panics during network + * namespace cleanup. So please don't get this wrong. + */ extern int register_pernet_subsys(struct pernet_operations *); extern void unregister_pernet_subsys(struct pernet_operations *); extern int register_pernet_gen_subsys(int *id, struct pernet_operations *); diff --git a/net/core/dev.c b/net/core/dev.c index 72b0d26fd46d..9e4afe650e7a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2267,12 +2267,6 @@ int netif_receive_skb(struct sk_buff *skb) rcu_read_lock(); - /* Don't receive packets in an exiting network namespace */ - if (!net_alive(dev_net(skb->dev))) { - kfree_skb(skb); - goto out; - } - #ifdef CONFIG_NET_CLS_ACT if (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 2adb1a7d361f..e3bebd36f053 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -157,9 +157,6 @@ static void cleanup_net(struct work_struct *work) struct pernet_operations *ops; struct net *net; - /* Be very certain incoming network packets will not find us */ - rcu_barrier(); - net = container_of(work, struct net, work); mutex_lock(&net_mutex); From 97d4b35fb44cd5a80bc10952e2b1de5d3a14117b Mon Sep 17 00:00:00 2001 From: Tom Parker Date: Tue, 3 Mar 2009 17:59:39 +0800 Subject: [PATCH 100/580] Blackfin arch: fix bug - Error if one serial has hardware flow control and the other doesn't I have a system where UART0 is configured with hardware flow control, but UART1 doesn't have it enabled. Attempting to access UART1 in this configuration results in the following error in dmesg: <3>bfin-gpio: GPIO 0 is already reserved as Peripheral by bfin-uart ! <5>Stack from 0082bc7c: <5> 0082bc88 00404dd6 00000003 00000000 0054051e 004079da 0082bcb4 00000000 <5> 00000003 00000000 0052686c 0113f2a0 005fa3f0 00000032 20515249 00003035 <5> 00427228 00526e50 0113f2e0 005fa3f0 00000032 0113f2e0 0054b748 0000ffff <5> 22222222 22222222 004e1628 00427304 00000000 00000032 00000023 0054b748 <5> 00487a94 0054b7e8 0054b748 0000000b 00487fb8 0054b748 0054b748 00000001 <5> 0000000a 005fa3f0 009d4fe8 0101e3c0 0054b748 005fa3f0 0050b134 0054b748 <5> <5>Call Trace: <4>[<00485c16>] _uart_startup+0x56/0x178 <4>[<004865c8>] _uart_open+0x40/0x3e0 <4>[<0048661c>] _uart_open+0x94/0x3e0 <4>[<0047f1ce>] _init_dev+0x1fa/0x450 <4>[<004e1628>] ___mutex_unlock_slowpath+0x30/0xe8 <4>[<004815da>] _tty_open+0xf6/0x21c <4>[<0043dab0>] ___path_lookup_intent_open+0x34/0x7c <4>[<004375e4>] _chrdev_open+0x7c/0x134 <4>[<0043dc2c>] _open_namei+0x60/0x568 <4>[<00433fa2>] ___dentry_open+0x9e/0x188 <4>[<00437568>] _chrdev_open+0x0/0x134 <4>[<0043410c>] _nameidata_to_filp+0x30/0x3c <4>[<00434152>] _do_filp_open+0x3a/0x44 <4>[<00408826>] _task_running_tick+0x102/0x278 <4>[<0043418e>] _do_sys_open+0x32/0xac <4>[<0043ede4>] _sys_ioctl+0x28/0x50 <4>[<0043edbc>] _sys_ioctl+0x0/0x50 <4>[<00434224>] _sys_open+0x18/0x20 <4>[<0043420c>] _sys_open+0x0/0x20 <4>[<00418174>] _sys_setuid+0x0/0xc8 This is because the #ifdef's in bfin_serial_5xx.h are messed up. More specifically, they add/remove the uart_{rts,cts}_pin fields in bfin_serial_resources based on whether the particular port has rts/cts enabled, as opposed to when either port has it enabled. This patch fixed this. Signed-off-by: Tom Parker Signed-off-by: Sonic Zhang Signed-off-by: Bryan Wu --- .../mach-bf518/include/mach/bfin_serial_5xx.h | 4 ++-- .../mach-bf527/include/mach/bfin_serial_5xx.h | 4 ++-- .../mach-bf533/include/mach/bfin_serial_5xx.h | 2 +- .../mach-bf537/include/mach/bfin_serial_5xx.h | 4 ++-- .../mach-bf538/include/mach/bfin_serial_5xx.h | 4 ++-- .../mach-bf548/include/mach/bfin_serial_5xx.h | 22 +++++++++++++------ .../mach-bf561/include/mach/bfin_serial_5xx.h | 2 +- 7 files changed, 25 insertions(+), 17 deletions(-) diff --git a/arch/blackfin/mach-bf518/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf518/include/mach/bfin_serial_5xx.h index b50a63b975a2..e21c1c3e4ec7 100644 --- a/arch/blackfin/mach-bf518/include/mach/bfin_serial_5xx.h +++ b/arch/blackfin/mach-bf518/include/mach/bfin_serial_5xx.h @@ -144,7 +144,7 @@ struct bfin_serial_res bfin_serial_resource[] = { CH_UART0_TX, CH_UART0_RX, #endif -#ifdef CONFIG_BFIN_UART0_CTSRTS +#ifdef CONFIG_SERIAL_BFIN_CTSRTS CONFIG_UART0_CTS_PIN, CONFIG_UART0_RTS_PIN, #endif @@ -158,7 +158,7 @@ struct bfin_serial_res bfin_serial_resource[] = { CH_UART1_TX, CH_UART1_RX, #endif -#ifdef CONFIG_BFIN_UART1_CTSRTS +#ifdef CONFIG_SERIAL_BFIN_CTSRTS CONFIG_UART1_CTS_PIN, CONFIG_UART1_RTS_PIN, #endif diff --git a/arch/blackfin/mach-bf527/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf527/include/mach/bfin_serial_5xx.h index 75722d6008b0..e8c41fd842b5 100644 --- a/arch/blackfin/mach-bf527/include/mach/bfin_serial_5xx.h +++ b/arch/blackfin/mach-bf527/include/mach/bfin_serial_5xx.h @@ -144,7 +144,7 @@ struct bfin_serial_res bfin_serial_resource[] = { CH_UART0_TX, CH_UART0_RX, #endif -#ifdef CONFIG_BFIN_UART0_CTSRTS +#ifdef CONFIG_SERIAL_BFIN_CTSRTS CONFIG_UART0_CTS_PIN, CONFIG_UART0_RTS_PIN, #endif @@ -158,7 +158,7 @@ struct bfin_serial_res bfin_serial_resource[] = { CH_UART1_TX, CH_UART1_RX, #endif -#ifdef CONFIG_BFIN_UART1_CTSRTS +#ifdef CONFIG_SERIAL_BFIN_CTSRTS CONFIG_UART1_CTS_PIN, CONFIG_UART1_RTS_PIN, #endif diff --git a/arch/blackfin/mach-bf533/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf533/include/mach/bfin_serial_5xx.h index f3d9e495230c..5f517f53b0fd 100644 --- a/arch/blackfin/mach-bf533/include/mach/bfin_serial_5xx.h +++ b/arch/blackfin/mach-bf533/include/mach/bfin_serial_5xx.h @@ -134,7 +134,7 @@ struct bfin_serial_res bfin_serial_resource[] = { CH_UART_TX, CH_UART_RX, #endif -#ifdef CONFIG_BFIN_UART0_CTSRTS +#ifdef CONFIG_SERIAL_BFIN_CTSRTS CONFIG_UART0_CTS_PIN, CONFIG_UART0_RTS_PIN, #endif diff --git a/arch/blackfin/mach-bf537/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf537/include/mach/bfin_serial_5xx.h index b3f87e1d16a2..9e34700844a2 100644 --- a/arch/blackfin/mach-bf537/include/mach/bfin_serial_5xx.h +++ b/arch/blackfin/mach-bf537/include/mach/bfin_serial_5xx.h @@ -144,7 +144,7 @@ struct bfin_serial_res bfin_serial_resource[] = { CH_UART0_TX, CH_UART0_RX, #endif -#ifdef CONFIG_BFIN_UART0_CTSRTS +#ifdef CONFIG_SERIAL_BFIN_CTSRTS CONFIG_UART0_CTS_PIN, CONFIG_UART0_RTS_PIN, #endif @@ -158,7 +158,7 @@ struct bfin_serial_res bfin_serial_resource[] = { CH_UART1_TX, CH_UART1_RX, #endif -#ifdef CONFIG_BFIN_UART1_CTSRTS +#ifdef CONFIG_SERIAL_BFIN_CTSRTS CONFIG_UART1_CTS_PIN, CONFIG_UART1_RTS_PIN, #endif diff --git a/arch/blackfin/mach-bf538/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf538/include/mach/bfin_serial_5xx.h index 40503b6b89a3..3c2811ebecdd 100644 --- a/arch/blackfin/mach-bf538/include/mach/bfin_serial_5xx.h +++ b/arch/blackfin/mach-bf538/include/mach/bfin_serial_5xx.h @@ -144,7 +144,7 @@ struct bfin_serial_res bfin_serial_resource[] = { CH_UART0_TX, CH_UART0_RX, #endif -#ifdef CONFIG_BFIN_UART0_CTSRTS +#ifdef CONFIG_SERIAL_BFIN_CTSRTS CONFIG_UART0_CTS_PIN, CONFIG_UART0_RTS_PIN, #endif @@ -158,7 +158,7 @@ struct bfin_serial_res bfin_serial_resource[] = { CH_UART1_TX, CH_UART1_RX, #endif -#ifdef CONFIG_BFIN_UART1_CTSRTS +#ifdef CONFIG_SERIAL_BFIN_CTSRTS CONFIG_UART1_CTS_PIN, CONFIG_UART1_RTS_PIN, #endif diff --git a/arch/blackfin/mach-bf548/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf548/include/mach/bfin_serial_5xx.h index e4cf35e7ab9f..c05e79cba257 100644 --- a/arch/blackfin/mach-bf548/include/mach/bfin_serial_5xx.h +++ b/arch/blackfin/mach-bf548/include/mach/bfin_serial_5xx.h @@ -63,7 +63,7 @@ #define UART_ENABLE_INTS(x, v) UART_SET_IER(x, v) #define UART_DISABLE_INTS(x) UART_CLEAR_IER(x, 0xF) -#if defined(CONFIG_BFIN_UART0_CTSRTS) || defined(CONFIG_BFIN_UART1_CTSRTS) +#if defined(CONFIG_BFIN_UART0_CTSRTS) || defined(CONFIG_BFIN_UART2_CTSRTS) # define CONFIG_SERIAL_BFIN_CTSRTS # ifndef CONFIG_UART0_CTS_PIN @@ -74,12 +74,12 @@ # define CONFIG_UART0_RTS_PIN -1 # endif -# ifndef CONFIG_UART1_CTS_PIN -# define CONFIG_UART1_CTS_PIN -1 +# ifndef CONFIG_UART2_CTS_PIN +# define CONFIG_UART2_CTS_PIN -1 # endif -# ifndef CONFIG_UART1_RTS_PIN -# define CONFIG_UART1_RTS_PIN -1 +# ifndef CONFIG_UART2_RTS_PIN +# define CONFIG_UART2_RTS_PIN -1 # endif #endif @@ -130,7 +130,7 @@ struct bfin_serial_res bfin_serial_resource[] = { CH_UART0_TX, CH_UART0_RX, #endif -#ifdef CONFIG_BFIN_UART0_CTSRTS +#ifdef CONFIG_SERIAL_BFIN_CTSRTS CONFIG_UART0_CTS_PIN, CONFIG_UART0_RTS_PIN, #endif @@ -143,6 +143,10 @@ struct bfin_serial_res bfin_serial_resource[] = { #ifdef CONFIG_SERIAL_BFIN_DMA CH_UART1_TX, CH_UART1_RX, +#endif +#ifdef CONFIG_SERIAL_BFIN_CTSRTS + 0, + 0, #endif }, #endif @@ -154,7 +158,7 @@ struct bfin_serial_res bfin_serial_resource[] = { CH_UART2_TX, CH_UART2_RX, #endif -#ifdef CONFIG_BFIN_UART2_CTSRTS +#ifdef CONFIG_SERIAL_BFIN_CTSRTS CONFIG_UART2_CTS_PIN, CONFIG_UART2_RTS_PIN, #endif @@ -167,6 +171,10 @@ struct bfin_serial_res bfin_serial_resource[] = { #ifdef CONFIG_SERIAL_BFIN_DMA CH_UART3_TX, CH_UART3_RX, +#endif +#ifdef CONFIG_SERIAL_BFIN_CTSRTS + 0, + 0, #endif }, #endif diff --git a/arch/blackfin/mach-bf561/include/mach/bfin_serial_5xx.h b/arch/blackfin/mach-bf561/include/mach/bfin_serial_5xx.h index 043bfcf26c52..ca8c5f645209 100644 --- a/arch/blackfin/mach-bf561/include/mach/bfin_serial_5xx.h +++ b/arch/blackfin/mach-bf561/include/mach/bfin_serial_5xx.h @@ -134,7 +134,7 @@ struct bfin_serial_res bfin_serial_resource[] = { CH_UART_TX, CH_UART_RX, #endif -#ifdef CONFIG_BFIN_UART0_CTSRTS +#ifdef CONFIG_SERIAL_BFIN_CTSRTS CONFIG_UART0_CTS_PIN, CONFIG_UART0_RTS_PIN, #endif From 25ef4a67e78e1322d55f0a38783537ed89addc02 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Mon, 2 Mar 2009 22:39:36 +0100 Subject: [PATCH 101/580] [ARM] 5416/1: Use unused address in v6_early_abort The target of the strex instruction to clear the exlusive monitor is currently the top of the stack. If the store succeeeds this corrupts r0 in pt_regs. Use the next stack location instead of the current one to prevent any chance of corrupting an in-use address. Signed-off-by: Seth Forshee Signed-off-by: Russell King --- arch/arm/mm/abort-ev6.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/mm/abort-ev6.S b/arch/arm/mm/abort-ev6.S index 8a7f65ba14b7..94077fbd96b7 100644 --- a/arch/arm/mm/abort-ev6.S +++ b/arch/arm/mm/abort-ev6.S @@ -23,7 +23,8 @@ ENTRY(v6_early_abort) #ifdef CONFIG_CPU_32v6K clrex #else - strex r0, r1, [sp] @ Clear the exclusive monitor + sub r1, sp, #4 @ Get unused stack location + strex r0, r1, [r1] @ Clear the exclusive monitor #endif mrc p15, 0, r1, c5, c0, 0 @ get FSR mrc p15, 0, r0, c6, c0, 0 @ get FAR From b57ee99fab25dbc12150fe66fe54dc52bc6de784 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 3 Mar 2009 11:44:12 +0100 Subject: [PATCH 102/580] [ARM] 5417/1: Set the correct cacheid for ARMv6 CPUs with ARMv7 style MMU The cacheid_init() function assumes that if cpu_architecture() returns 7, the caches are VIPT_NONALIASING. The cpu_architecture() function returns the version of the supported MMU features (e.g. TEX remapping) but it doesn't make any assumptions about the cache type. The patch adds the checking of the Cache Type Register for the ARMv7 format. Signed-off-by: Catalin Marinas Signed-off-by: Russell King --- arch/arm/kernel/setup.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 7049815d66d5..68d6494c0389 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -233,12 +233,13 @@ static void __init cacheid_init(void) unsigned int cachetype = read_cpuid_cachetype(); unsigned int arch = cpu_architecture(); - if (arch >= CPU_ARCH_ARMv7) { - cacheid = CACHEID_VIPT_NONALIASING; - if ((cachetype & (3 << 14)) == 1 << 14) - cacheid |= CACHEID_ASID_TAGGED; - } else if (arch >= CPU_ARCH_ARMv6) { - if (cachetype & (1 << 23)) + if (arch >= CPU_ARCH_ARMv6) { + if ((cachetype & (7 << 29)) == 4 << 29) { + /* ARMv7 register format */ + cacheid = CACHEID_VIPT_NONALIASING; + if ((cachetype & (3 << 14)) == 1 << 14) + cacheid |= CACHEID_ASID_TAGGED; + } else if (cachetype & (1 << 23)) cacheid = CACHEID_VIPT_ALIASING; else cacheid = CACHEID_VIPT_NONALIASING; From bdf602bd737eb07d63d6fa2da826b4751fdf9bab Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 3 Mar 2009 13:43:47 +0000 Subject: [PATCH 103/580] [ARM] fix lots of ARM __devexit sillyness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `iop_adma_remove' referenced in section `.data' of drivers/built-in.o: defined in discarded section `.devexit.text' of drivers/built-in.o `mv_xor_remove' referenced in section `.data' of drivers/built-in.o: defined in discarded section `.devexit.text' of drivers/built-in.o `mv64xxx_i2c_unmap_regs' referenced in section `.devinit.text' of drivers/built-in.o: defined in discarded section `.devexit.text' of drivers/built-in.o `mv64xxx_i2c_remove' referenced in section `.data' of drivers/built-in.o: defined in discarded section `.devexit.text' of drivers/built-in.o `orion_nand_remove' referenced in section `.data' of drivers/built-in.o: defined in discarded section `.devexit.text' of drivers/built-in.o `pxafb_remove' referenced in section `.data' of drivers/built-in.o: defined in discarded section `.devexit.text' of drivers/built-in.o Acked-by: Uwe Kleine-König Signed-off-by: Russell King --- drivers/dma/iop-adma.c | 2 +- drivers/dma/mv_xor.c | 2 +- drivers/i2c/busses/i2c-mv64xxx.c | 4 ++-- drivers/mtd/nand/orion_nand.c | 2 +- drivers/video/pxafb.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c index ea5440dd10dc..647374acba94 100644 --- a/drivers/dma/iop-adma.c +++ b/drivers/dma/iop-adma.c @@ -1401,7 +1401,7 @@ MODULE_ALIAS("platform:iop-adma"); static struct platform_driver iop_adma_driver = { .probe = iop_adma_probe, - .remove = iop_adma_remove, + .remove = __devexit_p(iop_adma_remove), .driver = { .owner = THIS_MODULE, .name = "iop-adma", diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c index d35cbd1ff0b3..5d5d5b31867f 100644 --- a/drivers/dma/mv_xor.c +++ b/drivers/dma/mv_xor.c @@ -1287,7 +1287,7 @@ mv_xor_conf_mbus_windows(struct mv_xor_shared_private *msp, static struct platform_driver mv_xor_driver = { .probe = mv_xor_probe, - .remove = mv_xor_remove, + .remove = __devexit_p(mv_xor_remove), .driver = { .owner = THIS_MODULE, .name = MV_XOR_NAME, diff --git a/drivers/i2c/busses/i2c-mv64xxx.c b/drivers/i2c/busses/i2c-mv64xxx.c index eeda276f8f16..7f186bbcb99d 100644 --- a/drivers/i2c/busses/i2c-mv64xxx.c +++ b/drivers/i2c/busses/i2c-mv64xxx.c @@ -482,7 +482,7 @@ mv64xxx_i2c_map_regs(struct platform_device *pd, return 0; } -static void __devexit +static void mv64xxx_i2c_unmap_regs(struct mv64xxx_i2c_data *drv_data) { if (drv_data->reg_base) { @@ -577,7 +577,7 @@ mv64xxx_i2c_remove(struct platform_device *dev) static struct platform_driver mv64xxx_i2c_driver = { .probe = mv64xxx_i2c_probe, - .remove = mv64xxx_i2c_remove, + .remove = __devexit_p(mv64xxx_i2c_remove), .driver = { .owner = THIS_MODULE, .name = MV64XXX_I2C_CTLR_NAME, diff --git a/drivers/mtd/nand/orion_nand.c b/drivers/mtd/nand/orion_nand.c index 917cf8d3ae95..c2dfd3ea353d 100644 --- a/drivers/mtd/nand/orion_nand.c +++ b/drivers/mtd/nand/orion_nand.c @@ -149,7 +149,7 @@ static int __devexit orion_nand_remove(struct platform_device *pdev) static struct platform_driver orion_nand_driver = { .probe = orion_nand_probe, - .remove = orion_nand_remove, + .remove = __devexit_p(orion_nand_remove), .driver = { .name = "orion_nand", .owner = THIS_MODULE, diff --git a/drivers/video/pxafb.c b/drivers/video/pxafb.c index 48ff701d3a72..2552b9f325ee 100644 --- a/drivers/video/pxafb.c +++ b/drivers/video/pxafb.c @@ -2230,7 +2230,7 @@ static int __devexit pxafb_remove(struct platform_device *dev) static struct platform_driver pxafb_driver = { .probe = pxafb_probe, - .remove = pxafb_remove, + .remove = __devexit_p(pxafb_remove), .suspend = pxafb_suspend, .resume = pxafb_resume, .driver = { From f45964ed6971db2e7ae6cb9b164def1d23b46612 Mon Sep 17 00:00:00 2001 From: Saeed Bishara Date: Mon, 2 Mar 2009 17:30:36 +0200 Subject: [PATCH 104/580] [ARM] orion5x: pass dram mbus data to xor driver This data should be passed to the xor driver in order to initialize the address decoding windows of the xor unit. without this patch, the self tests of the xor will fail unless the address decoding windows were initialized by the boot loader. Signed-off-by: Saeed Bishara Signed-off-by: Nicolas Pitre --- arch/arm/mach-orion5x/common.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm/mach-orion5x/common.c b/arch/arm/mach-orion5x/common.c index 0a623379789f..8a0e49d84256 100644 --- a/arch/arm/mach-orion5x/common.c +++ b/arch/arm/mach-orion5x/common.c @@ -431,6 +431,10 @@ void __init orion5x_uart1_init(void) /***************************************************************************** * XOR engine ****************************************************************************/ +struct mv_xor_platform_shared_data orion5x_xor_shared_data = { + .dram = &orion5x_mbus_dram_info, +}; + static struct resource orion5x_xor_shared_resources[] = { { .name = "xor low", @@ -448,6 +452,9 @@ static struct resource orion5x_xor_shared_resources[] = { static struct platform_device orion5x_xor_shared = { .name = MV_XOR_SHARED_NAME, .id = 0, + .dev = { + .platform_data = &orion5x_xor_shared_data, + }, .num_resources = ARRAY_SIZE(orion5x_xor_shared_resources), .resource = orion5x_xor_shared_resources, }; From 1777f1a978153e8b887c1e1eb5160ac46098b142 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Ha=C5=82asa?= Date: Wed, 4 Mar 2009 08:01:22 +0800 Subject: [PATCH 105/580] crypto: ixp4xx - Fix qmgr_request_queue build failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is another user of IXP4xx queue manager, fix it. Signed-off-by: Krzysztof Hałasa Signed-off-by: Herbert Xu --- drivers/crypto/ixp4xx_crypto.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/ixp4xx_crypto.c b/drivers/crypto/ixp4xx_crypto.c index 2d637e0fbc03..d9e751be8c5f 100644 --- a/drivers/crypto/ixp4xx_crypto.c +++ b/drivers/crypto/ixp4xx_crypto.c @@ -457,10 +457,12 @@ static int init_ixp_crypto(void) if (!ctx_pool) { goto err; } - ret = qmgr_request_queue(SEND_QID, NPE_QLEN_TOTAL, 0, 0); + ret = qmgr_request_queue(SEND_QID, NPE_QLEN_TOTAL, 0, 0, + "ixp_crypto:out", NULL); if (ret) goto err; - ret = qmgr_request_queue(RECV_QID, NPE_QLEN, 0, 0); + ret = qmgr_request_queue(RECV_QID, NPE_QLEN, 0, 0, + "ixp_crypto:in", NULL); if (ret) { qmgr_release_queue(SEND_QID); goto err; From fec6c6fec3e20637bee5d276fb61dd8b49a3f9cc Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 3 Mar 2009 17:05:22 -0800 Subject: [PATCH 106/580] Linux 2.6.29-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index df6ce3e80090..d04ee0ad1dcc 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 29 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Erotic Pickled Herring # *DOCUMENTATION* From 4e8304758cc09a6097dbd2c4f44a5369e5c1edb0 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 3 Mar 2009 11:51:39 -0800 Subject: [PATCH 107/580] x86: remove vestigial fix_ioremap prototypes The function seems to have disappeared at some point, leaving some vestigial prototypes behind... Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/include/asm/io.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 683d0b4c00fc..e5383e3d2f8c 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -172,8 +172,6 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) extern void iounmap(volatile void __iomem *addr); -extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); - #ifdef CONFIG_X86_32 # include "io_32.h" @@ -198,7 +196,6 @@ extern void early_ioremap_reset(void); extern void __iomem *early_ioremap(unsigned long offset, unsigned long size); extern void __iomem *early_memremap(unsigned long offset, unsigned long size); extern void early_iounmap(void __iomem *addr, unsigned long size); -extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys); #define IO_SPACE_LIMIT 0xffff From f254f3909efaf59ca2d0f408de2d044dace60706 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 3 Mar 2009 12:02:57 -0800 Subject: [PATCH 108/580] x86: un-__init fill_pud/pmd/pte They are used by __set_fixmap->set_pte_vaddr_pud, which can be used by arch_setup_additional_pages(), and so is used after init. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 11981fc8570a..07f44d491df1 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -168,7 +168,7 @@ static __ref void *spp_getpage(void) return ptr; } -static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr) +static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) { if (pgd_none(*pgd)) { pud_t *pud = (pud_t *)spp_getpage(); @@ -180,7 +180,7 @@ static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr) return pud_offset(pgd, vaddr); } -static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr) +static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) { if (pud_none(*pud)) { pmd_t *pmd = (pmd_t *) spp_getpage(); @@ -192,7 +192,7 @@ static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr) return pmd_offset(pud, vaddr); } -static pte_t * __init fill_pte(pmd_t *pmd, unsigned long vaddr) +static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) { if (pmd_none(*pmd)) { pte_t *pte = (pte_t *) spp_getpage(); From 9bd50df6aa9bdd583793a16b0b9379dfd78c079e Mon Sep 17 00:00:00 2001 From: Philippe Gerum Date: Wed, 4 Mar 2009 16:52:38 +0800 Subject: [PATCH 109/580] Blackfin arch: Update adeos blackfin arch patch to 1.9-00 Signed-off-by: Philippe Gerum Signed-off-by: Bryan Wu --- arch/blackfin/include/asm/ipipe.h | 100 +++++------- arch/blackfin/include/asm/ipipe_base.h | 12 +- arch/blackfin/include/asm/irq.h | 36 +++-- arch/blackfin/include/asm/thread_info.h | 2 + arch/blackfin/kernel/ipipe.c | 180 ++++++---------------- arch/blackfin/kernel/irqchip.c | 14 +- arch/blackfin/kernel/time.c | 5 +- arch/blackfin/mach-common/entry.S | 61 +++++++- arch/blackfin/mach-common/interrupt.S | 12 +- arch/blackfin/mach-common/ints-priority.c | 126 ++++++--------- 10 files changed, 243 insertions(+), 305 deletions(-) diff --git a/arch/blackfin/include/asm/ipipe.h b/arch/blackfin/include/asm/ipipe.h index 76f53d8b9a0d..343b56361ec9 100644 --- a/arch/blackfin/include/asm/ipipe.h +++ b/arch/blackfin/include/asm/ipipe.h @@ -35,9 +35,9 @@ #include #include -#define IPIPE_ARCH_STRING "1.8-00" +#define IPIPE_ARCH_STRING "1.9-00" #define IPIPE_MAJOR_NUMBER 1 -#define IPIPE_MINOR_NUMBER 8 +#define IPIPE_MINOR_NUMBER 9 #define IPIPE_PATCH_NUMBER 0 #ifdef CONFIG_SMP @@ -83,9 +83,9 @@ struct ipipe_sysinfo { "%2 = CYCLES2\n" \ "CC = %2 == %0\n" \ "if ! CC jump 1b\n" \ - : "=r" (((unsigned long *)&t)[1]), \ - "=r" (((unsigned long *)&t)[0]), \ - "=r" (__cy2) \ + : "=d,a" (((unsigned long *)&t)[1]), \ + "=d,a" (((unsigned long *)&t)[0]), \ + "=d,a" (__cy2) \ : /*no input*/ : "CC"); \ t; \ }) @@ -118,35 +118,40 @@ void __ipipe_disable_irqdesc(struct ipipe_domain *ipd, #define __ipipe_disable_irq(irq) (irq_desc[irq].chip->mask(irq)) -#define __ipipe_lock_root() \ - set_bit(IPIPE_ROOTLOCK_FLAG, &ipipe_root_domain->flags) +static inline int __ipipe_check_tickdev(const char *devname) +{ + return 1; +} -#define __ipipe_unlock_root() \ - clear_bit(IPIPE_ROOTLOCK_FLAG, &ipipe_root_domain->flags) +static inline void __ipipe_lock_root(void) +{ + set_bit(IPIPE_SYNCDEFER_FLAG, &ipipe_root_cpudom_var(status)); +} + +static inline void __ipipe_unlock_root(void) +{ + clear_bit(IPIPE_SYNCDEFER_FLAG, &ipipe_root_cpudom_var(status)); +} void __ipipe_enable_pipeline(void); #define __ipipe_hook_critical_ipi(ipd) do { } while (0) -#define __ipipe_sync_pipeline(syncmask) \ - do { \ - struct ipipe_domain *ipd = ipipe_current_domain; \ - if (likely(ipd != ipipe_root_domain || !test_bit(IPIPE_ROOTLOCK_FLAG, &ipd->flags))) \ - __ipipe_sync_stage(syncmask); \ - } while (0) +#define __ipipe_sync_pipeline ___ipipe_sync_pipeline +void ___ipipe_sync_pipeline(unsigned long syncmask); void __ipipe_handle_irq(unsigned irq, struct pt_regs *regs); int __ipipe_get_irq_priority(unsigned irq); -int __ipipe_get_irqthread_priority(unsigned irq); - void __ipipe_stall_root_raw(void); void __ipipe_unstall_root_raw(void); void __ipipe_serial_debug(const char *fmt, ...); +asmlinkage void __ipipe_call_irqtail(unsigned long addr); + DECLARE_PER_CPU(struct pt_regs, __ipipe_tick_regs); extern unsigned long __ipipe_core_clock; @@ -162,42 +167,25 @@ static inline unsigned long __ipipe_ffnz(unsigned long ul) #define __ipipe_run_irqtail() /* Must be a macro */ \ do { \ - asmlinkage void __ipipe_call_irqtail(void); \ unsigned long __pending; \ - CSYNC(); \ + CSYNC(); \ __pending = bfin_read_IPEND(); \ if (__pending & 0x8000) { \ __pending &= ~0x8010; \ if (__pending && (__pending & (__pending - 1)) == 0) \ - __ipipe_call_irqtail(); \ + __ipipe_call_irqtail(__ipipe_irq_tail_hook); \ } \ } while (0) #define __ipipe_run_isr(ipd, irq) \ do { \ if (ipd == ipipe_root_domain) { \ - /* \ - * Note: the I-pipe implements a threaded interrupt model on \ - * this arch for Linux external IRQs. The interrupt handler we \ - * call here only wakes up the associated IRQ thread. \ - */ \ - if (ipipe_virtual_irq_p(irq)) { \ - /* No irqtail here; virtual interrupts have no effect \ - on IPEND so there is no need for processing \ - deferral. */ \ - local_irq_enable_nohead(ipd); \ + local_irq_enable_hw(); \ + if (ipipe_virtual_irq_p(irq)) \ ipd->irqs[irq].handler(irq, ipd->irqs[irq].cookie); \ - local_irq_disable_nohead(ipd); \ - } else \ - /* \ - * No need to run the irqtail here either; \ - * we can't be preempted by hw IRQs, so \ - * non-Linux IRQs cannot stack over the short \ - * thread wakeup code. Which in turn means \ - * that no irqtail condition could be pending \ - * for domains above Linux in the pipeline. \ - */ \ + else \ ipd->irqs[irq].handler(irq, &__raw_get_cpu_var(__ipipe_tick_regs)); \ + local_irq_disable_hw(); \ } else { \ __clear_bit(IPIPE_SYNC_FLAG, &ipipe_cpudom_var(ipd, status)); \ local_irq_enable_nohead(ipd); \ @@ -217,42 +205,24 @@ void ipipe_init_irq_threads(void); int ipipe_start_irq_thread(unsigned irq, struct irq_desc *desc); -#define IS_SYSIRQ(irq) ((irq) > IRQ_CORETMR && (irq) <= SYS_IRQS) -#define IS_GPIOIRQ(irq) ((irq) >= GPIO_IRQ_BASE && (irq) < NR_IRQS) - +#ifdef CONFIG_GENERIC_CLOCKEVENTS +#define IRQ_SYSTMR IRQ_CORETMR +#define IRQ_PRIOTMR IRQ_CORETMR +#else #define IRQ_SYSTMR IRQ_TIMER0 #define IRQ_PRIOTMR CONFIG_IRQ_TIMER0 +#endif -#if defined(CONFIG_BF531) || defined(CONFIG_BF532) || defined(CONFIG_BF533) -#define PRIO_GPIODEMUX(irq) CONFIG_PFA -#elif defined(CONFIG_BF534) || defined(CONFIG_BF536) || defined(CONFIG_BF537) -#define PRIO_GPIODEMUX(irq) CONFIG_IRQ_PROG_INTA -#elif defined(CONFIG_BF52x) -#define PRIO_GPIODEMUX(irq) ((irq) == IRQ_PORTF_INTA ? CONFIG_IRQ_PORTF_INTA : \ - (irq) == IRQ_PORTG_INTA ? CONFIG_IRQ_PORTG_INTA : \ - (irq) == IRQ_PORTH_INTA ? CONFIG_IRQ_PORTH_INTA : \ - -1) -#elif defined(CONFIG_BF561) -#define PRIO_GPIODEMUX(irq) ((irq) == IRQ_PROG0_INTA ? CONFIG_IRQ_PROG0_INTA : \ - (irq) == IRQ_PROG1_INTA ? CONFIG_IRQ_PROG1_INTA : \ - (irq) == IRQ_PROG2_INTA ? CONFIG_IRQ_PROG2_INTA : \ - -1) +#ifdef CONFIG_BF561 #define bfin_write_TIMER_DISABLE(val) bfin_write_TMRS8_DISABLE(val) #define bfin_write_TIMER_ENABLE(val) bfin_write_TMRS8_ENABLE(val) #define bfin_write_TIMER_STATUS(val) bfin_write_TMRS8_STATUS(val) #define bfin_read_TIMER_STATUS() bfin_read_TMRS8_STATUS() #elif defined(CONFIG_BF54x) -#define PRIO_GPIODEMUX(irq) ((irq) == IRQ_PINT0 ? CONFIG_IRQ_PINT0 : \ - (irq) == IRQ_PINT1 ? CONFIG_IRQ_PINT1 : \ - (irq) == IRQ_PINT2 ? CONFIG_IRQ_PINT2 : \ - (irq) == IRQ_PINT3 ? CONFIG_IRQ_PINT3 : \ - -1) #define bfin_write_TIMER_DISABLE(val) bfin_write_TIMER_DISABLE0(val) #define bfin_write_TIMER_ENABLE(val) bfin_write_TIMER_ENABLE0(val) #define bfin_write_TIMER_STATUS(val) bfin_write_TIMER_STATUS0(val) #define bfin_read_TIMER_STATUS(val) bfin_read_TIMER_STATUS0(val) -#else -# error "no PRIO_GPIODEMUX() for this part" #endif #define __ipipe_root_tick_p(regs) ((regs->ipend & 0x10) != 0) @@ -275,4 +245,6 @@ int ipipe_start_irq_thread(unsigned irq, struct irq_desc *desc); #endif /* !CONFIG_IPIPE */ +#define ipipe_update_tick_evtdev(evtdev) do { } while (0) + #endif /* !__ASM_BLACKFIN_IPIPE_H */ diff --git a/arch/blackfin/include/asm/ipipe_base.h b/arch/blackfin/include/asm/ipipe_base.h index cb1025aeabcf..3e8acbd1a3be 100644 --- a/arch/blackfin/include/asm/ipipe_base.h +++ b/arch/blackfin/include/asm/ipipe_base.h @@ -1,5 +1,5 @@ /* -*- linux-c -*- - * include/asm-blackfin/_baseipipe.h + * include/asm-blackfin/ipipe_base.h * * Copyright (C) 2007 Philippe Gerum. * @@ -27,8 +27,9 @@ #define IPIPE_NR_XIRQS NR_IRQS #define IPIPE_IRQ_ISHIFT 5 /* 2^5 for 32bits arch. */ -/* Blackfin-specific, global domain flags */ -#define IPIPE_ROOTLOCK_FLAG 1 /* Lock pipeline for root */ +/* Blackfin-specific, per-cpu pipeline status */ +#define IPIPE_SYNCDEFER_FLAG 15 +#define IPIPE_SYNCDEFER_MASK (1L << IPIPE_SYNCDEFER_MASK) /* Blackfin traps -- i.e. exception vector numbers */ #define IPIPE_NR_FAULTS 52 /* We leave a gap after VEC_ILL_RES. */ @@ -48,11 +49,6 @@ #ifndef __ASSEMBLY__ -#include - -extern int test_bit(int nr, const void *addr); - - extern unsigned long __ipipe_root_status; /* Alias to ipipe_root_cpudom_var(status) */ static inline void __ipipe_stall_root(void) diff --git a/arch/blackfin/include/asm/irq.h b/arch/blackfin/include/asm/irq.h index 3d977909ce7d..7645e85a5f6f 100644 --- a/arch/blackfin/include/asm/irq.h +++ b/arch/blackfin/include/asm/irq.h @@ -61,20 +61,38 @@ void __ipipe_restore_root(unsigned long flags); #define raw_irqs_disabled_flags(flags) (!irqs_enabled_from_flags_hw(flags)) #define local_test_iflag_hw(x) irqs_enabled_from_flags_hw(x) -#define local_save_flags(x) \ - do { \ - (x) = __ipipe_test_root() ? \ +#define local_save_flags(x) \ + do { \ + (x) = __ipipe_test_root() ? \ __all_masked_irq_flags : bfin_irq_flags; \ + barrier(); \ } while (0) -#define local_irq_save(x) \ - do { \ - (x) = __ipipe_test_and_stall_root(); \ +#define local_irq_save(x) \ + do { \ + (x) = __ipipe_test_and_stall_root() ? \ + __all_masked_irq_flags : bfin_irq_flags; \ + barrier(); \ } while (0) -#define local_irq_restore(x) __ipipe_restore_root(x) -#define local_irq_disable() __ipipe_stall_root() -#define local_irq_enable() __ipipe_unstall_root() +static inline void local_irq_restore(unsigned long x) +{ + barrier(); + __ipipe_restore_root(x == __all_masked_irq_flags); +} + +#define local_irq_disable() \ + do { \ + __ipipe_stall_root(); \ + barrier(); \ + } while (0) + +static inline void local_irq_enable(void) +{ + barrier(); + __ipipe_unstall_root(); +} + #define irqs_disabled() __ipipe_test_root() #define local_save_flags_hw(x) \ diff --git a/arch/blackfin/include/asm/thread_info.h b/arch/blackfin/include/asm/thread_info.h index e721ce55956c..2920087516f2 100644 --- a/arch/blackfin/include/asm/thread_info.h +++ b/arch/blackfin/include/asm/thread_info.h @@ -122,6 +122,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_MEMDIE 4 #define TIF_RESTORE_SIGMASK 5 /* restore signal mask in do_signal() */ #define TIF_FREEZE 6 /* is freezing for suspend */ +#define TIF_IRQ_SYNC 7 /* sync pipeline stage */ /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1< #include -static int create_irq_threads; - DEFINE_PER_CPU(struct pt_regs, __ipipe_tick_regs); -static DEFINE_PER_CPU(unsigned long, pending_irqthread_mask); - -static DEFINE_PER_CPU(int [IVG13 + 1], pending_irq_count); - asmlinkage void asm_do_IRQ(unsigned int irq, struct pt_regs *regs); static void __ipipe_no_irqtail(void); @@ -93,6 +87,7 @@ void __ipipe_enable_pipeline(void) */ void __ipipe_handle_irq(unsigned irq, struct pt_regs *regs) { + struct ipipe_percpu_domain_data *p = ipipe_root_cpudom_ptr(); struct ipipe_domain *this_domain, *next_domain; struct list_head *head, *pos; int m_ack, s = -1; @@ -104,7 +99,6 @@ void __ipipe_handle_irq(unsigned irq, struct pt_regs *regs) * interrupt. */ m_ack = (regs == NULL || irq == IRQ_SYSTMR || irq == IRQ_CORETMR); - this_domain = ipipe_current_domain; if (unlikely(test_bit(IPIPE_STICKY_FLAG, &this_domain->irqs[irq].control))) @@ -114,49 +108,28 @@ void __ipipe_handle_irq(unsigned irq, struct pt_regs *regs) next_domain = list_entry(head, struct ipipe_domain, p_link); if (likely(test_bit(IPIPE_WIRED_FLAG, &next_domain->irqs[irq].control))) { if (!m_ack && next_domain->irqs[irq].acknowledge != NULL) - next_domain->irqs[irq].acknowledge(irq, irq_desc + irq); - if (test_bit(IPIPE_ROOTLOCK_FLAG, &ipipe_root_domain->flags)) - s = __test_and_set_bit(IPIPE_STALL_FLAG, - &ipipe_root_cpudom_var(status)); + next_domain->irqs[irq].acknowledge(irq, irq_to_desc(irq)); + if (test_bit(IPIPE_SYNCDEFER_FLAG, &p->status)) + s = __test_and_set_bit(IPIPE_STALL_FLAG, &p->status); __ipipe_dispatch_wired(next_domain, irq); - goto finalize; - return; + goto out; } } /* Ack the interrupt. */ pos = head; - while (pos != &__ipipe_pipeline) { next_domain = list_entry(pos, struct ipipe_domain, p_link); - /* - * For each domain handling the incoming IRQ, mark it - * as pending in its log. - */ if (test_bit(IPIPE_HANDLE_FLAG, &next_domain->irqs[irq].control)) { - /* - * Domains that handle this IRQ are polled for - * acknowledging it by decreasing priority - * order. The interrupt must be made pending - * _first_ in the domain's status flags before - * the PIC is unlocked. - */ __ipipe_set_irq_pending(next_domain, irq); - if (!m_ack && next_domain->irqs[irq].acknowledge != NULL) { - next_domain->irqs[irq].acknowledge(irq, irq_desc + irq); + next_domain->irqs[irq].acknowledge(irq, irq_to_desc(irq)); m_ack = 1; } } - - /* - * If the domain does not want the IRQ to be passed - * down the interrupt pipe, exit the loop now. - */ if (!test_bit(IPIPE_PASS_FLAG, &next_domain->irqs[irq].control)) break; - pos = next_domain->p_link.next; } @@ -166,18 +139,24 @@ void __ipipe_handle_irq(unsigned irq, struct pt_regs *regs) * immediately to the current domain if the interrupt has been * marked as 'sticky'. This search does not go beyond the * current domain in the pipeline. We also enforce the - * additional root stage lock (blackfin-specific). */ + * additional root stage lock (blackfin-specific). + */ + if (test_bit(IPIPE_SYNCDEFER_FLAG, &p->status)) + s = __test_and_set_bit(IPIPE_STALL_FLAG, &p->status); - if (test_bit(IPIPE_ROOTLOCK_FLAG, &ipipe_root_domain->flags)) - s = __test_and_set_bit(IPIPE_STALL_FLAG, - &ipipe_root_cpudom_var(status)); -finalize: + /* + * If the interrupt preempted the head domain, then do not + * even try to walk the pipeline, unless an interrupt is + * pending for it. + */ + if (test_bit(IPIPE_AHEAD_FLAG, &this_domain->flags) && + ipipe_head_cpudom_var(irqpend_himask) == 0) + goto out; __ipipe_walk_pipeline(head); - +out: if (!s) - __clear_bit(IPIPE_STALL_FLAG, - &ipipe_root_cpudom_var(status)); + __clear_bit(IPIPE_STALL_FLAG, &p->status); } int __ipipe_check_root(void) @@ -187,7 +166,7 @@ int __ipipe_check_root(void) void __ipipe_enable_irqdesc(struct ipipe_domain *ipd, unsigned irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); int prio = desc->ic_prio; desc->depth = 0; @@ -199,7 +178,7 @@ EXPORT_SYMBOL(__ipipe_enable_irqdesc); void __ipipe_disable_irqdesc(struct ipipe_domain *ipd, unsigned irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); int prio = desc->ic_prio; if (ipd != &ipipe_root && @@ -236,15 +215,18 @@ int __ipipe_syscall_root(struct pt_regs *regs) { unsigned long flags; - /* We need to run the IRQ tail hook whenever we don't + /* + * We need to run the IRQ tail hook whenever we don't * propagate a syscall to higher domains, because we know that * important operations might be pending there (e.g. Xenomai - * deferred rescheduling). */ + * deferred rescheduling). + */ - if (!__ipipe_syscall_watched_p(current, regs->orig_p0)) { + if (regs->orig_p0 < NR_syscalls) { void (*hook)(void) = (void (*)(void))__ipipe_irq_tail_hook; hook(); - return 0; + if ((current->flags & PF_EVNOTIFY) == 0) + return 0; } /* @@ -312,112 +294,46 @@ int ipipe_trigger_irq(unsigned irq) { unsigned long flags; +#ifdef CONFIG_IPIPE_DEBUG if (irq >= IPIPE_NR_IRQS || (ipipe_virtual_irq_p(irq) && !test_bit(irq - IPIPE_VIRQ_BASE, &__ipipe_virtual_irq_map))) return -EINVAL; +#endif local_irq_save_hw(flags); - __ipipe_handle_irq(irq, NULL); - local_irq_restore_hw(flags); return 1; } -/* Move Linux IRQ to threads. */ - -static int do_irqd(void *__desc) +asmlinkage void __ipipe_sync_root(void) { - struct irq_desc *desc = __desc; - unsigned irq = desc - irq_desc; - int thrprio = desc->thr_prio; - int thrmask = 1 << thrprio; - int cpu = smp_processor_id(); - cpumask_t cpumask; + unsigned long flags; - sigfillset(¤t->blocked); - current->flags |= PF_NOFREEZE; - cpumask = cpumask_of_cpu(cpu); - set_cpus_allowed(current, cpumask); - ipipe_setscheduler_root(current, SCHED_FIFO, 50 + thrprio); + BUG_ON(irqs_disabled()); - while (!kthread_should_stop()) { - local_irq_disable(); - if (!(desc->status & IRQ_SCHEDULED)) { - set_current_state(TASK_INTERRUPTIBLE); -resched: - local_irq_enable(); - schedule(); - local_irq_disable(); - } - __set_current_state(TASK_RUNNING); - /* - * If higher priority interrupt servers are ready to - * run, reschedule immediately. We need this for the - * GPIO demux IRQ handler to unmask the interrupt line - * _last_, after all GPIO IRQs have run. - */ - if (per_cpu(pending_irqthread_mask, cpu) & ~(thrmask|(thrmask-1))) - goto resched; - if (--per_cpu(pending_irq_count[thrprio], cpu) == 0) - per_cpu(pending_irqthread_mask, cpu) &= ~thrmask; - desc->status &= ~IRQ_SCHEDULED; - desc->thr_handler(irq, &__raw_get_cpu_var(__ipipe_tick_regs)); - local_irq_enable(); - } - __set_current_state(TASK_RUNNING); - return 0; + local_irq_save_hw(flags); + + clear_thread_flag(TIF_IRQ_SYNC); + + if (ipipe_root_cpudom_var(irqpend_himask) != 0) + __ipipe_sync_pipeline(IPIPE_IRQMASK_ANY); + + local_irq_restore_hw(flags); } -static void kick_irqd(unsigned irq, void *cookie) +void ___ipipe_sync_pipeline(unsigned long syncmask) { - struct irq_desc *desc = irq_desc + irq; - int thrprio = desc->thr_prio; - int thrmask = 1 << thrprio; - int cpu = smp_processor_id(); + struct ipipe_domain *ipd = ipipe_current_domain; - if (!(desc->status & IRQ_SCHEDULED)) { - desc->status |= IRQ_SCHEDULED; - per_cpu(pending_irqthread_mask, cpu) |= thrmask; - ++per_cpu(pending_irq_count[thrprio], cpu); - wake_up_process(desc->thread); - } -} - -int ipipe_start_irq_thread(unsigned irq, struct irq_desc *desc) -{ - if (desc->thread || !create_irq_threads) - return 0; - - desc->thread = kthread_create(do_irqd, desc, "IRQ %d", irq); - if (desc->thread == NULL) { - printk(KERN_ERR "irqd: could not create IRQ thread %d!\n", irq); - return -ENOMEM; + if (ipd == ipipe_root_domain) { + if (test_bit(IPIPE_SYNCDEFER_FLAG, &ipipe_root_cpudom_var(status))) + return; } - wake_up_process(desc->thread); - - desc->thr_handler = ipipe_root_domain->irqs[irq].handler; - ipipe_root_domain->irqs[irq].handler = &kick_irqd; - - return 0; -} - -void __init ipipe_init_irq_threads(void) -{ - unsigned irq; - struct irq_desc *desc; - - create_irq_threads = 1; - - for (irq = 0; irq < NR_IRQS; irq++) { - desc = irq_desc + irq; - if (desc->action != NULL || - (desc->status & IRQ_NOREQUEST) != 0) - ipipe_start_irq_thread(irq, desc); - } + __ipipe_sync_stage(syncmask); } EXPORT_SYMBOL(show_stack); diff --git a/arch/blackfin/kernel/irqchip.c b/arch/blackfin/kernel/irqchip.c index 75724eee6494..7fd126564846 100644 --- a/arch/blackfin/kernel/irqchip.c +++ b/arch/blackfin/kernel/irqchip.c @@ -144,11 +144,15 @@ asmlinkage void asm_do_IRQ(unsigned int irq, struct pt_regs *regs) #endif generic_handle_irq(irq); -#ifndef CONFIG_IPIPE /* Useless and bugous over the I-pipe: IRQs are threaded. */ - /* If we're the only interrupt running (ignoring IRQ15 which is for - syscalls), lower our priority to IRQ14 so that softirqs run at - that level. If there's another, lower-level interrupt, irq_exit - will defer softirqs to that. */ +#ifndef CONFIG_IPIPE + /* + * If we're the only interrupt running (ignoring IRQ15 which + * is for syscalls), lower our priority to IRQ14 so that + * softirqs run at that level. If there's another, + * lower-level interrupt, irq_exit will defer softirqs to + * that. If the interrupt pipeline is enabled, we are already + * running at IRQ14 priority, so we don't need this code. + */ CSYNC(); pending = bfin_read_IPEND() & ~0x8000; other_ints = pending & (pending - 1); diff --git a/arch/blackfin/kernel/time.c b/arch/blackfin/kernel/time.c index 172b4c588467..1bbacfbd4c5d 100644 --- a/arch/blackfin/kernel/time.c +++ b/arch/blackfin/kernel/time.c @@ -134,7 +134,10 @@ irqreturn_t timer_interrupt(int irq, void *dummy) write_seqlock(&xtime_lock); #if defined(CONFIG_TICK_SOURCE_SYSTMR0) && !defined(CONFIG_IPIPE) -/* FIXME: Here TIMIL0 is not set when IPIPE enabled, why? */ + /* + * TIMIL0 is latched in __ipipe_grab_irq() when the I-Pipe is + * enabled. + */ if (get_gptimer_status(0) & TIMER_STATUS_TIMIL0) { #endif do_timer(1); diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S index 88de053bbe8e..21e65a339a22 100644 --- a/arch/blackfin/mach-common/entry.S +++ b/arch/blackfin/mach-common/entry.S @@ -600,6 +600,19 @@ ENTRY(_system_call) p2 = [p2]; [p2+(TASK_THREAD+THREAD_KSP)] = sp; +#ifdef CONFIG_IPIPE + r0 = sp; + SP += -12; + call ___ipipe_syscall_root; + SP += 12; + cc = r0 == 1; + if cc jump .Lsyscall_really_exit; + cc = r0 == -1; + if cc jump .Lresume_userspace; + r3 = [sp + PT_R3]; + r4 = [sp + PT_R4]; + p0 = [sp + PT_ORIG_P0]; +#endif /* CONFIG_IPIPE */ /* Check the System Call */ r7 = __NR_syscall; @@ -654,6 +667,17 @@ ENTRY(_system_call) r7 = r7 & r4; .Lsyscall_resched: +#ifdef CONFIG_IPIPE + cc = BITTST(r7, TIF_IRQ_SYNC); + if !cc jump .Lsyscall_no_irqsync; + [--sp] = reti; + r0 = [sp++]; + SP += -12; + call ___ipipe_sync_root; + SP += 12; + jump .Lresume_userspace_1; +.Lsyscall_no_irqsync: +#endif cc = BITTST(r7, TIF_NEED_RESCHED); if !cc jump .Lsyscall_sigpending; @@ -685,6 +709,10 @@ ENTRY(_system_call) .Lsyscall_really_exit: r5 = [sp + PT_RESERVED]; rets = r5; +#ifdef CONFIG_IPIPE + [--sp] = reti; + r5 = [sp++]; +#endif /* CONFIG_IPIPE */ rts; ENDPROC(_system_call) @@ -771,6 +799,15 @@ _new_old_task: ENDPROC(_resume) ENTRY(_ret_from_exception) +#ifdef CONFIG_IPIPE + [--sp] = rets; + SP += -12; + call ___ipipe_check_root + SP += 12 + rets = [sp++]; + cc = r0 == 0; + if cc jump 4f; /* not on behalf of Linux, get out */ +#endif /* CONFIG_IPIPE */ p2.l = lo(IPEND); p2.h = hi(IPEND); @@ -827,6 +864,28 @@ ENTRY(_ret_from_exception) rts; ENDPROC(_ret_from_exception) +#ifdef CONFIG_IPIPE + +_sync_root_irqs: + [--sp] = reti; /* Reenable interrupts */ + r0 = [sp++]; + jump.l ___ipipe_sync_root + +_resume_kernel_from_int: + r0.l = _sync_root_irqs + r0.h = _sync_root_irqs + [--sp] = rets; + [--sp] = ( r7:4, p5:3 ); + SP += -12; + call ___ipipe_call_irqtail + SP += 12; + ( r7:4, p5:3 ) = [sp++]; + rets = [sp++]; + rts +#else +#define _resume_kernel_from_int 2f +#endif + ENTRY(_return_from_int) /* If someone else already raised IRQ 15, do nothing. */ csync; @@ -848,7 +907,7 @@ ENTRY(_return_from_int) r1 = r0 - r1; r2 = r0 & r1; cc = r2 == 0; - if !cc jump 2f; + if !cc jump _resume_kernel_from_int; /* Lower the interrupt level to 15. */ p0.l = lo(EVT15); diff --git a/arch/blackfin/mach-common/interrupt.S b/arch/blackfin/mach-common/interrupt.S index 43c4eb9acb65..0069c2dd4625 100644 --- a/arch/blackfin/mach-common/interrupt.S +++ b/arch/blackfin/mach-common/interrupt.S @@ -235,6 +235,7 @@ ENDPROC(_evt_system_call) #ifdef CONFIG_IPIPE ENTRY(___ipipe_call_irqtail) + p0 = r0; r0.l = 1f; r0.h = 1f; reti = r0; @@ -242,9 +243,6 @@ ENTRY(___ipipe_call_irqtail) 1: [--sp] = rets; [--sp] = ( r7:4, p5:3 ); - p0.l = ___ipipe_irq_tail_hook; - p0.h = ___ipipe_irq_tail_hook; - p0 = [p0]; sp += -12; call (p0); sp += 12; @@ -259,7 +257,7 @@ ENTRY(___ipipe_call_irqtail) p0.h = hi(EVT14); [p0] = r0; csync; - r0 = 0x401f; + r0 = 0x401f (z); sti r0; raise 14; [--sp] = reti; /* IRQs on. */ @@ -277,11 +275,7 @@ ENTRY(___ipipe_call_irqtail) p0.h = _bfin_irq_flags; r0 = [p0]; sti r0; -#if 0 /* FIXME: this actually raises scheduling latencies */ - /* Reenable interrupts */ - [--sp] = reti; - r0 = [sp++]; -#endif rts; ENDPROC(___ipipe_call_irqtail) + #endif /* CONFIG_IPIPE */ diff --git a/arch/blackfin/mach-common/ints-priority.c b/arch/blackfin/mach-common/ints-priority.c index 202494568c6c..a7d7b2dd4059 100644 --- a/arch/blackfin/mach-common/ints-priority.c +++ b/arch/blackfin/mach-common/ints-priority.c @@ -161,11 +161,15 @@ static void bfin_core_unmask_irq(unsigned int irq) static void bfin_internal_mask_irq(unsigned int irq) { + unsigned long flags; + #ifdef CONFIG_BF53x + local_irq_save_hw(flags); bfin_write_SIC_IMASK(bfin_read_SIC_IMASK() & ~(1 << SIC_SYSIRQ(irq))); #else unsigned mask_bank, mask_bit; + local_irq_save_hw(flags); mask_bank = SIC_SYSIRQ(irq) / 32; mask_bit = SIC_SYSIRQ(irq) % 32; bfin_write_SIC_IMASK(mask_bank, bfin_read_SIC_IMASK(mask_bank) & @@ -175,15 +179,20 @@ static void bfin_internal_mask_irq(unsigned int irq) ~(1 << mask_bit)); #endif #endif + local_irq_restore_hw(flags); } static void bfin_internal_unmask_irq(unsigned int irq) { + unsigned long flags; + #ifdef CONFIG_BF53x + local_irq_save_hw(flags); bfin_write_SIC_IMASK(bfin_read_SIC_IMASK() | (1 << SIC_SYSIRQ(irq))); #else unsigned mask_bank, mask_bit; + local_irq_save_hw(flags); mask_bank = SIC_SYSIRQ(irq) / 32; mask_bit = SIC_SYSIRQ(irq) % 32; bfin_write_SIC_IMASK(mask_bank, bfin_read_SIC_IMASK(mask_bank) | @@ -193,6 +202,7 @@ static void bfin_internal_unmask_irq(unsigned int irq) (1 << mask_bit)); #endif #endif + local_irq_restore_hw(flags); } #ifdef CONFIG_PM @@ -390,7 +400,7 @@ static void bfin_demux_error_irq(unsigned int int_err_irq, static inline void bfin_set_irq_handler(unsigned irq, irq_flow_handler_t handle) { #ifdef CONFIG_IPIPE - _set_irq_handler(irq, handle_edge_irq); + _set_irq_handler(irq, handle_level_irq); #else struct irq_desc *desc = irq_desc + irq; /* May not call generic set_irq_handler() due to spinlock @@ -1055,13 +1065,18 @@ int __init init_arch_irq(void) #endif default: #ifdef CONFIG_IPIPE - /* - * We want internal interrupt sources to be masked, because - * ISRs may trigger interrupts recursively (e.g. DMA), but - * interrupts are _not_ masked at CPU level. So let's handle - * them as level interrupts. - */ - set_irq_handler(irq, handle_level_irq); + /* + * We want internal interrupt sources to be + * masked, because ISRs may trigger interrupts + * recursively (e.g. DMA), but interrupts are + * _not_ masked at CPU level. So let's handle + * most of them as level interrupts, except + * the timer interrupt which is special. + */ + if (irq == IRQ_SYSTMR || irq == IRQ_CORETMR) + set_irq_handler(irq, handle_simple_irq); + else + set_irq_handler(irq, handle_level_irq); #else /* !CONFIG_IPIPE */ set_irq_handler(irq, handle_simple_irq); #endif /* !CONFIG_IPIPE */ @@ -1123,9 +1138,8 @@ int __init init_arch_irq(void) #ifdef CONFIG_IPIPE for (irq = 0; irq < NR_IRQS; irq++) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); desc->ic_prio = __ipipe_get_irq_priority(irq); - desc->thr_prio = __ipipe_get_irqthread_priority(irq); } #endif /* CONFIG_IPIPE */ @@ -1208,76 +1222,21 @@ int __ipipe_get_irq_priority(unsigned irq) return IVG15; } -int __ipipe_get_irqthread_priority(unsigned irq) -{ - int ient, prio; - int demux_irq; - - /* The returned priority value is rescaled to [0..IVG13+1] - * with 0 being the lowest effective priority level. */ - - if (irq <= IRQ_CORETMR) - return IVG13 - irq + 1; - - /* GPIO IRQs are given the priority of the demux - * interrupt. */ - if (IS_GPIOIRQ(irq)) { -#if defined(CONFIG_BF54x) - u32 bank = PINT_2_BANK(irq2pint_lut[irq - SYS_IRQS]); - demux_irq = (bank == 0 ? IRQ_PINT0 : - bank == 1 ? IRQ_PINT1 : - bank == 2 ? IRQ_PINT2 : - IRQ_PINT3); -#elif defined(CONFIG_BF561) - demux_irq = (irq >= IRQ_PF32 ? IRQ_PROG2_INTA : - irq >= IRQ_PF16 ? IRQ_PROG1_INTA : - IRQ_PROG0_INTA); -#elif defined(CONFIG_BF52x) - demux_irq = (irq >= IRQ_PH0 ? IRQ_PORTH_INTA : - irq >= IRQ_PG0 ? IRQ_PORTG_INTA : - IRQ_PORTF_INTA); -#else - demux_irq = irq; -#endif - return IVG13 - PRIO_GPIODEMUX(demux_irq) + 1; - } - - /* The GPIO demux interrupt is given a lower priority - * than the GPIO IRQs, so that its threaded handler - * unmasks the interrupt line after the decoded IRQs - * have been processed. */ - prio = PRIO_GPIODEMUX(irq); - /* demux irq? */ - if (prio != -1) - return IVG13 - prio; - - for (ient = 0; ient < NR_PERI_INTS; ient++) { - struct ivgx *ivg = ivg_table + ient; - if (ivg->irqno == irq) { - for (prio = 0; prio <= IVG13-IVG7; prio++) { - if (ivg7_13[prio].ifirst <= ivg && - ivg7_13[prio].istop > ivg) - return IVG7 - prio; - } - } - } - - return 0; -} - /* Hw interrupts are disabled on entry (check SAVE_CONTEXT). */ #ifdef CONFIG_DO_IRQ_L1 __attribute__((l1_text)) #endif asmlinkage int __ipipe_grab_irq(int vec, struct pt_regs *regs) { + struct ipipe_percpu_domain_data *p = ipipe_root_cpudom_ptr(); + struct ipipe_domain *this_domain = ipipe_current_domain; struct ivgx *ivg_stop = ivg7_13[vec-IVG7].istop; struct ivgx *ivg = ivg7_13[vec-IVG7].ifirst; - int irq; + int irq, s; if (likely(vec == EVT_IVTMR_P)) { irq = IRQ_CORETMR; - goto handle_irq; + goto core_tick; } SSYNC(); @@ -1319,24 +1278,39 @@ asmlinkage int __ipipe_grab_irq(int vec, struct pt_regs *regs) irq = ivg->irqno; if (irq == IRQ_SYSTMR) { +#ifdef CONFIG_GENERIC_CLOCKEVENTS +core_tick: +#else bfin_write_TIMER_STATUS(1); /* Latch TIMIL0 */ +#endif /* This is basically what we need from the register frame. */ __raw_get_cpu_var(__ipipe_tick_regs).ipend = regs->ipend; __raw_get_cpu_var(__ipipe_tick_regs).pc = regs->pc; - if (!ipipe_root_domain_p) - __raw_get_cpu_var(__ipipe_tick_regs).ipend |= 0x10; - else + if (this_domain != ipipe_root_domain) __raw_get_cpu_var(__ipipe_tick_regs).ipend &= ~0x10; + else + __raw_get_cpu_var(__ipipe_tick_regs).ipend |= 0x10; } -handle_irq: +#ifndef CONFIG_GENERIC_CLOCKEVENTS +core_tick: +#endif + if (this_domain == ipipe_root_domain) { + s = __test_and_set_bit(IPIPE_SYNCDEFER_FLAG, &p->status); + barrier(); + } ipipe_trace_irq_entry(irq); __ipipe_handle_irq(irq, regs); - ipipe_trace_irq_exit(irq); + ipipe_trace_irq_exit(irq); - if (ipipe_root_domain_p) - return !test_bit(IPIPE_STALL_FLAG, &ipipe_root_cpudom_var(status)); + if (this_domain == ipipe_root_domain) { + set_thread_flag(TIF_IRQ_SYNC); + if (!s) { + __clear_bit(IPIPE_SYNCDEFER_FLAG, &p->status); + return !test_bit(IPIPE_STALL_FLAG, &p->status); + } + } return 0; } From 8bf6774d7fe44ada94bb8df50f089c6e60bdfb26 Mon Sep 17 00:00:00 2001 From: Michael Hennerich Date: Wed, 4 Mar 2009 11:34:10 +0800 Subject: [PATCH 110/580] Blackfin arch: Enable Write Back Cache on all Blackfin Boards Signed-off-by: Michael Hennerich Signed-off-by: Bryan Wu --- arch/blackfin/configs/BF527-EZKIT_defconfig | 4 ++-- arch/blackfin/configs/BF533-EZKIT_defconfig | 4 ++-- arch/blackfin/configs/BF533-STAMP_defconfig | 4 ++-- arch/blackfin/configs/BF537-STAMP_defconfig | 4 ++-- arch/blackfin/configs/BF538-EZKIT_defconfig | 4 ++-- arch/blackfin/configs/BF548-EZKIT_defconfig | 4 ++-- arch/blackfin/configs/BF561-EZKIT_defconfig | 4 ++-- arch/blackfin/configs/BlackStamp_defconfig | 4 ++-- arch/blackfin/configs/CM-BF527_defconfig | 4 ++-- arch/blackfin/configs/CM-BF548_defconfig | 4 ++-- arch/blackfin/configs/SRV1_defconfig | 4 ++-- 11 files changed, 22 insertions(+), 22 deletions(-) diff --git a/arch/blackfin/configs/BF527-EZKIT_defconfig b/arch/blackfin/configs/BF527-EZKIT_defconfig index 833128b39724..a50050f17706 100644 --- a/arch/blackfin/configs/BF527-EZKIT_defconfig +++ b/arch/blackfin/configs/BF527-EZKIT_defconfig @@ -327,8 +327,8 @@ CONFIG_BFIN_ICACHE=y CONFIG_BFIN_DCACHE=y # CONFIG_BFIN_DCACHE_BANKA is not set # CONFIG_BFIN_ICACHE_LOCK is not set -# CONFIG_BFIN_WB is not set -CONFIG_BFIN_WT=y +CONFIG_BFIN_WB=y +# CONFIG_BFIN_WT is not set # CONFIG_MPU is not set # diff --git a/arch/blackfin/configs/BF533-EZKIT_defconfig b/arch/blackfin/configs/BF533-EZKIT_defconfig index 334c94b51c40..0a2a00d63887 100644 --- a/arch/blackfin/configs/BF533-EZKIT_defconfig +++ b/arch/blackfin/configs/BF533-EZKIT_defconfig @@ -290,8 +290,8 @@ CONFIG_BFIN_ICACHE=y CONFIG_BFIN_DCACHE=y # CONFIG_BFIN_DCACHE_BANKA is not set # CONFIG_BFIN_ICACHE_LOCK is not set -# CONFIG_BFIN_WB is not set -CONFIG_BFIN_WT=y +CONFIG_BFIN_WB=y +# CONFIG_BFIN_WT is not set # CONFIG_MPU is not set # diff --git a/arch/blackfin/configs/BF533-STAMP_defconfig b/arch/blackfin/configs/BF533-STAMP_defconfig index 9d733436e300..eb027587a355 100644 --- a/arch/blackfin/configs/BF533-STAMP_defconfig +++ b/arch/blackfin/configs/BF533-STAMP_defconfig @@ -290,8 +290,8 @@ CONFIG_BFIN_ICACHE=y CONFIG_BFIN_DCACHE=y # CONFIG_BFIN_DCACHE_BANKA is not set # CONFIG_BFIN_ICACHE_LOCK is not set -# CONFIG_BFIN_WB is not set -CONFIG_BFIN_WT=y +CONFIG_BFIN_WB=y +# CONFIG_BFIN_WT is not set # CONFIG_MPU is not set # diff --git a/arch/blackfin/configs/BF537-STAMP_defconfig b/arch/blackfin/configs/BF537-STAMP_defconfig index c61b6000a9eb..9e62b9f40eb1 100644 --- a/arch/blackfin/configs/BF537-STAMP_defconfig +++ b/arch/blackfin/configs/BF537-STAMP_defconfig @@ -298,8 +298,8 @@ CONFIG_BFIN_ICACHE=y CONFIG_BFIN_DCACHE=y # CONFIG_BFIN_DCACHE_BANKA is not set # CONFIG_BFIN_ICACHE_LOCK is not set -# CONFIG_BFIN_WB is not set -CONFIG_BFIN_WT=y +CONFIG_BFIN_WB=y +# CONFIG_BFIN_WT is not set # CONFIG_MPU is not set # diff --git a/arch/blackfin/configs/BF538-EZKIT_defconfig b/arch/blackfin/configs/BF538-EZKIT_defconfig index cb32f5624a1b..dd6ad6be1c87 100644 --- a/arch/blackfin/configs/BF538-EZKIT_defconfig +++ b/arch/blackfin/configs/BF538-EZKIT_defconfig @@ -306,8 +306,8 @@ CONFIG_BFIN_ICACHE=y CONFIG_BFIN_DCACHE=y # CONFIG_BFIN_DCACHE_BANKA is not set # CONFIG_BFIN_ICACHE_LOCK is not set -# CONFIG_BFIN_WB is not set -CONFIG_BFIN_WT=y +CONFIG_BFIN_WB=y +# CONFIG_BFIN_WT is not set # CONFIG_MPU is not set # diff --git a/arch/blackfin/configs/BF548-EZKIT_defconfig b/arch/blackfin/configs/BF548-EZKIT_defconfig index 0f8697618aa5..b41dbd08c2c6 100644 --- a/arch/blackfin/configs/BF548-EZKIT_defconfig +++ b/arch/blackfin/configs/BF548-EZKIT_defconfig @@ -361,8 +361,8 @@ CONFIG_BFIN_ICACHE=y CONFIG_BFIN_DCACHE=y # CONFIG_BFIN_DCACHE_BANKA is not set # CONFIG_BFIN_ICACHE_LOCK is not set -# CONFIG_BFIN_WB is not set -CONFIG_BFIN_WT=y +CONFIG_BFIN_WB=y +# CONFIG_BFIN_WT is not set # CONFIG_BFIN_L2_CACHEABLE is not set # CONFIG_MPU is not set diff --git a/arch/blackfin/configs/BF561-EZKIT_defconfig b/arch/blackfin/configs/BF561-EZKIT_defconfig index 042c7adfccfa..69714fb3e608 100644 --- a/arch/blackfin/configs/BF561-EZKIT_defconfig +++ b/arch/blackfin/configs/BF561-EZKIT_defconfig @@ -329,8 +329,8 @@ CONFIG_BFIN_ICACHE=y CONFIG_BFIN_DCACHE=y # CONFIG_BFIN_DCACHE_BANKA is not set # CONFIG_BFIN_ICACHE_LOCK is not set -# CONFIG_BFIN_WB is not set -CONFIG_BFIN_WT=y +CONFIG_BFIN_WB=y +# CONFIG_BFIN_WT is not set # CONFIG_BFIN_L2_CACHEABLE is not set # CONFIG_MPU is not set diff --git a/arch/blackfin/configs/BlackStamp_defconfig b/arch/blackfin/configs/BlackStamp_defconfig index 3a20e281d23c..017c6ea071b5 100644 --- a/arch/blackfin/configs/BlackStamp_defconfig +++ b/arch/blackfin/configs/BlackStamp_defconfig @@ -288,8 +288,8 @@ CONFIG_BFIN_ICACHE=y CONFIG_BFIN_DCACHE=y # CONFIG_BFIN_DCACHE_BANKA is not set # CONFIG_BFIN_ICACHE_LOCK is not set -# CONFIG_BFIN_WB is not set -CONFIG_BFIN_WT=y +CONFIG_BFIN_WB=y +# CONFIG_BFIN_WT is not set # CONFIG_MPU is not set # diff --git a/arch/blackfin/configs/CM-BF527_defconfig b/arch/blackfin/configs/CM-BF527_defconfig index 865ed85a5760..d880ef786770 100644 --- a/arch/blackfin/configs/CM-BF527_defconfig +++ b/arch/blackfin/configs/CM-BF527_defconfig @@ -332,8 +332,8 @@ CONFIG_BFIN_ICACHE=y CONFIG_BFIN_DCACHE=y # CONFIG_BFIN_DCACHE_BANKA is not set # CONFIG_BFIN_ICACHE_LOCK is not set -# CONFIG_BFIN_WB is not set -CONFIG_BFIN_WT=y +CONFIG_BFIN_WB=y +# CONFIG_BFIN_WT is not set # CONFIG_MPU is not set # diff --git a/arch/blackfin/configs/CM-BF548_defconfig b/arch/blackfin/configs/CM-BF548_defconfig index efe9741b1f14..542ebc5436ad 100644 --- a/arch/blackfin/configs/CM-BF548_defconfig +++ b/arch/blackfin/configs/CM-BF548_defconfig @@ -336,8 +336,8 @@ CONFIG_BFIN_ICACHE=y CONFIG_BFIN_DCACHE=y # CONFIG_BFIN_DCACHE_BANKA is not set # CONFIG_BFIN_ICACHE_LOCK is not set -# CONFIG_BFIN_WB is not set -CONFIG_BFIN_WT=y +CONFIG_BFIN_WB=y +# CONFIG_BFIN_WT is not set CONFIG_L1_MAX_PIECE=16 # CONFIG_MPU is not set diff --git a/arch/blackfin/configs/SRV1_defconfig b/arch/blackfin/configs/SRV1_defconfig index fa580affc9d6..a46529c6ade3 100644 --- a/arch/blackfin/configs/SRV1_defconfig +++ b/arch/blackfin/configs/SRV1_defconfig @@ -282,8 +282,8 @@ CONFIG_BFIN_ICACHE=y CONFIG_BFIN_DCACHE=y # CONFIG_BFIN_DCACHE_BANKA is not set # CONFIG_BFIN_ICACHE_LOCK is not set -# CONFIG_BFIN_WB is not set -CONFIG_BFIN_WT=y +CONFIG_BFIN_WB=y +# CONFIG_BFIN_WT is not set CONFIG_L1_MAX_PIECE=16 # From 368a12117dd8abf6eaefa37c21ac313b517128b9 Mon Sep 17 00:00:00 2001 From: Tony Breeds Date: Tue, 3 Mar 2009 17:59:30 +0000 Subject: [PATCH 111/580] powerpc: Run sbc610 USB fixup code only on the appropriate platform. commit a969e76a7101bf5f3d369563df1ca1253dd6131b (powerpc: Correct USB support for GE Fanuc SBC610) introduced a fixup for NEC usb controllers. This fixup should only run on GEF SBC610 boards. Fixes Fedora bug #486511. (https://bugzilla.redhat.com/show_bug.cgi?id=486511) Signed-off-by: Tony Breeds Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/86xx/gef_sbc610.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/platforms/86xx/gef_sbc610.c b/arch/powerpc/platforms/86xx/gef_sbc610.c index fb371f5ce132..d6b772ba3b8f 100644 --- a/arch/powerpc/platforms/86xx/gef_sbc610.c +++ b/arch/powerpc/platforms/86xx/gef_sbc610.c @@ -142,6 +142,10 @@ static void __init gef_sbc610_nec_fixup(struct pci_dev *pdev) { unsigned int val; + /* Do not do the fixup on other platforms! */ + if (!machine_is(gef_sbc610)) + return; + printk(KERN_INFO "Running NEC uPD720101 Fixup\n"); /* Ensure ports 1, 2, 3, 4 & 5 are enabled */ From 4843b93c96ae5043c6279c4ec6fcd8ee3866ff5b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 3 Mar 2009 23:37:30 -0800 Subject: [PATCH 112/580] netlink: invert error code in netlink_set_err() The callers of netlink_set_err() currently pass a negative value as parameter for the error code. However, sk->sk_err wants a positive error value. Without this patch, skb_recv_datagram() called by netlink_recvmsg() may return a positive value to report an error. Another choice to fix this is to change callers to pass a positive error value, but this seems a bit inconsistent and error prone to me. Indeed, the callers of netlink_set_err() assumed that the (usual) negative value for error codes was fine before this patch :). This patch also includes some documentation in docbook format for netlink_set_err() to avoid this sort of confusion. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 9eb895c7a2a9..3ae3cb816563 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1084,6 +1084,13 @@ static inline int do_one_set_err(struct sock *sk, return 0; } +/** + * netlink_set_err - report error to broadcast listeners + * @ssk: the kernel netlink socket, as returned by netlink_kernel_create() + * @pid: the PID of a process that we want to skip (if any) + * @groups: the broadcast group that will notice the error + * @code: error code, must be negative (as usual in kernelspace) + */ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code) { struct netlink_set_err_data info; @@ -1093,7 +1100,8 @@ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code) info.exclude_sk = ssk; info.pid = pid; info.group = group; - info.code = code; + /* sk->sk_err wants a positive error value */ + info.code = -code; read_lock(&nl_table_lock); From a1a69c8db7f988f903349442a7538d21b56c38e9 Mon Sep 17 00:00:00 2001 From: Peter Korsgaard Date: Tue, 3 Mar 2009 23:48:16 -0800 Subject: [PATCH 113/580] dm9601: new vendor/product IDs Add vendor/product IDs for new no name dm9601 compatible usb ethernet adaptors. Reported-by: Eric Lauriault Signed-off-by: Peter Korsgaard Signed-off-by: David S. Miller --- drivers/net/usb/dm9601.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/usb/dm9601.c b/drivers/net/usb/dm9601.c index 5b67bbf1987e..81682c6defa0 100644 --- a/drivers/net/usb/dm9601.c +++ b/drivers/net/usb/dm9601.c @@ -633,6 +633,10 @@ static const struct usb_device_id products[] = { }, { USB_DEVICE(0x0a47, 0x9601), /* Hirose USB-100 */ + .driver_info = (unsigned long)&dm9601_info, + }, + { + USB_DEVICE(0x0fe6, 0x8101), /* DM9601 USB to Fast Ethernet Adapter */ .driver_info = (unsigned long)&dm9601_info, }, {}, // END From 4222474519ff5b31a526dfa1da7aa4b0e38bef5c Mon Sep 17 00:00:00 2001 From: Meelis Roos Date: Tue, 3 Mar 2009 23:48:50 -0800 Subject: [PATCH 114/580] net: fix tokenring license Currently, modular tokenring ("tr") lacks a license and fails to load: tr: module license 'unspecified' taints kernel. tr: Unknown symbol proc_net_fops_create Beacuse of this, no tokenring driver can load if it depends on modular tr. Fix this by adding GPL module license as it is in the kernel. With this fix, tr module loads fine and tms380 driver also loads. Well, it does'nt work but that's a different bug. Signed-off-by: Meelis Roos Signed-off-by: David S. Miller --- net/802/tr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/802/tr.c b/net/802/tr.c index 158150fee462..f47ae289d83b 100644 --- a/net/802/tr.c +++ b/net/802/tr.c @@ -668,3 +668,5 @@ module_init(rif_init); EXPORT_SYMBOL(tr_type_trans); EXPORT_SYMBOL(alloc_trdev); + +MODULE_LICENSE("GPL"); From 5fd3a17ed456637a224cf4ca82b9ad9d005bc8d4 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 4 Mar 2009 00:57:25 -0700 Subject: [PATCH 115/580] md: fix deadlock when stopping arrays Resolve a deadlock when stopping redundant arrays, i.e. ones that require a call to sysfs_remove_group when shutdown. The deadlock is summarized below: Thread1 Thread2 ------- ------- read sysfs attribute stop array take mddev lock sysfs_remove_group sysfs_get_active wait for mddev lock wait for active Sysrq-w: -------- mdmon S 00000017 2212 4163 1 f1982ea8 00000046 2dcf6b85 00000017 c0b23100 f2f83ed0 c0b23100 f2f8413c c0b23100 c0b23100 c0b1fb98 f2f8413c 00000000 f2f8413c c0b23100 f2291ecc 00000002 c0b23100 00000000 00000017 f2f83ed0 f1982eac 00000046 c044d9dd Call Trace: [] ? debug_mutex_add_waiter+0x1d/0x58 [] __mutex_lock_common+0x1d9/0x338 [] ? __mutex_lock_common+0x1d9/0x338 [] mutex_lock_interruptible_nested+0x33/0x3a [] ? mddev_lock+0x14/0x16 [] mddev_lock+0x14/0x16 [] md_attr_show+0x2a/0x49 [] sysfs_read_file+0x93/0xf9 mdadm D 00000017 2812 4177 1 f0401d78 00000046 430456f8 00000017 f0401d58 f0401d20 c0b23100 f2da2c4c c0b23100 c0b23100 c0b1fb98 f2da2c4c 0a10fc36 00000000 c0b23100 f0401d70 00000003 c0b23100 00000000 00000017 f2da29e0 00000001 00000002 00000000 Call Trace: [] schedule_timeout+0x1b/0x95 [] ? schedule_timeout+0x1b/0x95 [] ? wait_for_common+0x34/0xdc [] ? trace_hardirqs_on_caller+0x18/0x145 [] ? trace_hardirqs_on+0xb/0xd [] wait_for_common+0xa0/0xdc [] ? default_wake_function+0x0/0x12 [] wait_for_completion+0x17/0x19 [] sysfs_addrm_finish+0x19f/0x1d1 [] sysfs_hash_and_remove+0x42/0x55 [] sysfs_remove_group+0x57/0x86 [] do_md_stop+0x13a/0x499 This has been there for a while, but is easier to trigger now that mdmon is closely watching sysfs. Cc: Reported-by: Jacek Danecki Signed-off-by: Dan Williams --- drivers/md/md.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 03b4cd0a6344..a307f87eb90e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -214,12 +214,7 @@ static inline mddev_t *mddev_get(mddev_t *mddev) return mddev; } -static void mddev_delayed_delete(struct work_struct *ws) -{ - mddev_t *mddev = container_of(ws, mddev_t, del_work); - kobject_del(&mddev->kobj); - kobject_put(&mddev->kobj); -} +static void mddev_delayed_delete(struct work_struct *ws); static void mddev_put(mddev_t *mddev) { @@ -3542,6 +3537,21 @@ static struct kobj_type md_ktype = { int mdp_major = 0; +static void mddev_delayed_delete(struct work_struct *ws) +{ + mddev_t *mddev = container_of(ws, mddev_t, del_work); + + if (mddev->private == &md_redundancy_group) { + sysfs_remove_group(&mddev->kobj, &md_redundancy_group); + if (mddev->sysfs_action) + sysfs_put(mddev->sysfs_action); + mddev->sysfs_action = NULL; + mddev->private = NULL; + } + kobject_del(&mddev->kobj); + kobject_put(&mddev->kobj); +} + static int md_alloc(dev_t dev, char *name) { static DEFINE_MUTEX(disks_mutex); @@ -4033,13 +4043,9 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) mddev->queue->merge_bvec_fn = NULL; mddev->queue->unplug_fn = NULL; mddev->queue->backing_dev_info.congested_fn = NULL; - if (mddev->pers->sync_request) { - sysfs_remove_group(&mddev->kobj, &md_redundancy_group); - if (mddev->sysfs_action) - sysfs_put(mddev->sysfs_action); - mddev->sysfs_action = NULL; - } module_put(mddev->pers->owner); + if (mddev->pers->sync_request) + mddev->private = &md_redundancy_group; mddev->pers = NULL; /* tell userspace to handle 'inactive' */ sysfs_notify_dirent(mddev->sysfs_state); From 858b9ced6e73a0f087294c398a1ae70a7eeed94f Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Wed, 4 Mar 2009 00:11:42 -0800 Subject: [PATCH 116/580] net: more timeouts that reach -1 with while (timeout-- > 0); timeout reaches -1 after the loop, so the tests below are off by one. also don't do an '< 0' test on an unsigned. Signed-off-by: Roel Kluin Signed-off-by: David S. Miller --- drivers/net/arm/ks8695net.c | 2 +- drivers/net/jme.c | 3 ++- drivers/net/ucc_geth_mii.c | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/arm/ks8695net.c b/drivers/net/arm/ks8695net.c index 1cf2f949c0b4..f3a127434897 100644 --- a/drivers/net/arm/ks8695net.c +++ b/drivers/net/arm/ks8695net.c @@ -560,7 +560,7 @@ ks8695_reset(struct ks8695_priv *ksp) msleep(1); } - if (reset_timeout == 0) { + if (reset_timeout < 0) { dev_crit(ksp->dev, "Timeout waiting for DMA engines to reset\n"); /* And blithely carry on */ diff --git a/drivers/net/jme.c b/drivers/net/jme.c index 08b34051c646..a6e1a35a13cb 100644 --- a/drivers/net/jme.c +++ b/drivers/net/jme.c @@ -957,13 +957,14 @@ jme_process_receive(struct jme_adapter *jme, int limit) goto out_inc; i = atomic_read(&rxring->next_to_clean); - while (limit-- > 0) { + while (limit > 0) { rxdesc = rxring->desc; rxdesc += i; if ((rxdesc->descwb.flags & cpu_to_le16(RXWBFLAG_OWN)) || !(rxdesc->descwb.desccnt & RXWBDCNT_WBCPL)) goto out; + --limit; desccnt = rxdesc->descwb.desccnt & RXWBDCNT_DCNT; diff --git a/drivers/net/ucc_geth_mii.c b/drivers/net/ucc_geth_mii.c index 54635911305c..0ada4edd56eb 100644 --- a/drivers/net/ucc_geth_mii.c +++ b/drivers/net/ucc_geth_mii.c @@ -107,7 +107,7 @@ int uec_mdio_read(struct mii_bus *bus, int mii_id, int regnum) static int uec_mdio_reset(struct mii_bus *bus) { struct ucc_mii_mng __iomem *regs = (void __iomem *)bus->priv; - unsigned int timeout = PHY_INIT_TIMEOUT; + int timeout = PHY_INIT_TIMEOUT; mutex_lock(&bus->mdio_lock); @@ -123,7 +123,7 @@ static int uec_mdio_reset(struct mii_bus *bus) mutex_unlock(&bus->mdio_lock); - if (timeout <= 0) { + if (timeout < 0) { printk(KERN_ERR "%s: The MII Bus is stuck!\n", bus->name); return -EBUSY; } From b9bdcd9bd78d253dcc8e13c29f0acd67e080e7c1 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Wed, 4 Mar 2009 00:05:56 -0800 Subject: [PATCH 117/580] net pcmcia: worklimit reaches -1 with while (--worklimit >= 0); worklimit reaches -1 after the loop. In 3c589_cs.c this caused a warning not to be printed. In 3c574_cs.c contrastingly, el3_rx() treats worklimit differently: static int el3_rx(struct net_device *dev, int worklimit) { while (--worklimit >= 0) { ... } return worklimit; } el3_rx() is only called by function el3_interrupt(): twice: static irqreturn_t el3_interrupt(int irq, void *dev_id) { int work_budget = max_interrupt_work; while(...) { if (...) work_budget = el3_rx(dev, work_budget); if (...) work_budget = el3_rx(dev, work_budget); if (--work_budget < 0) { ... break; } } } The error path can occur 2 too early. Signed-off-by: Roel Kluin Signed-off-by: David S. Miller --- drivers/net/pcmcia/3c574_cs.c | 3 ++- drivers/net/pcmcia/3c589_cs.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/pcmcia/3c574_cs.c b/drivers/net/pcmcia/3c574_cs.c index e5cb6b1f0ebd..2404a838b1fe 100644 --- a/drivers/net/pcmcia/3c574_cs.c +++ b/drivers/net/pcmcia/3c574_cs.c @@ -1035,7 +1035,8 @@ static int el3_rx(struct net_device *dev, int worklimit) DEBUG(3, "%s: in rx_packet(), status %4.4x, rx_status %4.4x.\n", dev->name, inw(ioaddr+EL3_STATUS), inw(ioaddr+RxStatus)); while (!((rx_status = inw(ioaddr + RxStatus)) & 0x8000) && - (--worklimit >= 0)) { + worklimit > 0) { + worklimit--; if (rx_status & 0x4000) { /* Error, update stats. */ short error = rx_status & 0x3800; dev->stats.rx_errors++; diff --git a/drivers/net/pcmcia/3c589_cs.c b/drivers/net/pcmcia/3c589_cs.c index 73ecc657999d..1e01b8a6dbf3 100644 --- a/drivers/net/pcmcia/3c589_cs.c +++ b/drivers/net/pcmcia/3c589_cs.c @@ -857,7 +857,8 @@ static int el3_rx(struct net_device *dev) DEBUG(3, "%s: in rx_packet(), status %4.4x, rx_status %4.4x.\n", dev->name, inw(ioaddr+EL3_STATUS), inw(ioaddr+RX_STATUS)); while (!((rx_status = inw(ioaddr + RX_STATUS)) & 0x8000) && - (--worklimit >= 0)) { + worklimit > 0) { + worklimit--; if (rx_status & 0x4000) { /* Error, update stats. */ short error = rx_status & 0x3800; dev->stats.rx_errors++; From 948731115774c2e5ff7409360f35389459502211 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Wed, 4 Mar 2009 00:07:57 -0800 Subject: [PATCH 118/580] aoe: error printed 1 too early with while (i-- > 0); i reaches -1 after the loop, so the test below is printed one too early: 0 still means success. Signed-off-by: Roel Kluin Signed-off-by: David S. Miller --- drivers/block/aoe/aoedev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index cc250577d405..eeea477d9601 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -173,7 +173,7 @@ skbfree(struct sk_buff *skb) return; while (atomic_read(&skb_shinfo(skb)->dataref) != 1 && i-- > 0) msleep(Sms); - if (i <= 0) { + if (i < 0) { printk(KERN_ERR "aoe: %s holds ref: %s\n", skb->dev ? skb->dev->name : "netif", From 4a8fd2cfdad4d043a1fadba2f3f340945d966825 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Wed, 4 Mar 2009 00:08:39 -0800 Subject: [PATCH 119/580] sungem: another error printed one too early Another error was printed one too early. Signed-off-by: Roel Kluin Signed-off-by: David S. Miller --- drivers/net/sungem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/sungem.c b/drivers/net/sungem.c index 8d64b1da0465..0fcb7503363d 100644 --- a/drivers/net/sungem.c +++ b/drivers/net/sungem.c @@ -1229,7 +1229,7 @@ static void gem_reset(struct gem *gp) break; } while (val & (GREG_SWRST_TXRST | GREG_SWRST_RXRST)); - if (limit <= 0) + if (limit < 0) printk(KERN_ERR "%s: SW reset is ghetto.\n", gp->dev->name); if (gp->phy_type == phy_serialink || gp->phy_type == phy_serdes) From c21986a76220919bb06e4c04a89a8a62aeac107e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Mar 2009 00:18:11 -0800 Subject: [PATCH 120/580] jsflash: stop defining MAJOR_NR Ever since early 2.5 kernels block drivers don't need to define MAJOR_NR anymore, so use the JSFD_MAJOR defined directly and kill it. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- drivers/sbus/char/jsflash.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c index a9a9893a5f95..e6d1fc8c54f1 100644 --- a/drivers/sbus/char/jsflash.c +++ b/drivers/sbus/char/jsflash.c @@ -38,9 +38,6 @@ #include #include #include - -#define MAJOR_NR JSFD_MAJOR - #include #include #include From f4c13638185c91a269c63fcfb980d89180cf43a1 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Wed, 4 Mar 2009 00:19:28 -0800 Subject: [PATCH 121/580] sparc64: wait_event_interruptible_timeout may return -ERESTARTSYS wait_event_interruptible_timeout may return -ERESTARTSYS. Signed-off-by: Roel Kluin Signed-off-by: David S. Miller --- drivers/sbus/char/bbc_i2c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/sbus/char/bbc_i2c.c b/drivers/sbus/char/bbc_i2c.c index f08e169ba1b5..7e30e5f6e032 100644 --- a/drivers/sbus/char/bbc_i2c.c +++ b/drivers/sbus/char/bbc_i2c.c @@ -129,7 +129,7 @@ static int wait_for_pin(struct bbc_i2c_bus *bp, u8 *status) bp->waiting = 1; add_wait_queue(&bp->wq, &wait); while (limit-- > 0) { - unsigned long val; + long val; val = wait_event_interruptible_timeout( bp->wq, From 0bf3d933085509916335480121761a1b225e6c23 Mon Sep 17 00:00:00 2001 From: Sonic Zhang Date: Thu, 5 Mar 2009 16:44:53 +0800 Subject: [PATCH 122/580] Blackfin arch: fix bug - kgdb fails to continue after setting breakpoint on bf561-ezkit kernel with smp patch Free spinlock before call IPI handlers. Signed-off-by: Sonic Zhang Signed-off-by: Bryan Wu Header from folded patch 'blackfin_arch__fix_bug_-_kgdb_fails_to_continue_after_setting_breakpoint_on_bf561-ezkit_kernel_with_smp_patch-1': Blackfin arch: fix bug - kgdb fails to continue after setting breakpoint on bf561-ezkit kernel with smp patch Don't test l1 code in SMP kernel. Signed-off-by: Sonic Zhang Signed-off-by: Bryan Wu --- arch/blackfin/kernel/kgdb_test.c | 9 +++++++-- arch/blackfin/mach-common/smp.c | 6 +++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/blackfin/kernel/kgdb_test.c b/arch/blackfin/kernel/kgdb_test.c index 3dba9c17304a..dbcf3e45cb0b 100644 --- a/arch/blackfin/kernel/kgdb_test.c +++ b/arch/blackfin/kernel/kgdb_test.c @@ -20,6 +20,7 @@ static char cmdline[256]; static unsigned long len; +#ifndef CONFIG_SMP static int num1 __attribute__((l1_data)); void kgdb_l1_test(void) __attribute__((l1_text)); @@ -32,6 +33,8 @@ void kgdb_l1_test(void) printk(KERN_ALERT "L1(after change) : data variable addr = 0x%p, data value is %d\n", &num1, num1); return ; } +#endif + #if L2_LENGTH static int num2 __attribute__((l2)); @@ -59,10 +62,12 @@ int kgdb_test(char *name, int len, int count, int z) static int test_proc_output(char *buf) { kgdb_test("hello world!", 12, 0x55, 0x10); +#ifndef CONFIG_SMP kgdb_l1_test(); - #if L2_LENGTH +#endif +#if L2_LENGTH kgdb_l2_test(); - #endif +#endif return 0; } diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c index 77c992847094..93eab6146079 100644 --- a/arch/blackfin/mach-common/smp.c +++ b/arch/blackfin/mach-common/smp.c @@ -158,10 +158,14 @@ static irqreturn_t ipi_handler(int irq, void *dev_instance) kfree(msg); break; case BFIN_IPI_CALL_FUNC: + spin_unlock(&msg_queue->lock); ipi_call_function(cpu, msg); + spin_lock(&msg_queue->lock); break; case BFIN_IPI_CPU_STOP: + spin_unlock(&msg_queue->lock); ipi_cpu_stop(cpu); + spin_lock(&msg_queue->lock); kfree(msg); break; default: @@ -457,7 +461,7 @@ void smp_icache_flush_range_others(unsigned long start, unsigned long end) smp_flush_data.start = start; smp_flush_data.end = end; - if (smp_call_function(&ipi_flush_icache, &smp_flush_data, 1)) + if (smp_call_function(&ipi_flush_icache, &smp_flush_data, 0)) printk(KERN_WARNING "SMP: failed to run I-cache flush request on other CPUs\n"); } EXPORT_SYMBOL_GPL(smp_icache_flush_range_others); From 199862892e83d04a294eebad45adc40c658b8630 Mon Sep 17 00:00:00 2001 From: Michael Hennerich Date: Thu, 5 Mar 2009 16:45:55 +0800 Subject: [PATCH 123/580] Blackfin arch: PM_BFIN_WAKE_GP: update help Signed-off-by: Michael Hennerich Signed-off-by: Bryan Wu --- arch/blackfin/Kconfig | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index 8f1f97d56e1e..be6097163b07 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -1168,6 +1168,12 @@ config PM_BFIN_WAKE_GP default n help Enable General-Purpose Wake-Up (Voltage Regulator Power-Up) + (all processors, except ADSP-BF549). This option sets + the general-purpose wake-up enable (GPWE) control bit to enable + wake-up upon detection of an active low signal on the /GPW (PH7) pin. + On ADSP-BF549 this option enables the the same functionality on the + /MRXON pin also PH7. + endmenu menu "CPU Frequency scaling" From ff19fed4fe54f2f1dd439ac02969333ea9a9b4ff Mon Sep 17 00:00:00 2001 From: Michael Hennerich Date: Wed, 4 Mar 2009 17:35:51 +0800 Subject: [PATCH 124/580] Blackfin arch: Fix BUG - kernel fails to build in pm.c when allow wakeup fromi standby by GPIO This feature is not available on BF54x. Signed-off-by: Michael Hennerich Signed-off-by: Bryan Wu --- arch/blackfin/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index be6097163b07..0c1f86e3e44a 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -1129,6 +1129,7 @@ endchoice config PM_WAKEUP_BY_GPIO bool "Allow Wakeup from Standby by GPIO" + depends on PM && !BF54x config PM_WAKEUP_GPIO_NUMBER int "GPIO number" From c18e99cfba746ab0ad8d45e1f351ed990947c58c Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Wed, 4 Mar 2009 17:36:49 +0800 Subject: [PATCH 125/580] Blackfin arch: update anomaly sheets to match latest public info Signed-off-by: Mike Frysinger Signed-off-by: Bryan Wu --- .../mach-bf518/include/mach/anomaly.h | 13 ++++++++++-- .../mach-bf527/include/mach/anomaly.h | 3 ++- .../mach-bf533/include/mach/anomaly.h | 4 ++-- .../mach-bf537/include/mach/anomaly.h | 4 ++-- .../mach-bf538/include/mach/anomaly.h | 3 ++- .../mach-bf548/include/mach/anomaly.h | 21 +++++++++++++++---- .../mach-bf561/include/mach/anomaly.h | 4 ++-- 7 files changed, 38 insertions(+), 14 deletions(-) diff --git a/arch/blackfin/mach-bf518/include/mach/anomaly.h b/arch/blackfin/mach-bf518/include/mach/anomaly.h index 6e1670072829..143a29dfe870 100644 --- a/arch/blackfin/mach-bf518/include/mach/anomaly.h +++ b/arch/blackfin/mach-bf518/include/mach/anomaly.h @@ -2,12 +2,12 @@ * File: include/asm-blackfin/mach-bf518/anomaly.h * Bugs: Enter bugs at http://blackfin.uclinux.org/ * - * Copyright (C) 2004-2008 Analog Devices Inc. + * Copyright (C) 2004-2009 Analog Devices Inc. * Licensed under the GPL-2 or later. */ /* This file shoule be up to date with: - * - ???? + * - Revision B, 02/03/2009; ADSP-BF512/BF514/BF516/BF518 Blackfin Processor Anomaly List */ #ifndef _MACH_ANOMALY_H_ @@ -19,6 +19,8 @@ #define ANOMALY_05000122 (1) /* False Hardware Error from an Access in the Shadow of a Conditional Branch */ #define ANOMALY_05000245 (1) +/* Incorrect Timer Pulse Width in Single-Shot PWM_OUT Mode with External Clock */ +#define ANOMALY_05000254 (1) /* Sensitivity To Noise with Slow Input Edge Rates on External SPORT TX and RX Clocks */ #define ANOMALY_05000265 (1) /* False Hardware Errors Caused by Fetches at the Boundary of Reserved Memory */ @@ -53,6 +55,12 @@ #define ANOMALY_05000443 (1) /* Incorrect L1 Instruction Bank B Memory Map Location */ #define ANOMALY_05000444 (1) +/* Incorrect Default Hysteresis Setting for RESET, NMI, and BMODE Signals */ +#define ANOMALY_05000452 (1) +/* PWM_TRIPB Signal Not Available on PG10 */ +#define ANOMALY_05000453 (1) +/* PPI_FS3 is Driven One Half Cycle Later Than PPI Data */ +#define ANOMALY_05000455 (1) /* Anomalies that don't exist on this proc */ #define ANOMALY_05000125 (0) @@ -67,6 +75,7 @@ #define ANOMALY_05000273 (0) #define ANOMALY_05000278 (0) #define ANOMALY_05000285 (0) +#define ANOMALY_05000305 (0) #define ANOMALY_05000307 (0) #define ANOMALY_05000311 (0) #define ANOMALY_05000312 (0) diff --git a/arch/blackfin/mach-bf527/include/mach/anomaly.h b/arch/blackfin/mach-bf527/include/mach/anomaly.h index fdc87839af60..4ffccb6de52d 100644 --- a/arch/blackfin/mach-bf527/include/mach/anomaly.h +++ b/arch/blackfin/mach-bf527/include/mach/anomaly.h @@ -2,7 +2,7 @@ * File: include/asm-blackfin/mach-bf527/anomaly.h * Bugs: Enter bugs at http://blackfin.uclinux.org/ * - * Copyright (C) 2004-2008 Analog Devices Inc. + * Copyright (C) 2004-2009 Analog Devices Inc. * Licensed under the GPL-2 or later. */ @@ -169,6 +169,7 @@ #define ANOMALY_05000273 (0) #define ANOMALY_05000278 (0) #define ANOMALY_05000285 (0) +#define ANOMALY_05000305 (0) #define ANOMALY_05000307 (0) #define ANOMALY_05000311 (0) #define ANOMALY_05000312 (0) diff --git a/arch/blackfin/mach-bf533/include/mach/anomaly.h b/arch/blackfin/mach-bf533/include/mach/anomaly.h index 657dd1820a93..9e838f851720 100644 --- a/arch/blackfin/mach-bf533/include/mach/anomaly.h +++ b/arch/blackfin/mach-bf533/include/mach/anomaly.h @@ -2,7 +2,7 @@ * File: include/asm-blackfin/mach-bf533/anomaly.h * Bugs: Enter bugs at http://blackfin.uclinux.org/ * - * Copyright (C) 2004-2008 Analog Devices Inc. + * Copyright (C) 2004-2009 Analog Devices Inc. * Licensed under the GPL-2 or later. */ @@ -160,7 +160,7 @@ #define ANOMALY_05000301 (__SILICON_REVISION__ < 6) /* SSYNCs After Writes To DMA MMR Registers May Not Be Handled Correctly */ #define ANOMALY_05000302 (__SILICON_REVISION__ < 5) -/* New Feature: Additional Hysteresis on SPORT Input Pins (Not Available On Older Silicon) */ +/* SPORT_HYS Bit in PLL_CTL Register Is Not Functional */ #define ANOMALY_05000305 (__SILICON_REVISION__ < 5) /* New Feature: Additional PPI Frame Sync Sampling Options (Not Available On Older Silicon) */ #define ANOMALY_05000306 (__SILICON_REVISION__ < 5) diff --git a/arch/blackfin/mach-bf537/include/mach/anomaly.h b/arch/blackfin/mach-bf537/include/mach/anomaly.h index fe1ae4c75cd6..438b43ffd41e 100644 --- a/arch/blackfin/mach-bf537/include/mach/anomaly.h +++ b/arch/blackfin/mach-bf537/include/mach/anomaly.h @@ -2,7 +2,7 @@ * File: include/asm-blackfin/mach-bf537/anomaly.h * Bugs: Enter bugs at http://blackfin.uclinux.org/ * - * Copyright (C) 2004-2008 Analog Devices Inc. + * Copyright (C) 2004-2009 Analog Devices Inc. * Licensed under the GPL-2 or later. */ @@ -110,7 +110,7 @@ #define ANOMALY_05000301 (1) /* SSYNCs After Writes To CAN/DMA MMR Registers Are Not Always Handled Correctly */ #define ANOMALY_05000304 (__SILICON_REVISION__ < 3) -/* New Feature: Additional Hysteresis on SPORT Input Pins (Not Available On Older Silicon) */ +/* SPORT_HYS Bit in PLL_CTL Register Is Not Functional */ #define ANOMALY_05000305 (__SILICON_REVISION__ < 3) /* SCKELOW Bit Does Not Maintain State Through Hibernate */ #define ANOMALY_05000307 (__SILICON_REVISION__ < 3) diff --git a/arch/blackfin/mach-bf538/include/mach/anomaly.h b/arch/blackfin/mach-bf538/include/mach/anomaly.h index 56ea454709ca..f4ece1cfca43 100644 --- a/arch/blackfin/mach-bf538/include/mach/anomaly.h +++ b/arch/blackfin/mach-bf538/include/mach/anomaly.h @@ -2,7 +2,7 @@ * File: include/asm-blackfin/mach-bf538/anomaly.h * Bugs: Enter bugs at http://blackfin.uclinux.org/ * - * Copyright (C) 2004-2008 Analog Devices Inc. + * Copyright (C) 2004-2009 Analog Devices Inc. * Licensed under the GPL-2 or later. */ @@ -120,6 +120,7 @@ #define ANOMALY_05000198 (0) #define ANOMALY_05000230 (0) #define ANOMALY_05000263 (0) +#define ANOMALY_05000305 (0) #define ANOMALY_05000311 (0) #define ANOMALY_05000323 (0) #define ANOMALY_05000353 (1) diff --git a/arch/blackfin/mach-bf548/include/mach/anomaly.h b/arch/blackfin/mach-bf548/include/mach/anomaly.h index d9d10a73c1dc..882e40ccf0d1 100644 --- a/arch/blackfin/mach-bf548/include/mach/anomaly.h +++ b/arch/blackfin/mach-bf548/include/mach/anomaly.h @@ -2,12 +2,12 @@ * File: include/asm-blackfin/mach-bf548/anomaly.h * Bugs: Enter bugs at http://blackfin.uclinux.org/ * - * Copyright (C) 2004-2008 Analog Devices Inc. + * Copyright (C) 2004-2009 Analog Devices Inc. * Licensed under the GPL-2 or later. */ /* This file shoule be up to date with: - * - Revision G, 08/07/2008; ADSP-BF542/BF544/BF547/BF548/BF549 Blackfin Processor Anomaly List + * - Revision H, 01/16/2009; ADSP-BF542/BF544/BF547/BF548/BF549 Blackfin Processor Anomaly List */ #ifndef _MACH_ANOMALY_H_ @@ -91,8 +91,6 @@ #define ANOMALY_05000371 (__SILICON_REVISION__ < 2) /* USB DP/DM Data Pins May Lose State When Entering Hibernate */ #define ANOMALY_05000372 (__SILICON_REVISION__ < 1) -/* Mobile DDR Operation Not Functional */ -#define ANOMALY_05000377 (1) /* Security/Authentication Speedpath Causes Authentication To Fail To Initiate */ #define ANOMALY_05000378 (__SILICON_REVISION__ < 2) /* 16-Bit NAND FLASH Boot Mode Is Not Functional */ @@ -157,8 +155,22 @@ #define ANOMALY_05000429 (__SILICON_REVISION__ < 2) /* Software System Reset Corrupts PLL_LOCKCNT Register */ #define ANOMALY_05000430 (__SILICON_REVISION__ >= 2) +/* Incorrect Use of Stack in Lockbox Firmware During Authentication */ +#define ANOMALY_05000431 (__SILICON_REVISION__ < 3) +/* OTP Write Accesses Not Supported */ +#define ANOMALY_05000442 (__SILICON_REVISION__ < 1) /* IFLUSH Instruction at End of Hardware Loop Causes Infinite Stall */ #define ANOMALY_05000443 (1) +/* CDMAPRIO and L2DMAPRIO Bits in the SYSCR Register Are Not Functional */ +#define ANOMALY_05000446 (1) +/* UART IrDA Receiver Fails on Extended Bit Pulses */ +#define ANOMALY_05000447 (1) +/* DDR Clock Duty Cycle Spec Violation (tCH, tCL) */ +#define ANOMALY_05000448 (__SILICON_REVISION__ == 1) +/* Reduced Timing Margins on DDR Output Setup and Hold (tDS and tDH) */ +#define ANOMALY_05000449 (__SILICON_REVISION__ == 1) +/* USB DMA Mode 1 Short Packet Data Corruption */ +#define ANOMALY_05000450 (1 /* Anomalies that don't exist on this proc */ #define ANOMALY_05000125 (0) @@ -172,6 +184,7 @@ #define ANOMALY_05000266 (0) #define ANOMALY_05000273 (0) #define ANOMALY_05000278 (0) +#define ANOMALY_05000305 (0) #define ANOMALY_05000307 (0) #define ANOMALY_05000311 (0) #define ANOMALY_05000323 (0) diff --git a/arch/blackfin/mach-bf561/include/mach/anomaly.h b/arch/blackfin/mach-bf561/include/mach/anomaly.h index 24d3a2264e44..d78bc6b49e61 100644 --- a/arch/blackfin/mach-bf561/include/mach/anomaly.h +++ b/arch/blackfin/mach-bf561/include/mach/anomaly.h @@ -2,7 +2,7 @@ * File: include/asm-blackfin/mach-bf561/anomaly.h * Bugs: Enter bugs at http://blackfin.uclinux.org/ * - * Copyright (C) 2004-2008 Analog Devices Inc. + * Copyright (C) 2004-2009 Analog Devices Inc. * Licensed under the GPL-2 or later. */ @@ -224,7 +224,7 @@ #define ANOMALY_05000301 (1) /* SSYNCs After Writes To DMA MMR Registers May Not Be Handled Correctly */ #define ANOMALY_05000302 (1) -/* New Feature: Additional Hysteresis on SPORT Input Pins (Not Available On Older Silicon) */ +/* SPORT_HYS Bit in PLL_CTL Register Is Not Functional */ #define ANOMALY_05000305 (__SILICON_REVISION__ < 5) /* SCKELOW Bit Does Not Maintain State Through Hibernate */ #define ANOMALY_05000307 (__SILICON_REVISION__ < 5) From 540aca06b737cc38965b52eeceefba3d24376461 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 4 Mar 2009 11:46:40 +0200 Subject: [PATCH 126/580] x86: move devmem_is_allowed() to common mm/init.c Impact: cleanup The function is identical on 32-bit and 64-bit configurations so move it to the common mm/init.c file. Signed-off-by: Pekka Enberg LKML-Reference: <1236160001.29024.29.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 24 ++++++++++++++++++++++++ arch/x86/mm/init_32.c | 21 --------------------- arch/x86/mm/init_64.c | 22 ---------------------- 3 files changed, 24 insertions(+), 43 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index ce6a722587d8..f89df52683c5 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1,9 +1,33 @@ +#include #include + #include #include +#include #include #include +/* + * devmem_is_allowed() checks to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * + * On x86, access has to be given to the first megabyte of ram because that area + * contains bios code and data regions used by X and dosemu and similar apps. + * Access has to be given to non-kernel-ram areas as well, these contain the PCI + * mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pagenr) +{ + if (pagenr <= 256) + return 1; + if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) + return 0; + if (!page_is_ram(pagenr)) + return 1; + return 0; +} + void free_init_pages(char *what, unsigned long begin, unsigned long end) { unsigned long addr = begin; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 0b087dcd2c18..917c4e60c767 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -354,27 +354,6 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, } } -/* - * devmem_is_allowed() checks to see if /dev/mem access to a certain address - * is valid. The argument is a physical page number. - * - * - * On x86, access has to be given to the first megabyte of ram because that area - * contains bios code and data regions used by X and dosemu and similar apps. - * Access has to be given to non-kernel-ram areas as well, these contain the PCI - * mmio resources as well as potential bios/acpi data regions. - */ -int devmem_is_allowed(unsigned long pagenr) -{ - if (pagenr <= 256) - return 1; - if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) - return 0; - if (!page_is_ram(pagenr)) - return 1; - return 0; -} - pte_t *kmap_pte; pgprot_t kmap_prot; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 724e537432e7..074435e79824 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -876,28 +876,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif /* CONFIG_MEMORY_HOTPLUG */ -/* - * devmem_is_allowed() checks to see if /dev/mem access to a certain address - * is valid. The argument is a physical page number. - * - * - * On x86, access has to be given to the first megabyte of ram because that area - * contains bios code and data regions used by X and dosemu and similar apps. - * Access has to be given to non-kernel-ram areas as well, these contain the PCI - * mmio resources as well as potential bios/acpi data regions. - */ -int devmem_is_allowed(unsigned long pagenr) -{ - if (pagenr <= 256) - return 1; - if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) - return 0; - if (!page_is_ram(pagenr)) - return 1; - return 0; -} - - static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, kcore_vsyscall; From 73af76dfd1f998dba71d8e8e785cbe77a990bf17 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 4 Mar 2009 11:47:17 +0100 Subject: [PATCH 127/580] x86, mce: fix build failure in arch/x86/kernel/cpu/mcheck/threshold.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: build fix The APIC code rewrite in the x86 tree broke the x86/mce branch: arch/x86/kernel/cpu/mcheck/threshold.c: In function ‘mce_threshold_interrupt’: arch/x86/kernel/cpu/mcheck/threshold.c:24: error: implicit declaration of function ‘ack_APIC_irq’ Also tidy up the file a bit while at it. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/threshold.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index e4b8a3833fc5..23ee9e730f78 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -1,9 +1,13 @@ -/* Common corrected MCE threshold handler code */ -#include +/* + * Common corrected MCE threshold handler code: + */ #include -#include +#include + #include +#include #include +#include static void default_threshold_interrupt(void) { From fe7ca2e1e847b65c12d245cbf402af89da96888a Mon Sep 17 00:00:00 2001 From: Brian Haley Date: Wed, 4 Mar 2009 03:18:11 -0800 Subject: [PATCH 128/580] IPv6: add "disable" module parameter support to ipv6.ko Add "disable" module parameter support to ipv6.ko by specifying "disable=1" on module load. We just do the minimum of initializing inetsw6[] so calls from other modules to inet6_register_protosw() won't OOPs, then bail out. No IPv6 addresses or sockets can be created as a result, and a reboot is required to enable IPv6. Signed-off-by: Brian Haley Signed-off-by: David S. Miller --- Documentation/networking/ipv6.txt | 35 +++++++++++++++++++++++++++++++ net/ipv6/af_inet6.c | 21 ++++++++++++++----- 2 files changed, 51 insertions(+), 5 deletions(-) create mode 100644 Documentation/networking/ipv6.txt diff --git a/Documentation/networking/ipv6.txt b/Documentation/networking/ipv6.txt new file mode 100644 index 000000000000..268e5c103dd8 --- /dev/null +++ b/Documentation/networking/ipv6.txt @@ -0,0 +1,35 @@ + +Options for the ipv6 module are supplied as parameters at load time. + +Module options may be given as command line arguments to the insmod +or modprobe command, but are usually specified in either the +/etc/modules.conf or /etc/modprobe.conf configuration file, or in a +distro-specific configuration file. + +The available ipv6 module parameters are listed below. If a parameter +is not specified the default value is used. + +The parameters are as follows: + +disable + + Specifies whether to load the IPv6 module, but disable all + its functionality. This might be used when another module + has a dependency on the IPv6 module being loaded, but no + IPv6 addresses or operations are desired. + + The possible values and their effects are: + + 0 + IPv6 is enabled. + + This is the default value. + + 1 + IPv6 is disabled. + + No IPv6 addresses will be added to interfaces, and + it will not be possible to open an IPv6 socket. + + A reboot is required to enable IPv6. + diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c802bc1658a8..da944eca2ca6 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -72,6 +72,10 @@ MODULE_LICENSE("GPL"); static struct list_head inetsw6[SOCK_MAX]; static DEFINE_SPINLOCK(inetsw6_lock); +static int disable_ipv6 = 0; +module_param_named(disable, disable_ipv6, int, 0); +MODULE_PARM_DESC(disable, "Disable IPv6 such that it is non-functional"); + static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) { const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); @@ -991,10 +995,21 @@ static int __init inet6_init(void) { struct sk_buff *dummy_skb; struct list_head *r; - int err; + int err = 0; BUILD_BUG_ON(sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb)); + /* Register the socket-side information for inet6_create. */ + for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) + INIT_LIST_HEAD(r); + + if (disable_ipv6) { + printk(KERN_INFO + "IPv6: Loaded, but administratively disabled, " + "reboot required to enable\n"); + goto out; + } + err = proto_register(&tcpv6_prot, 1); if (err) goto out; @@ -1012,10 +1027,6 @@ static int __init inet6_init(void) goto out_unregister_udplite_proto; - /* Register the socket-side information for inet6_create. */ - for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) - INIT_LIST_HEAD(r); - /* We MUST register RAW sockets before we create the ICMP6, * IGMP6, or NDISC control sockets. */ From fb13d9f9e450bceafd88ac8a98f7a98e8096a5fe Mon Sep 17 00:00:00 2001 From: Brian Haley Date: Wed, 4 Mar 2009 03:20:26 -0800 Subject: [PATCH 129/580] SCTP: change sctp_ctl_sock_init() to try IPv4 if IPv6 fails Change sctp_ctl_sock_init() to try IPv4 if IPv6 socket registration fails. Required if the IPv6 module is loaded with "disable=1", else SCTP will fail to load. Signed-off-by: Brian Haley Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/protocol.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 4e6638449639..c4986d0f7419 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -717,15 +717,20 @@ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev, static int sctp_ctl_sock_init(void) { int err; - sa_family_t family; + sa_family_t family = PF_INET; if (sctp_get_pf_specific(PF_INET6)) family = PF_INET6; - else - family = PF_INET; err = inet_ctl_sock_create(&sctp_ctl_sock, family, SOCK_SEQPACKET, IPPROTO_SCTP, &init_net); + + /* If IPv6 socket could not be created, try the IPv4 socket */ + if (err < 0 && family == PF_INET6) + err = inet_ctl_sock_create(&sctp_ctl_sock, AF_INET, + SOCK_SEQPACKET, IPPROTO_SCTP, + &init_net); + if (err < 0) { printk(KERN_ERR "SCTP: Failed to create the SCTP control socket.\n"); From 5ad8b7d12605e88d1e532061699102797fdefe08 Mon Sep 17 00:00:00 2001 From: Helge Bahmann Date: Wed, 4 Mar 2009 21:49:14 +1000 Subject: [PATCH 130/580] drm: fix double lock typo [airlied: you shall not retype patches from other trees half asleep] Signed-of-by: Dave Airlie --- drivers/gpu/drm/drm_stub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_stub.c b/drivers/gpu/drm/drm_stub.c index 096e2a37446d..7c8b15b22bf2 100644 --- a/drivers/gpu/drm/drm_stub.c +++ b/drivers/gpu/drm/drm_stub.c @@ -168,7 +168,7 @@ int drm_setmaster_ioctl(struct drm_device *dev, void *data, file_priv->minor->master != file_priv->master) { mutex_lock(&dev->struct_mutex); file_priv->minor->master = drm_master_get(file_priv->master); - mutex_lock(&dev->struct_mutex); + mutex_unlock(&dev->struct_mutex); } return 0; From ff0c0874905fb312ca1491bbdac2653b0b48c20b Mon Sep 17 00:00:00 2001 From: Brian Maly Date: Tue, 3 Mar 2009 21:55:31 -0500 Subject: [PATCH 131/580] x86: fix DMI on EFI Impact: reactivate DMI quirks on EFI hardware DMI tables are loaded by EFI, so the dmi calls must happen after efi_init() and not before. Currently Apple hardware uses DMI to determine the framebuffer mappings for efifb. Without DMI working you also have no video on MacBook Pro. This patch resolves the DMI issue for EFI hardware (DMI is now properly detected at boot), and additionally efifb now loads on Apple hardware (i.e. video works). Signed-off-by: Brian Maly Acked-by: Yinghai Lu Cc: ying.huang@intel.com LKML-Reference: <49ADEDA3.1030406@redhat.com> Signed-off-by: Ingo Molnar arch/x86/kernel/setup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) --- arch/x86/kernel/setup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c461f6d69074..6a8811a69324 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -770,6 +770,9 @@ void __init setup_arch(char **cmdline_p) finish_e820_parsing(); + if (efi_enabled) + efi_init(); + dmi_scan_machine(); dmi_check_system(bad_bios_dmi_table); @@ -789,8 +792,6 @@ void __init setup_arch(char **cmdline_p) insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); - if (efi_enabled) - efi_init(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { From 6298e719cf388f43b674f43799af467d3e4e5aa7 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 4 Mar 2009 10:16:07 +0200 Subject: [PATCH 132/580] x86: set_highmem_pages_init() cleanup, #2 Impact: cleanup The zones are set up at this stage so there's a highmem zone available even for the UMA case. The only difference there is that for machines that have CONFIG_HIGHMEM enabled but don't have any highmem available, ->zone_start_pfn is zero whereas highstart_pfn is non-zero). The field is left zeroed because of the !size test in free_area_init_core() but shouldn't be a problem because add_highpages_with_active_regions() handles empty ranges just fine. Signed-off-by: Pekka Enberg Cc: Mel Gorman LKML-Reference: <1236154567.29024.23.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/highmem_32.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 00f127c80b0e..d11745334a67 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -158,7 +158,6 @@ EXPORT_SYMBOL(kunmap); EXPORT_SYMBOL(kmap_atomic); EXPORT_SYMBOL(kunmap_atomic); -#ifdef CONFIG_NUMA void __init set_highmem_pages_init(void) { struct zone *zone; @@ -182,11 +181,3 @@ void __init set_highmem_pages_init(void) } totalram_pages += totalhigh_pages; } -#else -void __init set_highmem_pages_init(void) -{ - add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); - - totalram_pages += totalhigh_pages; -} -#endif /* CONFIG_NUMA */ From 87d99d6f7ee3ec73b2b0212427b173502ec9d6cb Mon Sep 17 00:00:00 2001 From: David Brownell Date: Wed, 4 Mar 2009 10:07:40 -0800 Subject: [PATCH 133/580] ARM: OMAP: Fix compile error if pm.h is included Change the error to a warning. Signed-off-by: David Brownell Signed-off-by: Tony Lindgren --- arch/arm/plat-omap/include/mach/pm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/plat-omap/include/mach/pm.h b/arch/arm/plat-omap/include/mach/pm.h index 2a9c27ad4c37..37e2f0f38b46 100644 --- a/arch/arm/plat-omap/include/mach/pm.h +++ b/arch/arm/plat-omap/include/mach/pm.h @@ -108,7 +108,7 @@ !defined(CONFIG_ARCH_OMAP15XX) && \ !defined(CONFIG_ARCH_OMAP16XX) && \ !defined(CONFIG_ARCH_OMAP24XX) -#error "Power management for this processor not implemented yet" +#warning "Power management for this processor not implemented yet" #endif #ifndef __ASSEMBLER__ From 80ea3bac3a47bc73efa334d0dd57099d0ff14216 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Wed, 4 Mar 2009 10:07:41 -0800 Subject: [PATCH 134/580] ARM: OMAP: sched_clock() corrected After my OMAP3 board has been running for a while, I'm seeing weird latency traces like this: sh-1574 0d.h2 153us : do_timer (tick_do_update_jiffies64) sh-1574 0d.h2 153us : update_wall_time (do_timer) sh-1574 0d.h2 153us!: omap_32k_read (update_wall_time) sh-1574 0d.h2 1883us : update_xtime_cache (update_wall_time) sh-1574 0d.h2 1883us : clocksource_get_next (update_wall_time) sh-1574 0d.h2 1883us+: _spin_lock_irqsave (clocksource_get_next) and after a while: sh-17818 0d.h3 153us : do_timer (tick_do_update_jiffies64) sh-17818 0d.h3 153us : update_wall_time (do_timer) sh-17818 0d.h3 153us!: omap_32k_read (update_wall_time) sh-17818 0d.h3 1915us : update_xtime_cache (update_wall_time) sh-17818 0d.h3 1915us+: clocksource_get_next (update_wall_time) sh-17818 0d.h3 1945us : _spin_lock_irqsave (clocksource_get_next) Turns out that sched_clock() is using cyc2ns(), which returns NTP adjusted time. The sched_clock() frequency should not be adjusted. The patch deletes omap_32k_ticks_to_nsecs() and rewrites sched_clock() to do the conversion using the constant multiplier. Signed-off-by: Aaro Koskinen Signed-off-by: Tony Lindgren --- arch/arm/plat-omap/common.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/arm/plat-omap/common.c b/arch/arm/plat-omap/common.c index 0843b8882f93..6825fbb5a056 100644 --- a/arch/arm/plat-omap/common.c +++ b/arch/arm/plat-omap/common.c @@ -199,21 +199,17 @@ static struct clocksource clocksource_32k = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -/* - * Rounds down to nearest nsec. - */ -unsigned long long omap_32k_ticks_to_nsecs(unsigned long ticks_32k) -{ - return cyc2ns(&clocksource_32k, ticks_32k); -} - /* * Returns current time from boot in nsecs. It's OK for this to wrap * around for now, as it's just a relative time stamp. */ unsigned long long sched_clock(void) { - return omap_32k_ticks_to_nsecs(omap_32k_read()); + unsigned long long ret; + + ret = (unsigned long long)omap_32k_read(); + ret = (ret * clocksource_32k.mult_orig) >> clocksource_32k.shift; + return ret; } static int __init omap_init_clocksource_32k(void) From e951651657610305c9af0045fd64c6a5b0bc4b80 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Wed, 4 Mar 2009 10:07:41 -0800 Subject: [PATCH 135/580] ARM: OMAP: Allow I2C bus driver to be compiled as a module Fixes a linker error when OMAP I2C bus driver is compiled as a module: ERROR: "i2c_register_board_info" [arch/arm/plat-omap/i2c.ko] undefined! The I2C utility functions used for board initialization should be always built-in. Signed-off-by: Aaro Koskinen Acked-by: Jarkko Nikula Signed-off-by: Tony Lindgren --- arch/arm/plat-omap/Makefile | 3 ++- arch/arm/plat-omap/include/mach/common.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm/plat-omap/Makefile b/arch/arm/plat-omap/Makefile index deaff58878a2..04a100cfb8e5 100644 --- a/arch/arm/plat-omap/Makefile +++ b/arch/arm/plat-omap/Makefile @@ -18,7 +18,8 @@ obj-$(CONFIG_CPU_FREQ) += cpu-omap.o obj-$(CONFIG_OMAP_DM_TIMER) += dmtimer.o obj-$(CONFIG_OMAP_DEBUG_DEVICES) += debug-devices.o obj-$(CONFIG_OMAP_DEBUG_LEDS) += debug-leds.o -obj-$(CONFIG_I2C_OMAP) += i2c.o +i2c-omap-$(CONFIG_I2C_OMAP) := i2c.o +obj-y += $(i2c-omap-m) $(i2c-omap-y) # OMAP mailbox framework obj-$(CONFIG_OMAP_MBOX_FWK) += mailbox.o diff --git a/arch/arm/plat-omap/include/mach/common.h b/arch/arm/plat-omap/include/mach/common.h index ef70e2b0f054..e746ec7e785e 100644 --- a/arch/arm/plat-omap/include/mach/common.h +++ b/arch/arm/plat-omap/include/mach/common.h @@ -35,7 +35,7 @@ extern void omap_map_common_io(void); extern struct sys_timer omap_timer; extern void omap_serial_init(void); extern void omap_serial_enable_clocks(int enable); -#ifdef CONFIG_I2C_OMAP +#if defined(CONFIG_I2C_OMAP) || defined(CONFIG_I2C_OMAP_MODULE) extern int omap_register_i2c_bus(int bus_id, u32 clkrate, struct i2c_board_info const *info, unsigned len); From 8ca7fe267f58462825729443f3e3b44ef4901cf0 Mon Sep 17 00:00:00 2001 From: Koen Kooi Date: Wed, 4 Mar 2009 10:07:42 -0800 Subject: [PATCH 136/580] ARM: OMAP: board-omap3beagle: set i2c-3 to 100kHz Changing it do 100kHz is needed to make more devices works properly. Controlling the TI DLP Pico projector[1] doesn't work properly at 400kHz, 100kHz and lower work fine. EDID readout is unaffected by this change. [1] http://focus.ti.com/dlpdmd/docs/dlpdiscovery.tsp?sectionId=60&tabId=2234 Signed-off-by: Koen Kooi Acked-by: David Brownell Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/board-omap3beagle.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm/mach-omap2/board-omap3beagle.c b/arch/arm/mach-omap2/board-omap3beagle.c index 38c88fbe658d..e39cd2c46cfa 100644 --- a/arch/arm/mach-omap2/board-omap3beagle.c +++ b/arch/arm/mach-omap2/board-omap3beagle.c @@ -178,7 +178,9 @@ static int __init omap3_beagle_i2c_init(void) #ifdef CONFIG_I2C2_OMAP_BEAGLE omap_register_i2c_bus(2, 400, NULL, 0); #endif - omap_register_i2c_bus(3, 400, NULL, 0); + /* Bus 3 is attached to the DVI port where devices like the pico DLP + * projector don't work reliably with 400kHz */ + omap_register_i2c_bus(3, 100, NULL, 0); return 0; } From dd39ecf522ba86c70809715af46e6557f6491131 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 4 Mar 2009 10:58:33 +0800 Subject: [PATCH 137/580] x86: EFI: Back efi_ioremap with init_memory_mapping instead of FIX_MAP Impact: Fix boot failure on EFI system with large runtime memory range Brian Maly reported that some EFI system with large runtime memory range can not boot. Because the FIX_MAP used to map runtime memory range is smaller than run time memory range. This patch fixes this issue by re-implement efi_ioremap() with init_memory_mapping(). Reported-and-tested-by: Brian Maly Signed-off-by: Huang Ying Cc: Brian Maly Cc: Yinghai Lu LKML-Reference: <1236135513.6204.306.camel@yhuang-dev.sh.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/efi.h | 2 -- arch/x86/include/asm/fixmap_64.h | 4 ---- arch/x86/kernel/efi.c | 7 +++++-- arch/x86/kernel/efi_64.c | 21 ++++----------------- 4 files changed, 9 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index ca5ffb2856b6..edc90f23e708 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -37,8 +37,6 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); #else /* !CONFIG_X86_32 */ -#define MAX_EFI_IO_PAGES 100 - extern u64 efi_call0(void *fp); extern u64 efi_call1(void *fp, u64 arg1); extern u64 efi_call2(void *fp, u64 arg1, u64 arg2); diff --git a/arch/x86/include/asm/fixmap_64.h b/arch/x86/include/asm/fixmap_64.h index 00a30ab9b1a5..8be740977db8 100644 --- a/arch/x86/include/asm/fixmap_64.h +++ b/arch/x86/include/asm/fixmap_64.h @@ -16,7 +16,6 @@ #include #include #include -#include /* * Here we define all the compile-time 'special' virtual @@ -43,9 +42,6 @@ enum fixed_addresses { FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ FIX_IO_APIC_BASE_0, FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, - FIX_EFI_IO_MAP_LAST_PAGE, - FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE - + MAX_EFI_IO_PAGES - 1, #ifdef CONFIG_PARAVIRT FIX_PARAVIRT_BOOTMAP, #endif diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 1119d247fe11..eb1ef3b67dd5 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -467,7 +467,7 @@ void __init efi_enter_virtual_mode(void) efi_memory_desc_t *md; efi_status_t status; unsigned long size; - u64 end, systab, addr, npages; + u64 end, systab, addr, npages, end_pfn; void *p, *va; efi.systab = NULL; @@ -479,7 +479,10 @@ void __init efi_enter_virtual_mode(void) size = md->num_pages << EFI_PAGE_SHIFT; end = md->phys_addr + size; - if (PFN_UP(end) <= max_low_pfn_mapped) + end_pfn = PFN_UP(end); + if (end_pfn <= max_low_pfn_mapped + || (end_pfn > (1UL << (32 - PAGE_SHIFT)) + && end_pfn <= max_pfn_mapped)) va = __va(md->phys_addr); else va = efi_ioremap(md->phys_addr, size); diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index 652c5287215f..cb783b92c50c 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c @@ -99,24 +99,11 @@ void __init efi_call_phys_epilog(void) void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size) { - static unsigned pages_mapped __initdata; - unsigned i, pages; - unsigned long offset; + unsigned long last_map_pfn; - pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr); - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - - if (pages_mapped + pages > MAX_EFI_IO_PAGES) + last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); + if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) return NULL; - for (i = 0; i < pages; i++) { - __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped, - phys_addr, PAGE_KERNEL); - phys_addr += PAGE_SIZE; - pages_mapped++; - } - - return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \ - (pages_mapped - pages)) + offset; + return (void __iomem *)__va(phys_addr); } From acaabe795a62bba089c185917af86b44654313dc Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 4 Mar 2009 12:56:05 -0600 Subject: [PATCH 138/580] x86: UV, SGI RTC: add generic system vector This patch allocates a system interrupt vector for various platform specific uses. Signed-off-by: Dimitri Sivanich Cc: Andrew Morton Cc: john stultz LKML-Reference: <20090304185605.GA24419@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/entry_arch.h | 2 ++ arch/x86/include/asm/hardirq.h | 1 + arch/x86/include/asm/hw_irq.h | 1 + arch/x86/include/asm/irq.h | 1 + arch/x86/include/asm/irq_vectors.h | 5 +++++ arch/x86/kernel/entry_64.S | 2 ++ arch/x86/kernel/irq.c | 34 ++++++++++++++++++++++++++++++ arch/x86/kernel/irqinit_32.c | 3 +++ arch/x86/kernel/irqinit_64.c | 3 +++ 9 files changed, 52 insertions(+) diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 854d538ae857..c2e6bedaf258 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -33,6 +33,8 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7, smp_invalidate_interrupt) #endif +BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR) + /* * every pentium local APIC has two 'local interrupts', with a * soft-definable vector attached to both interrupts, one of diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 176f058e7159..039db6aa8e02 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -12,6 +12,7 @@ typedef struct { unsigned int apic_timer_irqs; /* arch dependent */ unsigned int irq_spurious_count; #endif + unsigned int generic_irqs; /* arch dependent */ #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 370e1c83bb49..b762ea49bd70 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -27,6 +27,7 @@ /* Interrupt handlers registered during init_IRQ */ extern void apic_timer_interrupt(void); +extern void generic_interrupt(void); extern void error_interrupt(void); extern void spurious_interrupt(void); extern void thermal_interrupt(void); diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 107eb2196691..f38481bcd455 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -36,6 +36,7 @@ static inline int irq_canonicalize(int irq) extern void fixup_irqs(void); #endif +extern void (*generic_interrupt_extension)(void); extern void init_IRQ(void); extern void native_init_IRQ(void); extern bool handle_irq(unsigned irq, struct pt_regs *regs); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 8a285f356f8a..3cbd79bbb47c 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -111,6 +111,11 @@ */ #define LOCAL_PERF_VECTOR 0xee +/* + * Generic system vector for platform specific use + */ +#define GENERIC_INTERRUPT_VECTOR 0xed + /* * First APIC vector available to drivers: (vectors 0x30-0xee) we * start at 0x31(0x41) to spread out vectors evenly between priority diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 83d1836b9467..7ba4621c0dfa 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -984,6 +984,8 @@ apicinterrupt UV_BAU_MESSAGE \ #endif apicinterrupt LOCAL_TIMER_VECTOR \ apic_timer_interrupt smp_apic_timer_interrupt +apicinterrupt GENERIC_INTERRUPT_VECTOR \ + generic_interrupt smp_generic_interrupt #ifdef CONFIG_SMP apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index f13ca1650aaf..b864341dcc45 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -15,6 +15,9 @@ atomic_t irq_err_count; +/* Function pointer for generic interrupt vector handling */ +void (*generic_interrupt_extension)(void) = NULL; + /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. @@ -56,6 +59,12 @@ static int show_other_interrupts(struct seq_file *p) seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); seq_printf(p, " Local timer interrupts\n"); #endif + if (generic_interrupt_extension) { + seq_printf(p, "PLT: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); + seq_printf(p, " Platform interrupts\n"); + } #ifdef CONFIG_SMP seq_printf(p, "RES: "); for_each_online_cpu(j) @@ -163,6 +172,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; #endif + if (generic_interrupt_extension) + sum += irq_stats(cpu)->generic_irqs; #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; sum += irq_stats(cpu)->irq_call_count; @@ -226,4 +237,27 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) return 1; } +/* + * Handler for GENERIC_INTERRUPT_VECTOR. + */ +void smp_generic_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + ack_APIC_irq(); + + exit_idle(); + + irq_enter(); + + inc_irq_stat(generic_irqs); + + if (generic_interrupt_extension) + generic_interrupt_extension(); + + irq_exit(); + + set_irq_regs(old_regs); +} + EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 50b8c3a3006c..bc1326105448 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -175,6 +175,9 @@ void __init native_init_IRQ(void) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + /* generic IPI for platform specific use */ + alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); + /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index da481a1e3f30..c7a49e0ffbfb 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -147,6 +147,9 @@ static void __init apic_intr_init(void) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + /* generic IPI for platform specific use */ + alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); + /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); From 8661984f628c6f7d9cbaac6697f26d6b0be3ad3b Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 4 Mar 2009 12:57:19 -0600 Subject: [PATCH 139/580] x86: UV, SGI RTC: loop through installed UV blades Add macro to loop through each possible blade. Signed-off-by: Dimitri Sivanich Cc: Andrew Morton Cc: john stultz LKML-Reference: <20090304185719.GB24419@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 777327ef05c1..9f4dfba33b28 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -199,6 +199,10 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define SCIR_CPU_ACTIVITY 0x02 /* not idle */ #define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */ +/* Loop through all installed blades */ +#define for_each_possible_blade(bid) \ + for ((bid) = 0; (bid) < uv_num_possible_blades(); (bid)++) + /* * Macros for converting between kernel virtual addresses, socket local physical * addresses, and UV global physical addresses. From 5ab5ab34498f94d60884c4ccea890601e429042e Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 4 Mar 2009 12:59:18 -0600 Subject: [PATCH 140/580] x86: UV, SGI RTC: add UV RTC clocksource/clockevents This patch provides a high resolution clock/timer source using the SGI UV system-wide synchronized RTC clock/timer hardware. Signed-off-by: Dimitri Sivanich Cc: Andrew Morton Cc: john stultz LKML-Reference: <20090304185918.GC24419@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/uv_time.c | 391 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 392 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/uv_time.c diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 95f216bbfaf1..339ce35648e6 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -111,7 +111,7 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64 ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) - obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o + obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c new file mode 100644 index 000000000000..6f8e3256ab2a --- /dev/null +++ b/arch/x86/kernel/uv_time.c @@ -0,0 +1,391 @@ +/* + * SGI RTC clock/timer routines. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2009 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Dimitri Sivanich + */ +#include + +#include +#include +#include +#include + +#define RTC_NAME "sgi_rtc" + +static cycle_t uv_read_rtc(void); +static int uv_rtc_next_event(unsigned long, struct clock_event_device *); +static void uv_rtc_timer_setup(enum clock_event_mode, + struct clock_event_device *); + +static struct clocksource clocksource_uv = { + .name = RTC_NAME, + .rating = 400, + .read = uv_read_rtc, + .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, + .shift = 10, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static struct clock_event_device clock_event_device_uv = { + .name = RTC_NAME, + .features = CLOCK_EVT_FEAT_ONESHOT, + .shift = 20, + .rating = 400, + .irq = -1, + .set_next_event = uv_rtc_next_event, + .set_mode = uv_rtc_timer_setup, + .event_handler = NULL, +}; + +static DEFINE_PER_CPU(struct clock_event_device, cpu_ced); + +/* There is one of these allocated per node */ +struct uv_rtc_timer_head { + spinlock_t lock; + /* next cpu waiting for timer, local node relative: */ + int next_cpu; + /* number of cpus on this node: */ + int ncpus; + struct { + int lcpu; /* systemwide logical cpu number */ + u64 expires; /* next timer expiration for this cpu */ + } cpu[1]; +}; + +/* + * Access to uv_rtc_timer_head via blade id. + */ +static struct uv_rtc_timer_head **blade_info __read_mostly; + +static int uv_rtc_enable; + +/* + * Hardware interface routines + */ + +/* Send IPIs to another node */ +static void uv_rtc_send_IPI(int cpu) +{ + unsigned long apicid, val; + int pnode; + + apicid = per_cpu(x86_cpu_to_apicid, cpu); + pnode = uv_apicid_to_pnode(apicid); + val = (1UL << UVH_IPI_INT_SEND_SHFT) | + (apicid << UVH_IPI_INT_APIC_ID_SHFT) | + (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); + + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); +} + +/* Check for an RTC interrupt pending */ +static int uv_intr_pending(int pnode) +{ + return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) & + UVH_EVENT_OCCURRED0_RTC1_MASK; +} + +/* Setup interrupt and return non-zero if early expiration occurred. */ +static int uv_setup_intr(int cpu, u64 expires) +{ + u64 val; + int pnode = uv_cpu_to_pnode(cpu); + + uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, + UVH_RTC1_INT_CONFIG_M_MASK); + uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L); + + uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, + UVH_EVENT_OCCURRED0_RTC1_MASK); + + val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | + ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); + + /* Set configuration */ + uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val); + /* Initialize comparator value */ + uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); + + return (expires < uv_read_rtc() && !uv_intr_pending(pnode)); +} + +/* + * Per-cpu timer tracking routines + */ + +static __init void uv_rtc_deallocate_timers(void) +{ + int bid; + + for_each_possible_blade(bid) { + kfree(blade_info[bid]); + } + kfree(blade_info); +} + +/* Allocate per-node list of cpu timer expiration times. */ +static __init int uv_rtc_allocate_timers(void) +{ + int cpu; + + blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL); + if (!blade_info) + return -ENOMEM; + memset(blade_info, 0, uv_possible_blades * sizeof(void *)); + + for_each_present_cpu(cpu) { + int nid = cpu_to_node(cpu); + int bid = uv_cpu_to_blade_id(cpu); + int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; + struct uv_rtc_timer_head *head = blade_info[bid]; + + if (!head) { + head = kmalloc_node(sizeof(struct uv_rtc_timer_head) + + (uv_blade_nr_possible_cpus(bid) * + 2 * sizeof(u64)), + GFP_KERNEL, nid); + if (!head) { + uv_rtc_deallocate_timers(); + return -ENOMEM; + } + spin_lock_init(&head->lock); + head->ncpus = uv_blade_nr_possible_cpus(bid); + head->next_cpu = -1; + blade_info[bid] = head; + } + + head->cpu[bcpu].lcpu = cpu; + head->cpu[bcpu].expires = ULLONG_MAX; + } + + return 0; +} + +/* Find and set the next expiring timer. */ +static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode) +{ + u64 lowest = ULLONG_MAX; + int c, bcpu = -1; + + head->next_cpu = -1; + for (c = 0; c < head->ncpus; c++) { + u64 exp = head->cpu[c].expires; + if (exp < lowest) { + bcpu = c; + lowest = exp; + } + } + if (bcpu >= 0) { + head->next_cpu = bcpu; + c = head->cpu[bcpu].lcpu; + if (uv_setup_intr(c, lowest)) + /* If we didn't set it up in time, trigger */ + uv_rtc_send_IPI(c); + } else { + uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, + UVH_RTC1_INT_CONFIG_M_MASK); + } +} + +/* + * Set expiration time for current cpu. + * + * Returns 1 if we missed the expiration time. + */ +static int uv_rtc_set_timer(int cpu, u64 expires) +{ + int pnode = uv_cpu_to_pnode(cpu); + int bid = uv_cpu_to_blade_id(cpu); + struct uv_rtc_timer_head *head = blade_info[bid]; + int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; + u64 *t = &head->cpu[bcpu].expires; + unsigned long flags; + int next_cpu; + + spin_lock_irqsave(&head->lock, flags); + + next_cpu = head->next_cpu; + *t = expires; + /* Will this one be next to go off? */ + if (next_cpu < 0 || bcpu == next_cpu || + expires < head->cpu[next_cpu].expires) { + head->next_cpu = bcpu; + if (uv_setup_intr(cpu, expires)) { + *t = ULLONG_MAX; + uv_rtc_find_next_timer(head, pnode); + spin_unlock_irqrestore(&head->lock, flags); + return 1; + } + } + + spin_unlock_irqrestore(&head->lock, flags); + return 0; +} + +/* + * Unset expiration time for current cpu. + * + * Returns 1 if this timer was pending. + */ +static int uv_rtc_unset_timer(int cpu) +{ + int pnode = uv_cpu_to_pnode(cpu); + int bid = uv_cpu_to_blade_id(cpu); + struct uv_rtc_timer_head *head = blade_info[bid]; + int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; + u64 *t = &head->cpu[bcpu].expires; + unsigned long flags; + int rc = 0; + + spin_lock_irqsave(&head->lock, flags); + + if (head->next_cpu == bcpu && uv_read_rtc() >= *t) + rc = 1; + + *t = ULLONG_MAX; + + /* Was the hardware setup for this timer? */ + if (head->next_cpu == bcpu) + uv_rtc_find_next_timer(head, pnode); + + spin_unlock_irqrestore(&head->lock, flags); + + return rc; +} + + +/* + * Kernel interface routines. + */ + +/* + * Read the RTC. + */ +static cycle_t uv_read_rtc(void) +{ + return (cycle_t)uv_read_local_mmr(UVH_RTC); +} + +/* + * Program the next event, relative to now + */ +static int uv_rtc_next_event(unsigned long delta, + struct clock_event_device *ced) +{ + int ced_cpu = cpumask_first(ced->cpumask); + + return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc()); +} + +/* + * Setup the RTC timer in oneshot mode + */ +static void uv_rtc_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + int ced_cpu = cpumask_first(evt->cpumask); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + case CLOCK_EVT_MODE_ONESHOT: + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here yet */ + break; + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + uv_rtc_unset_timer(ced_cpu); + break; + } +} + +static void uv_rtc_interrupt(void) +{ + struct clock_event_device *ced = &__get_cpu_var(cpu_ced); + int cpu = smp_processor_id(); + + if (!ced || !ced->event_handler) + return; + + if (uv_rtc_unset_timer(cpu) != 1) + return; + + ced->event_handler(ced); +} + +static int __init uv_enable_rtc(char *str) +{ + uv_rtc_enable = 1; + + return 1; +} +__setup("uvrtc", uv_enable_rtc); + +static __init void uv_rtc_register_clockevents(struct work_struct *dummy) +{ + struct clock_event_device *ced = &__get_cpu_var(cpu_ced); + + *ced = clock_event_device_uv; + ced->cpumask = cpumask_of(smp_processor_id()); + clockevents_register_device(ced); +} + +static __init int uv_rtc_setup_clock(void) +{ + int rc; + + if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) + return -ENODEV; + + generic_interrupt_extension = uv_rtc_interrupt; + + clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, + clocksource_uv.shift); + + rc = clocksource_register(&clocksource_uv); + if (rc) { + generic_interrupt_extension = NULL; + return rc; + } + + /* Setup and register clockevents */ + rc = uv_rtc_allocate_timers(); + if (rc) { + clocksource_unregister(&clocksource_uv); + generic_interrupt_extension = NULL; + return rc; + } + + clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, + NSEC_PER_SEC, clock_event_device_uv.shift); + + clock_event_device_uv.min_delta_ns = NSEC_PER_SEC / + sn_rtc_cycles_per_second; + + clock_event_device_uv.max_delta_ns = clocksource_uv.mask * + (NSEC_PER_SEC / sn_rtc_cycles_per_second); + + rc = schedule_on_each_cpu(uv_rtc_register_clockevents); + if (rc) { + clocksource_unregister(&clocksource_uv); + generic_interrupt_extension = NULL; + uv_rtc_deallocate_timers(); + } + + return rc; +} +arch_initcall(uv_rtc_setup_clock); From ab9e18587f4cdb5f3fb3854c732f27a36f98e8f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Gl=C3=B6ckner?= Date: Wed, 4 Mar 2009 19:42:27 +0100 Subject: [PATCH 141/580] x86, math-emu: fix init_fpu for task != current MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: fix math-emu related crash while using GDB/ptrace init_fpu() calls finit to initialize a task's xstate, while finit always works on the current task. If we use PTRACE_GETFPREGS on another process and both processes did not already use floating point, we get a null pointer exception in finit. This patch creates a new function finit_task that takes a task_struct parameter. finit becomes a wrapper that simply calls finit_task with current. On the plus side this avoids many calls to get_current which would each resolve to an inline assembler mov instruction. An empty finit_task has been added to i387.h to avoid linker errors in case the compiler still emits the call in init_fpu when CONFIG_MATH_EMULATION is not defined. The declaration of finit in i387.h has been removed as the remaining code using this function gets its prototype from fpu_proto.h. Signed-off-by: Daniel Glöckner Cc: Suresh Siddha Cc: "Pallipadi Venkatesh" Cc: Arjan van de Ven Cc: Bill Metzenthen LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/i387.h | 8 +++++++- arch/x86/kernel/i387.c | 2 +- arch/x86/math-emu/fpu_aux.c | 31 ++++++++++++++++++++----------- 3 files changed, 28 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 48f0004db8c9..71c9e5183982 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -172,7 +172,13 @@ static inline void __save_init_fpu(struct task_struct *tsk) #else /* CONFIG_X86_32 */ -extern void finit(void); +#ifdef CONFIG_MATH_EMULATION +extern void finit_task(struct task_struct *tsk); +#else +static inline void finit_task(struct task_struct *tsk) +{ +} +#endif static inline void tolerant_fwait(void) { diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index b0f61f0dcd0a..f2f8540a7f3d 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -136,7 +136,7 @@ int init_fpu(struct task_struct *tsk) #ifdef CONFIG_X86_32 if (!HAVE_HWFP) { memset(tsk->thread.xstate, 0, xstate_size); - finit(); + finit_task(tsk); set_stopped_child_used_math(tsk); return 0; } diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c index 491e737ce547..aa0987088774 100644 --- a/arch/x86/math-emu/fpu_aux.c +++ b/arch/x86/math-emu/fpu_aux.c @@ -30,20 +30,29 @@ static void fclex(void) } /* Needs to be externally visible */ -void finit(void) +void finit_task(struct task_struct *tsk) { - control_word = 0x037f; - partial_status = 0; - top = 0; /* We don't keep top in the status word internally. */ - fpu_tag_word = 0xffff; + struct i387_soft_struct *soft = &tsk->thread.xstate->soft; + struct address *oaddr, *iaddr; + soft->cwd = 0x037f; + soft->swd = 0; + soft->ftop = 0; /* We don't keep top in the status word internally. */ + soft->twd = 0xffff; /* The behaviour is different from that detailed in Section 15.1.6 of the Intel manual */ - operand_address.offset = 0; - operand_address.selector = 0; - instruction_address.offset = 0; - instruction_address.selector = 0; - instruction_address.opcode = 0; - no_ip_update = 1; + oaddr = (struct address *)&soft->foo; + oaddr->offset = 0; + oaddr->selector = 0; + iaddr = (struct address *)&soft->fip; + iaddr->offset = 0; + iaddr->selector = 0; + iaddr->opcode = 0; + soft->no_update = 1; +} + +void finit(void) +{ + finit_task(current); } /* From a71edd1f46c8a599509bda478fb4eea27fb0da63 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 4 Mar 2009 01:22:35 -0800 Subject: [PATCH 142/580] x86: fix bootmem cross node for 32bit numa Impact: fix panic on system 2g x4 sockets Found one system with 4 sockets and every sockets has 2g can not boot with numa32 because boot mem is crossing nodes. So try to have numa version of setup_bootmem_allocator(). Signed-off-by: Yinghai Lu Cc: Andrew Morton LKML-Reference: <49AE485B.8000902@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 46 +++++++++++++++++++++++++++++++++++++------ arch/x86/mm/numa_32.c | 5 +++-- 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 917c4e60c767..67bdb59d4e10 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -776,9 +776,37 @@ static void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } +#ifdef CONFIG_NEED_MULTIPLE_NODES +static unsigned long __init setup_node_bootmem(int nodeid, + unsigned long start_pfn, + unsigned long end_pfn, + unsigned long bootmap) +{ + unsigned long bootmap_size; + + if (start_pfn > max_low_pfn) + return bootmap; + if (end_pfn > max_low_pfn) + end_pfn = max_low_pfn; + + /* don't touch min_low_pfn */ + bootmap_size = init_bootmem_node(NODE_DATA(nodeid), + bootmap >> PAGE_SHIFT, + start_pfn, end_pfn); + printk(KERN_INFO " node %d low ram: %08lx - %08lx\n", + nodeid, start_pfn<> PAGE_SHIFT, - min_low_pfn, max_low_pfn); printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped<> PAGE_SHIFT, + min_low_pfn, max_low_pfn); printk(KERN_INFO " bootmap %08lx - %08lx\n", bootmap, bootmap + bootmap_size); - for_each_online_node(i) - free_bootmem_with_active_regions(i, max_low_pfn); + free_bootmem_with_active_regions(0, max_low_pfn); early_res_to_bootmem(0, max_low_pfn<bdata = &bootmem_node_data[nid]; + } - NODE_DATA(0)->bdata = &bootmem_node_data[0]; setup_bootmem_allocator(); } From b68adb16f29c8ea02f21f5ebf65bcabffe217e9f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 4 Mar 2009 01:24:04 -0800 Subject: [PATCH 143/580] x86: make 32-bit init_memory_mapping range change more like 64-bit Impact: cleanup make code more readable and more like 64-bit Signed-off-by: Yinghai Lu Cc: Andrew Morton LKML-Reference: <49AE48B4.8010907@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 126 +++++++++++++++++++++++++++++++----------- 1 file changed, 94 insertions(+), 32 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 67bdb59d4e10..37aeaf366d5f 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -885,29 +885,55 @@ static void __init find_early_table_space(unsigned long end, int use_pse) (table_start << PAGE_SHIFT) + tables); } +struct map_range { + unsigned long start; + unsigned long end; + unsigned page_size_mask; +}; + +#define NR_RANGE_MR 3 + +static int save_mr(struct map_range *mr, int nr_range, + unsigned long start_pfn, unsigned long end_pfn, + unsigned long page_size_mask) +{ + if (start_pfn < end_pfn) { + if (nr_range >= NR_RANGE_MR) + panic("run out of range for init_memory_mapping\n"); + mr[nr_range].start = start_pfn<> PAGE_SHIFT; - end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT); - } else { - /* head is not big page alignment ? */ - start_pfn = start >> PAGE_SHIFT; - end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) + /* head could not be big page alignment ? */ + start_pfn = start >> PAGE_SHIFT; + pos = start_pfn << PAGE_SHIFT; + if (pos == 0) + end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); + else + end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + if (end_pfn > (end>>PAGE_SHIFT)) + end_pfn = end>>PAGE_SHIFT; + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + pos = end_pfn << PAGE_SHIFT; } - if (start_pfn < end_pfn) - kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0); /* big page range */ - start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) + start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - if (start_pfn < (big_page_start >> PAGE_SHIFT)) - start_pfn = big_page_start >> PAGE_SHIFT; end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - if (start_pfn < end_pfn) - kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, - use_pse); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1< (big_page_start>>PAGE_SHIFT)) { - end_pfn = end >> PAGE_SHIFT; - if (start_pfn < end_pfn) - kernel_physical_mapping_init(pgd_base, start_pfn, - end_pfn, 0); + start_pfn = pos>>PAGE_SHIFT; + end_pfn = end>>PAGE_SHIFT; + if (start_pfn < end_pfn) + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + + /* try to merge same page size and continuous */ + for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { + unsigned long old_start; + if (mr[i].end != mr[i+1].start || + mr[i].page_size_mask != mr[i+1].page_size_mask) + continue; + /* move it */ + old_start = mr[i].start; + memmove(&mr[i], &mr[i+1], + (nr_range - 1 - i) * sizeof(struct map_range)); + mr[i--].start = old_start; + nr_range--; } + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG " %08lx - %08lx page %s\n", + mr[i].start, mr[i].end, + (mr[i].page_size_mask & (1<> PAGE_SHIFT, + mr[i].end >> PAGE_SHIFT, + mr[i].page_size_mask == (1< Date: Wed, 4 Mar 2009 01:25:21 -0800 Subject: [PATCH 144/580] x86: ioremap mptable Impact: fix boot with mptable above max_low_mapped Try to use early_ioremap() to map MPC to make sure it works even it is at the end of ram. Signed-off-by: Yinghai Lu Cc: Andrew Morton LKML-Reference: <49AE4901.3090801@kernel.org> Signed-off-by: Ingo Molnar Reported-and-tested-by: Kevin O'Connor --- arch/x86/kernel/mpparse.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 37cb1bda1baf..ae9060cb4481 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -558,6 +558,19 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) static struct mpf_intel *mpf_found; +static unsigned long __init get_mpc_size(unsigned long physptr) +{ + struct mpc_table *mpc; + unsigned long size; + + mpc = early_ioremap(physptr, PAGE_SIZE); + size = mpc->length; + early_iounmap(mpc, PAGE_SIZE); + apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); + + return size; +} + /* * Scan the memory blocks for an SMP configuration block. */ @@ -611,12 +624,16 @@ static void __init __get_smp_config(unsigned int early) construct_default_ISA_mptable(mpf->feature1); } else if (mpf->physptr) { + struct mpc_table *mpc; + unsigned long size; + size = get_mpc_size(mpf->physptr); + mpc = early_ioremap(mpf->physptr, size); /* * Read the physical hardware table. Anything here will * override the defaults. */ - if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) { + if (!smp_read_mpc(mpc, early)) { #ifdef CONFIG_X86_LOCAL_APIC smp_found_config = 0; #endif @@ -624,8 +641,10 @@ static void __init __get_smp_config(unsigned int early) "BIOS bug, MP table errors detected!...\n"); printk(KERN_ERR "... disabling SMP support. " "(tell your hw vendor)\n"); + early_iounmap(mpc, size); return; } + early_iounmap(mpc, size); if (early) return; From f62432395ec54e93f113091bcb2e2017eeed7683 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 4 Mar 2009 01:25:54 -0800 Subject: [PATCH 145/580] x86: reserve exact size of mptable Impact: save a bit of RAM Get the exact size for the reserve_bootmem() call. Signed-off-by: Yinghai Lu Cc: Andrew Morton LKML-Reference: <49AE4922.605@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index ae9060cb4481..e8192401da47 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -716,10 +716,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, if (!reserve) return 1; - reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE, + reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf), BOOTMEM_DEFAULT); if (mpf->physptr) { - unsigned long size = PAGE_SIZE; + unsigned long size = get_mpc_size(mpf->physptr); #ifdef CONFIG_X86_32 /* * We cannot access to MPC table to compute From dd4124a8a06bca89c077a16437edac010f0bb993 Mon Sep 17 00:00:00 2001 From: Leann Ogasawara Date: Wed, 4 Mar 2009 11:53:00 -0800 Subject: [PATCH 146/580] x86: add Dell XPS710 reboot quirk Dell XPS710 will hang on reboot. This is resolved by adding a quirk to set bios reboot. Signed-off-by: Leann Ogasawara Signed-off-by: Tim Gardner Cc: "manoj.iyer" Cc: LKML-Reference: <1236196380.3231.89.camel@emiko> Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 2b46eb41643b..4526b3a75ed2 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -217,6 +217,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), }, }, + { /* Handle problems with rebooting on Dell XPS710 */ + .callback = set_bios_reboot, + .ident = "Dell XPS710", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"), + }, + }, { } }; From 731ddea63600c24ff01e6e5144cea88bf7266ac5 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 4 Mar 2009 11:13:40 +0200 Subject: [PATCH 147/580] x86: move free_initrd_mem() to common mm/init.c Impact: cleanup The function is identical on 32-bit and 64-bit configurations so move it to the common mm/init.c file. Signed-off-by: Pekka Enberg LKML-Reference: <1236158020.29024.28.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 7 +++++++ arch/x86/mm/init_32.c | 7 ------- arch/x86/mm/init_64.c | 7 ------- 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index f89df52683c5..cc7fe660f33d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -71,3 +71,10 @@ void free_initmem(void) (unsigned long)(&__init_begin), (unsigned long)(&__init_end)); } + +#ifdef CONFIG_BLK_DEV_INITRD +void free_initrd_mem(unsigned long start, unsigned long end) +{ + free_init_pages("initrd memory", start, end); +} +#endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 37aeaf366d5f..c69c6b1f5e55 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1275,13 +1275,6 @@ void mark_rodata_ro(void) } #endif -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_init_pages("initrd memory", start, end); -} -#endif - int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, int flags) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 074435e79824..d325186dd32b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -963,13 +963,6 @@ void mark_rodata_ro(void) #endif -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_init_pages("initrd memory", start, end); -} -#endif - int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, int flags) { From 64ca5ab913f1594ef316556e65f5eae63ff50cee Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Mar 2009 12:11:56 -0800 Subject: [PATCH 148/580] rcu: increment quiescent state counter in ksoftirqd() If a machine is flooded by network frames, a cpu can loop 100% of its time inside ksoftirqd() without calling schedule(). This can delay RCU grace period to insane values. Adding rcu_qsctr_inc() call in ksoftirqd() solves this problem. Paul: "This regression was a result of the recent change from "schedule()" to "cond_resched()", which got rid of that quiescent state in the common case where a reschedule is not needed". Signed-off-by: Eric Dumazet Reviewed-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/softirq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/softirq.c b/kernel/softirq.c index bdbe9de9cd8d..9041ea7948fe 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -626,6 +626,7 @@ static int ksoftirqd(void * __bind_cpu) preempt_enable_no_resched(); cond_resched(); preempt_disable(); + rcu_qsctr_inc((long)__bind_cpu); } preempt_enable(); set_current_state(TASK_INTERRUPTIBLE); From fc5efe3941c47c0278fe1bbcf8cc02a03a74fcda Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 4 Mar 2009 12:21:24 -0800 Subject: [PATCH 149/580] x86: fix bootmem cross node for 32bit numa, cleanup Impact: clean up Simplify the code, reuse some lines. Remove min_low_pfn reference, it is always 0 Signed-off-by: Yinghai Lu Cc: Andrew Morton LKML-Reference: <49AEE2C4.2030602@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c69c6b1f5e55..c351456d06dc 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -776,7 +776,6 @@ static void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } -#ifdef CONFIG_NEED_MULTIPLE_NODES static unsigned long __init setup_node_bootmem(int nodeid, unsigned long start_pfn, unsigned long end_pfn, @@ -802,7 +801,6 @@ static unsigned long __init setup_node_bootmem(int nodeid, return bootmap + bootmap_size; } -#endif void __init setup_bootmem_allocator(void) { @@ -812,8 +810,7 @@ void __init setup_bootmem_allocator(void) * Initialize the boot-time allocator (with low memory only): */ bootmap_size = bootmem_bootmap_pages(max_low_pfn)<> PAGE_SHIFT, - min_low_pfn, max_low_pfn); - printk(KERN_INFO " bootmap %08lx - %08lx\n", - bootmap, bootmap + bootmap_size); - free_bootmem_with_active_regions(0, max_low_pfn); - early_res_to_bootmem(0, max_low_pfn< Date: Wed, 4 Mar 2009 07:33:51 +0100 Subject: [PATCH 150/580] smack: fixes for unlabeled host support The following patch (against 2.6.29rc5) fixes a few issues in the smack/netlabel "unlabeled host support" functionnality that was added in 2.6.29rc. It should go in before -final. 1) smack_host_label disregard a "0.0.0.0/0 @" rule (or other label), preventing 'tagged' tasks to access Internet (many systems drop packets with IP options) 2) netmasks were not handled correctly, they were stored in a way _not equivalent_ to conversion to be32 (it was equivalent for /0, /8, /16, /24, /32 masks but not other masks) 3) smack_netlbladdr prefixes (IP/mask) were not consistent (mask&IP was not done), so there could have been different list entries for the same IP prefix; if those entries had different labels, well ... 4) they were not sorted 1) 2) 3) are bugs, 4) is a more cosmetic issue. The patch : -creates a new helper smk_netlbladdr_insert to insert a smk_netlbladdr, -sorted by netmask length -use the new sorted nature of smack_netlbladdrs list to simplify smack_host_label : the first match _will_ be the more specific -corrects endianness issues in smk_write_netlbladdr & netlbladdr_seq_show Signed-off-by: Acked-by: Casey Schaufler Reviewed-by: Paul Moore Signed-off-by: James Morris --- security/smack/smack_lsm.c | 43 +++++-------------------- security/smack/smackfs.c | 64 +++++++++++++++++++++++++++++--------- 2 files changed, 57 insertions(+), 50 deletions(-) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 0278bc083044..e7ded1326b0f 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -1498,58 +1498,31 @@ static int smack_socket_post_create(struct socket *sock, int family, * looks for host based access restrictions * * This version will only be appropriate for really small - * sets of single label hosts. Because of the masking - * it cannot shortcut out on the first match. There are - * numerious ways to address the problem, but none of them - * have been applied here. + * sets of single label hosts. * * Returns the label of the far end or NULL if it's not special. */ static char *smack_host_label(struct sockaddr_in *sip) { struct smk_netlbladdr *snp; - char *bestlabel = NULL; struct in_addr *siap = &sip->sin_addr; - struct in_addr *liap; - struct in_addr *miap; - struct in_addr bestmask; if (siap->s_addr == 0) return NULL; - bestmask.s_addr = 0; - for (snp = smack_netlbladdrs; snp != NULL; snp = snp->smk_next) { - liap = &snp->smk_host.sin_addr; - miap = &snp->smk_mask; /* - * If the addresses match after applying the list entry mask - * the entry matches the address. If it doesn't move along to - * the next entry. + * we break after finding the first match because + * the list is sorted from longest to shortest mask + * so we have found the most specific match */ - if ((liap->s_addr & miap->s_addr) != - (siap->s_addr & miap->s_addr)) - continue; - /* - * If the list entry mask identifies a single address - * it can't get any more specific. - */ - if (miap->s_addr == 0xffffffff) + if ((&snp->smk_host.sin_addr)->s_addr == + (siap->s_addr & (&snp->smk_mask)->s_addr)) { return snp->smk_label; - /* - * If the list entry mask is less specific than the best - * already found this entry is uninteresting. - */ - if ((miap->s_addr | bestmask.s_addr) == bestmask.s_addr) - continue; - /* - * This is better than any entry found so far. - */ - bestmask.s_addr = miap->s_addr; - bestlabel = snp->smk_label; + } } - return bestlabel; + return NULL; } /** diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index 8e42800878f4..51f0efc50dab 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -650,10 +650,6 @@ static void *netlbladdr_seq_next(struct seq_file *s, void *v, loff_t *pos) return skp; } -/* -#define BEMASK 0x80000000 -*/ -#define BEMASK 0x00000001 #define BEBITS (sizeof(__be32) * 8) /* @@ -663,12 +659,10 @@ static int netlbladdr_seq_show(struct seq_file *s, void *v) { struct smk_netlbladdr *skp = (struct smk_netlbladdr *) v; unsigned char *hp = (char *) &skp->smk_host.sin_addr.s_addr; - __be32 bebits; - int maskn = 0; + int maskn; + u32 temp_mask = be32_to_cpu(skp->smk_mask.s_addr); - for (bebits = BEMASK; bebits != 0; maskn++, bebits <<= 1) - if ((skp->smk_mask.s_addr & bebits) == 0) - break; + for (maskn = 0; temp_mask; temp_mask <<= 1, maskn++); seq_printf(s, "%u.%u.%u.%u/%d %s\n", hp[0], hp[1], hp[2], hp[3], maskn, skp->smk_label); @@ -701,6 +695,42 @@ static int smk_open_netlbladdr(struct inode *inode, struct file *file) return seq_open(file, &netlbladdr_seq_ops); } +/** + * smk_netlbladdr_insert + * @new : netlabel to insert + * + * This helper insert netlabel in the smack_netlbladdrs list + * sorted by netmask length (longest to smallest) + */ +static void smk_netlbladdr_insert(struct smk_netlbladdr *new) +{ + struct smk_netlbladdr *m; + + if (smack_netlbladdrs == NULL) { + smack_netlbladdrs = new; + return; + } + + /* the comparison '>' is a bit hacky, but works */ + if (new->smk_mask.s_addr > smack_netlbladdrs->smk_mask.s_addr) { + new->smk_next = smack_netlbladdrs; + smack_netlbladdrs = new; + return; + } + for (m = smack_netlbladdrs; m != NULL; m = m->smk_next) { + if (m->smk_next == NULL) { + m->smk_next = new; + return; + } + if (new->smk_mask.s_addr > m->smk_next->smk_mask.s_addr) { + new->smk_next = m->smk_next; + m->smk_next = new; + return; + } + } +} + + /** * smk_write_netlbladdr - write() for /smack/netlabel * @filp: file pointer, not actually used @@ -724,8 +754,9 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf, struct netlbl_audit audit_info; struct in_addr mask; unsigned int m; - __be32 bebits = BEMASK; + u32 mask_bits = (1<<31); __be32 nsa; + u32 temp_mask; /* * Must have privilege. @@ -761,10 +792,13 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf, if (sp == NULL) return -EINVAL; - for (mask.s_addr = 0; m > 0; m--) { - mask.s_addr |= bebits; - bebits <<= 1; + for (temp_mask = 0; m > 0; m--) { + temp_mask |= mask_bits; + mask_bits >>= 1; } + mask.s_addr = cpu_to_be32(temp_mask); + + newname.sin_addr.s_addr &= mask.s_addr; /* * Only allow one writer at a time. Writes should be * quite rare and small in any case. @@ -772,6 +806,7 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf, mutex_lock(&smk_netlbladdr_lock); nsa = newname.sin_addr.s_addr; + /* try to find if the prefix is already in the list */ for (skp = smack_netlbladdrs; skp != NULL; skp = skp->smk_next) if (skp->smk_host.sin_addr.s_addr == nsa && skp->smk_mask.s_addr == mask.s_addr) @@ -787,9 +822,8 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf, rc = 0; skp->smk_host.sin_addr.s_addr = newname.sin_addr.s_addr; skp->smk_mask.s_addr = mask.s_addr; - skp->smk_next = smack_netlbladdrs; skp->smk_label = sp; - smack_netlbladdrs = skp; + smk_netlbladdr_insert(skp); } } else { rc = netlbl_cfg_unlbl_static_del(&init_net, NULL, From d0cac39e4ec8097e4c7099d291b1fdcc0fe56b58 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 4 Mar 2009 14:43:47 -0800 Subject: [PATCH 151/580] sparc64: Fix lost interrupts on sun4u. Based upon a report by Meelis Roos. Sparc64 SBUS and PCI controllers use a combination of IMAP and ICLR registers to manage device interrupts. The IMAP register contains the "valid" enable bit as well as CPU targetting information. Whereas the ICLR register is written with zero at the end of handling an interrupt to reset the state machine for that interrupt to IDLE so it can be sent again. For PCI slot and SBUS slot devices we can have multiple interrupts sharing the same IMAP register. There are individual ICLR registers but only one IMAP register for managing those. We represent each shared case with individual virtual IRQs so the generic IRQ layer thinks there is only one user of the IRQ instance. In such shared IMAP cases this is wrong, so if there are multiple active users then a free_irq() call will prematurely turn off the interrupt by clearing the Valid bit in the IMAP register even though there are other active users. Fix this by simply doing nothing in sun4u_disable_irq() and checking IRQF_DISABLED during IRQ dispatch. This situation doesn't exist in the hypervisor sun4v cases, so I left those alone. Tested-by: Meelis Roos Signed-off-by: David S. Miller --- arch/sparc/kernel/irq_64.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index e289376198eb..1c378d8e90c5 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -323,17 +323,25 @@ static void sun4u_set_affinity(unsigned int virt_irq, sun4u_irq_enable(virt_irq); } +/* Don't do anything. The desc->status check for IRQ_DISABLED in + * handler_irq() will skip the handler call and that will leave the + * interrupt in the sent state. The next ->enable() call will hit the + * ICLR register to reset the state machine. + * + * This scheme is necessary, instead of clearing the Valid bit in the + * IMAP register, to handle the case of IMAP registers being shared by + * multiple INOs (and thus ICLR registers). Since we use a different + * virtual IRQ for each shared IMAP instance, the generic code thinks + * there is only one user so it prematurely calls ->disable() on + * free_irq(). + * + * We have to provide an explicit ->disable() method instead of using + * NULL to get the default. The reason is that if the generic code + * sees that, it also hooks up a default ->shutdown method which + * invokes ->mask() which we do not want. See irq_chip_set_defaults(). + */ static void sun4u_irq_disable(unsigned int virt_irq) { - struct irq_handler_data *data = get_irq_chip_data(virt_irq); - - if (likely(data)) { - unsigned long imap = data->imap; - unsigned long tmp = upa_readq(imap); - - tmp &= ~IMAP_VALID; - upa_writeq(tmp, imap); - } } static void sun4u_irq_eoi(unsigned int virt_irq) @@ -746,7 +754,8 @@ void handler_irq(int irq, struct pt_regs *regs) desc = irq_desc + virt_irq; - desc->handle_irq(virt_irq, desc); + if (!(desc->status & IRQ_DISABLED)) + desc->handle_irq(virt_irq, desc); bucket_pa = next_pa; } From 49bc46360d68156ce82b2b1a12badb80078453a0 Mon Sep 17 00:00:00 2001 From: Maciej Sosnowski Date: Thu, 26 Feb 2009 11:04:23 +0100 Subject: [PATCH 152/580] I/OAT: add verification for proper APICID_TAG_MAP setting by BIOS BIOS versions for systems with I/OAT ver.2 have been found which fail to program APICID_TAG_MAP for DCA. The ioatdma driver should recognize incorrectly set APICID_TAG_MAP and disable DCA in that case. Signed-off-by: Maciej Sosnowski Signed-off-by: Shannon Nelson Acked-by: Jeff Kirsher Signed-off-by: Dan Williams --- drivers/dma/ioat_dca.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/dma/ioat_dca.c b/drivers/dma/ioat_dca.c index 6cf622da0286..78705ca7d36a 100644 --- a/drivers/dma/ioat_dca.c +++ b/drivers/dma/ioat_dca.c @@ -49,6 +49,23 @@ #define DCA_TAG_MAP_MASK 0xDF +/* expected tag map bytes for I/OAT ver.2 */ +#define DCA2_TAG_MAP_BYTE0 0x80 +#define DCA2_TAG_MAP_BYTE1 0x0 +#define DCA2_TAG_MAP_BYTE2 0x81 +#define DCA2_TAG_MAP_BYTE3 0x82 +#define DCA2_TAG_MAP_BYTE4 0x82 + +/* verify if tag map matches expected values */ +static inline int dca2_tag_map_valid(u8 *tag_map) +{ + return ((tag_map[0] == DCA2_TAG_MAP_BYTE0) && + (tag_map[1] == DCA2_TAG_MAP_BYTE1) && + (tag_map[2] == DCA2_TAG_MAP_BYTE2) && + (tag_map[3] == DCA2_TAG_MAP_BYTE3) && + (tag_map[4] == DCA2_TAG_MAP_BYTE4)); +} + /* * "Legacy" DCA systems do not implement the DCA register set in the * I/OAT device. Software needs direct support for their tag mappings. @@ -452,6 +469,13 @@ struct dca_provider *ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase) ioatdca->tag_map[i] = 0; } + if (!dca2_tag_map_valid(ioatdca->tag_map)) { + dev_err(&pdev->dev, "APICID_TAG_MAP set incorrectly by BIOS, " + "disabling DCA\n"); + free_dca_provider(dca); + return NULL; + } + err = register_dca_provider(dca, &pdev->dev); if (err) { free_dca_provider(dca); From ea9c717d0148d4194f9bd04ecfa6b59b20fc0a08 Mon Sep 17 00:00:00 2001 From: Maciej Sosnowski Date: Thu, 26 Feb 2009 11:04:38 +0100 Subject: [PATCH 153/580] I/OAT: do not set DCACTRL_CMPL_WRITE_ENABLE for I/OAT ver.3 Flag DCACTRL_CMPL_WRITE_ENABLE is valid only for I/OAT ver.2 so it should not be set for I/OAT ver.3. Signed-off-by: Maciej Sosnowski Signed-off-by: Shannon Nelson Acked-by: Jeff Kirsher Signed-off-by: Dan Williams --- drivers/dma/ioat_dma.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c index b3759c4b6536..879f4a06e3ca 100644 --- a/drivers/dma/ioat_dma.c +++ b/drivers/dma/ioat_dma.c @@ -189,11 +189,13 @@ static int ioat_dma_enumerate_channels(struct ioatdma_device *device) ioat_chan->xfercap = xfercap; ioat_chan->desccount = 0; INIT_DELAYED_WORK(&ioat_chan->work, ioat_dma_chan_reset_part2); - if (ioat_chan->device->version != IOAT_VER_1_2) { - writel(IOAT_DCACTRL_CMPL_WRITE_ENABLE - | IOAT_DMA_DCA_ANY_CPU, - ioat_chan->reg_base + IOAT_DCACTRL_OFFSET); - } + if (ioat_chan->device->version == IOAT_VER_2_0) + writel(IOAT_DCACTRL_CMPL_WRITE_ENABLE | + IOAT_DMA_DCA_ANY_CPU, + ioat_chan->reg_base + IOAT_DCACTRL_OFFSET); + else if (ioat_chan->device->version == IOAT_VER_3_0) + writel(IOAT_DMA_DCA_ANY_CPU, + ioat_chan->reg_base + IOAT_DCACTRL_OFFSET); spin_lock_init(&ioat_chan->cleanup_lock); spin_lock_init(&ioat_chan->desc_lock); INIT_LIST_HEAD(&ioat_chan->free_desc); From 8b794b141c633083408d0bfb2229b3406d0ebf99 Mon Sep 17 00:00:00 2001 From: Maciej Sosnowski Date: Thu, 26 Feb 2009 11:04:54 +0100 Subject: [PATCH 154/580] I/OAT: fail initialization on zero channels detection On some systems with I/OAT ver.2 when DCA is disabled in BIOS situations have been observed that zero DMA channels are detected instead of four. To avoid kernel panic driver should fail gracefully with appropriate message. Signed-off-by: Maciej Sosnowski Signed-off-by: Shannon Nelson Acked-by: Jeff Kirsher Signed-off-by: Dan Williams --- drivers/dma/ioat_dma.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c index 879f4a06e3ca..9012da7908f5 100644 --- a/drivers/dma/ioat_dma.c +++ b/drivers/dma/ioat_dma.c @@ -1659,6 +1659,13 @@ struct ioatdma_device *ioat_dma_probe(struct pci_dev *pdev, " %d channels, device version 0x%02x, driver version %s\n", device->common.chancnt, device->version, IOAT_DMA_VERSION); + if (!device->common.chancnt) { + dev_err(&device->pdev->dev, + "Intel(R) I/OAT DMA Engine problem found: " + "zero channels detected\n"); + goto err_setup_interrupts; + } + err = ioat_dma_setup_interrupts(device); if (err) goto err_setup_interrupts; From 2b8a6bf896ef47cc7d84c503079cc7b99789f9fa Mon Sep 17 00:00:00 2001 From: Maciej Sosnowski Date: Thu, 26 Feb 2009 11:05:07 +0100 Subject: [PATCH 155/580] I/OAT: cancel watchdog before dma remove Channel watchdog should be canceled before the rest of dma remove stuff. Signed-off-by: Maciej Sosnowski Signed-off-by: Shannon Nelson Acked-by: Jeff Kirsher Signed-off-by: Dan Williams --- drivers/dma/ioat_dma.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c index 9012da7908f5..fc9b845ee893 100644 --- a/drivers/dma/ioat_dma.c +++ b/drivers/dma/ioat_dma.c @@ -1705,6 +1705,9 @@ void ioat_dma_remove(struct ioatdma_device *device) struct dma_chan *chan, *_chan; struct ioat_dma_chan *ioat_chan; + if (device->version != IOAT_VER_3_0) + cancel_delayed_work(&device->work); + ioat_dma_remove_interrupts(device); dma_async_device_unregister(&device->common); @@ -1716,10 +1719,6 @@ void ioat_dma_remove(struct ioatdma_device *device) pci_release_regions(device->pdev); pci_disable_device(device->pdev); - if (device->version != IOAT_VER_3_0) { - cancel_delayed_work(&device->work); - } - list_for_each_entry_safe(chan, _chan, &device->common.channels, device_node) { ioat_chan = to_ioat_chan(chan); From 5de22343b2303b278ab562e5d166ffe306566d30 Mon Sep 17 00:00:00 2001 From: Maciej Sosnowski Date: Thu, 26 Feb 2009 11:05:17 +0100 Subject: [PATCH 156/580] I/OAT: set tcp_dma_copybreak to 256k for I/OAT ver.3 Upcoming server platforms from Intel based on the Nehalem performance have significantly improved CPU based copy performance. However, the DMA engine can still be effective at higher I/O sizes for TCP traffic and at this time copybreak should be set to 256k for TCP traffic only. Signed-off-by: Maciej Sosnowski Signed-off-by: Shannon Nelson Acked-by: Jeff Kirsher Signed-off-by: Dan Williams --- drivers/dma/ioatdma.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/dma/ioatdma.h b/drivers/dma/ioatdma.h index a3306d0e1372..dcf8db513c3a 100644 --- a/drivers/dma/ioatdma.h +++ b/drivers/dma/ioatdma.h @@ -135,12 +135,14 @@ static inline void ioat_set_tcp_copy_break(struct ioatdma_device *dev) #ifdef CONFIG_NET_DMA switch (dev->version) { case IOAT_VER_1_2: - case IOAT_VER_3_0: sysctl_tcp_dma_copybreak = 4096; break; case IOAT_VER_2_0: sysctl_tcp_dma_copybreak = 2048; break; + case IOAT_VER_3_0: + sysctl_tcp_dma_copybreak = 262144; + break; } #endif } From aa2d0b8b97efa1033609ca89b9faa5d3a1232959 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Thu, 26 Feb 2009 11:05:30 +0100 Subject: [PATCH 157/580] I/OAT: list usage cleanup Trivial cleanup, list_del(); list_add_tail() is equivalent to list_move_tail(). Semantic patch for coccinelle can be found at www.cccmz.de/~snakebyte/list_move_tail.spatch Signed-off-by: Eric Sesterhenn Signed-off-by: Maciej Sosnowski Signed-off-by: Shannon Nelson Acked-by: Jeff Kirsher Signed-off-by: Dan Williams --- drivers/dma/ioat_dma.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c index fc9b845ee893..ae8c0ce3b86a 100644 --- a/drivers/dma/ioat_dma.c +++ b/drivers/dma/ioat_dma.c @@ -1171,9 +1171,8 @@ static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *ioat_chan) * up if the client is done with the descriptor */ if (async_tx_test_ack(&desc->async_tx)) { - list_del(&desc->node); - list_add_tail(&desc->node, - &ioat_chan->free_desc); + list_move_tail(&desc->node, + &ioat_chan->free_desc); } else desc->async_tx.cookie = 0; } else { From 211a22ce08dbb27eb1a66df8a4bdae5e96092bc8 Mon Sep 17 00:00:00 2001 From: Maciej Sosnowski Date: Thu, 26 Feb 2009 11:05:43 +0100 Subject: [PATCH 158/580] I/OAT: update driver version and copyright dates Together with new fixes update driver version and extend copyright dates ranges. Signed-off-by: Maciej Sosnowski Signed-off-by: Shannon Nelson Acked-by: Jeff Kirsher Signed-off-by: Dan Williams --- drivers/dca/dca-core.c | 2 +- drivers/dma/ioat.c | 2 +- drivers/dma/ioat_dca.c | 2 +- drivers/dma/ioat_dma.c | 2 +- drivers/dma/ioatdma.h | 4 ++-- drivers/dma/ioatdma_hw.h | 2 +- drivers/dma/ioatdma_registers.h | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/dca/dca-core.c b/drivers/dca/dca-core.c index 33bd75347518..25b743abfb59 100644 --- a/drivers/dca/dca-core.c +++ b/drivers/dca/dca-core.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2007 Intel Corporation. All rights reserved. + * Copyright(c) 2007 - 2009 Intel Corporation. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free diff --git a/drivers/dma/ioat.c b/drivers/dma/ioat.c index 4105d6575b64..ed83dd9df192 100644 --- a/drivers/dma/ioat.c +++ b/drivers/dma/ioat.c @@ -1,6 +1,6 @@ /* * Intel I/OAT DMA Linux driver - * Copyright(c) 2007 Intel Corporation. + * Copyright(c) 2007 - 2009 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, diff --git a/drivers/dma/ioat_dca.c b/drivers/dma/ioat_dca.c index 78705ca7d36a..c012a1e15043 100644 --- a/drivers/dma/ioat_dca.c +++ b/drivers/dma/ioat_dca.c @@ -1,6 +1,6 @@ /* * Intel I/OAT DMA Linux driver - * Copyright(c) 2007 Intel Corporation. + * Copyright(c) 2007 - 2009 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c index ae8c0ce3b86a..068b63514525 100644 --- a/drivers/dma/ioat_dma.c +++ b/drivers/dma/ioat_dma.c @@ -1,6 +1,6 @@ /* * Intel I/OAT DMA Linux driver - * Copyright(c) 2004 - 2007 Intel Corporation. + * Copyright(c) 2004 - 2009 Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, diff --git a/drivers/dma/ioatdma.h b/drivers/dma/ioatdma.h index dcf8db513c3a..a52ff4bd4601 100644 --- a/drivers/dma/ioatdma.h +++ b/drivers/dma/ioatdma.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2004 - 2007 Intel Corporation. All rights reserved. + * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -29,7 +29,7 @@ #include #include -#define IOAT_DMA_VERSION "3.30" +#define IOAT_DMA_VERSION "3.64" enum ioat_interrupt { none = 0, diff --git a/drivers/dma/ioatdma_hw.h b/drivers/dma/ioatdma_hw.h index f1ae2c776f74..afa57eef86c9 100644 --- a/drivers/dma/ioatdma_hw.h +++ b/drivers/dma/ioatdma_hw.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2004 - 2007 Intel Corporation. All rights reserved. + * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free diff --git a/drivers/dma/ioatdma_registers.h b/drivers/dma/ioatdma_registers.h index 827cb503cac6..49bc277424f8 100644 --- a/drivers/dma/ioatdma_registers.h +++ b/drivers/dma/ioatdma_registers.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2004 - 2007 Intel Corporation. All rights reserved. + * Copyright(c) 2004 - 2009 Intel Corporation. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free From 0c33e1ca3d80647f2e72e44524fd21e79214da20 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 2 Mar 2009 13:31:35 -0700 Subject: [PATCH 159/580] I/OAT: fail self-test if callback test reaches timeout If we miss interrupts in the self test then fail registration of this channel as it is unsuitable for use as a public channel. Signed-off-by: Maciej Sosnowski Signed-off-by: Dan Williams --- drivers/dma/ioat_dma.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c index 068b63514525..5905cd36bcd2 100644 --- a/drivers/dma/ioat_dma.c +++ b/drivers/dma/ioat_dma.c @@ -1363,6 +1363,7 @@ static int ioat_dma_self_test(struct ioatdma_device *device) dma_cookie_t cookie; int err = 0; struct completion cmp; + unsigned long tmo; src = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL); if (!src) @@ -1414,9 +1415,10 @@ static int ioat_dma_self_test(struct ioatdma_device *device) } device->common.device_issue_pending(dma_chan); - wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)); + tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)); - if (device->common.device_is_tx_complete(dma_chan, cookie, NULL, NULL) + if (tmo == 0 || + device->common.device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) { dev_err(&device->pdev->dev, "Self-test copy timed out, disabling\n"); From 900325a6ce33995688b7a680a34e7698f16f4d72 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 2 Mar 2009 15:33:46 -0700 Subject: [PATCH 160/580] fsldma: fix off by one in dma_halt Prevent dev_err from firing even if we successfully detected 'dma-idle' before the full 1ms timeout has elapsed. Acked-by: Roel Kluin Signed-off-by: Dan Williams --- drivers/dma/fsldma.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index 70126a606239..86d6da47f558 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -158,7 +158,8 @@ static void dma_start(struct fsl_dma_chan *fsl_chan) static void dma_halt(struct fsl_dma_chan *fsl_chan) { - int i = 0; + int i; + DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr, DMA_IN(fsl_chan, &fsl_chan->reg_base->mr, 32) | FSL_DMA_MR_CA, 32); @@ -166,8 +167,11 @@ static void dma_halt(struct fsl_dma_chan *fsl_chan) DMA_IN(fsl_chan, &fsl_chan->reg_base->mr, 32) & ~(FSL_DMA_MR_CS | FSL_DMA_MR_EMS_EN | FSL_DMA_MR_CA), 32); - while (!dma_is_idle(fsl_chan) && (i++ < 100)) + for (i = 0; i < 100; i++) { + if (dma_is_idle(fsl_chan)) + break; udelay(10); + } if (i >= 100 && !dma_is_idle(fsl_chan)) dev_err(fsl_chan->dev, "DMA halt timeout!\n"); } From a09b09ae51ace43d28cd9bc1c8bb97986f2b55a6 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Wed, 25 Feb 2009 13:56:21 +0100 Subject: [PATCH 161/580] iop-adma, mv_xor: fix mem leak on self-test setup failure iop_adma_zero_sum_self_test has the brackets in the wrong place for the setup failure deallocation path. This error was duplicated in mv_xor_xor_self_test. Signed-off-by: Roel Kluin Signed-off-by: Dan Williams --- drivers/dma/iop-adma.c | 16 ++++++++-------- drivers/dma/mv_xor.c | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c index ea5440dd10dc..4131ab8e5032 100644 --- a/drivers/dma/iop-adma.c +++ b/drivers/dma/iop-adma.c @@ -928,19 +928,19 @@ iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device) for (src_idx = 0; src_idx < IOP_ADMA_NUM_SRC_TEST; src_idx++) { xor_srcs[src_idx] = alloc_page(GFP_KERNEL); - if (!xor_srcs[src_idx]) - while (src_idx--) { + if (!xor_srcs[src_idx]) { + while (src_idx--) __free_page(xor_srcs[src_idx]); - return -ENOMEM; - } + return -ENOMEM; + } } dest = alloc_page(GFP_KERNEL); - if (!dest) - while (src_idx--) { + if (!dest) { + while (src_idx--) __free_page(xor_srcs[src_idx]); - return -ENOMEM; - } + return -ENOMEM; + } /* Fill in src buffers */ for (src_idx = 0; src_idx < IOP_ADMA_NUM_SRC_TEST; src_idx++) { diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c index d35cbd1ff0b3..2a4e3e30a046 100644 --- a/drivers/dma/mv_xor.c +++ b/drivers/dma/mv_xor.c @@ -1019,19 +1019,19 @@ mv_xor_xor_self_test(struct mv_xor_device *device) for (src_idx = 0; src_idx < MV_XOR_NUM_SRC_TEST; src_idx++) { xor_srcs[src_idx] = alloc_page(GFP_KERNEL); - if (!xor_srcs[src_idx]) - while (src_idx--) { + if (!xor_srcs[src_idx]) { + while (src_idx--) __free_page(xor_srcs[src_idx]); - return -ENOMEM; - } + return -ENOMEM; + } } dest = alloc_page(GFP_KERNEL); - if (!dest) - while (src_idx--) { + if (!dest) { + while (src_idx--) __free_page(xor_srcs[src_idx]); - return -ENOMEM; - } + return -ENOMEM; + } /* Fill in src buffers */ for (src_idx = 0; src_idx < MV_XOR_NUM_SRC_TEST; src_idx++) { From c74ef1f867d18171c8617519ee5fe40b02903934 Mon Sep 17 00:00:00 2001 From: Luotao Fu Date: Thu, 26 Feb 2009 12:29:20 +0100 Subject: [PATCH 162/580] ipu_idmac: fix spinlock type fix a probably accidently dropped reference operator while calling spin_unlock_restore to an ipu lock. Signed-off-by: Luotao Fu Cc: Guennadi Liakhovetski Signed-off-by: Andrew Morton Signed-off-by: Dan Williams --- drivers/dma/ipu/ipu_idmac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/ipu/ipu_idmac.c b/drivers/dma/ipu/ipu_idmac.c index 1f154d08e98f..ae50a9d1a4e6 100644 --- a/drivers/dma/ipu/ipu_idmac.c +++ b/drivers/dma/ipu/ipu_idmac.c @@ -729,7 +729,7 @@ static int ipu_init_channel_buffer(struct idmac_channel *ichan, ichan->status = IPU_CHANNEL_READY; - spin_unlock_irqrestore(ipu->lock, flags); + spin_unlock_irqrestore(&ipu->lock, flags); return 0; } From 7cbd4877e5b167b56a3d6033b926a9f925186e12 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 4 Mar 2009 16:06:03 -0700 Subject: [PATCH 163/580] dmatest: fix use after free in dmatest_exit dmatest_cleanup_chanel will free dtc, so grab ->chan before it goes away and use it to do the release. Reported-by: Thierry Reding Signed-off-by: Dan Williams --- drivers/dma/dmatest.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index 732fa1ec36ab..e190d8b30700 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c @@ -430,13 +430,15 @@ late_initcall(dmatest_init); static void __exit dmatest_exit(void) { struct dmatest_chan *dtc, *_dtc; + struct dma_chan *chan; list_for_each_entry_safe(dtc, _dtc, &dmatest_channels, node) { list_del(&dtc->node); + chan = dtc->chan; dmatest_cleanup_channel(dtc); pr_debug("dmatest: dropped channel %s\n", - dma_chan_name(dtc->chan)); - dma_release_channel(dtc->chan); + dma_chan_name(chan)); + dma_release_channel(chan); } } module_exit(dmatest_exit); From 9f8ac0b7b063be77f0de7a27fe5e6a0aa2cce58d Mon Sep 17 00:00:00 2001 From: Matt Carlson Date: Wed, 25 Feb 2009 14:21:20 +0000 Subject: [PATCH 164/580] tg3: Fix 5906 link problems Commit 6833c043f9fc03696fde623914c4a0277df2a0bc introduced the phy auto-powerdown capability. While the APD feature only works for 5761 and 5784 asic revisions, the (harmless portion of the) code was applied to all 5705 and newer devices. However, the 5906 phy departs from the usual design. This commit was interfering with the 5906's ability to negotiate link against some switches. This patch corrects the problem. Signed-off-by: Matt Carlson Signed-off-by: Benjamin Li Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/tg3.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index b080f9493d83..dabdf59f8016 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -1473,7 +1473,8 @@ static void tg3_phy_toggle_apd(struct tg3 *tp, bool enable) { u32 reg; - if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) + if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS) || + GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5906) return; reg = MII_TG3_MISC_SHDW_WREN | From 7ce9d5d1f3c8736511daa413c64985a05b2feee3 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 4 Mar 2009 18:38:18 -0500 Subject: [PATCH 165/580] ext4: fix ext4_free_inode() vs. ext4_claim_inode() race I was seeing fsck errors on inode bitmaps after a 4 thread dbench run on a 4 cpu machine: Inode bitmap differences: -50736 -(50752--50753) etc... I believe that this is because ext4_free_inode() uses atomic bitops, and although ext4_new_inode() *used* to also use atomic bitops for synchronization, commit 393418676a7602e1d7d3f6e560159c65c8cbd50e changed this to use the sb_bgl_lock, so that we could also synchronize against read_inode_bitmap and initialization of uninit inode tables. However, that change left ext4_free_inode using atomic bitops, which I think leaves no synchronization between setting & unsetting bits in the inode table. The below patch fixes it for me, although I wonder if we're getting at all heavy-handed with this spinlock... Signed-off-by: Eric Sandeen Reviewed-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f18a919be70b..627f8c3337a3 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -188,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_group_desc *gdp; struct ext4_super_block *es; struct ext4_sb_info *sbi; - int fatal = 0, err, count; + int fatal = 0, err, count, cleared; ext4_group_t flex_group; if (atomic_read(&inode->i_count) > 1) { @@ -248,8 +248,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) goto error_return; /* Ok, now we can actually update the inode bitmaps.. */ - if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), - bit, bitmap_bh->b_data)) + spin_lock(sb_bgl_lock(sbi, block_group)); + cleared = ext4_clear_bit(bit, bitmap_bh->b_data); + spin_unlock(sb_bgl_lock(sbi, block_group)); + if (!cleared) ext4_error(sb, "ext4_free_inode", "bit already cleared for inode %lu", ino); else { From 118e1ef6fabfc023126e6075f6ac0fc729cb5285 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Thu, 5 Mar 2009 00:31:12 +0000 Subject: [PATCH 166/580] Squashfs: Fix oops when reading fsfuzzer corrupted filesystems This fixes a code regression caused by the recent mainlining changes. The recent code changes call zlib_inflate repeatedly, decompressing into separate 4K buffers, this code didn't check for the possibility that zlib_inflate might ask for too many buffers when decompressing corrupted data. Signed-off-by: Phillip Lougher --- fs/squashfs/block.c | 13 +++++++++++-- fs/squashfs/cache.c | 4 ++-- fs/squashfs/squashfs.h | 2 +- fs/squashfs/super.c | 2 +- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index c837dfc2b3c6..321728f48f2d 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -80,7 +80,7 @@ static struct buffer_head *get_block_length(struct super_block *sb, * generated a larger block - this does occasionally happen with zlib). */ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, - int length, u64 *next_index, int srclength) + int length, u64 *next_index, int srclength, int pages) { struct squashfs_sb_info *msblk = sb->s_fs_info; struct buffer_head **bh; @@ -185,6 +185,14 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, } if (msblk->stream.avail_out == 0) { + if (page == pages) { + ERROR("zlib_inflate tried to " + "decompress too much data, " + "expected %d bytes. Zlib " + "data probably corrupt\n", + srclength); + goto release_mutex; + } msblk->stream.next_out = buffer[page++]; msblk->stream.avail_out = PAGE_CACHE_SIZE; } @@ -268,7 +276,8 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, put_bh(bh[k]); read_failure: - ERROR("sb_bread failed reading block 0x%llx\n", cur_index); + ERROR("squashfs_read_data failed to read block 0x%llx\n", + (unsigned long long) index); kfree(bh); return -EIO; } diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index f29eda16d25e..1c4739e33af6 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -119,7 +119,7 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb, entry->length = squashfs_read_data(sb, entry->data, block, length, &entry->next_index, - cache->block_size); + cache->block_size, cache->pages); spin_lock(&cache->lock); @@ -406,7 +406,7 @@ int squashfs_read_table(struct super_block *sb, void *buffer, u64 block, for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE) data[i] = buffer; res = squashfs_read_data(sb, data, block, length | - SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length); + SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages); kfree(data); return res; } diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 6b2515d027d5..0e9feb6adf7e 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -34,7 +34,7 @@ static inline struct squashfs_inode_info *squashfs_i(struct inode *inode) /* block.c */ extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *, - int); + int, int); /* cache.c */ extern struct squashfs_cache *squashfs_cache_init(char *, int, int); diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 071df5b5b491..681ec0d83799 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -389,7 +389,7 @@ static int __init init_squashfs_fs(void) return err; } - printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) " + printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) " "Phillip Lougher\n"); return 0; From edf2e2811efa9304ebe14f778d33b764cfd58b7a Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Thu, 5 Mar 2009 00:40:13 +0000 Subject: [PATCH 167/580] Squashfs: fix documentation typo, Cramfs filesystem limit is 256 MiB Signed-off-by: Phillip Lougher --- Documentation/filesystems/squashfs.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt index 3e79e4a7a392..b324c033035a 100644 --- a/Documentation/filesystems/squashfs.txt +++ b/Documentation/filesystems/squashfs.txt @@ -22,7 +22,7 @@ Squashfs filesystem features versus Cramfs: Squashfs Cramfs -Max filesystem size: 2^64 16 MiB +Max filesystem size: 2^64 256 MiB Max file size: ~ 2 TiB 16 MiB Max files: unlimited unlimited Max directories: unlimited unlimited From f4f8056a862a9950320429dfda708c88b4ce6025 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 5 Mar 2009 00:55:31 +0000 Subject: [PATCH 168/580] Squashfs: frag_size should be signed, as it can hold an error result Signed-off-by: Roel Kluin Signed-off-by: Phillip Lougher --- fs/squashfs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 7a63398bb855..9101dbde39ec 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -133,7 +133,8 @@ int squashfs_read_inode(struct inode *inode, long long ino) type = le16_to_cpu(sqshb_ino->inode_type); switch (type) { case SQUASHFS_REG_TYPE: { - unsigned int frag_offset, frag_size, frag; + unsigned int frag_offset, frag; + int frag_size; u64 frag_blk; struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg; @@ -175,7 +176,8 @@ int squashfs_read_inode(struct inode *inode, long long ino) break; } case SQUASHFS_LREG_TYPE: { - unsigned int frag_offset, frag_size, frag; + unsigned int frag_offset, frag; + int frag_size; u64 frag_blk; struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg; From a883bf564ea555447a76682bb2d8d4bc92e23e0e Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Wed, 4 Mar 2009 17:38:10 -0800 Subject: [PATCH 169/580] pkt_sched: act_police: Fix a rate estimator test. A commit c1b56878fb68e9c14070939ea4537ad4db79ffae "tc: policing requires a rate estimator" introduced a test which invalidates previously working configs, based on examples from iproute2: doc/actions/actions-general. This is too rigorous: a rate estimator is needed only when police's "avrate" option is used. Reported-by: Joao Correia Diagnosed-by: John Dykstra Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/sched/act_police.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 5c72a116b1a4..f8f047b61245 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -183,13 +183,6 @@ static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est, if (R_tab == NULL) goto failure; - if (!est && (ret == ACT_P_CREATED || - !gen_estimator_active(&police->tcf_bstats, - &police->tcf_rate_est))) { - err = -EINVAL; - goto failure; - } - if (parm->peakrate.rate) { P_tab = qdisc_get_rtab(&parm->peakrate, tb[TCA_POLICE_PEAKRATE]); @@ -205,6 +198,12 @@ static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est, &police->tcf_lock, est); if (err) goto failure_unlock; + } else if (tb[TCA_POLICE_AVRATE] && + (ret == ACT_P_CREATED || + !gen_estimator_active(&police->tcf_bstats, + &police->tcf_rate_est))) { + err = -EINVAL; + goto failure_unlock; } /* No failure allowed after this point */ From 87786945fe4b0e60e8f1db62d5ee8a3cec539a67 Mon Sep 17 00:00:00 2001 From: Meelis Roos Date: Wed, 4 Mar 2009 04:59:41 +0000 Subject: [PATCH 170/580] tmspci: fix request_irq race Currently, tmspci tokenring driver crashes on device initialization because it requests its irq before initializing corresponding data structures. Fix this by moving request_irq call to a safer place. Signed-off-by: Meelis Roos Signed-off-by: David S. Miller --- drivers/net/tokenring/tmspci.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/tokenring/tmspci.c b/drivers/net/tokenring/tmspci.c index 5f601773c260..e2150b3c83d9 100644 --- a/drivers/net/tokenring/tmspci.c +++ b/drivers/net/tokenring/tmspci.c @@ -121,11 +121,6 @@ static int __devinit tms_pci_attach(struct pci_dev *pdev, const struct pci_devic goto err_out_trdev; } - ret = request_irq(pdev->irq, tms380tr_interrupt, IRQF_SHARED, - dev->name, dev); - if (ret) - goto err_out_region; - dev->base_addr = pci_ioaddr; dev->irq = pci_irq_line; dev->dma = 0; @@ -142,7 +137,7 @@ static int __devinit tms_pci_attach(struct pci_dev *pdev, const struct pci_devic ret = tmsdev_init(dev, &pdev->dev); if (ret) { printk("%s: unable to get memory for dev->priv.\n", dev->name); - goto err_out_irq; + goto err_out_region; } tp = netdev_priv(dev); @@ -157,6 +152,11 @@ static int __devinit tms_pci_attach(struct pci_dev *pdev, const struct pci_devic tp->tmspriv = cardinfo; + ret = request_irq(pdev->irq, tms380tr_interrupt, IRQF_SHARED, + dev->name, dev); + if (ret) + goto err_out_tmsdev; + dev->open = tms380tr_open; dev->stop = tms380tr_close; pci_set_drvdata(pdev, dev); @@ -164,15 +164,15 @@ static int __devinit tms_pci_attach(struct pci_dev *pdev, const struct pci_devic ret = register_netdev(dev); if (ret) - goto err_out_tmsdev; + goto err_out_irq; return 0; +err_out_irq: + free_irq(pdev->irq, dev); err_out_tmsdev: pci_set_drvdata(pdev, NULL); tmsdev_term(dev); -err_out_irq: - free_irq(pdev->irq, dev); err_out_region: release_region(pci_ioaddr, TMS_PCI_IO_EXTENT); err_out_trdev: From 7acab7a9ca6b0c5b820f083424c57e6ac2031d9d Mon Sep 17 00:00:00 2001 From: Enrik Berkhan Date: Thu, 5 Mar 2009 14:42:30 +0800 Subject: [PATCH 171/580] Blackfin arch: fix bug - The SPORT_HYS bit is not set for BF561 0.5 IMHO the setting should depend on ANOMALY_05000305 which is about the availability of the bit, not ANOMALY_05000265 which only describes the SPORT sensitivity to noise (checked for BF561 only, though). If that's not true for other BF variants, maybe the definition of ANOMALY_05000265 for BF561 should be changed to '(1)' instead. Signed-off-by: Enrik Berkhan Signed-off-by: Mike Frysinger Signed-off-by: Bryan Wu --- arch/blackfin/mach-common/clocks-init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/blackfin/mach-common/clocks-init.c b/arch/blackfin/mach-common/clocks-init.c index 9dddb6f8cc85..35393651359b 100644 --- a/arch/blackfin/mach-common/clocks-init.c +++ b/arch/blackfin/mach-common/clocks-init.c @@ -17,7 +17,7 @@ #define SDGCTL_WIDTH (1 << 31) /* SDRAM external data path width */ #define PLL_CTL_VAL \ (((CONFIG_VCO_MULT & 63) << 9) | CLKIN_HALF | \ - (PLL_BYPASS << 8) | (ANOMALY_05000265 ? 0x8000 : 0)) + (PLL_BYPASS << 8) | (ANOMALY_05000305 ? 0 : 0x8000)) __attribute__((l1_text)) static void do_sync(void) From 54acd0efab072cb70e87206329d561b297f93bbb Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 4 Mar 2009 23:01:02 -0800 Subject: [PATCH 172/580] net: Fix missing dev->neigh_setup in register_netdevice(). Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/dev.c b/net/core/dev.c index 9e4afe650e7a..2dd484ed3dbb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4339,6 +4339,7 @@ int register_netdevice(struct net_device *dev) dev->do_ioctl = ops->ndo_do_ioctl; dev->set_config = ops->ndo_set_config; dev->change_mtu = ops->ndo_change_mtu; + dev->neigh_setup = ops->ndo_neigh_setup; dev->tx_timeout = ops->ndo_tx_timeout; dev->get_stats = ops->ndo_get_stats; dev->vlan_rx_register = ops->ndo_vlan_rx_register; From 9d40bbda599def1e1d155d7f7dca14fe8744bd2b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 4 Mar 2009 23:46:25 -0800 Subject: [PATCH 173/580] vlan: Fix vlan-in-vlan crashes. As analyzed by Patrick McHardy, vlan needs to reset it's netdev_ops pointer in it's ->init() function but this leaves the compat method pointers stale. Add a netdev_resync_ops() and call it from the vlan code. Any other driver which changes ->netdev_ops after register_netdevice() will need to call this new function after doing so too. With help from Patrick McHardy. Tested-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/8021q/vlan_dev.c | 3 ++- net/core/dev.c | 56 ++++++++++++++++++++++++--------------- 3 files changed, 37 insertions(+), 23 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ec54785d34f9..659366734f3f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1079,6 +1079,7 @@ extern void synchronize_net(void); extern int register_netdevice_notifier(struct notifier_block *nb); extern int unregister_netdevice_notifier(struct notifier_block *nb); extern int init_dummy_netdev(struct net_device *dev); +extern void netdev_resync_ops(struct net_device *dev); extern int call_netdevice_notifiers(unsigned long val, struct net_device *dev); extern struct net_device *dev_get_by_index(struct net *net, int ifindex); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 4a19acd3a32b..1b34135cf990 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -553,7 +553,7 @@ static int vlan_dev_neigh_setup(struct net_device *dev, struct neigh_parms *pa) int err = 0; if (netif_device_present(real_dev) && ops->ndo_neigh_setup) - err = ops->ndo_neigh_setup(dev, pa); + err = ops->ndo_neigh_setup(real_dev, pa); return err; } @@ -639,6 +639,7 @@ static int vlan_dev_init(struct net_device *dev) dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN; dev->netdev_ops = &vlan_netdev_ops; } + netdev_resync_ops(dev); if (is_vlan_dev(real_dev)) subclass = 1; diff --git a/net/core/dev.c b/net/core/dev.c index 2dd484ed3dbb..f1129706ce7b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4282,6 +4282,39 @@ unsigned long netdev_fix_features(unsigned long features, const char *name) } EXPORT_SYMBOL(netdev_fix_features); +/* Some devices need to (re-)set their netdev_ops inside + * ->init() or similar. If that happens, we have to setup + * the compat pointers again. + */ +void netdev_resync_ops(struct net_device *dev) +{ +#ifdef CONFIG_COMPAT_NET_DEV_OPS + const struct net_device_ops *ops = dev->netdev_ops; + + dev->init = ops->ndo_init; + dev->uninit = ops->ndo_uninit; + dev->open = ops->ndo_open; + dev->change_rx_flags = ops->ndo_change_rx_flags; + dev->set_rx_mode = ops->ndo_set_rx_mode; + dev->set_multicast_list = ops->ndo_set_multicast_list; + dev->set_mac_address = ops->ndo_set_mac_address; + dev->validate_addr = ops->ndo_validate_addr; + dev->do_ioctl = ops->ndo_do_ioctl; + dev->set_config = ops->ndo_set_config; + dev->change_mtu = ops->ndo_change_mtu; + dev->neigh_setup = ops->ndo_neigh_setup; + dev->tx_timeout = ops->ndo_tx_timeout; + dev->get_stats = ops->ndo_get_stats; + dev->vlan_rx_register = ops->ndo_vlan_rx_register; + dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid; + dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid; +#ifdef CONFIG_NET_POLL_CONTROLLER + dev->poll_controller = ops->ndo_poll_controller; +#endif +#endif +} +EXPORT_SYMBOL(netdev_resync_ops); + /** * register_netdevice - register a network device * @dev: device to register @@ -4326,28 +4359,7 @@ int register_netdevice(struct net_device *dev) * This is temporary until all network devices are converted. */ if (dev->netdev_ops) { - const struct net_device_ops *ops = dev->netdev_ops; - - dev->init = ops->ndo_init; - dev->uninit = ops->ndo_uninit; - dev->open = ops->ndo_open; - dev->change_rx_flags = ops->ndo_change_rx_flags; - dev->set_rx_mode = ops->ndo_set_rx_mode; - dev->set_multicast_list = ops->ndo_set_multicast_list; - dev->set_mac_address = ops->ndo_set_mac_address; - dev->validate_addr = ops->ndo_validate_addr; - dev->do_ioctl = ops->ndo_do_ioctl; - dev->set_config = ops->ndo_set_config; - dev->change_mtu = ops->ndo_change_mtu; - dev->neigh_setup = ops->ndo_neigh_setup; - dev->tx_timeout = ops->ndo_tx_timeout; - dev->get_stats = ops->ndo_get_stats; - dev->vlan_rx_register = ops->ndo_vlan_rx_register; - dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid; - dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid; -#ifdef CONFIG_NET_POLL_CONTROLLER - dev->poll_controller = ops->ndo_poll_controller; -#endif + netdev_resync_ops(dev); } else { char drivername[64]; pr_info("%s (%s): not using net_device_ops yet\n", From f7e989ab64f926e34bafea0752431e1fd6a618f3 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Thu, 5 Mar 2009 17:33:36 +0800 Subject: [PATCH 174/580] Blackfin arch: make sure people do not set the kernel load address too high Signed-off-by: Mike Frysinger Signed-off-by: Bryan Wu --- arch/blackfin/mach-common/arch_checks.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/blackfin/mach-common/arch_checks.c b/arch/blackfin/mach-common/arch_checks.c index 98133b968f7b..b3a2e3fa15e3 100644 --- a/arch/blackfin/mach-common/arch_checks.c +++ b/arch/blackfin/mach-common/arch_checks.c @@ -62,3 +62,8 @@ #if (CONFIG_BOOT_LOAD & 0x3) # error "The kernel load address must be 4 byte aligned" #endif + +/* The entire kernel must be able to make a 24bit pcrel call to start of L1 */ +#if ((0xffffffff - L1_CODE_START + 1) + CONFIG_BOOT_LOAD) > 0x1000000 +# error "The kernel load address is too high; keep it below 10meg for safety" +#endif From c19577e34e641f802ad35fc0204ff68ea16fe7a3 Mon Sep 17 00:00:00 2001 From: Graf Yang Date: Thu, 5 Mar 2009 17:35:59 +0800 Subject: [PATCH 175/580] Blackfin arch: Fix bug - make ksz8893m driver available when bfin_mac is enabled Signed-off-by: Graf Yang Signed-off-by: Bryan Wu --- arch/blackfin/mach-bf518/boards/ezbrd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/blackfin/mach-bf518/boards/ezbrd.c b/arch/blackfin/mach-bf518/boards/ezbrd.c index 0e175342112e..b044de49ecf0 100644 --- a/arch/blackfin/mach-bf518/boards/ezbrd.c +++ b/arch/blackfin/mach-bf518/boards/ezbrd.c @@ -113,7 +113,6 @@ static struct platform_device bfin_mac_device = { .name = "bfin_mac", .dev.platform_data = &bfin_mii_bus, }; -#endif #if defined(CONFIG_NET_DSA_KSZ8893M) || defined(CONFIG_NET_DSA_KSZ8893M_MODULE) static struct dsa_platform_data ksz8893m_switch_data = { @@ -132,6 +131,7 @@ static struct platform_device ksz8893m_switch_device = { .dev.platform_data = &ksz8893m_switch_data, }; #endif +#endif #if defined(CONFIG_MTD_M25P80) \ || defined(CONFIG_MTD_M25P80_MODULE) @@ -171,6 +171,7 @@ static struct bfin5xx_spi_chip spi_adc_chip_info = { }; #endif +#if defined(CONFIG_BFIN_MAC) || defined(CONFIG_BFIN_MAC_MODULE) #if defined(CONFIG_NET_DSA_KSZ8893M) \ || defined(CONFIG_NET_DSA_KSZ8893M_MODULE) /* SPI SWITCH CHIP */ @@ -179,6 +180,7 @@ static struct bfin5xx_spi_chip spi_switch_info = { .bits_per_word = 8, }; #endif +#endif #if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) static struct bfin5xx_spi_chip spi_mmc_chip_info = { @@ -259,6 +261,7 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = { }, #endif +#if defined(CONFIG_BFIN_MAC) || defined(CONFIG_BFIN_MAC_MODULE) #if defined(CONFIG_NET_DSA_KSZ8893M) \ || defined(CONFIG_NET_DSA_KSZ8893M_MODULE) { @@ -271,6 +274,7 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = { .mode = SPI_MODE_3, }, #endif +#endif #if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) { @@ -630,11 +634,10 @@ static struct platform_device *stamp_devices[] __initdata = { #if defined(CONFIG_BFIN_MAC) || defined(CONFIG_BFIN_MAC_MODULE) &bfin_mii_bus, &bfin_mac_device, -#endif - #if defined(CONFIG_NET_DSA_KSZ8893M) || defined(CONFIG_NET_DSA_KSZ8893M_MODULE) &ksz8893m_switch_device, #endif +#endif #if defined(CONFIG_SPI_BFIN) || defined(CONFIG_SPI_BFIN_MODULE) &bfin_spi0_device, From d7ff1a90b2d3d122b896056d63db919617e9b6cd Mon Sep 17 00:00:00 2001 From: Sonic Zhang Date: Thu, 5 Mar 2009 18:26:59 +0800 Subject: [PATCH 176/580] Blackfin arch: Fix bug - KGDB single step into the middle of a 4 bytes instruction on bf561 after soft bp is hit Run IFLUSH twice to avoid loading wrong instruction after invalidating icache and following sequence is met. 1) The one instruction address is cached in the icache. 2) This instruction in SDRAM is changed. 3) IFLASH[P0] is executed only once in lackfin_icache_flush_range(). 4) This instruction is executed again, but not the changed new one. Signed-off-by: Sonic Zhang Signed-off-by: Bryan Wu --- arch/blackfin/mach-common/cache.S | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/blackfin/mach-common/cache.S b/arch/blackfin/mach-common/cache.S index 3c98dacbf289..aa0648c6a9fe 100644 --- a/arch/blackfin/mach-common/cache.S +++ b/arch/blackfin/mach-common/cache.S @@ -66,11 +66,33 @@ /* Invalidate all instruction cache lines assocoiated with this memory area */ ENTRY(_blackfin_icache_flush_range) +/* + * Walkaround to avoid loading wrong instruction after invalidating icache + * and following sequence is met. + * + * 1) One instruction address is cached in the instruction cache. + * 2) This instruction in SDRAM is changed. + * 3) IFLASH[P0] is executed only once in blackfin_icache_flush_range(). + * 4) This instruction is executed again, but the old one is loaded. + */ + P0 = R0; + IFLUSH[P0]; do_flush IFLUSH, , nop ENDPROC(_blackfin_icache_flush_range) /* Flush all cache lines assocoiated with this area of memory. */ ENTRY(_blackfin_icache_dcache_flush_range) +/* + * Walkaround to avoid loading wrong instruction after invalidating icache + * and following sequence is met. + * + * 1) One instruction address is cached in the instruction cache. + * 2) This instruction in SDRAM is changed. + * 3) IFLASH[P0] is executed only once in blackfin_icache_flush_range(). + * 4) This instruction is executed again, but the old one is loaded. + */ + P0 = R0; + IFLUSH[P0]; do_flush FLUSH, IFLUSH ENDPROC(_blackfin_icache_dcache_flush_range) From 5047607f0170b1f797a2fe37bb45310c1e1d5ce0 Mon Sep 17 00:00:00 2001 From: Graf Yang Date: Thu, 5 Mar 2009 17:32:41 +0800 Subject: [PATCH 177/580] Blackfin arch: update default kernel config, select KSZ8893M driver for BF518 Signed-off-by: Graf Yang Signed-off-by: Bryan Wu --- arch/blackfin/configs/BF518F-EZBRD_defconfig | 63 ++++++++++++++++++-- 1 file changed, 59 insertions(+), 4 deletions(-) diff --git a/arch/blackfin/configs/BF518F-EZBRD_defconfig b/arch/blackfin/configs/BF518F-EZBRD_defconfig index 4fdb9e04759f..281f4b60e603 100644 --- a/arch/blackfin/configs/BF518F-EZBRD_defconfig +++ b/arch/blackfin/configs/BF518F-EZBRD_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.28-rc2 -# Fri Jan 9 17:58:41 2009 +# Linux kernel version: 2.6.28 +# Fri Feb 20 10:01:44 2009 # # CONFIG_MMU is not set # CONFIG_FPU is not set @@ -133,10 +133,15 @@ CONFIG_BF518=y # CONFIG_BF538 is not set # CONFIG_BF539 is not set # CONFIG_BF542 is not set +# CONFIG_BF542M is not set # CONFIG_BF544 is not set +# CONFIG_BF544M is not set # CONFIG_BF547 is not set +# CONFIG_BF547M is not set # CONFIG_BF548 is not set +# CONFIG_BF548M is not set # CONFIG_BF549 is not set +# CONFIG_BF549M is not set # CONFIG_BF561 is not set CONFIG_BF_REV_MIN=0 CONFIG_BF_REV_MAX=2 @@ -426,7 +431,17 @@ CONFIG_DEFAULT_TCP_CONG="cubic" # CONFIG_TIPC is not set # CONFIG_ATM is not set # CONFIG_BRIDGE is not set -# CONFIG_NET_DSA is not set +CONFIG_NET_DSA=y +# CONFIG_NET_DSA_TAG_DSA is not set +# CONFIG_NET_DSA_TAG_EDSA is not set +# CONFIG_NET_DSA_TAG_TRAILER is not set +CONFIG_NET_DSA_TAG_STPID=y +# CONFIG_NET_DSA_MV88E6XXX is not set +# CONFIG_NET_DSA_MV88E6060 is not set +# CONFIG_NET_DSA_MV88E6XXX_NEED_PPU is not set +# CONFIG_NET_DSA_MV88E6131 is not set +# CONFIG_NET_DSA_MV88E6123_61_65 is not set +CONFIG_NET_DSA_KSZ8893M=y # CONFIG_VLAN_8021Q is not set # CONFIG_DECNET is not set # CONFIG_LLC2 is not set @@ -529,6 +544,8 @@ CONFIG_MTD_COMPLEX_MAPPINGS=y # # Self-contained MTD device drivers # +# CONFIG_MTD_DATAFLASH is not set +# CONFIG_MTD_M25P80 is not set # CONFIG_MTD_SLRAM is not set # CONFIG_MTD_PHRAM is not set # CONFIG_MTD_MTDRAM is not set @@ -561,7 +578,9 @@ CONFIG_BLK_DEV_RAM_SIZE=4096 # CONFIG_BLK_DEV_HD is not set CONFIG_MISC_DEVICES=y # CONFIG_EEPROM_93CX6 is not set +# CONFIG_ICS932S401 is not set # CONFIG_ENCLOSURE_SERVICES is not set +# CONFIG_C2PORT is not set CONFIG_HAVE_IDE=y # CONFIG_IDE is not set @@ -607,6 +626,7 @@ CONFIG_BFIN_RX_DESC_NUM=20 # CONFIG_SMC91X is not set # CONFIG_SMSC911X is not set # CONFIG_DM9000 is not set +# CONFIG_ENC28J60 is not set # CONFIG_IBM_NEW_EMAC_ZMII is not set # CONFIG_IBM_NEW_EMAC_RGMII is not set # CONFIG_IBM_NEW_EMAC_TAH is not set @@ -764,7 +784,23 @@ CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 # CONFIG_I2C_DEBUG_ALGO is not set # CONFIG_I2C_DEBUG_BUS is not set # CONFIG_I2C_DEBUG_CHIP is not set -# CONFIG_SPI is not set +CONFIG_SPI=y +# CONFIG_SPI_DEBUG is not set +CONFIG_SPI_MASTER=y + +# +# SPI Master Controller Drivers +# +CONFIG_SPI_BFIN=y +# CONFIG_SPI_BFIN_LOCK is not set +# CONFIG_SPI_BITBANG is not set + +# +# SPI Protocol Masters +# +# CONFIG_SPI_AT25 is not set +# CONFIG_SPI_SPIDEV is not set +# CONFIG_SPI_TLE62X0 is not set CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y # CONFIG_GPIOLIB is not set # CONFIG_W1 is not set @@ -788,8 +824,10 @@ CONFIG_BFIN_WDT=y # CONFIG_MFD_SM501 is not set # CONFIG_HTC_PASIC3 is not set # CONFIG_MFD_TMIO is not set +# CONFIG_PMIC_DA903X is not set # CONFIG_MFD_WM8400 is not set # CONFIG_MFD_WM8350_I2C is not set +# CONFIG_REGULATOR is not set # # Multimedia devices @@ -861,10 +899,18 @@ CONFIG_RTC_INTF_DEV=y # CONFIG_RTC_DRV_M41T80 is not set # CONFIG_RTC_DRV_S35390A is not set # CONFIG_RTC_DRV_FM3130 is not set +# CONFIG_RTC_DRV_RX8581 is not set # # SPI RTC drivers # +# CONFIG_RTC_DRV_M41T94 is not set +# CONFIG_RTC_DRV_DS1305 is not set +# CONFIG_RTC_DRV_DS1390 is not set +# CONFIG_RTC_DRV_MAX6902 is not set +# CONFIG_RTC_DRV_R9701 is not set +# CONFIG_RTC_DRV_RS5C348 is not set +# CONFIG_RTC_DRV_DS3234 is not set # # Platform RTC drivers @@ -1062,12 +1108,20 @@ CONFIG_DEBUG_INFO=y # CONFIG_DEBUG_BLOCK_EXT_DEVT is not set # CONFIG_FAULT_INJECTION is not set CONFIG_SYSCTL_SYSCALL_CHECK=y + +# +# Tracers +# +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_BOOT_TRACER is not set # CONFIG_DYNAMIC_PRINTK_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set # CONFIG_DEBUG_STACKOVERFLOW is not set # CONFIG_DEBUG_STACK_USAGE is not set +# CONFIG_KGDB_TESTCASE is not set CONFIG_DEBUG_VERBOSE=y CONFIG_DEBUG_MMRS=y # CONFIG_DEBUG_HWERR is not set @@ -1100,6 +1154,7 @@ CONFIG_CRYPTO=y # # CONFIG_CRYPTO_FIPS is not set # CONFIG_CRYPTO_MANAGER is not set +# CONFIG_CRYPTO_MANAGER2 is not set # CONFIG_CRYPTO_GF128MUL is not set # CONFIG_CRYPTO_NULL is not set # CONFIG_CRYPTO_CRYPTD is not set From 72e2240f181871675d3a979766330c91d48a1673 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 5 Mar 2009 01:57:44 -0800 Subject: [PATCH 178/580] bonding: Fix device passed into ->ndo_neigh_setup(). Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 9fb388388fb7..e0578fe8c0db 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4113,7 +4113,7 @@ static int bond_neigh_setup(struct net_device *dev, struct neigh_parms *parms) const struct net_device_ops *slave_ops = slave->dev->netdev_ops; if (slave_ops->ndo_neigh_setup) - return slave_ops->ndo_neigh_setup(dev, parms); + return slave_ops->ndo_neigh_setup(slave->dev, parms); } return 0; } From a1aade478862fca8710904532cbc6efed97fd05a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 4 Mar 2009 16:11:35 -0800 Subject: [PATCH 179/580] x86/doc: mini-howto for using earlyprintk=dbgp [ mingo: small edits and extensions. ] Signed-off-by: Yinghai Lu Cc: "Eric W. Biederman" Cc: Greg KH Cc: Randy Dunlap Cc: Alan Stern Cc: Sarah Sharp Cc: Andrew Morton LKML-Reference: <49AF18B7.4050305@kernel.org> Signed-off-by: Ingo Molnar --- Documentation/x86/earlyprintk.txt | 101 ++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 Documentation/x86/earlyprintk.txt diff --git a/Documentation/x86/earlyprintk.txt b/Documentation/x86/earlyprintk.txt new file mode 100644 index 000000000000..607b1a016064 --- /dev/null +++ b/Documentation/x86/earlyprintk.txt @@ -0,0 +1,101 @@ + +Mini-HOWTO for using the earlyprintk=dbgp boot option with a +USB2 Debug port key and a debug cable, on x86 systems. + +You need two computers, the 'USB debug key' special gadget and +and two USB cables, connected like this: + + [host/target] <-------> [USB debug key] <-------> [client/console] + +1. There are three specific hardware requirements: + + a.) Host/target system needs to have USB debug port capability. + + You can check this capability by looking at a 'Debug port' bit in + the lspci -vvv output: + + # lspci -vvv + ... + 00:1d.7 USB Controller: Intel Corporation 82801H (ICH8 Family) USB2 EHCI Controller #1 (rev 03) (prog-if 20 [EHCI]) + Subsystem: Lenovo ThinkPad T61 + Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx- + Status: Cap+ 66MHz- UDF- FastB2B+ ParErr- DEVSEL=medium >TAbort- SERR- /proc/sysrq-trigger + + On the host/target system you should see this help line in "dmesg" output: + + SysRq : HELP : loglevel(0-9) reBoot Crashdump terminate-all-tasks(E) memory-full-oom-kill(F) kill-all-tasks(I) saK show-backtrace-all-active-cpus(L) show-memory-usage(M) nice-all-RT-tasks(N) powerOff show-registers(P) show-all-timers(Q) unRaw Sync show-task-states(T) Unmount show-blocked-tasks(W) dump-ftrace-buffer(Z) + + On the client/console system do: + + cat /dev/ttyUSB0 + + And you should see the help line above displayed shortly after you've + provoked it on the host system. + +If it does not work then please ask about it on the linux-kernel@vger.kernel.org +mailing list or contact the x86 maintainers. From 00049522425e8390d1815e1579733644ad2bb0c7 Mon Sep 17 00:00:00 2001 From: Robin Getz Date: Thu, 5 Mar 2009 18:18:49 +0800 Subject: [PATCH 180/580] Blackfin arch: Random read/write errors are a bad thing Random read/write errors are a bad thing - so don't let anyone (including the test bench) run on something we know is bad. Signed-off-by: Robin Getz Signed-off-by: Bryan Wu --- arch/blackfin/kernel/setup.c | 4 ++++ arch/blackfin/mach-common/arch_checks.c | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/arch/blackfin/kernel/setup.c b/arch/blackfin/kernel/setup.c index e5c116230800..4646b985cc9d 100644 --- a/arch/blackfin/kernel/setup.c +++ b/arch/blackfin/kernel/setup.c @@ -889,6 +889,10 @@ void __init setup_arch(char **cmdline_p) CPU, bfin_revid()); } + /* We can't run on BF548-0.1 due to ANOMALY 05000448 */ + if (bfin_cpuid() == 0x27de && bfin_revid() == 1) + panic("You can't run on this processor due to 05000448\n"); + printk(KERN_INFO "Blackfin Linux support by http://blackfin.uclinux.org/\n"); printk(KERN_INFO "Processor Speed: %lu MHz core clock and %lu MHz System Clock\n", diff --git a/arch/blackfin/mach-common/arch_checks.c b/arch/blackfin/mach-common/arch_checks.c index b3a2e3fa15e3..a2ca26aa210d 100644 --- a/arch/blackfin/mach-common/arch_checks.c +++ b/arch/blackfin/mach-common/arch_checks.c @@ -67,3 +67,9 @@ #if ((0xffffffff - L1_CODE_START + 1) + CONFIG_BOOT_LOAD) > 0x1000000 # error "The kernel load address is too high; keep it below 10meg for safety" #endif + +#ifdef ANOMALY_05000448 +# if ANOMALY_05000448 +# error You are using a part with anomaly 05000448, this issue causes random memory read/write failures - that means random crashes. +# endif +#endif From 1400b3faab8fedfffde5a7fe47098e2732d4aa76 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 4 Mar 2009 16:02:46 -0600 Subject: [PATCH 181/580] x86: UV, SGI RTC: fix uv_time.c for UP Fix non-smp build of uv_time.c. Signed-off-by: Dimitri Sivanich LKML-Reference: <20090304220246.GC6288@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/uv_time.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index 6f8e3256ab2a..2ffb6c53326e 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #define RTC_NAME "sgi_rtc" @@ -84,7 +86,7 @@ static void uv_rtc_send_IPI(int cpu) unsigned long apicid, val; int pnode; - apicid = per_cpu(x86_cpu_to_apicid, cpu); + apicid = cpu_physical_id(cpu); pnode = uv_apicid_to_pnode(apicid); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (apicid << UVH_IPI_INT_APIC_ID_SHFT) | From ba0dade4bbe6bb26b975323726f7214278d5e994 Mon Sep 17 00:00:00 2001 From: Michael Hennerich Date: Thu, 5 Mar 2009 18:41:24 +0800 Subject: [PATCH 182/580] Blackfin arch: fix bug - On bf548-ezkit, ethernet fails to work after wakeup from "mem" Signed-off-by: Michael Hennerich Signed-off-by: Bryan Wu --- arch/blackfin/mach-common/dpmc_modes.S | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arch/blackfin/mach-common/dpmc_modes.S b/arch/blackfin/mach-common/dpmc_modes.S index 4da50bcd9300..8009a512fb11 100644 --- a/arch/blackfin/mach-common/dpmc_modes.S +++ b/arch/blackfin/mach-common/dpmc_modes.S @@ -376,10 +376,22 @@ ENTRY(_do_hibernate) #endif #ifdef PINT0_ASSIGN + PM_SYS_PUSH(PINT0_MASK_SET) + PM_SYS_PUSH(PINT1_MASK_SET) + PM_SYS_PUSH(PINT2_MASK_SET) + PM_SYS_PUSH(PINT3_MASK_SET) PM_SYS_PUSH(PINT0_ASSIGN) PM_SYS_PUSH(PINT1_ASSIGN) PM_SYS_PUSH(PINT2_ASSIGN) PM_SYS_PUSH(PINT3_ASSIGN) + PM_SYS_PUSH(PINT0_INVERT_SET) + PM_SYS_PUSH(PINT1_INVERT_SET) + PM_SYS_PUSH(PINT2_INVERT_SET) + PM_SYS_PUSH(PINT3_INVERT_SET) + PM_SYS_PUSH(PINT0_EDGE_SET) + PM_SYS_PUSH(PINT1_EDGE_SET) + PM_SYS_PUSH(PINT2_EDGE_SET) + PM_SYS_PUSH(PINT3_EDGE_SET) #endif PM_SYS_PUSH(EBIU_AMBCTL0) @@ -714,10 +726,22 @@ ENTRY(_do_hibernate) PM_SYS_POP(EBIU_AMBCTL0) #ifdef PINT0_ASSIGN + PM_SYS_POP(PINT3_EDGE_SET) + PM_SYS_POP(PINT2_EDGE_SET) + PM_SYS_POP(PINT1_EDGE_SET) + PM_SYS_POP(PINT0_EDGE_SET) + PM_SYS_POP(PINT3_INVERT_SET) + PM_SYS_POP(PINT2_INVERT_SET) + PM_SYS_POP(PINT1_INVERT_SET) + PM_SYS_POP(PINT0_INVERT_SET) PM_SYS_POP(PINT3_ASSIGN) PM_SYS_POP(PINT2_ASSIGN) PM_SYS_POP(PINT1_ASSIGN) PM_SYS_POP(PINT0_ASSIGN) + PM_SYS_POP(PINT3_MASK_SET) + PM_SYS_POP(PINT2_MASK_SET) + PM_SYS_POP(PINT1_MASK_SET) + PM_SYS_POP(PINT0_MASK_SET) #endif #ifdef SICA_IWR1 From 8e2a769414604e016fcb3d0f15a2050b5d0d90c0 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Thu, 5 Mar 2009 18:45:07 +0800 Subject: [PATCH 183/580] Blackfin arch: mark init_pda as __init as only __init funcs all it Signed-off-by: Mike Frysinger Signed-off-by: Bryan Wu --- arch/blackfin/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/blackfin/mm/init.c b/arch/blackfin/mm/init.c index d0532b72bba5..9c3629b9a689 100644 --- a/arch/blackfin/mm/init.c +++ b/arch/blackfin/mm/init.c @@ -104,7 +104,7 @@ void __init paging_init(void) } } -asmlinkage void init_pda(void) +asmlinkage void __init init_pda(void) { unsigned int cpu = raw_smp_processor_id(); From 27276ba21fe590edc43305f483565aa02010bbbb Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Thu, 5 Mar 2009 18:47:20 +0800 Subject: [PATCH 184/580] Blackfin arch: remove spurious dash when dcache is off Signed-off-by: Mike Frysinger Signed-off-by: Bryan Wu --- arch/blackfin/kernel/setup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/blackfin/kernel/setup.c b/arch/blackfin/kernel/setup.c index 4646b985cc9d..a58687bdee6a 100644 --- a/arch/blackfin/kernel/setup.c +++ b/arch/blackfin/kernel/setup.c @@ -1145,12 +1145,12 @@ static int show_cpuinfo(struct seq_file *m, void *v) icache_size = 0; seq_printf(m, "cache size\t: %d KB(L1 icache) " - "%d KB(L1 dcache-%s) %d KB(L2 cache)\n", + "%d KB(L1 dcache%s) %d KB(L2 cache)\n", icache_size, dcache_size, #if defined CONFIG_BFIN_WB - "wb" + "-wb" #elif defined CONFIG_BFIN_WT - "wt" + "-wt" #endif "", 0); From 7786ce823b1c9a3de01a63857e3451890cb4e81c Mon Sep 17 00:00:00 2001 From: Jie Zhang Date: Thu, 5 Mar 2009 18:50:26 +0800 Subject: [PATCH 185/580] Blackfin arch: fix bug - gdb signull case make trunk kernel panic frequently Use copy_to_user_page and copy_from_user_page instead of memcpy. copy_to_user_page does cache flush when necessary. Signed-off-by: Jie Zhang Signed-off-by: Bryan Wu --- arch/blackfin/kernel/ptrace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c index 594e325b40e4..d76618db50df 100644 --- a/arch/blackfin/kernel/ptrace.c +++ b/arch/blackfin/kernel/ptrace.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #define TEXT_OFFSET 0 @@ -240,7 +241,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) } else if (addr >= FIXED_CODE_START && addr + sizeof(tmp) <= FIXED_CODE_END) { - memcpy(&tmp, (const void *)(addr), sizeof(tmp)); + copy_from_user_page(0, 0, 0, &tmp, (const void *)(addr), sizeof(tmp)); copied = sizeof(tmp); } else @@ -320,7 +321,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) } else if (addr >= FIXED_CODE_START && addr + sizeof(data) <= FIXED_CODE_END) { - memcpy((void *)(addr), &data, sizeof(data)); + copy_to_user_page(0, 0, 0, (void *)(addr), &data, sizeof(data)); copied = sizeof(data); } else From a1a15ac5f9aeee521c048a88fc1aec848e623de7 Mon Sep 17 00:00:00 2001 From: Kris Shannon Date: Mon, 2 Mar 2009 19:47:37 +1100 Subject: [PATCH 186/580] Fix kernel NULL pointer dereference in xen-blkfront When booting Xen Dom0 on a pre-release 3.2.1 hypervisor the system Oopses on a "Unable to handle kernel NULL pointer dereference" in xenwatch. From the backtrace it looks like backend_changed is calling bdget_disk with a NULL pointer. Checking for NULL and returning ENODEV instead allows the kernel to boot. --- drivers/block/xen-blkfront.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index b6c8ce254359..8f905089b72b 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -977,6 +977,8 @@ static void backend_changed(struct xenbus_device *dev, break; case XenbusStateClosing: + if (info->gd == NULL) + xenbus_dev_fatal(dev, -ENODEV, "gd is NULL"); bd = bdget_disk(info->gd, 0); if (bd == NULL) xenbus_dev_fatal(dev, -ENODEV, "bdget failed"); From 5e18cfd04feca78cc08a6b8b71a60a610de81eaa Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 27 Feb 2009 08:10:26 +0100 Subject: [PATCH 187/580] cciss: remove 30 second initial timeout on controller reset Commit 5e4c91c84b194b26cf592779e451f4b5be777cba forgot to remove the initial sleep, get rid of it. Thanks to Randy Dunlap for spotting this error. Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index b5a061114630..4f9b6d792017 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -3606,11 +3606,9 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, if (cciss_hard_reset_controller(pdev) || cciss_reset_msi(pdev)) return -ENODEV; - /* Some devices (notably the HP Smart Array 5i Controller) - need a little pause here */ - schedule_timeout_uninterruptible(30*HZ); - - /* Now try to get the controller to respond to a no-op */ + /* Now try to get the controller to respond to a no-op. Some + devices (notably the HP Smart Array 5i Controller) need + up to 30 seconds to respond. */ for (i=0; i<30; i++) { if (cciss_noop(pdev) == 0) break; From a3941ec101a5ec54c1e929730afeb196441a171e Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 5 Mar 2009 08:03:53 +0100 Subject: [PATCH 188/580] loop: don't increment p->offset with (size_t) -EINVAL Upon a 'transfer error block' size is set to -EINVAL, but this becomes positive since size is unsigned: p->offset still gets incremented. Signed-off-by: Roel Kluin Signed-off-by: Jens Axboe --- drivers/block/loop.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index edbaac6c0573..bf0345577672 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -392,8 +392,7 @@ lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct loop_device *lo = p->lo; struct page *page = buf->page; sector_t IV; - size_t size; - int ret; + int size, ret; ret = buf->ops->confirm(pipe, buf); if (unlikely(ret)) From fb7b75abe58acfa586445e430c29412090ad2058 Mon Sep 17 00:00:00 2001 From: Alon Bar-Lev Date: Thu, 5 Mar 2009 19:42:43 +0800 Subject: [PATCH 189/580] Blackfin arch: cleanup bfin_sport.h header and export it to userspace Signed-off-by: Alon Bar-Lev Signed-off-by: Mike Frysinger Signed-off-by: Bryan Wu --- arch/blackfin/include/asm/Kbuild | 1 + arch/blackfin/include/asm/bfin_sport.h | 45 ++++++++------------------ 2 files changed, 15 insertions(+), 31 deletions(-) diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild index 606ecfdcc962..09c31418cc08 100644 --- a/arch/blackfin/include/asm/Kbuild +++ b/arch/blackfin/include/asm/Kbuild @@ -1,3 +1,4 @@ include include/asm-generic/Kbuild.asm +unifdef-y += bfin_sport.h unifdef-y += fixed_code.h diff --git a/arch/blackfin/include/asm/bfin_sport.h b/arch/blackfin/include/asm/bfin_sport.h index fe88a2c19213..65a651db5b07 100644 --- a/arch/blackfin/include/asm/bfin_sport.h +++ b/arch/blackfin/include/asm/bfin_sport.h @@ -1,30 +1,9 @@ /* - * File: include/asm-blackfin/bfin_sport.h - * Based on: - * Author: Roy Huang (roy.huang@analog.com) + * bfin_sport.h - userspace header for bfin sport driver * - * Created: Thu Aug. 24 2006 - * Description: + * Copyright 2004-2008 Analog Devices Inc. * - * Modified: - * Copyright 2004-2006 Analog Devices Inc. - * - * Bugs: Enter bugs at http://blackfin.uclinux.org/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see the file COPYING, or write - * to the Free Software Foundation, Inc., - * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * Licensed under the GPL-2 or later. */ #ifndef __BFIN_SPORT_H__ @@ -42,11 +21,10 @@ #define NORM_FORMAT 0x0 #define ALAW_FORMAT 0x2 #define ULAW_FORMAT 0x3 -struct sport_register; /* Function driver which use sport must initialize the structure */ struct sport_config { - /*TDM (multichannels), I2S or other mode */ + /* TDM (multichannels), I2S or other mode */ unsigned int mode:3; /* if TDM mode is selected, channels must be set */ @@ -72,12 +50,18 @@ struct sport_config { int serial_clk; int fsync_clk; - unsigned int data_format:2; /*Normal, u-law or a-law */ + unsigned int data_format:2; /* Normal, u-law or a-law */ int word_len; /* How length of the word in bits, 3-32 bits */ int dma_enabled; }; +/* Userspace interface */ +#define SPORT_IOC_MAGIC 'P' +#define SPORT_IOC_CONFIG _IOWR('P', 0x01, struct sport_config) + +#ifdef __KERNEL__ + struct sport_register { unsigned short tcr1; unsigned short reserved0; @@ -117,9 +101,6 @@ struct sport_register { unsigned long mrcs3; }; -#define SPORT_IOC_MAGIC 'P' -#define SPORT_IOC_CONFIG _IOWR('P', 0x01, struct sport_config) - struct sport_dev { struct cdev cdev; /* Char device structure */ @@ -149,6 +130,8 @@ struct sport_dev { struct sport_config config; }; +#endif + #define SPORT_TCR1 0 #define SPORT_TCR2 1 #define SPORT_TCLKDIV 2 @@ -169,4 +152,4 @@ struct sport_dev { #define SPORT_MRCS2 22 #define SPORT_MRCS3 23 -#endif /*__BFIN_SPORT_H__*/ +#endif From e7d3ef13d52a126438f687a1a32da65ff926ed57 Mon Sep 17 00:00:00 2001 From: Stuart Hayes Date: Wed, 4 Mar 2009 11:59:46 -0800 Subject: [PATCH 190/580] libata: change drive ready wait after hard reset to 5s This fixes problems during resume with drives that take longer than 1s to be ready. The ATA-6 spec appears to allow 5 seconds for a drive to be ready. On one affected system, this patch changes "PM: resume devices took..." message from 17 seconds to 4 seconds, and gets rid of a lot of ugly timeout/error messages. Without this patch, the libata code moves on after 1s, tries to send a soft reset (which the drive doesn't see because it isn't ready) which also times out, then an IDENTIFY command is sent to the drive which times out, and finally the error handler will try to send another hard reset which will finally get things working. Signed-off-by: Stuart Hayes Signed-off-by: Andrew Morton Signed-off-by: Jeff Garzik --- include/linux/libata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/libata.h b/include/linux/libata.h index 5d87bc09a1f5..dd818c7decd7 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -275,7 +275,7 @@ enum { * advised to wait only for the following duration before * doing SRST. */ - ATA_TMOUT_PMP_SRST_WAIT = 1000, + ATA_TMOUT_PMP_SRST_WAIT = 5000, /* ATA bus states */ BUS_UNKNOWN = 0, From 5825627c9463581fd9e70f8285685889ae5bb9bb Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 27 Feb 2009 17:35:43 +0900 Subject: [PATCH 191/580] libata: fix dma_unmap_sg misuse libata passes the returned value of dma_map_sg() to dma_unmap_sg(),which is the misuse of dma_unmap_sg(). DMA-mapping.txt says: To unmap a scatterlist, just call: pci_unmap_sg(pdev, sglist, nents, direction); Again, make sure DMA activity has already finished. PLEASE NOTE: The 'nents' argument to the pci_unmap_sg call must be the _same_ one you passed into the pci_map_sg call, it should _NOT_ be the 'count' value _returned_ from the pci_map_sg call. Signed-off-by: FUJITA Tomonori Acked-by: Bartlomiej Zolnierkiewicz Acked-by: Tejun Heo Signed-off-by: Jeff Garzik --- drivers/ata/libata-core.c | 4 ++-- include/linux/libata.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 9fbf0595f3d4..5e324cea3019 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4612,7 +4612,7 @@ void ata_sg_clean(struct ata_queued_cmd *qc) VPRINTK("unmapping %u sg elements\n", qc->n_elem); if (qc->n_elem) - dma_unmap_sg(ap->dev, sg, qc->n_elem, dir); + dma_unmap_sg(ap->dev, sg, qc->orig_n_elem, dir); qc->flags &= ~ATA_QCFLAG_DMAMAP; qc->sg = NULL; @@ -4727,7 +4727,7 @@ static int ata_sg_setup(struct ata_queued_cmd *qc) return -1; DPRINTK("%d sg elements mapped\n", n_elem); - + qc->orig_n_elem = qc->n_elem; qc->n_elem = n_elem; qc->flags |= ATA_QCFLAG_DMAMAP; diff --git a/include/linux/libata.h b/include/linux/libata.h index dd818c7decd7..fbf064e13ad5 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -530,6 +530,7 @@ struct ata_queued_cmd { unsigned long flags; /* ATA_QCFLAG_xxx */ unsigned int tag; unsigned int n_elem; + unsigned int orig_n_elem; int dma_dir; From 84bda12af31f930e4200c5244aa111de2485d7b0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 Mar 2009 18:53:26 +0900 Subject: [PATCH 192/580] libata: align ap->sector_buf ap->sector_buf is used as DMA target and should at least be aligned on cacheline. This caused problems on some embedded machines. Signed-off-by: Tejun Heo Signed-off-by: Jeff Garzik --- include/linux/libata.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/libata.h b/include/linux/libata.h index fbf064e13ad5..dc18b87ed722 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -751,7 +751,8 @@ struct ata_port { acpi_handle acpi_handle; struct ata_acpi_gtm __acpi_init_gtm; /* use ata_acpi_init_gtm() */ #endif - u8 sector_buf[ATA_SECT_SIZE]; /* owned by EH */ + /* owned by EH */ + u8 sector_buf[ATA_SECT_SIZE] ____cacheline_aligned; }; /* The following initializer overrides a method to NULL whether one of From b53570814692db79c3525523b6e9ec9874416c04 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 2 Mar 2009 18:55:16 +0900 Subject: [PATCH 193/580] libata: don't use on-stack sense buffer sense_buffer is used as DMA target and shouldn't be allocated on stack. Use ap->sector_buf instead. This problem is spotted by Chuck Ebbert. Signed-off-by: Tejun Heo Reported-by: Chuck Ebbert Signed-off-by: Jeff Garzik --- drivers/ata/libata-eh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index ce2ef0475339..009ccc7c9865 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -2901,7 +2901,7 @@ static int atapi_eh_clear_ua(struct ata_device *dev) int i; for (i = 0; i < ATA_EH_UA_TRIES; i++) { - u8 sense_buffer[SCSI_SENSE_BUFFERSIZE]; + u8 *sense_buffer = dev->link->ap->sector_buf; u8 sense_key = 0; unsigned int err_mask; From 7adbe46b9289794f8fe629cd78c876169741177f Mon Sep 17 00:00:00 2001 From: peerchen Date: Fri, 27 Feb 2009 16:58:41 +0800 Subject: [PATCH 194/580] ahci: Add the Device IDs for MCP89 and remove IDs of MCP7B to/from ahci.c Added the Device IDs for MCP89 AHCI controller. Removed the IDs of MCP7B because this chipset had been cancelled. Signed-off-by: Peer Chen Signed-off-by: Jeff Garzik --- drivers/ata/ahci.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index a603bbf9b1b7..66e012cd3271 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -582,18 +582,18 @@ static const struct pci_device_id ahci_pci_tbl[] = { { PCI_VDEVICE(NVIDIA, 0x0abd), board_ahci }, /* MCP79 */ { PCI_VDEVICE(NVIDIA, 0x0abe), board_ahci }, /* MCP79 */ { PCI_VDEVICE(NVIDIA, 0x0abf), board_ahci }, /* MCP79 */ - { PCI_VDEVICE(NVIDIA, 0x0bc8), board_ahci }, /* MCP7B */ - { PCI_VDEVICE(NVIDIA, 0x0bc9), board_ahci }, /* MCP7B */ - { PCI_VDEVICE(NVIDIA, 0x0bca), board_ahci }, /* MCP7B */ - { PCI_VDEVICE(NVIDIA, 0x0bcb), board_ahci }, /* MCP7B */ - { PCI_VDEVICE(NVIDIA, 0x0bcc), board_ahci }, /* MCP7B */ - { PCI_VDEVICE(NVIDIA, 0x0bcd), board_ahci }, /* MCP7B */ - { PCI_VDEVICE(NVIDIA, 0x0bce), board_ahci }, /* MCP7B */ - { PCI_VDEVICE(NVIDIA, 0x0bcf), board_ahci }, /* MCP7B */ - { PCI_VDEVICE(NVIDIA, 0x0bc4), board_ahci }, /* MCP7B */ - { PCI_VDEVICE(NVIDIA, 0x0bc5), board_ahci }, /* MCP7B */ - { PCI_VDEVICE(NVIDIA, 0x0bc6), board_ahci }, /* MCP7B */ - { PCI_VDEVICE(NVIDIA, 0x0bc7), board_ahci }, /* MCP7B */ + { PCI_VDEVICE(NVIDIA, 0x0d84), board_ahci }, /* MCP89 */ + { PCI_VDEVICE(NVIDIA, 0x0d85), board_ahci }, /* MCP89 */ + { PCI_VDEVICE(NVIDIA, 0x0d86), board_ahci }, /* MCP89 */ + { PCI_VDEVICE(NVIDIA, 0x0d87), board_ahci }, /* MCP89 */ + { PCI_VDEVICE(NVIDIA, 0x0d88), board_ahci }, /* MCP89 */ + { PCI_VDEVICE(NVIDIA, 0x0d89), board_ahci }, /* MCP89 */ + { PCI_VDEVICE(NVIDIA, 0x0d8a), board_ahci }, /* MCP89 */ + { PCI_VDEVICE(NVIDIA, 0x0d8b), board_ahci }, /* MCP89 */ + { PCI_VDEVICE(NVIDIA, 0x0d8c), board_ahci }, /* MCP89 */ + { PCI_VDEVICE(NVIDIA, 0x0d8d), board_ahci }, /* MCP89 */ + { PCI_VDEVICE(NVIDIA, 0x0d8e), board_ahci }, /* MCP89 */ + { PCI_VDEVICE(NVIDIA, 0x0d8f), board_ahci }, /* MCP89 */ /* SiS */ { PCI_VDEVICE(SI, 0x1184), board_ahci }, /* SiS 966 */ From 55f784c826af2506e417bcc484d7e0e4d27f1977 Mon Sep 17 00:00:00 2001 From: Brandon Ehle Date: Sun, 1 Mar 2009 00:02:49 -0800 Subject: [PATCH 195/580] sata_nv: fix module parameter description Update MODULE_PARM_DESC for ADMA to reflect the fact that the option is disabled by default. Signed-off-by: Brandon Ehle Signed-off-by: Jeff Garzik --- drivers/ata/sata_nv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c index 55a8eed3f3a3..f65b53785a8f 100644 --- a/drivers/ata/sata_nv.c +++ b/drivers/ata/sata_nv.c @@ -2523,7 +2523,7 @@ static void __exit nv_exit(void) module_init(nv_init); module_exit(nv_exit); module_param_named(adma, adma_enabled, bool, 0444); -MODULE_PARM_DESC(adma, "Enable use of ADMA (Default: true)"); +MODULE_PARM_DESC(adma, "Enable use of ADMA (Default: false)"); module_param_named(swncq, swncq_enabled, bool, 0444); MODULE_PARM_DESC(swncq, "Enable use of SWNCQ (Default: true)"); From d6515e6ff4ad3db4bd5ef2dd4e1026a7aca2482e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 4 Mar 2009 15:59:30 +0900 Subject: [PATCH 196/580] libata: make sure port is thawed when skipping resets When SCR access is available and the link is offline, softreset is skipped as it only wastes time and some controllers don't respond very well. However, the skip path forgot to thaw the port, which not only blocks further event notification from the port but also causes repeated EH invocations on the same event on drivers which rely on ->thaw() to clear events if the IRQ is shared with another device or port. This problem has always been there but is uncovered by recent sata_nv nf2/3 change which dropped hardreset support while maintaining SCR access. nf2/3 doesn't clear hotplug event mask from the interrupt handler but relies on ->thaw() to clear them. When the hardreset was there, the reset action was never skipped and the port was always thawed but, with the hardreset gone, ->prereset() determines that there's no need for softreset and both ->softreset() and ->thaw() are skipped. This leads to stuck hotplug event in the IRQ status register triggering hotplug event whenever IRQ is delieverd on the same IRQ. As the controller shares the same IRQ for both ports, this happens on every IO if one port is occpupied and the other isn't. This patch fixes the problem by making sure that the port is thawed on reset-skip path. bko#11615 reports this problem. Signed-off-by: Tejun Heo Cc: Robert Hancock Reported-by: Dan Andresan Reported-by: Arne Woerner Reported-by: Stefan Lippers-Hollmann Signed-off-by: Jeff Garzik --- drivers/ata/libata-eh.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index 009ccc7c9865..ea890911d4fa 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -2423,11 +2423,14 @@ int ata_eh_reset(struct ata_link *link, int classify, } /* prereset() might have cleared ATA_EH_RESET. If so, - * bang classes and return. + * bang classes, thaw and return. */ if (reset && !(ehc->i.action & ATA_EH_RESET)) { ata_for_each_dev(dev, link, ALL) classes[dev->devno] = ATA_DEV_NONE; + if ((ap->pflags & ATA_PFLAG_FROZEN) && + ata_is_host_link(link)) + ata_eh_thaw_port(ap); rc = 0; goto out; } From 968e594afdbc40b4270f9d4032ae8350475749d6 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Mon, 16 Feb 2009 20:15:08 -0600 Subject: [PATCH 197/580] libata: Don't trust current capacity values in identify words 57-58 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hanno Böck reported a problem where an old Conner CP30254 240MB hard drive was reported as 1.1TB in capacity by libata: http://lkml.org/lkml/2009/2/13/134 This was caused by libata trusting the drive's reported current capacity in sectors in identify words 57 and 58 if the drive does not support LBA and the current CHS translation values appear valid. Unfortunately it seems older ATA specs were vague about what this field should contain and a number of drives used values with wrong byte order or that were totally bogus. There's no unique information that it conveys and so we can just calculate the number of sectors from the reported current CHS values. While we're at it, clean up this function to use named constants for the identify word values. Signed-off-by: Robert Hancock Signed-off-by: Jeff Garzik --- drivers/ata/libata-core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 5e324cea3019..060bcd601f57 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -1322,14 +1322,16 @@ static u64 ata_id_n_sectors(const u16 *id) { if (ata_id_has_lba(id)) { if (ata_id_has_lba48(id)) - return ata_id_u64(id, 100); + return ata_id_u64(id, ATA_ID_LBA_CAPACITY_2); else - return ata_id_u32(id, 60); + return ata_id_u32(id, ATA_ID_LBA_CAPACITY); } else { if (ata_id_current_chs_valid(id)) - return ata_id_u32(id, 57); + return id[ATA_ID_CUR_CYLS] * id[ATA_ID_CUR_HEADS] * + id[ATA_ID_CUR_SECTORS]; else - return id[1] * id[3] * id[6]; + return id[ATA_ID_CYLS] * id[ATA_ID_HEADS] * + id[ATA_ID_SECTORS]; } } From c3f5d2d8b5fa6eb0cc1c47fd162bf6432f206f42 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:54:52 +0200 Subject: [PATCH 198/580] x86: init_memory_mapping() trivial cleanups Impact: cleanup To reduce the diff between the 32-bit and 64-bit versions of init_memory_mapping(), fix up all trivial issues. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-1-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 42 +++++++++++++++++++++++++----------------- arch/x86/mm/init_64.c | 26 +++++++++++++++----------- 2 files changed, 40 insertions(+), 28 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c351456d06dc..ad4e03c2d4df 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -868,11 +868,10 @@ static void __init find_early_table_space(unsigned long end, int use_pse) table_start >>= PAGE_SHIFT; table_end = table_start; - table_top = table_start + (tables>>PAGE_SHIFT); + table_top = table_start + (tables >> PAGE_SHIFT); printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_start << PAGE_SHIFT, - (table_start << PAGE_SHIFT) + tables); + end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT); } struct map_range { @@ -899,8 +898,13 @@ static int save_mr(struct map_range *mr, int nr_range, return nr_range; } +/* + * Setup the direct mapping of the physical memory at PAGE_OFFSET. + * This runs before bootmem is initialized and gets pages directly from + * the physical memory. To access them they are temporarily mapped. + */ unsigned long __init_refok init_memory_mapping(unsigned long start, - unsigned long end) + unsigned long end) { pgd_t *pgd_base = swapper_pg_dir; unsigned long page_size_mask = 0; @@ -911,7 +915,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, int nr_range, i; int use_pse; - printk(KERN_INFO "init_memory_mapping: %08lx-%08lx\n", start, end); + printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); #ifdef CONFIG_DEBUG_PAGEALLOC /* @@ -940,19 +944,19 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __supported_pte_mask |= _PAGE_GLOBAL; } - memset(mr, 0, sizeof(mr)); - nr_range = 0; - if (use_pse) page_size_mask |= 1 << PG_LEVEL_2M; + memset(mr, 0, sizeof(mr)); + nr_range = 0; + /* * Don't use a large page for the first 2/4MB of memory * because there are often fixed size MTRRs in there * and overlapping MTRRs into large pages can cause * slowdowns. */ - /* head could not be big page alignment ? */ + /* head if not big page alignment ? */ start_pfn = start >> PAGE_SHIFT; pos = start_pfn << PAGE_SHIFT; if (pos == 0) @@ -960,14 +964,14 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, else end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - if (end_pfn > (end>>PAGE_SHIFT)) - end_pfn = end>>PAGE_SHIFT; + if (end_pfn > (end >> PAGE_SHIFT)) + end_pfn = end >> PAGE_SHIFT; if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); pos = end_pfn << PAGE_SHIFT; } - /* big page range */ + /* big page (2M) range */ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); @@ -977,7 +981,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, pos = end_pfn << PAGE_SHIFT; } - /* tail is not big page alignment ? */ + /* tail is not big page (2M) alignment */ start_pfn = pos>>PAGE_SHIFT; end_pfn = end>>PAGE_SHIFT; if (start_pfn < end_pfn) @@ -998,13 +1002,17 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, } for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG " %08lx - %08lx page %s\n", - mr[i].start, mr[i].end, - (mr[i].page_size_mask & (1<> PUD_SHIFT; tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); + if (use_gbpages) { unsigned long extra; + extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; } else pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); if (use_pse) { unsigned long extra; + extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); /* @@ -647,7 +652,6 @@ static int save_mr(struct map_range *mr, int nr_range, unsigned long start_pfn, unsigned long end_pfn, unsigned long page_size_mask) { - if (start_pfn < end_pfn) { if (nr_range >= NR_RANGE_MR) panic("run out of range for init_memory_mapping\n"); @@ -679,13 +683,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); - /* - * Find space for the kernel direct mapping tables. - * - * Later we should allocate these tables in the local node of the - * memory mapped. Unfortunately this is done currently before the - * nodes are discovered. - */ if (!after_bootmem) init_gbpages(); @@ -709,7 +706,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, memset(mr, 0, sizeof(mr)); nr_range = 0; - /* head if not big page alignment ?*/ + /* head if not big page alignment ? */ start_pfn = start >> PAGE_SHIFT; pos = start_pfn << PAGE_SHIFT; end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) @@ -721,7 +718,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, pos = end_pfn << PAGE_SHIFT; } - /* big page (2M) range*/ + /* big page (2M) range */ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) @@ -769,7 +766,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, /* move it */ old_start = mr[i].start; memmove(&mr[i], &mr[i+1], - (nr_range - 1 - i) * sizeof (struct map_range)); + (nr_range - 1 - i) * sizeof(struct map_range)); mr[i--].start = old_start; nr_range--; } @@ -780,6 +777,13 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, (mr[i].page_size_mask & (1< Date: Thu, 5 Mar 2009 14:54:53 +0200 Subject: [PATCH 199/580] x86: add gbpages support to 32-bit init_memory_mapping() Impact: cleanup To reduce the diff between the 32-bit and 64-bit versions of init_memory_mapping(), add gbpages support to the 32-bit version. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-2-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ad4e03c2d4df..5fad0f95d5a3 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -65,6 +65,8 @@ static unsigned long __meminitdata table_top; static int __initdata after_init_bootmem; +int direct_gbpages; + static __init void *alloc_low_page(void) { unsigned long pfn = table_end++; @@ -831,14 +833,22 @@ void __init setup_bootmem_allocator(void) after_init_bootmem = 1; } -static void __init find_early_table_space(unsigned long end, int use_pse) +static void __init find_early_table_space(unsigned long end, int use_pse, + int use_gbpages) { unsigned long puds, pmds, ptes, tables, start; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + if (use_gbpages) { + unsigned long extra; + + extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); + pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; + } else + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); if (use_pse) { @@ -913,7 +923,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, struct map_range mr[NR_RANGE_MR]; int nr_range, i; - int use_pse; + int use_pse, use_gbpages; printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); @@ -923,9 +933,10 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc. */ - use_pse = 0; + use_pse = use_gbpages = 0; #else use_pse = cpu_has_pse; + use_gbpages = direct_gbpages; #endif #ifdef CONFIG_X86_PAE @@ -944,6 +955,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __supported_pte_mask |= _PAGE_GLOBAL; } + if (use_gbpages) + page_size_mask |= 1 << PG_LEVEL_1G; if (use_pse) page_size_mask |= 1 << PG_LEVEL_2M; @@ -1015,7 +1028,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * nodes are discovered. */ if (!after_init_bootmem) - find_early_table_space(end, use_pse); + find_early_table_space(end, use_pse, use_gbpages); for (i = 0; i < nr_range; i++) kernel_physical_mapping_init(pgd_base, From 49a2bf7303b0dc5fccbb3ff7cf2e7751f0e3953d Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:54:54 +0200 Subject: [PATCH 200/580] x86: find_early_table_space() unification Impact: cleanup There are some minor differences between the 32-bit and 64-bit find_early_table_space() functions. This patch wraps those differences under CONFIG_X86_32 to make the function identical on both configurations. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-3-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 9 +++++++++ arch/x86/mm/init_64.c | 14 ++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5fad0f95d5a3..86a99947455b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -855,24 +855,33 @@ static void __init find_early_table_space(unsigned long end, int use_pse, unsigned long extra; extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); +#ifdef CONFIG_X86_32 extra += PMD_SIZE; +#endif ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); +#ifdef CONFIG_X86_32 /* for fixmap */ tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); +#endif /* * RED-PEN putting page tables only on node 0 could * cause a hotspot and fill up ZONE_DMA. The page tables * need roughly 0.5KB per GB. */ +#ifdef CONFIG_X86_32 start = 0x7000; table_start = find_e820_area(start, max_pfn_mapped<>PMD_SHIFT) << PMD_SHIFT); +#ifdef CONFIG_X86_32 + extra += PMD_SIZE; +#endif ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); +#ifdef CONFIG_X86_32 + /* for fixmap */ + tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); +#endif + /* * RED-PEN putting page tables only on node 0 could * cause a hotspot and fill up ZONE_DMA. The page tables * need roughly 0.5KB per GB. */ +#ifdef CONFIG_X86_32 + start = 0x7000; + table_start = find_e820_area(start, max_pfn_mapped< Date: Thu, 5 Mar 2009 14:54:55 +0200 Subject: [PATCH 201/580] x86: move pgd_base out of init_memory_mapping() Impact: cleanup This patch moves pgd_base out of init_memory_mapping() to reduce the diff between the 32-bit version and the 64-bit version of the function. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-4-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 86a99947455b..cfc68d601380 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -227,11 +227,11 @@ static inline int is_kernel_text(unsigned long addr) * of max_low_pfn pages, by creating page tables starting from address * PAGE_OFFSET: */ -static void __init kernel_physical_mapping_init(pgd_t *pgd_base, - unsigned long start_pfn, +static void __init kernel_physical_mapping_init(unsigned long start_pfn, unsigned long end_pfn, int use_pse) { + pgd_t *pgd_base = swapper_pg_dir; int pgd_idx, pmd_idx, pte_ofs; unsigned long pfn; pgd_t *pgd; @@ -509,8 +509,9 @@ void __init native_pagetable_setup_done(pgd_t *base) * be partially populated, and so it avoids stomping on any existing * mappings. */ -static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base) +static void __init early_ioremap_page_table_range_init(void) { + pgd_t *pgd_base = swapper_pg_dir; unsigned long vaddr, end; /* @@ -925,7 +926,6 @@ static int save_mr(struct map_range *mr, int nr_range, unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) { - pgd_t *pgd_base = swapper_pg_dir; unsigned long page_size_mask = 0; unsigned long start_pfn, end_pfn; unsigned long pos; @@ -1040,12 +1040,12 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, find_early_table_space(end, use_pse, use_gbpages); for (i = 0; i < nr_range; i++) - kernel_physical_mapping_init(pgd_base, + kernel_physical_mapping_init( mr[i].start >> PAGE_SHIFT, mr[i].end >> PAGE_SHIFT, mr[i].page_size_mask == (1< Date: Thu, 5 Mar 2009 14:54:56 +0200 Subject: [PATCH 202/580] x86: ifdef 32-bit specific setup in init_memory_mapping() Impact: cleanup Enabling NX, PSE, and PGE are only required on 32-bit so ifdef them in both versions of the function. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-5-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 2 ++ arch/x86/mm/init_64.c | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index cfc68d601380..eb98cb90cb39 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -948,6 +948,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, use_gbpages = direct_gbpages; #endif +#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_PAE set_nx(); if (nx_enabled) @@ -963,6 +964,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, set_in_cr4(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } +#endif if (use_gbpages) page_size_mask |= 1 << PG_LEVEL_1G; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 151e5ba34412..c3c0be5b6373 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -712,6 +712,24 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, use_gbpages = direct_gbpages; #endif +#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_PAE + set_nx(); + if (nx_enabled) + printk(KERN_INFO "NX (Execute Disable) protection: active\n"); +#endif + + /* Enable PSE if available */ + if (cpu_has_pse) + set_in_cr4(X86_CR4_PSE); + + /* Enable PGE if available */ + if (cpu_has_pge) { + set_in_cr4(X86_CR4_PGE); + __supported_pte_mask |= _PAGE_GLOBAL; + } +#endif + if (use_gbpages) page_size_mask |= 1 << PG_LEVEL_1G; if (use_pse) From 96083ca11bc85265c7ef9e791a57e3514d8f605a Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:54:57 +0200 Subject: [PATCH 203/580] x86: remove unnecessary save_mr() sanity check Impact: cleanup The save_mr() function already checks that start_pfn is less than end_pfn so we can remove the unnecessary check which reduces the diff between the 32-bit and the 64-bit versions of init_memory_mapping(). Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-6-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index eb98cb90cb39..559715b488bb 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1008,8 +1008,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, /* tail is not big page (2M) alignment */ start_pfn = pos>>PAGE_SHIFT; end_pfn = end>>PAGE_SHIFT; - if (start_pfn < end_pfn) - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); /* try to merge same page size and continuous */ for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { From c464573cb3d3bdd45eed8f5f59596f84ede95a0c Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:54:58 +0200 Subject: [PATCH 204/580] x86: rename after_init_bootmem to after_bootmem in mm/init_32.c Impact: cleanup This patch renames after_init_bootmem to after_bootmem in mm/init_32.c to reduce the diff to the 64-bit version of of init_memory_mapping(). Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-7-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 559715b488bb..cc5c3992385e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -63,7 +63,7 @@ static unsigned long __initdata table_start; static unsigned long __meminitdata table_end; static unsigned long __meminitdata table_top; -static int __initdata after_init_bootmem; +int after_bootmem; int direct_gbpages; @@ -92,7 +92,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) #ifdef CONFIG_X86_PAE if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { - if (after_init_bootmem) + if (after_bootmem) pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); else pmd_table = (pmd_t *)alloc_low_page(); @@ -119,7 +119,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { pte_t *page_table = NULL; - if (after_init_bootmem) { + if (after_bootmem) { #ifdef CONFIG_DEBUG_PAGEALLOC page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); #endif @@ -158,7 +158,7 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, pte_t *newpte; int i; - BUG_ON(after_init_bootmem); + BUG_ON(after_bootmem); newpte = alloc_low_page(); for (i = 0; i < PTRS_PER_PTE; i++) set_pte(newpte + i, pte[i]); @@ -831,7 +831,7 @@ void __init setup_bootmem_allocator(void) bootmap = setup_node_bootmem(0, 0, max_low_pfn, bootmap); #endif - after_init_bootmem = 1; + after_bootmem = 1; } static void __init find_early_table_space(unsigned long end, int use_pse, @@ -1037,7 +1037,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * memory mapped. Unfortunately this is done currently before the * nodes are discovered. */ - if (!after_init_bootmem) + if (!after_bootmem) find_early_table_space(end, use_pse, use_gbpages); for (i = 0; i < nr_range; i++) @@ -1052,11 +1052,11 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __flush_tlb_all(); - if (!after_init_bootmem) + if (!after_bootmem) reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT, "PGTABLE"); - if (!after_init_bootmem) + if (!after_bootmem) early_memtest(start, end); return end >> PAGE_SHIFT; From cbba65796df99f3ca9bf70d14e5a19384c54b6a1 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:54:59 +0200 Subject: [PATCH 205/580] x86: unify kernel_physical_mapping_init() call in init_memory_mapping() Impact: cleanup The 64-bit version of init_memory_mapping() uses the last mapped address returned from kernel_physical_mapping_init() whereas the 32-bit version doesn't. This patch adds relevant ifdefs to both versions of the function to reduce the diff between them. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-8-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 10 +++++++++- arch/x86/mm/init_64.c | 21 +++++++++++++-------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index cc5c3992385e..00c1d8508258 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -929,6 +929,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long page_size_mask = 0; unsigned long start_pfn, end_pfn; unsigned long pos; + unsigned long ret; struct map_range mr[NR_RANGE_MR]; int nr_range, i; @@ -1040,11 +1041,18 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, if (!after_bootmem) find_early_table_space(end, use_pse, use_gbpages); +#ifdef CONFIG_X86_32 for (i = 0; i < nr_range; i++) kernel_physical_mapping_init( mr[i].start >> PAGE_SHIFT, mr[i].end >> PAGE_SHIFT, mr[i].page_size_mask == (1<> PAGE_SHIFT; + return ret >> PAGE_SHIFT; } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index c3c0be5b6373..e4fadea2e521 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -686,10 +686,10 @@ static int save_mr(struct map_range *mr, int nr_range, unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) { - unsigned long last_map_addr = 0; unsigned long page_size_mask = 0; unsigned long start_pfn, end_pfn; unsigned long pos; + unsigned long ret; struct map_range mr[NR_RANGE_MR]; int nr_range, i; @@ -819,10 +819,18 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, if (!after_bootmem) find_early_table_space(end, use_pse, use_gbpages); +#ifdef CONFIG_X86_32 for (i = 0; i < nr_range; i++) - last_map_addr = kernel_physical_mapping_init( - mr[i].start, mr[i].end, - mr[i].page_size_mask); + kernel_physical_mapping_init( + mr[i].start >> PAGE_SHIFT, + mr[i].end >> PAGE_SHIFT, + mr[i].page_size_mask == (1<> PAGE_SHIFT; + return ret >> PAGE_SHIFT; } #ifndef CONFIG_NUMA From d58e854e36ddf241ebc243e4122c5ab087bf38df Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:55:00 +0200 Subject: [PATCH 206/580] x86: add table start and end sanity checks to 32-bit init_memory_mapping() Impact: cleanup This patch adds a sanity check to the 32-bit version of init_memory_mapping() to reduce the diff to the 64-bit version. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-9-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 00c1d8508258..0a3707fb973b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1060,7 +1060,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __flush_tlb_all(); - if (!after_bootmem) + if (!after_bootmem && table_end > table_start) reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT, "PGTABLE"); From 01ced9ec14ad1b4f8a533c2f2b5a4fe4c92c1099 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:55:01 +0200 Subject: [PATCH 207/580] x86: ifdef 32-bit and 64-bit setup in init_memory_mapping() Impact: cleanup To reduce the diff between the 32-bit and 64-bit versions of init_memory_mapping(), ifdef configuration specific setup code in the function. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-10-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 6 ++++++ arch/x86/mm/init_64.c | 8 ++++++++ 2 files changed, 14 insertions(+) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 0a3707fb973b..3f91bdc20971 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1054,10 +1054,16 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, mr[i].page_size_mask); #endif +#ifdef CONFIG_X86_32 early_ioremap_page_table_range_init(); load_cr3(swapper_pg_dir); +#endif +#ifdef CONFIG_X86_64 + if (!after_bootmem) + mmu_cr4_features = read_cr4(); +#endif __flush_tlb_all(); if (!after_bootmem && table_end > table_start) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e4fadea2e521..5ecb23a57d2f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -832,8 +832,16 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, mr[i].page_size_mask); #endif +#ifdef CONFIG_X86_32 + early_ioremap_page_table_range_init(); + + load_cr3(swapper_pg_dir); +#endif + +#ifdef CONFIG_X86_64 if (!after_bootmem) mmu_cr4_features = read_cr4(); +#endif __flush_tlb_all(); if (!after_bootmem && table_end > table_start) From c338d6f60fc29dfc74bd82b91526ef43ba992bab Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:55:02 +0200 Subject: [PATCH 208/580] x86: ifdef 32-bit and 64-bit pfn setup in init_memory_mapping() Impact: cleanup To reduce the diff between the 32-bit and 64-bit versions of init_memory_mapping(), ifdef configuration specific pfn setup code in the function. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-11-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 42 +++++++++++++++++++++++++++++++++++++++--- arch/x86/mm/init_64.c | 21 +++++++++++++++++++++ 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 3f91bdc20971..34760e483972 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -975,20 +975,25 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, memset(mr, 0, sizeof(mr)); nr_range = 0; + /* head if not big page alignment ? */ + start_pfn = start >> PAGE_SHIFT; + pos = start_pfn << PAGE_SHIFT; +#ifdef CONFIG_X86_32 /* * Don't use a large page for the first 2/4MB of memory * because there are often fixed size MTRRs in there * and overlapping MTRRs into large pages can cause * slowdowns. */ - /* head if not big page alignment ? */ - start_pfn = start >> PAGE_SHIFT; - pos = start_pfn << PAGE_SHIFT; if (pos == 0) end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); else end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +#else /* CONFIG_X86_64 */ + end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#endif if (end_pfn > (end >> PAGE_SHIFT)) end_pfn = end >> PAGE_SHIFT; if (start_pfn < end_pfn) { @@ -999,13 +1004,44 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, /* big page (2M) range */ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +#ifdef CONFIG_X86_32 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +#else /* CONFIG_X86_64 */ + end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) + end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); +#endif + if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & + ((1<>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<>PAGE_SHIFT; end_pfn = end>>PAGE_SHIFT; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5ecb23a57d2f..d99bc6ac4884 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -741,8 +741,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, /* head if not big page alignment ? */ start_pfn = start >> PAGE_SHIFT; pos = start_pfn << PAGE_SHIFT; +#ifdef CONFIG_X86_32 + /* + * Don't use a large page for the first 2/4MB of memory + * because there are often fixed size MTRRs in there + * and overlapping MTRRs into large pages can cause + * slowdowns. + */ + if (pos == 0) + end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); + else + end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#else /* CONFIG_X86_64 */ end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +#endif if (end_pfn > (end >> PAGE_SHIFT)) end_pfn = end >> PAGE_SHIFT; if (start_pfn < end_pfn) { @@ -753,16 +767,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, /* big page (2M) range */ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +#ifdef CONFIG_X86_32 + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +#else /* CONFIG_X86_64 */ end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); +#endif + if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); @@ -783,6 +803,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, page_size_mask & (1<>PAGE_SHIFT; From b47e3418c52b26f6143fc696326ae52a21324551 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:55:03 +0200 Subject: [PATCH 209/580] x86: ifdef 32-bit and 64-bit NR_RANGE_MR for save_mr() unification Impact: cleanup As a trivial preparation for moving common code to arc/x86/mm/init.c, ifdef the 32-bit and 64-bit versions of NR_RANGE_MR. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-12-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 4 ++++ arch/x86/mm/init_64.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 34760e483972..f59e9b851637 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -900,7 +900,11 @@ struct map_range { unsigned page_size_mask; }; +#ifdef CONFIG_X86_32 #define NR_RANGE_MR 3 +#else /* CONFIG_X86_64 */ +#define NR_RANGE_MR 5 +#endif static int save_mr(struct map_range *mr, int nr_range, unsigned long start_pfn, unsigned long end_pfn, diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d99bc6ac4884..d101990e4635 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -660,7 +660,11 @@ struct map_range { unsigned page_size_mask; }; +#ifdef CONFIG_X86_32 +#define NR_RANGE_MR 3 +#else /* CONFIG_X86_64 */ #define NR_RANGE_MR 5 +#endif static int save_mr(struct map_range *mr, int nr_range, unsigned long start_pfn, unsigned long end_pfn, From 0c0f756fd679d9747d52dad51fce3a5bb362eec3 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:55:04 +0200 Subject: [PATCH 210/580] x86: add stub init_gbpages() for 32-bit init_memory_mapping() Impact: cleanup This patch adds an empty static inline init_gbpages() for the 32-bit version of init_memory_mapping() making both versions identical. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-13-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index f59e9b851637..cd3c24b490a1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -922,6 +922,10 @@ static int save_mr(struct map_range *mr, int nr_range, return nr_range; } +static inline void init_gbpages(void) +{ +} + /* * Setup the direct mapping of the physical memory at PAGE_OFFSET. * This runs before bootmem is initialized and gets pages directly from @@ -941,6 +945,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); + if (!after_bootmem) + init_gbpages(); + #ifdef CONFIG_DEBUG_PAGEALLOC /* * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. From f765090a2617b8d9cb73b71e0aa850c29460d8be Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:55:05 +0200 Subject: [PATCH 211/580] x86: move init_memory_mapping() to common mm/init.c Impact: cleanup This patch moves the init_memory_mapping() function to common mm/init.c. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-14-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 328 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/mm/init_32.c | 308 +-------------------------------------- arch/x86/mm/init_64.c | 314 +--------------------------------------- 3 files changed, 342 insertions(+), 608 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index cc7fe660f33d..3a21b136da24 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -2,10 +2,338 @@ #include #include +#include #include #include #include #include +#include + +#ifdef CONFIG_X86_32 +extern void __init early_ioremap_page_table_range_init(void); +extern void __init kernel_physical_mapping_init(unsigned long start_pfn, + unsigned long end_pfn, + int use_pse); +#endif + +#ifdef CONFIG_X86_64 +extern unsigned long __meminit +kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask); +#endif + +unsigned long __initdata table_start; +unsigned long __meminitdata table_end; +unsigned long __meminitdata table_top; + +int after_bootmem; + +int direct_gbpages +#ifdef CONFIG_DIRECT_GBPAGES + = 1 +#endif +; + +static void __init find_early_table_space(unsigned long end, int use_pse, + int use_gbpages) +{ + unsigned long puds, pmds, ptes, tables, start; + + puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); + + if (use_gbpages) { + unsigned long extra; + + extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); + pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; + } else + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); + + if (use_pse) { + unsigned long extra; + + extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); +#ifdef CONFIG_X86_32 + extra += PMD_SIZE; +#endif + ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + + tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); + +#ifdef CONFIG_X86_32 + /* for fixmap */ + tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); +#endif + + /* + * RED-PEN putting page tables only on node 0 could + * cause a hotspot and fill up ZONE_DMA. The page tables + * need roughly 0.5KB per GB. + */ +#ifdef CONFIG_X86_32 + start = 0x7000; + table_start = find_e820_area(start, max_pfn_mapped<>= PAGE_SHIFT; + table_end = table_start; + table_top = table_start + (tables >> PAGE_SHIFT); + + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", + end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT); +} + +struct map_range { + unsigned long start; + unsigned long end; + unsigned page_size_mask; +}; + +#ifdef CONFIG_X86_32 +#define NR_RANGE_MR 3 +#else /* CONFIG_X86_64 */ +#define NR_RANGE_MR 5 +#endif + +static int save_mr(struct map_range *mr, int nr_range, + unsigned long start_pfn, unsigned long end_pfn, + unsigned long page_size_mask) +{ + if (start_pfn < end_pfn) { + if (nr_range >= NR_RANGE_MR) + panic("run out of range for init_memory_mapping\n"); + mr[nr_range].start = start_pfn<> PAGE_SHIFT; + pos = start_pfn << PAGE_SHIFT; +#ifdef CONFIG_X86_32 + /* + * Don't use a large page for the first 2/4MB of memory + * because there are often fixed size MTRRs in there + * and overlapping MTRRs into large pages can cause + * slowdowns. + */ + if (pos == 0) + end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); + else + end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#else /* CONFIG_X86_64 */ + end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#endif + if (end_pfn > (end >> PAGE_SHIFT)) + end_pfn = end >> PAGE_SHIFT; + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + pos = end_pfn << PAGE_SHIFT; + } + + /* big page (2M) range */ + start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#ifdef CONFIG_X86_32 + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +#else /* CONFIG_X86_64 */ + end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) + end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); +#endif + + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & + ((1<>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<>PAGE_SHIFT; + end_pfn = end>>PAGE_SHIFT; + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + + /* try to merge same page size and continuous */ + for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { + unsigned long old_start; + if (mr[i].end != mr[i+1].start || + mr[i].page_size_mask != mr[i+1].page_size_mask) + continue; + /* move it */ + old_start = mr[i].start; + memmove(&mr[i], &mr[i+1], + (nr_range - 1 - i) * sizeof(struct map_range)); + mr[i--].start = old_start; + nr_range--; + } + + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG " %010lx - %010lx page %s\n", + mr[i].start, mr[i].end, + (mr[i].page_size_mask & (1<> PAGE_SHIFT, + mr[i].end >> PAGE_SHIFT, + mr[i].page_size_mask == (1< table_start) + reserve_early(table_start << PAGE_SHIFT, + table_end << PAGE_SHIFT, "PGTABLE"); + + if (!after_bootmem) + early_memtest(start, end); + + return ret >> PAGE_SHIFT; +} + /* * devmem_is_allowed() checks to see if /dev/mem access to a certain address diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index cd3c24b490a1..187522a0c66b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -59,13 +59,9 @@ unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); -static unsigned long __initdata table_start; -static unsigned long __meminitdata table_end; -static unsigned long __meminitdata table_top; - -int after_bootmem; - -int direct_gbpages; +extern unsigned long __initdata table_start; +extern unsigned long __meminitdata table_end; +extern unsigned long __meminitdata table_top; static __init void *alloc_low_page(void) { @@ -227,9 +223,9 @@ static inline int is_kernel_text(unsigned long addr) * of max_low_pfn pages, by creating page tables starting from address * PAGE_OFFSET: */ -static void __init kernel_physical_mapping_init(unsigned long start_pfn, - unsigned long end_pfn, - int use_pse) +void __init kernel_physical_mapping_init(unsigned long start_pfn, + unsigned long end_pfn, + int use_pse) { pgd_t *pgd_base = swapper_pg_dir; int pgd_idx, pmd_idx, pte_ofs; @@ -509,7 +505,7 @@ void __init native_pagetable_setup_done(pgd_t *base) * be partially populated, and so it avoids stomping on any existing * mappings. */ -static void __init early_ioremap_page_table_range_init(void) +void __init early_ioremap_page_table_range_init(void) { pgd_t *pgd_base = swapper_pg_dir; unsigned long vaddr, end; @@ -834,296 +830,6 @@ void __init setup_bootmem_allocator(void) after_bootmem = 1; } -static void __init find_early_table_space(unsigned long end, int use_pse, - int use_gbpages) -{ - unsigned long puds, pmds, ptes, tables, start; - - puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - - if (use_gbpages) { - unsigned long extra; - - extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); - pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); - - if (use_pse) { - unsigned long extra; - - extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); -#ifdef CONFIG_X86_32 - extra += PMD_SIZE; -#endif - ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else - ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - - tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); - -#ifdef CONFIG_X86_32 - /* for fixmap */ - tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif - - /* - * RED-PEN putting page tables only on node 0 could - * cause a hotspot and fill up ZONE_DMA. The page tables - * need roughly 0.5KB per GB. - */ -#ifdef CONFIG_X86_32 - start = 0x7000; - table_start = find_e820_area(start, max_pfn_mapped<>= PAGE_SHIFT; - table_end = table_start; - table_top = table_start + (tables >> PAGE_SHIFT); - - printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT); -} - -struct map_range { - unsigned long start; - unsigned long end; - unsigned page_size_mask; -}; - -#ifdef CONFIG_X86_32 -#define NR_RANGE_MR 3 -#else /* CONFIG_X86_64 */ -#define NR_RANGE_MR 5 -#endif - -static int save_mr(struct map_range *mr, int nr_range, - unsigned long start_pfn, unsigned long end_pfn, - unsigned long page_size_mask) -{ - if (start_pfn < end_pfn) { - if (nr_range >= NR_RANGE_MR) - panic("run out of range for init_memory_mapping\n"); - mr[nr_range].start = start_pfn<> PAGE_SHIFT; - pos = start_pfn << PAGE_SHIFT; -#ifdef CONFIG_X86_32 - /* - * Don't use a large page for the first 2/4MB of memory - * because there are often fixed size MTRRs in there - * and overlapping MTRRs into large pages can cause - * slowdowns. - */ - if (pos == 0) - end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); - else - end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); -#else /* CONFIG_X86_64 */ - end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); -#endif - if (end_pfn > (end >> PAGE_SHIFT)) - end_pfn = end >> PAGE_SHIFT; - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - pos = end_pfn << PAGE_SHIFT; - } - - /* big page (2M) range */ - start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); -#ifdef CONFIG_X86_32 - end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); -#else /* CONFIG_X86_64 */ - end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) - end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); -#endif - - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & (1<>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & - ((1<>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & (1<>PAGE_SHIFT; - end_pfn = end>>PAGE_SHIFT; - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - - /* try to merge same page size and continuous */ - for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { - unsigned long old_start; - if (mr[i].end != mr[i+1].start || - mr[i].page_size_mask != mr[i+1].page_size_mask) - continue; - /* move it */ - old_start = mr[i].start; - memmove(&mr[i], &mr[i+1], - (nr_range - 1 - i) * sizeof(struct map_range)); - mr[i--].start = old_start; - nr_range--; - } - - for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG " %010lx - %010lx page %s\n", - mr[i].start, mr[i].end, - (mr[i].page_size_mask & (1<> PAGE_SHIFT, - mr[i].end >> PAGE_SHIFT, - mr[i].page_size_mask == (1< table_start) - reserve_early(table_start << PAGE_SHIFT, - table_end << PAGE_SHIFT, "PGTABLE"); - - if (!after_bootmem) - early_memtest(start, end); - - return ret >> PAGE_SHIFT; -} - - /* * paging_init() sets up the page tables - note that the first 8MB are * already mapped by head.S. diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d101990e4635..a32fe0756088 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -61,12 +61,6 @@ static unsigned long dma_reserve __initdata; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -int direct_gbpages -#ifdef CONFIG_DIRECT_GBPAGES - = 1 -#endif -; - static int __init parse_direct_gbpages_off(char *arg) { direct_gbpages = 0; @@ -87,8 +81,6 @@ early_param("gbpages", parse_direct_gbpages_on); * around without checking the pgd every time. */ -int after_bootmem; - pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; EXPORT_SYMBOL_GPL(__supported_pte_mask); @@ -291,9 +283,9 @@ void __init cleanup_highmap(void) } } -static unsigned long __initdata table_start; -static unsigned long __meminitdata table_end; -static unsigned long __meminitdata table_top; +extern unsigned long __initdata table_start; +extern unsigned long __meminitdata table_end; +extern unsigned long __meminitdata table_top; static __ref void *alloc_low_page(unsigned long *phys) { @@ -547,77 +539,10 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, return phys_pud_init(pud, addr, end, page_size_mask); } -static void __init find_early_table_space(unsigned long end, int use_pse, - int use_gbpages) -{ - unsigned long puds, pmds, ptes, tables, start; - - puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - - if (use_gbpages) { - unsigned long extra; - - extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); - pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); - - if (use_pse) { - unsigned long extra; - - extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); -#ifdef CONFIG_X86_32 - extra += PMD_SIZE; -#endif - ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else - ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - - tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); - -#ifdef CONFIG_X86_32 - /* for fixmap */ - tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif - - /* - * RED-PEN putting page tables only on node 0 could - * cause a hotspot and fill up ZONE_DMA. The page tables - * need roughly 0.5KB per GB. - */ -#ifdef CONFIG_X86_32 - start = 0x7000; - table_start = find_e820_area(start, max_pfn_mapped<>= PAGE_SHIFT; - table_end = table_start; - table_top = table_start + (tables >> PAGE_SHIFT); - - printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT); -} - -static void __init init_gbpages(void) -{ - if (direct_gbpages && cpu_has_gbpages) - printk(KERN_INFO "Using GB pages for direct mapping\n"); - else - direct_gbpages = 0; -} - -static unsigned long __meminit kernel_physical_mapping_init(unsigned long start, - unsigned long end, - unsigned long page_size_mask) +unsigned long __meminit +kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask) { unsigned long next, last_map_addr = end; @@ -654,231 +579,6 @@ static unsigned long __meminit kernel_physical_mapping_init(unsigned long start, return last_map_addr; } -struct map_range { - unsigned long start; - unsigned long end; - unsigned page_size_mask; -}; - -#ifdef CONFIG_X86_32 -#define NR_RANGE_MR 3 -#else /* CONFIG_X86_64 */ -#define NR_RANGE_MR 5 -#endif - -static int save_mr(struct map_range *mr, int nr_range, - unsigned long start_pfn, unsigned long end_pfn, - unsigned long page_size_mask) -{ - if (start_pfn < end_pfn) { - if (nr_range >= NR_RANGE_MR) - panic("run out of range for init_memory_mapping\n"); - mr[nr_range].start = start_pfn<> PAGE_SHIFT; - pos = start_pfn << PAGE_SHIFT; -#ifdef CONFIG_X86_32 - /* - * Don't use a large page for the first 2/4MB of memory - * because there are often fixed size MTRRs in there - * and overlapping MTRRs into large pages can cause - * slowdowns. - */ - if (pos == 0) - end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); - else - end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); -#else /* CONFIG_X86_64 */ - end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); -#endif - if (end_pfn > (end >> PAGE_SHIFT)) - end_pfn = end >> PAGE_SHIFT; - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - pos = end_pfn << PAGE_SHIFT; - } - - /* big page (2M) range */ - start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); -#ifdef CONFIG_X86_32 - end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); -#else /* CONFIG_X86_64 */ - end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) - end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); -#endif - - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & (1<>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & - ((1<>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & (1<>PAGE_SHIFT; - end_pfn = end>>PAGE_SHIFT; - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - - /* try to merge same page size and continuous */ - for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { - unsigned long old_start; - if (mr[i].end != mr[i+1].start || - mr[i].page_size_mask != mr[i+1].page_size_mask) - continue; - /* move it */ - old_start = mr[i].start; - memmove(&mr[i], &mr[i+1], - (nr_range - 1 - i) * sizeof(struct map_range)); - mr[i--].start = old_start; - nr_range--; - } - - for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG " %010lx - %010lx page %s\n", - mr[i].start, mr[i].end, - (mr[i].page_size_mask & (1<> PAGE_SHIFT, - mr[i].end >> PAGE_SHIFT, - mr[i].page_size_mask == (1< table_start) - reserve_early(table_start << PAGE_SHIFT, - table_end << PAGE_SHIFT, "PGTABLE"); - - if (!after_bootmem) - early_memtest(start, end); - - return ret >> PAGE_SHIFT; -} - #ifndef CONFIG_NUMA void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) { From 298af9d89f3f5292e81a0a00f729c415adc4d8fb Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:55:06 +0200 Subject: [PATCH 212/580] x86: fix up some bad global variable names in mm/init.c Impact: cleanup The table_start, table_end, and table_top are too generic for global namespace so rename them to be more specific. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-15-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 26 +++++++++++++------------- arch/x86/mm/init_32.c | 14 +++++++------- arch/x86/mm/init_64.c | 10 +++++----- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 3a21b136da24..5bbdfe7459d2 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -23,9 +23,9 @@ kernel_physical_mapping_init(unsigned long start, unsigned long page_size_mask); #endif -unsigned long __initdata table_start; -unsigned long __meminitdata table_end; -unsigned long __meminitdata table_top; +unsigned long __initdata e820_table_start; +unsigned long __meminitdata e820_table_end; +unsigned long __meminitdata e820_table_top; int after_bootmem; @@ -78,21 +78,21 @@ static void __init find_early_table_space(unsigned long end, int use_pse, */ #ifdef CONFIG_X86_32 start = 0x7000; - table_start = find_e820_area(start, max_pfn_mapped<>= PAGE_SHIFT; - table_end = table_start; - table_top = table_start + (tables >> PAGE_SHIFT); + e820_table_start >>= PAGE_SHIFT; + e820_table_end = e820_table_start; + e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT); + end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); } struct map_range { @@ -324,9 +324,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, #endif __flush_tlb_all(); - if (!after_bootmem && table_end > table_start) - reserve_early(table_start << PAGE_SHIFT, - table_end << PAGE_SHIFT, "PGTABLE"); + if (!after_bootmem && e820_table_end > e820_table_start) + reserve_early(e820_table_start << PAGE_SHIFT, + e820_table_end << PAGE_SHIFT, "PGTABLE"); if (!after_bootmem) early_memtest(start, end); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 187522a0c66b..e9df0d9cdeb6 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -59,16 +59,16 @@ unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); -extern unsigned long __initdata table_start; -extern unsigned long __meminitdata table_end; -extern unsigned long __meminitdata table_top; +extern unsigned long __initdata e820_table_start; +extern unsigned long __meminitdata e820_table_end; +extern unsigned long __meminitdata e820_table_top; static __init void *alloc_low_page(void) { - unsigned long pfn = table_end++; + unsigned long pfn = e820_table_end++; void *adr; - if (pfn >= table_top) + if (pfn >= e820_table_top) panic("alloc_low_page: ran out of memory"); adr = __va(pfn * PAGE_SIZE); @@ -149,8 +149,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, if (pmd_idx_kmap_begin != pmd_idx_kmap_end && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end - && ((__pa(pte) >> PAGE_SHIFT) < table_start - || (__pa(pte) >> PAGE_SHIFT) >= table_end)) { + && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start + || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { pte_t *newpte; int i; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a32fe0756088..a1d33c58b497 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -283,13 +283,13 @@ void __init cleanup_highmap(void) } } -extern unsigned long __initdata table_start; -extern unsigned long __meminitdata table_end; -extern unsigned long __meminitdata table_top; +extern unsigned long __initdata e820_table_start; +extern unsigned long __meminitdata e820_table_end; +extern unsigned long __meminitdata e820_table_top; static __ref void *alloc_low_page(unsigned long *phys) { - unsigned long pfn = table_end++; + unsigned long pfn = e820_table_end++; void *adr; if (after_bootmem) { @@ -299,7 +299,7 @@ static __ref void *alloc_low_page(unsigned long *phys) return adr; } - if (pfn >= table_top) + if (pfn >= e820_table_top) panic("alloc_low_page: ran out of memory"); adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); From e53fb04fce6d246ebed755b904ed1b0b814a754c Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:55:07 +0200 Subject: [PATCH 213/580] x86: unify kernel_physical_mapping_init() function signatures Impact: cleanup In preparation for moving the function declaration to a header file, unify 32-bit and 64-bit signatures. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-16-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 13 +++---------- arch/x86/mm/init_32.c | 13 ++++++++++--- arch/x86/mm/init_64.c | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 5bbdfe7459d2..6475693a81ab 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -11,17 +11,12 @@ #ifdef CONFIG_X86_32 extern void __init early_ioremap_page_table_range_init(void); -extern void __init kernel_physical_mapping_init(unsigned long start_pfn, - unsigned long end_pfn, - int use_pse); #endif -#ifdef CONFIG_X86_64 -extern unsigned long __meminit +extern unsigned long __init kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask); -#endif unsigned long __initdata e820_table_start; unsigned long __meminitdata e820_table_end; @@ -301,10 +296,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, #ifdef CONFIG_X86_32 for (i = 0; i < nr_range; i++) - kernel_physical_mapping_init( - mr[i].start >> PAGE_SHIFT, - mr[i].end >> PAGE_SHIFT, - mr[i].page_size_mask == (1<> PAGE_SHIFT; + end_pfn = end >> PAGE_SHIFT; + /* * First iteration will setup identity mapping using large/small pages * based on use_pse, with other attributes same as set by @@ -350,6 +356,7 @@ void __init kernel_physical_mapping_init(unsigned long start_pfn, mapping_iter = 2; goto repeat; } + return 0; } pte_t *kmap_pte; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a1d33c58b497..f441ae316312 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -539,7 +539,7 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, return phys_pud_init(pud, addr, end, page_size_mask); } -unsigned long __meminit +unsigned long __init kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask) From 4fcb208391be5cf82c6fe2779c5eb9245ac97e91 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:55:08 +0200 Subject: [PATCH 214/580] x86: move function and variable declarations to asm/init.h Impact: cleanup Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-17-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/init.h | 18 ++++++++++++++++++ arch/x86/mm/init.c | 10 +--------- arch/x86/mm/init_32.c | 6 +----- arch/x86/mm/init_64.c | 5 +---- 4 files changed, 21 insertions(+), 18 deletions(-) create mode 100644 arch/x86/include/asm/init.h diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h new file mode 100644 index 000000000000..36fb1a6a5109 --- /dev/null +++ b/arch/x86/include/asm/init.h @@ -0,0 +1,18 @@ +#ifndef _ASM_X86_INIT_32_H +#define _ASM_X86_INIT_32_H + +#ifdef CONFIG_X86_32 +extern void __init early_ioremap_page_table_range_init(void); +#endif + +extern unsigned long __init +kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask); + + +extern unsigned long __initdata e820_table_start; +extern unsigned long __meminitdata e820_table_end; +extern unsigned long __meminitdata e820_table_top; + +#endif /* _ASM_X86_INIT_32_H */ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6475693a81ab..6d63e3d1253d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -3,21 +3,13 @@ #include #include +#include #include #include #include #include #include -#ifdef CONFIG_X86_32 -extern void __init early_ioremap_page_table_range_init(void); -#endif - -extern unsigned long __init -kernel_physical_mapping_init(unsigned long start, - unsigned long end, - unsigned long page_size_mask); - unsigned long __initdata e820_table_start; unsigned long __meminitdata e820_table_end; unsigned long __meminitdata e820_table_top; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5ca9c6c3439e..1669693e97de 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -49,6 +49,7 @@ #include #include #include +#include unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; @@ -58,11 +59,6 @@ unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); - -extern unsigned long __initdata e820_table_start; -extern unsigned long __meminitdata e820_table_end; -extern unsigned long __meminitdata e820_table_top; - static __init void *alloc_low_page(void) { unsigned long pfn = e820_table_end++; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index f441ae316312..7dd7ce49d69b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -48,6 +48,7 @@ #include #include #include +#include /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. @@ -283,10 +284,6 @@ void __init cleanup_highmap(void) } } -extern unsigned long __initdata e820_table_start; -extern unsigned long __meminitdata e820_table_end; -extern unsigned long __meminitdata e820_table_top; - static __ref void *alloc_low_page(unsigned long *phys) { unsigned long pfn = e820_table_end++; From 62436fe9ee10f5e0dd087b106d69d93c9549935a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Mar 2009 14:39:03 +0100 Subject: [PATCH 215/580] x86: move init_memory_mapping() to common mm/init.c, build fix on 32-bit PAE Impact: build fix Cc: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-14-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_types.h | 1 + arch/x86/mm/init_32.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 4d258ad76a0f..b8238dc8786d 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -273,6 +273,7 @@ typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; extern int nx_enabled; +extern void set_nx(void); #define pgprot_writecombine pgprot_writecombine extern pgprot_t pgprot_writecombine(pgprot_t prot); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 1669693e97de..5e5126e0d544 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -605,7 +605,7 @@ static int __init noexec_setup(char *str) } early_param("noexec", noexec_setup); -static void __init set_nx(void) +void __init set_nx(void) { unsigned int v[4], l, h; From d4cc510c61b050ef38e842a12feef71c56d7cf81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Mar 2009 11:48:46 +0100 Subject: [PATCH 216/580] [ARM] 5418/1: restore lr before leaving mcount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc seems to expect that lr isn't clobbered by mcount, because for a function starting with: static int func(void) { void *ra = __builtin_return_address(0); printk(KERN_EMERG "__builtin_return_address(0) = %pS\n", ra) ... the following assembler is generated by gcc 4.3.2: 0: e1a0c00d mov ip, sp 4: e92dd810 push {r4, fp, ip, lr, pc} 8: e24cb004 sub fp, ip, #4 ; 0x4 c: ebfffffe bl 0 10: e59f0034 ldr r0, [pc, #52] 14: e1a0100e mov r1, lr 18: ebfffffe bl 0 Without this patch obviously __builtin_return_address(0) yields func+0x10 instead of the return address of the caller. Note this patch fixes a similar issue for the routines used with dynamic ftrace even though this isn't currently selectable for ARM. Cc: Abhishek Sagar Cc: Steven Rostedt Cc: Ingo Molnar Signed-off-by: Uwe Kleine-König Signed-off-by: Russell King --- arch/arm/kernel/entry-common.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 49a6ba926c2b..159d0416f270 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -111,6 +111,7 @@ ENTRY(mcount) .globl mcount_call mcount_call: bl ftrace_stub + ldr lr, [fp, #-4] @ restore lr ldmia sp!, {r0-r3, pc} ENTRY(ftrace_caller) @@ -122,6 +123,7 @@ ENTRY(ftrace_caller) .globl ftrace_call ftrace_call: bl ftrace_stub + ldr lr, [fp, #-4] @ restore lr ldmia sp!, {r0-r3, pc} #else @@ -133,6 +135,7 @@ ENTRY(mcount) adr r0, ftrace_stub cmp r0, r2 bne trace + ldr lr, [fp, #-4] @ restore lr ldmia sp!, {r0-r3, pc} trace: @@ -141,6 +144,7 @@ trace: sub r0, r0, #MCOUNT_INSN_SIZE mov lr, pc mov pc, r2 + mov lr, r1 @ restore lr ldmia sp!, {r0-r3, pc} #endif /* CONFIG_DYNAMIC_FTRACE */ From a964e33c5d7c0ea46376d20c2f02edf01c9db251 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 4 Mar 2009 17:14:30 -0800 Subject: [PATCH 217/580] x86: clean up old gcc warnings gcc 3.2.2 reports: In file included from /usr/src/all/linux-next/arch/x86/include/asm/page.h:8, from /usr/src/all/linux-next/arch/x86/include/asm/processor.h:18, from /usr/src/all/linux-next/arch/x86/include/asm/atomic_32.h:6, from /usr/src/all/linux-next/arch/x86/include/asm/atomic.h:2, from include/linux/crypto.h:20, from arch/x86/kernel/asm-offsets_32.c:7, from arch/x86/kernel/asm-offsets.c:2: /usr/src/all/linux-next/arch/x86/include/asm/page_types.h:54: warning: parameter has incomplete type /usr/src/all/linux-next/arch/x86/include/asm/page_types.h:56: warning: parameter has incomplete type In file included from /usr/src/all/linux-next/arch/x86/include/asm/page.h:8, from /usr/src/all/linux-next/arch/x86/include/asm/processor.h:18, from include/linux/prefetch.h:14, from include/linux/list.h:6, from include/linux/module.h:9, from init/main.c:13: /usr/src/all/linux-next/arch/x86/include/asm/page_types.h:54: warning: parameter has incomplete type /usr/src/all/linux-next/arch/x86/include/asm/page_types.h:56: warning: parameter has incomplete type This is a bogus warning, but moving the pat-related functions into asm/pat.h and including asm/pgtable_types.h should fix it. Signed-off-by: Jeremy Fitzhardinge Reported-by: Tetsuo Handa Signed-off-by: Ingo Molnar --- arch/x86/include/asm/page_types.h | 6 ------ arch/x86/include/asm/pat.h | 5 +++++ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 2d625da6603c..826ad37006ab 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -40,14 +40,8 @@ #ifndef __ASSEMBLY__ -struct pgprot; - extern int page_is_ram(unsigned long pagenr); extern int devmem_is_allowed(unsigned long pagenr); -extern void map_devmem(unsigned long pfn, unsigned long size, - struct pgprot vma_prot); -extern void unmap_devmem(unsigned long pfn, unsigned long size, - struct pgprot vma_prot); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index b0e70056838e..2cd07b9422f4 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -2,6 +2,7 @@ #define _ASM_X86_PAT_H #include +#include #ifdef CONFIG_X86_PAT extern int pat_enabled; @@ -17,5 +18,9 @@ extern int free_memtype(u64 start, u64 end); extern int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flag); +extern void map_devmem(unsigned long pfn, unsigned long size, + struct pgprot vma_prot); +extern void unmap_devmem(unsigned long pfn, unsigned long size, + struct pgprot vma_prot); #endif /* _ASM_X86_PAT_H */ From dc16ecf7fd1fad7436832121435d4926a81d469e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 4 Mar 2009 16:10:44 -0800 Subject: [PATCH 218/580] x86-32: use specific __vmalloc_start_set flag in __virt_addr_valid Rather than relying on the ever-unreliable system_state, add a specific __vmalloc_start_set flag to indicate whether the vmalloc area has meaningful boundaries yet, and use that in x86-32's __phys_addr and __virt_addr_valid. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_32_types.h | 5 +++++ arch/x86/mm/init_32.c | 4 ++++ arch/x86/mm/ioremap.c | 7 +++---- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index bd8df3b2fe04..2733fad45f98 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -25,6 +25,11 @@ * area for the same reason. ;) */ #define VMALLOC_OFFSET (8 * 1024 * 1024) + +#ifndef __ASSEMBLER__ +extern bool __vmalloc_start_set; /* set once high_memory is set */ +#endif + #define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET) #ifdef CONFIG_X86_PAE #define LAST_PKMAP 512 diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5e5126e0d544..d57dfffb0213 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -59,6 +59,8 @@ unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); +bool __read_mostly __vmalloc_start_set = false; + static __init void *alloc_low_page(void) { unsigned long pfn = e820_table_end++; @@ -757,6 +759,8 @@ void __init initmem_init(unsigned long start_pfn, #ifdef CONFIG_FLATMEM max_mapnr = num_physpages; #endif + __vmalloc_start_set = true; + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 433f7bd4648a..a23ca5b5bf24 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -76,10 +76,9 @@ static inline int phys_addr_valid(unsigned long addr) #ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { - /* VMALLOC_* aren't constants; not available at the boot time */ + /* VMALLOC_* aren't constants */ VIRTUAL_BUG_ON(x < PAGE_OFFSET); - VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING && - is_vmalloc_addr((void *) x)); + VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); return x - PAGE_OFFSET; } EXPORT_SYMBOL(__phys_addr); @@ -89,7 +88,7 @@ bool __virt_addr_valid(unsigned long x) { if (x < PAGE_OFFSET) return false; - if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x)) + if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) return false; return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); } From ed26dbe5ae045e5bf95c6dc27497397a3fde52e1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 4 Mar 2009 16:16:51 -0800 Subject: [PATCH 219/580] x86: pre-initialize boot_cpu_data.x86_phys_bits to avoid system_state tests Impact: cleanup, micro-optimization Pre-initialize boot_cpu_data.x86_phys_bits to a reasonable default to remove the use of system_state tests in __virt_addr_valid() and __phys_addr(). Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 4 +++- arch/x86/mm/ioremap.c | 7 ++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b746deb9ebc6..f28c56e6bf94 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -202,7 +202,9 @@ struct ist_info ist_info; #endif #else -struct cpuinfo_x86 boot_cpu_data __read_mostly; +struct cpuinfo_x86 boot_cpu_data __read_mostly = { + .x86_phys_bits = MAX_PHYSMEM_BITS, +}; EXPORT_SYMBOL(boot_cpu_data); #endif diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index a23ca5b5bf24..62773abdf088 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -38,8 +38,7 @@ unsigned long __phys_addr(unsigned long x) } else { VIRTUAL_BUG_ON(x < PAGE_OFFSET); x -= PAGE_OFFSET; - VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM : - !phys_addr_valid(x)); + VIRTUAL_BUG_ON(!phys_addr_valid(x)); } return x; } @@ -56,10 +55,8 @@ bool __virt_addr_valid(unsigned long x) if (x < PAGE_OFFSET) return false; x -= PAGE_OFFSET; - if (system_state == SYSTEM_BOOTING ? - x > MAXMEM : !phys_addr_valid(x)) { + if (!phys_addr_valid(x)) return false; - } } return pfn_valid(x >> PAGE_SHIFT); From b02c387892fc6b3cc59c78ab2f79413d55f50190 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 12 Feb 2009 13:42:41 +0300 Subject: [PATCH 220/580] [WATCHDOG] ks8695_wdt.c: 'CLOCK_TICK_RATE' undeclared On arm-acs5k_tiny: drivers/watchdog/ks8695_wdt.c:68: error: 'CLOCK_TICK_RATE' undeclared (first use in this function) Signed-off-by: Alexey Dobriyan Signed-off-by: Wim Van Sebroeck Cc: stable --- drivers/watchdog/ks8695_wdt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/watchdog/ks8695_wdt.c b/drivers/watchdog/ks8695_wdt.c index 0b798fdaa378..74c92d384112 100644 --- a/drivers/watchdog/ks8695_wdt.c +++ b/drivers/watchdog/ks8695_wdt.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #define WDT_DEFAULT_TIME 5 /* seconds */ From 26952669f01646c3e7d0832c99b310b199fe2b20 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 3 Mar 2009 15:10:05 +0100 Subject: [PATCH 221/580] [WATCHDOG] gef_wdt.c: fsl_get_sys_freq() failure not noticed fsl_get_sys_freq() may return -1 when 'soc' isn't found, but in gef_wdt_probe() 'freq' is unsigned, so the test doesn't catch that. Signed-off-by: Roel Kluin Signed-off-by: Wim Van Sebroeck Signed-off-by: Andrew Morton --- drivers/watchdog/gef_wdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/gef_wdt.c b/drivers/watchdog/gef_wdt.c index f0c2b7a1a175..734d9806a872 100644 --- a/drivers/watchdog/gef_wdt.c +++ b/drivers/watchdog/gef_wdt.c @@ -269,7 +269,7 @@ static int __devinit gef_wdt_probe(struct of_device *dev, bus_clk = 133; /* in MHz */ freq = fsl_get_sys_freq(); - if (freq > 0) + if (freq != -1) bus_clk = freq; /* Map devices registers into memory */ From b2b352590d94651579e6914ecdb08d30b2cb5f19 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Mar 2009 15:15:44 +0100 Subject: [PATCH 222/580] x86: UV, SGI RTC: add generic system vector, build fix on UP Make ack_APIC_irq() build on !SMP && !APIC too. Cc: Dimitri Sivanich LKML-Reference: <20090304185605.GA24419@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 4ef949c1972e..394d177d721b 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -379,6 +379,7 @@ static inline u32 safe_apic_wait_icr_idle(void) static inline void ack_APIC_irq(void) { +#ifdef CONFIG_X86_LOCAL_APIC /* * ack_APIC_irq() actually gets compiled as a single instruction * ... yummie. @@ -386,6 +387,7 @@ static inline void ack_APIC_irq(void) /* Docs say use 0 for future compatibility */ apic_write(APIC_EOI, 0); +#endif } static inline unsigned default_get_apic_id(unsigned long x) From e0c6dcd8d4257a129fc813ac68f20b77c6ae2a7a Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 5 Mar 2009 16:10:55 +0100 Subject: [PATCH 223/580] ide: expiry() returns int, negative expiry() return values won't be noticed bart: It seems like the bug could cause insanely long timeouts for: - ATA_DMA_ERR error in dma_timer_expiry() - commands without ->expiry in tc86c001_timer_expiry() (TC86C001 IDE controller only) Signed-off-by: Roel Kluin Cc: Sergei Shtylyov Cc: Andrew Morton [bart: port it to the current tree] Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 9ee51adf567f..9ff90cb1dbf1 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -908,7 +908,7 @@ void ide_timer_expiry (unsigned long data) ide_drive_t *uninitialized_var(drive); ide_handler_t *handler; unsigned long flags; - unsigned long wait = -1; + int wait = -1; int plug_device = 0; spin_lock_irqsave(&hwif->lock, flags); From 71bfc7a7c73eaf1f99e309dba60822ba050e3ec5 Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Thu, 5 Mar 2009 16:10:56 +0100 Subject: [PATCH 224/580] ide: NULL noise: drivers/ide/ide-*.c Fix this sparse warnings: drivers/ide/ide-disk_proc.c:130:11: warning: Using plain integer as NULL pointer drivers/ide/ide-floppy_proc.c:32:11: warning: Using plain integer as NULL pointer drivers/ide/ide-proc.c:234:11: warning: Using plain integer as NULL pointer drivers/ide/ide-tape.c:2141:11: warning: Using plain integer as NULL pointer Signed-off-by: Hannes Eder Cc: trivial@kernel.org Cc: kernel-janitors@vger.kernel.org Cc: Al Viro Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-disk_proc.c | 2 +- drivers/ide/ide-floppy_proc.c | 2 +- drivers/ide/ide-proc.c | 2 +- drivers/ide/ide-tape.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/ide/ide-disk_proc.c b/drivers/ide/ide-disk_proc.c index 1146f4204c6e..1f86dcbd2b1c 100644 --- a/drivers/ide/ide-disk_proc.c +++ b/drivers/ide/ide-disk_proc.c @@ -125,5 +125,5 @@ const struct ide_proc_devset ide_disk_settings[] = { IDE_PROC_DEVSET(multcount, 0, 16), IDE_PROC_DEVSET(nowerr, 0, 1), IDE_PROC_DEVSET(wcache, 0, 1), - { 0 }, + { NULL }, }; diff --git a/drivers/ide/ide-floppy_proc.c b/drivers/ide/ide-floppy_proc.c index 3ec762cb60ab..fcd4d8153df5 100644 --- a/drivers/ide/ide-floppy_proc.c +++ b/drivers/ide/ide-floppy_proc.c @@ -29,5 +29,5 @@ const struct ide_proc_devset ide_floppy_settings[] = { IDE_PROC_DEVSET(bios_head, 0, 255), IDE_PROC_DEVSET(bios_sect, 0, 63), IDE_PROC_DEVSET(ticks, 0, 255), - { 0 }, + { NULL }, }; diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c index 1d8978b3314a..a7b9287ee0d4 100644 --- a/drivers/ide/ide-proc.c +++ b/drivers/ide/ide-proc.c @@ -231,7 +231,7 @@ static const struct ide_proc_devset ide_generic_settings[] = { IDE_PROC_DEVSET(pio_mode, 0, 255), IDE_PROC_DEVSET(unmaskirq, 0, 1), IDE_PROC_DEVSET(using_dma, 0, 1), - { 0 }, + { NULL }, }; static void proc_ide_settings_warn(void) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index bb450a7608c2..4e6181c7bbda 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -2166,7 +2166,7 @@ static const struct ide_proc_devset idetape_settings[] = { __IDE_PROC_DEVSET(speed, 0, 0xffff, NULL, NULL), __IDE_PROC_DEVSET(tdsc, IDETAPE_DSC_RW_MIN, IDETAPE_DSC_RW_MAX, mulf_tdsc, divf_tdsc), - { 0 }, + { NULL }, }; #endif From a509538d4fb4f99cdf0a095213d57cc3b2347615 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Thu, 5 Mar 2009 16:10:56 +0100 Subject: [PATCH 225/580] ide-iops: fix odd-length ATAPI PIO transfers Commit 9567b349f7e7dd7e2483db99ee8e4a6fe0caca38 (ide: merge ->atapi_*put_bytes and ->ata_*put_data methods) introduced a regression WRT the odd-length ATAPI PIO transfers -- the final word didn't get written (causing command timeouts). Signed-off-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-iops.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index 753b92ebe0ae..b1892bd95c6f 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -315,6 +315,8 @@ void ide_output_data(ide_drive_t *drive, struct request *rq, void *buf, u8 io_32bit = drive->io_32bit; u8 mmio = (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0; + len++; + if (io_32bit) { unsigned long uninitialized_var(flags); From 849d7130001ab740a5a4778a561049841fdd77c9 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 5 Mar 2009 16:10:57 +0100 Subject: [PATCH 226/580] ide: allow to wrap interrupt handler Signed-off-by: Stanislaw Gruszka Cc: Andrew Victor [bart: minor checkpatch.pl / CodingStyle fixups] Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-io.c | 1 + drivers/ide/ide-probe.c | 7 ++++++- include/linux/ide.h | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 9ff90cb1dbf1..a9a6c208288a 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -1162,6 +1162,7 @@ irqreturn_t ide_intr (int irq, void *dev_id) return irq_ret; } +EXPORT_SYMBOL_GPL(ide_intr); /** * ide_do_drive_cmd - issue IDE special command diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index ce0818a993f6..ee8e3e7cad51 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -950,6 +950,7 @@ static int ide_port_setup_devices(ide_hwif_t *hwif) static int init_irq (ide_hwif_t *hwif) { struct ide_io_ports *io_ports = &hwif->io_ports; + irq_handler_t irq_handler; int sa = 0; mutex_lock(&ide_cfg_mtx); @@ -959,6 +960,10 @@ static int init_irq (ide_hwif_t *hwif) hwif->timer.function = &ide_timer_expiry; hwif->timer.data = (unsigned long)hwif; + irq_handler = hwif->host->irq_handler; + if (irq_handler == NULL) + irq_handler = ide_intr; + #if defined(__mc68000__) sa = IRQF_SHARED; #endif /* __mc68000__ */ @@ -969,7 +974,7 @@ static int init_irq (ide_hwif_t *hwif) if (io_ports->ctl_addr) hwif->tp_ops->set_irq(hwif, 1); - if (request_irq(hwif->irq, &ide_intr, sa, hwif->name, hwif)) + if (request_irq(hwif->irq, irq_handler, sa, hwif->name, hwif)) goto out_up; if (!hwif->rqsize) { diff --git a/include/linux/ide.h b/include/linux/ide.h index fe235b65207e..e0cedfe9fad4 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -866,6 +866,7 @@ struct ide_host { unsigned int n_ports; struct device *dev[2]; unsigned int (*init_chipset)(struct pci_dev *); + irq_handler_t irq_handler; unsigned long host_flags; void *host_priv; ide_hwif_t *cur_port; /* for hosts requiring serialization */ From 6e5f1e1115bb041993f9f247036996364b4c84d5 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 5 Mar 2009 16:10:58 +0100 Subject: [PATCH 227/580] ide: add at91_ide driver This is IDE host driver for AT91 (SAM9, CAP9, AT572D940HF) Static Memory Controller with Compact Flash True IDE Mode logic. Driver have to switch 8/16 bit bus width when accessing Task Tile or Data Register. Moreover some extra things need to be done when setting PIO mode. Only PIO mode is used, hardware have no DMA support. If interrupt line is connected through GPIO extra quirk is needed to cope with fake interrupts. Signed-off-by: Stanislaw Gruszka Cc: Andrew Victor Acked-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- arch/arm/mach-at91/include/mach/board.h | 3 + drivers/ide/Kconfig | 5 + drivers/ide/Makefile | 1 + drivers/ide/at91_ide.c | 467 ++++++++++++++++++++++++ 4 files changed, 476 insertions(+) create mode 100644 drivers/ide/at91_ide.c diff --git a/arch/arm/mach-at91/include/mach/board.h b/arch/arm/mach-at91/include/mach/board.h index 0b3ae21b4565..793fe7b25f36 100644 --- a/arch/arm/mach-at91/include/mach/board.h +++ b/arch/arm/mach-at91/include/mach/board.h @@ -56,6 +56,9 @@ struct at91_cf_data { u8 vcc_pin; /* power switching */ u8 rst_pin; /* card reset */ u8 chipselect; /* EBI Chip Select number */ + u8 flags; +#define AT91_CF_TRUE_IDE 0x01 +#define AT91_IDE_SWAP_A0_A2 0x02 }; extern void __init at91_add_device_cf(struct at91_cf_data *data); diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig index e072903b12f0..5ea3bfad172a 100644 --- a/drivers/ide/Kconfig +++ b/drivers/ide/Kconfig @@ -721,6 +721,11 @@ config BLK_DEV_IDE_TX4939 depends on SOC_TX4939 select BLK_DEV_IDEDMA_SFF +config BLK_DEV_IDE_AT91 + tristate "Atmel AT91 (SAM9, CAP9, AT572D940HF) IDE support" + depends on ARM && ARCH_AT91 && !ARCH_AT91RM9200 && !ARCH_AT91X40 + select IDE_TIMINGS + config IDE_ARM tristate "ARM IDE support" depends on ARM && (ARCH_RPC || ARCH_SHARK) diff --git a/drivers/ide/Makefile b/drivers/ide/Makefile index d0e3d7d5b467..1c326d94aa6d 100644 --- a/drivers/ide/Makefile +++ b/drivers/ide/Makefile @@ -116,3 +116,4 @@ obj-$(CONFIG_BLK_DEV_IDE_AU1XXX) += au1xxx-ide.o obj-$(CONFIG_BLK_DEV_IDE_TX4938) += tx4938ide.o obj-$(CONFIG_BLK_DEV_IDE_TX4939) += tx4939ide.o +obj-$(CONFIG_BLK_DEV_IDE_AT91) += at91_ide.o diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c new file mode 100644 index 000000000000..1bb50f46388d --- /dev/null +++ b/drivers/ide/at91_ide.c @@ -0,0 +1,467 @@ +/* + * IDE host driver for AT91 (SAM9, CAP9, AT572D940HF) Static Memory Controller + * with Compact Flash True IDE logic + * + * Copyright (c) 2008, 2009 Kelvatek Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define DRV_NAME "at91_ide" + +#define perr(fmt, args...) pr_err(DRV_NAME ": " fmt, ##args) +#define pdbg(fmt, args...) pr_debug("%s " fmt, __func__, ##args) + +/* + * Access to IDE device is possible through EBI Static Memory Controller + * with Compact Flash logic. For details see EBI and SMC datasheet sections + * of any microcontroller from AT91SAM9 family. + * + * Within SMC chip select address space, lines A[23:21] distinguish Compact + * Flash modes (I/O, common memory, attribute memory, True IDE). IDE modes are: + * 0x00c0000 - True IDE + * 0x00e0000 - Alternate True IDE (Alt Status Register) + * + * On True IDE mode Task File and Data Register are mapped at the same address. + * To distinguish access between these two different bus data width is used: + * 8Bit for Task File, 16Bit for Data I/O. + * + * After initialization we do 8/16 bit flipping (changes in SMC MODE register) + * only inside IDE callback routines which are serialized by IDE layer, + * so no additional locking needed. + */ + +#define TASK_FILE 0x00c00000 +#define ALT_MODE 0x00e00000 +#define REGS_SIZE 8 + +#define enter_16bit(cs, mode) do { \ + mode = at91_sys_read(AT91_SMC_MODE(cs)); \ + at91_sys_write(AT91_SMC_MODE(cs), mode | AT91_SMC_DBW_16); \ +} while (0) + +#define leave_16bit(cs, mode) at91_sys_write(AT91_SMC_MODE(cs), mode); + +static void set_smc_timings(const u8 chipselect, const u16 cycle, + const u16 setup, const u16 pulse, + const u16 data_float, int use_iordy) +{ + unsigned long mode = AT91_SMC_READMODE | AT91_SMC_WRITEMODE | + AT91_SMC_BAT_SELECT; + + /* disable or enable waiting for IORDY signal */ + if (use_iordy) + mode |= AT91_SMC_EXNWMODE_READY; + + /* add data float cycles if needed */ + if (data_float) + mode |= AT91_SMC_TDF_(data_float); + + at91_sys_write(AT91_SMC_MODE(chipselect), mode); + + /* setup timings in SMC */ + at91_sys_write(AT91_SMC_SETUP(chipselect), AT91_SMC_NWESETUP_(setup) | + AT91_SMC_NCS_WRSETUP_(0) | + AT91_SMC_NRDSETUP_(setup) | + AT91_SMC_NCS_RDSETUP_(0)); + at91_sys_write(AT91_SMC_PULSE(chipselect), AT91_SMC_NWEPULSE_(pulse) | + AT91_SMC_NCS_WRPULSE_(cycle) | + AT91_SMC_NRDPULSE_(pulse) | + AT91_SMC_NCS_RDPULSE_(cycle)); + at91_sys_write(AT91_SMC_CYCLE(chipselect), AT91_SMC_NWECYCLE_(cycle) | + AT91_SMC_NRDCYCLE_(cycle)); +} + +static unsigned int calc_mck_cycles(unsigned int ns, unsigned int mck_hz) +{ + u64 tmp = ns; + + tmp *= mck_hz; + tmp += 1000*1000*1000 - 1; /* round up */ + do_div(tmp, 1000*1000*1000); + return (unsigned int) tmp; +} + +static void apply_timings(const u8 chipselect, const u8 pio, + const struct ide_timing *timing, int use_iordy) +{ + unsigned int t0, t1, t2, t6z; + unsigned int cycle, setup, pulse, data_float; + unsigned int mck_hz; + struct clk *mck; + + /* see table 22 of Compact Flash standard 4.1 for the meaning, + * we do not stretch active (t2) time, so setup (t1) + hold time (th) + * assure at least minimal recovery (t2i) time */ + t0 = timing->cyc8b; + t1 = timing->setup; + t2 = timing->act8b; + t6z = (pio < 5) ? 30 : 20; + + pdbg("t0=%u t1=%u t2=%u t6z=%u\n", t0, t1, t2, t6z); + + mck = clk_get(NULL, "mck"); + BUG_ON(IS_ERR(mck)); + mck_hz = clk_get_rate(mck); + pdbg("mck_hz=%u\n", mck_hz); + + cycle = calc_mck_cycles(t0, mck_hz); + setup = calc_mck_cycles(t1, mck_hz); + pulse = calc_mck_cycles(t2, mck_hz); + data_float = calc_mck_cycles(t6z, mck_hz); + + pdbg("cycle=%u setup=%u pulse=%u data_float=%u\n", + cycle, setup, pulse, data_float); + + set_smc_timings(chipselect, cycle, setup, pulse, data_float, use_iordy); +} + +static void at91_ide_input_data(ide_drive_t *drive, struct request *rq, + void *buf, unsigned int len) +{ + ide_hwif_t *hwif = drive->hwif; + struct ide_io_ports *io_ports = &hwif->io_ports; + u8 chipselect = hwif->select_data; + unsigned long mode; + + pdbg("cs %u buf %p len %d\n", chipselect, buf, len); + + len++; + + enter_16bit(chipselect, mode); + __ide_mm_insw((void __iomem *) io_ports->data_addr, buf, len / 2); + leave_16bit(chipselect, mode); +} + +static void at91_ide_output_data(ide_drive_t *drive, struct request *rq, + void *buf, unsigned int len) +{ + ide_hwif_t *hwif = drive->hwif; + struct ide_io_ports *io_ports = &hwif->io_ports; + u8 chipselect = hwif->select_data; + unsigned long mode; + + pdbg("cs %u buf %p len %d\n", chipselect, buf, len); + + enter_16bit(chipselect, mode); + __ide_mm_outsw((void __iomem *) io_ports->data_addr, buf, len / 2); + leave_16bit(chipselect, mode); +} + +static u8 ide_mm_inb(unsigned long port) +{ + return readb((void __iomem *) port); +} + +static void ide_mm_outb(u8 value, unsigned long port) +{ + writeb(value, (void __iomem *) port); +} + +static void at91_ide_tf_load(ide_drive_t *drive, ide_task_t *task) +{ + ide_hwif_t *hwif = drive->hwif; + struct ide_io_ports *io_ports = &hwif->io_ports; + struct ide_taskfile *tf = &task->tf; + u8 HIHI = (task->tf_flags & IDE_TFLAG_LBA48) ? 0xE0 : 0xEF; + + if (task->tf_flags & IDE_TFLAG_FLAGGED) + HIHI = 0xFF; + + if (task->tf_flags & IDE_TFLAG_OUT_DATA) { + u16 data = (tf->hob_data << 8) | tf->data; + + at91_ide_output_data(drive, NULL, &data, 2); + } + + if (task->tf_flags & IDE_TFLAG_OUT_HOB_FEATURE) + ide_mm_outb(tf->hob_feature, io_ports->feature_addr); + if (task->tf_flags & IDE_TFLAG_OUT_HOB_NSECT) + ide_mm_outb(tf->hob_nsect, io_ports->nsect_addr); + if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAL) + ide_mm_outb(tf->hob_lbal, io_ports->lbal_addr); + if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAM) + ide_mm_outb(tf->hob_lbam, io_ports->lbam_addr); + if (task->tf_flags & IDE_TFLAG_OUT_HOB_LBAH) + ide_mm_outb(tf->hob_lbah, io_ports->lbah_addr); + + if (task->tf_flags & IDE_TFLAG_OUT_FEATURE) + ide_mm_outb(tf->feature, io_ports->feature_addr); + if (task->tf_flags & IDE_TFLAG_OUT_NSECT) + ide_mm_outb(tf->nsect, io_ports->nsect_addr); + if (task->tf_flags & IDE_TFLAG_OUT_LBAL) + ide_mm_outb(tf->lbal, io_ports->lbal_addr); + if (task->tf_flags & IDE_TFLAG_OUT_LBAM) + ide_mm_outb(tf->lbam, io_ports->lbam_addr); + if (task->tf_flags & IDE_TFLAG_OUT_LBAH) + ide_mm_outb(tf->lbah, io_ports->lbah_addr); + + if (task->tf_flags & IDE_TFLAG_OUT_DEVICE) + ide_mm_outb((tf->device & HIHI) | drive->select, io_ports->device_addr); +} + +static void at91_ide_tf_read(ide_drive_t *drive, ide_task_t *task) +{ + ide_hwif_t *hwif = drive->hwif; + struct ide_io_ports *io_ports = &hwif->io_ports; + struct ide_taskfile *tf = &task->tf; + + if (task->tf_flags & IDE_TFLAG_IN_DATA) { + u16 data; + + at91_ide_input_data(drive, NULL, &data, 2); + tf->data = data & 0xff; + tf->hob_data = (data >> 8) & 0xff; + } + + /* be sure we're looking at the low order bits */ + ide_mm_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr); + + if (task->tf_flags & IDE_TFLAG_IN_FEATURE) + tf->feature = ide_mm_inb(io_ports->feature_addr); + if (task->tf_flags & IDE_TFLAG_IN_NSECT) + tf->nsect = ide_mm_inb(io_ports->nsect_addr); + if (task->tf_flags & IDE_TFLAG_IN_LBAL) + tf->lbal = ide_mm_inb(io_ports->lbal_addr); + if (task->tf_flags & IDE_TFLAG_IN_LBAM) + tf->lbam = ide_mm_inb(io_ports->lbam_addr); + if (task->tf_flags & IDE_TFLAG_IN_LBAH) + tf->lbah = ide_mm_inb(io_ports->lbah_addr); + if (task->tf_flags & IDE_TFLAG_IN_DEVICE) + tf->device = ide_mm_inb(io_ports->device_addr); + + if (task->tf_flags & IDE_TFLAG_LBA48) { + ide_mm_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr); + + if (task->tf_flags & IDE_TFLAG_IN_HOB_FEATURE) + tf->hob_feature = ide_mm_inb(io_ports->feature_addr); + if (task->tf_flags & IDE_TFLAG_IN_HOB_NSECT) + tf->hob_nsect = ide_mm_inb(io_ports->nsect_addr); + if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAL) + tf->hob_lbal = ide_mm_inb(io_ports->lbal_addr); + if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAM) + tf->hob_lbam = ide_mm_inb(io_ports->lbam_addr); + if (task->tf_flags & IDE_TFLAG_IN_HOB_LBAH) + tf->hob_lbah = ide_mm_inb(io_ports->lbah_addr); + } +} + +static void at91_ide_set_pio_mode(ide_drive_t *drive, const u8 pio) +{ + struct ide_timing *timing; + u8 chipselect = drive->hwif->select_data; + int use_iordy = 0; + + pdbg("chipselect %u pio %u\n", chipselect, pio); + + timing = ide_timing_find_mode(XFER_PIO_0 + pio); + BUG_ON(!timing); + + if ((pio > 2 || ata_id_has_iordy(drive->id)) && + !(ata_id_is_cfa(drive->id) && pio > 4)) + use_iordy = 1; + + apply_timings(chipselect, pio, timing, use_iordy); +} + +static const struct ide_tp_ops at91_ide_tp_ops = { + .exec_command = ide_exec_command, + .read_status = ide_read_status, + .read_altstatus = ide_read_altstatus, + .set_irq = ide_set_irq, + + .tf_load = at91_ide_tf_load, + .tf_read = at91_ide_tf_read, + + .input_data = at91_ide_input_data, + .output_data = at91_ide_output_data, +}; + +static const struct ide_port_ops at91_ide_port_ops = { + .set_pio_mode = at91_ide_set_pio_mode, +}; + +static const struct ide_port_info at91_ide_port_info __initdata = { + .port_ops = &at91_ide_port_ops, + .tp_ops = &at91_ide_tp_ops, + .host_flags = IDE_HFLAG_MMIO | IDE_HFLAG_NO_DMA | IDE_HFLAG_SINGLE | + IDE_HFLAG_NO_IO_32BIT | IDE_HFLAG_UNMASK_IRQS, + .pio_mask = ATA_PIO5, +}; + +/* + * If interrupt is delivered through GPIO, IRQ are triggered on falling + * and rising edge of signal. Whereas IDE device request interrupt on high + * level (rising edge in our case). This mean we have fake interrupts, so + * we need to check interrupt pin and exit instantly from ISR when line + * is on low level. + */ + +irqreturn_t at91_irq_handler(int irq, void *dev_id) +{ + int ntries = 8; + int pin_val1, pin_val2; + + /* additional deglitch, line can be noisy in badly designed PCB */ + do { + pin_val1 = at91_get_gpio_value(irq); + pin_val2 = at91_get_gpio_value(irq); + } while (pin_val1 != pin_val2 && --ntries > 0); + + if (pin_val1 == 0 || ntries <= 0) + return IRQ_HANDLED; + + return ide_intr(irq, dev_id); +} + +static int __init at91_ide_probe(struct platform_device *pdev) +{ + int ret; + hw_regs_t hw; + hw_regs_t *hws[] = { &hw, NULL, NULL, NULL }; + struct ide_host *host; + struct resource *res; + unsigned long tf_base = 0, ctl_base = 0; + struct at91_cf_data *board = pdev->dev.platform_data; + + if (!board) + return -ENODEV; + + if (board->det_pin && at91_get_gpio_value(board->det_pin) != 0) { + perr("no device detected\n"); + return -ENODEV; + } + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + perr("can't get memory resource\n"); + return -ENODEV; + } + + if (!devm_request_mem_region(&pdev->dev, res->start + TASK_FILE, + REGS_SIZE, "ide") || + !devm_request_mem_region(&pdev->dev, res->start + ALT_MODE, + REGS_SIZE, "alt")) { + perr("memory resources in use\n"); + return -EBUSY; + } + + pdbg("chipselect %u irq %u res %08lx\n", board->chipselect, + board->irq_pin, (unsigned long) res->start); + + tf_base = (unsigned long) devm_ioremap(&pdev->dev, res->start + TASK_FILE, + REGS_SIZE); + ctl_base = (unsigned long) devm_ioremap(&pdev->dev, res->start + ALT_MODE, + REGS_SIZE); + if (!tf_base || !ctl_base) { + perr("can't map memory regions\n"); + return -EBUSY; + } + + memset(&hw, 0, sizeof(hw)); + + if (board->flags & AT91_IDE_SWAP_A0_A2) { + /* workaround for stupid hardware bug */ + hw.io_ports.data_addr = tf_base + 0; + hw.io_ports.error_addr = tf_base + 4; + hw.io_ports.nsect_addr = tf_base + 2; + hw.io_ports.lbal_addr = tf_base + 6; + hw.io_ports.lbam_addr = tf_base + 1; + hw.io_ports.lbah_addr = tf_base + 5; + hw.io_ports.device_addr = tf_base + 3; + hw.io_ports.command_addr = tf_base + 7; + hw.io_ports.ctl_addr = ctl_base + 3; + } else + ide_std_init_ports(&hw, tf_base, ctl_base + 6); + + hw.irq = board->irq_pin; + hw.chipset = ide_generic; + hw.dev = &pdev->dev; + + host = ide_host_alloc(&at91_ide_port_info, hws); + if (!host) { + perr("failed to allocate ide host\n"); + return -ENOMEM; + } + + /* setup Static Memory Controller - PIO 0 as default */ + apply_timings(board->chipselect, 0, ide_timing_find_mode(XFER_PIO_0), 0); + + /* with GPIO interrupt we have to do quirks in handler */ + if (board->irq_pin >= PIN_BASE) + host->irq_handler = at91_irq_handler; + + host->ports[0]->select_data = board->chipselect; + + ret = ide_host_register(host, &at91_ide_port_info, hws); + if (ret) { + perr("failed to register ide host\n"); + goto err_free_host; + } + platform_set_drvdata(pdev, host); + return 0; + +err_free_host: + ide_host_free(host); + return ret; +} + +static int __exit at91_ide_remove(struct platform_device *pdev) +{ + struct ide_host *host = platform_get_drvdata(pdev); + + ide_host_remove(host); + return 0; +} + +static struct platform_driver at91_ide_driver = { + .driver = { + .name = DRV_NAME, + .owner = THIS_MODULE, + }, + .remove = __exit_p(at91_ide_remove), +}; + +static int __init at91_ide_init(void) +{ + return platform_driver_probe(&at91_ide_driver, at91_ide_probe); +} + +static void __exit at91_ide_exit(void) +{ + platform_driver_unregister(&at91_ide_driver); +} + +module_init(at91_ide_init); +module_exit(at91_ide_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Stanislaw Gruszka "); + From e565f206082a725108a75bcf00bbe3db21a56474 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 5 Mar 2009 16:10:58 +0100 Subject: [PATCH 228/580] AT91: initialize Compact Flash on AT91SAM9263 cpu Signed-off-by: Stanislaw Gruszka Cc: Andrew Victor Acked-by: Sergei Shtylyov Acked-by: Andrew Victor Signed-off-by: Bartlomiej Zolnierkiewicz --- arch/arm/mach-at91/at91sam9263_devices.c | 105 +++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/arch/arm/mach-at91/at91sam9263_devices.c b/arch/arm/mach-at91/at91sam9263_devices.c index 134af97ff340..b7f233242315 100644 --- a/arch/arm/mach-at91/at91sam9263_devices.c +++ b/arch/arm/mach-at91/at91sam9263_devices.c @@ -347,6 +347,111 @@ void __init at91_add_device_mmc(short mmc_id, struct at91_mmc_data *data) void __init at91_add_device_mmc(short mmc_id, struct at91_mmc_data *data) {} #endif +/* -------------------------------------------------------------------- + * Compact Flash (PCMCIA or IDE) + * -------------------------------------------------------------------- */ + +#if defined(CONFIG_AT91_CF) || defined(CONFIG_AT91_CF_MODULE) || \ + defined(CONFIG_BLK_DEV_IDE_AT91) || defined(CONFIG_BLK_DEV_IDE_AT91_MODULE) + +static struct at91_cf_data cf0_data; + +static struct resource cf0_resources[] = { + [0] = { + .start = AT91_CHIPSELECT_4, + .end = AT91_CHIPSELECT_4 + SZ_256M - 1, + .flags = IORESOURCE_MEM | IORESOURCE_MEM_8AND16BIT, + } +}; + +static struct platform_device cf0_device = { + .id = 0, + .dev = { + .platform_data = &cf0_data, + }, + .resource = cf0_resources, + .num_resources = ARRAY_SIZE(cf0_resources), +}; + +static struct at91_cf_data cf1_data; + +static struct resource cf1_resources[] = { + [0] = { + .start = AT91_CHIPSELECT_5, + .end = AT91_CHIPSELECT_5 + SZ_256M - 1, + .flags = IORESOURCE_MEM | IORESOURCE_MEM_8AND16BIT, + } +}; + +static struct platform_device cf1_device = { + .id = 1, + .dev = { + .platform_data = &cf1_data, + }, + .resource = cf1_resources, + .num_resources = ARRAY_SIZE(cf1_resources), +}; + +void __init at91_add_device_cf(struct at91_cf_data *data) +{ + unsigned long ebi0_csa; + struct platform_device *pdev; + + if (!data) + return; + + /* + * assign CS4 or CS5 to SMC with Compact Flash logic support, + * we assume SMC timings are configured by board code, + * except True IDE where timings are controlled by driver + */ + ebi0_csa = at91_sys_read(AT91_MATRIX_EBI0CSA); + switch (data->chipselect) { + case 4: + at91_set_A_periph(AT91_PIN_PD6, 0); /* EBI0_NCS4/CFCS0 */ + ebi0_csa |= AT91_MATRIX_EBI0_CS4A_SMC_CF1; + cf0_data = *data; + pdev = &cf0_device; + break; + case 5: + at91_set_A_periph(AT91_PIN_PD7, 0); /* EBI0_NCS5/CFCS1 */ + ebi0_csa |= AT91_MATRIX_EBI0_CS5A_SMC_CF2; + cf1_data = *data; + pdev = &cf1_device; + break; + default: + printk(KERN_ERR "AT91 CF: bad chip-select requested (%u)\n", + data->chipselect); + return; + } + at91_sys_write(AT91_MATRIX_EBI0CSA, ebi0_csa); + + if (data->det_pin) { + at91_set_gpio_input(data->det_pin, 1); + at91_set_deglitch(data->det_pin, 1); + } + + if (data->irq_pin) { + at91_set_gpio_input(data->irq_pin, 1); + at91_set_deglitch(data->irq_pin, 1); + } + + if (data->vcc_pin) + /* initially off */ + at91_set_gpio_output(data->vcc_pin, 0); + + /* enable EBI controlled pins */ + at91_set_A_periph(AT91_PIN_PD5, 1); /* NWAIT */ + at91_set_A_periph(AT91_PIN_PD8, 0); /* CFCE1 */ + at91_set_A_periph(AT91_PIN_PD9, 0); /* CFCE2 */ + at91_set_A_periph(AT91_PIN_PD14, 0); /* CFNRW */ + + pdev->name = (data->flags & AT91_CF_TRUE_IDE) ? "at91_ide" : "at91_cf"; + platform_device_register(pdev); +} +#else +void __init at91_add_device_cf(struct at91_cf_data *data) {} +#endif /* -------------------------------------------------------------------- * NAND / SmartMedia From ebcad5aaea26da3cb2ca90b7f31a67a027eb60db Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Thu, 5 Mar 2009 16:10:59 +0100 Subject: [PATCH 229/580] remove stale comment from HDIO_GET_IDENTITY returns 256 words currently. Noticed by Norman Diamond. Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/hdreg.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/hdreg.h b/include/linux/hdreg.h index c37e9241fae7..ed21bd3dbd25 100644 --- a/include/linux/hdreg.h +++ b/include/linux/hdreg.h @@ -511,7 +511,6 @@ struct hd_driveid { unsigned short words69_70[2]; /* reserved words 69-70 * future command overlap and queuing */ - /* HDIO_GET_IDENTITY currently returns only words 0 through 70 */ unsigned short words71_74[4]; /* reserved words 71-74 * for IDENTIFY PACKET DEVICE command */ From 7dbc3f6ead13cb87d2318443ebd8f19a987149aa Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Fri, 6 Mar 2009 00:20:49 +0800 Subject: [PATCH 230/580] Blackfin arch: add stubs for anomalies 447 and 448 Signed-off-by: Mike Frysinger Signed-off-by: Bryan Wu --- arch/blackfin/mach-bf518/include/mach/anomaly.h | 2 ++ arch/blackfin/mach-bf527/include/mach/anomaly.h | 2 ++ arch/blackfin/mach-bf533/include/mach/anomaly.h | 2 ++ arch/blackfin/mach-bf537/include/mach/anomaly.h | 2 ++ arch/blackfin/mach-bf538/include/mach/anomaly.h | 2 ++ arch/blackfin/mach-bf561/include/mach/anomaly.h | 2 ++ 6 files changed, 12 insertions(+) diff --git a/arch/blackfin/mach-bf518/include/mach/anomaly.h b/arch/blackfin/mach-bf518/include/mach/anomaly.h index 143a29dfe870..c847bb101076 100644 --- a/arch/blackfin/mach-bf518/include/mach/anomaly.h +++ b/arch/blackfin/mach-bf518/include/mach/anomaly.h @@ -86,5 +86,7 @@ #define ANOMALY_05000386 (0) #define ANOMALY_05000412 (0) #define ANOMALY_05000432 (0) +#define ANOMALY_05000447 (0) +#define ANOMALY_05000448 (0) #endif diff --git a/arch/blackfin/mach-bf527/include/mach/anomaly.h b/arch/blackfin/mach-bf527/include/mach/anomaly.h index 4ffccb6de52d..df6808d8a6ef 100644 --- a/arch/blackfin/mach-bf527/include/mach/anomaly.h +++ b/arch/blackfin/mach-bf527/include/mach/anomaly.h @@ -176,5 +176,7 @@ #define ANOMALY_05000323 (0) #define ANOMALY_05000363 (0) #define ANOMALY_05000412 (0) +#define ANOMALY_05000447 (0) +#define ANOMALY_05000448 (0) #endif diff --git a/arch/blackfin/mach-bf533/include/mach/anomaly.h b/arch/blackfin/mach-bf533/include/mach/anomaly.h index 9e838f851720..1cf893e2e55b 100644 --- a/arch/blackfin/mach-bf533/include/mach/anomaly.h +++ b/arch/blackfin/mach-bf533/include/mach/anomaly.h @@ -283,5 +283,7 @@ #define ANOMALY_05000412 (0) #define ANOMALY_05000432 (0) #define ANOMALY_05000435 (0) +#define ANOMALY_05000447 (0) +#define ANOMALY_05000448 (0) #endif diff --git a/arch/blackfin/mach-bf537/include/mach/anomaly.h b/arch/blackfin/mach-bf537/include/mach/anomaly.h index 438b43ffd41e..1bfd80c26c90 100644 --- a/arch/blackfin/mach-bf537/include/mach/anomaly.h +++ b/arch/blackfin/mach-bf537/include/mach/anomaly.h @@ -173,5 +173,7 @@ #define ANOMALY_05000412 (0) #define ANOMALY_05000432 (0) #define ANOMALY_05000435 (0) +#define ANOMALY_05000447 (0) +#define ANOMALY_05000448 (0) #endif diff --git a/arch/blackfin/mach-bf538/include/mach/anomaly.h b/arch/blackfin/mach-bf538/include/mach/anomaly.h index f4ece1cfca43..3a5699827363 100644 --- a/arch/blackfin/mach-bf538/include/mach/anomaly.h +++ b/arch/blackfin/mach-bf538/include/mach/anomaly.h @@ -130,5 +130,7 @@ #define ANOMALY_05000412 (0) #define ANOMALY_05000432 (0) #define ANOMALY_05000435 (0) +#define ANOMALY_05000447 (0) +#define ANOMALY_05000448 (0) #endif diff --git a/arch/blackfin/mach-bf561/include/mach/anomaly.h b/arch/blackfin/mach-bf561/include/mach/anomaly.h index d78bc6b49e61..d0b0b3506440 100644 --- a/arch/blackfin/mach-bf561/include/mach/anomaly.h +++ b/arch/blackfin/mach-bf561/include/mach/anomaly.h @@ -287,5 +287,7 @@ #define ANOMALY_05000386 (1) #define ANOMALY_05000432 (0) #define ANOMALY_05000435 (0) +#define ANOMALY_05000447 (0) +#define ANOMALY_05000448 (0) #endif From d42ad15b759d05a87f22b484af63987eff38ea88 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Thu, 5 Mar 2009 17:20:55 +0100 Subject: [PATCH 231/580] ata: add CFA specific identify data words Declare CFA specific identify data words 162 and 163 for future use. Signed-off-by: Sergei Shtylyov Signed-off-by: Jeff Garzik [bart: update patch summary/description] Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ata.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/ata.h b/include/linux/ata.h index 08a86d5cdf1b..9a061accd8b8 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -89,6 +89,8 @@ enum { ATA_ID_DLF = 128, ATA_ID_CSFO = 129, ATA_ID_CFA_POWER = 160, + ATA_ID_CFA_KEY_MGMT = 162, + ATA_ID_CFA_MODES = 163, ATA_ID_ROT_SPEED = 217, ATA_ID_PIO4 = (1 << 1), From 357fd373e194ec5bb02fe875a67c0874ab74b5a6 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Fri, 6 Mar 2009 00:24:01 +0800 Subject: [PATCH 232/580] Blackfin arch: remove duplicated ANOMALY_05000448 ifdef check Signed-off-by: Mike Frysinger Signed-off-by: Bryan Wu --- arch/blackfin/mach-common/arch_checks.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/blackfin/mach-common/arch_checks.c b/arch/blackfin/mach-common/arch_checks.c index a2ca26aa210d..80d39b2f9db2 100644 --- a/arch/blackfin/mach-common/arch_checks.c +++ b/arch/blackfin/mach-common/arch_checks.c @@ -68,8 +68,6 @@ # error "The kernel load address is too high; keep it below 10meg for safety" #endif -#ifdef ANOMALY_05000448 -# if ANOMALY_05000448 -# error You are using a part with anomaly 05000448, this issue causes random memory read/write failures - that means random crashes. -# endif +#if ANOMALY_05000448 +# error You are using a part with anomaly 05000448, this issue causes random memory read/write failures - that means random crashes. #endif From 4aad7ec373a559177ee2c488455f6bf8928c030c Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Fri, 6 Mar 2009 00:27:17 +0800 Subject: [PATCH 233/580] Blackfin arch: disable legacy /proc/scsi/ support by default Signed-off-by: Mike Frysinger Signed-off-by: Bryan Wu --- arch/blackfin/configs/BF548-EZKIT_defconfig | 2 +- arch/blackfin/configs/CM-BF548_defconfig | 2 +- arch/blackfin/configs/IP0X_defconfig | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/blackfin/configs/BF548-EZKIT_defconfig b/arch/blackfin/configs/BF548-EZKIT_defconfig index b41dbd08c2c6..6bc2fb1b2a70 100644 --- a/arch/blackfin/configs/BF548-EZKIT_defconfig +++ b/arch/blackfin/configs/BF548-EZKIT_defconfig @@ -680,7 +680,7 @@ CONFIG_SCSI=y CONFIG_SCSI_DMA=y # CONFIG_SCSI_TGT is not set # CONFIG_SCSI_NETLINK is not set -CONFIG_SCSI_PROC_FS=y +# CONFIG_SCSI_PROC_FS is not set # # SCSI support type (disk, tape, CD-ROM) diff --git a/arch/blackfin/configs/CM-BF548_defconfig b/arch/blackfin/configs/CM-BF548_defconfig index 542ebc5436ad..f410430b4e3d 100644 --- a/arch/blackfin/configs/CM-BF548_defconfig +++ b/arch/blackfin/configs/CM-BF548_defconfig @@ -595,7 +595,7 @@ CONFIG_SCSI=y CONFIG_SCSI_DMA=y # CONFIG_SCSI_TGT is not set # CONFIG_SCSI_NETLINK is not set -CONFIG_SCSI_PROC_FS=y +# CONFIG_SCSI_PROC_FS is not set # # SCSI support type (disk, tape, CD-ROM) diff --git a/arch/blackfin/configs/IP0X_defconfig b/arch/blackfin/configs/IP0X_defconfig index eae83b5de92f..7db93874c987 100644 --- a/arch/blackfin/configs/IP0X_defconfig +++ b/arch/blackfin/configs/IP0X_defconfig @@ -612,7 +612,7 @@ CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 CONFIG_SCSI=y # CONFIG_SCSI_TGT is not set # CONFIG_SCSI_NETLINK is not set -CONFIG_SCSI_PROC_FS=y +# CONFIG_SCSI_PROC_FS is not set # # SCSI support type (disk, tape, CD-ROM) From f3f704d375fcc92950f688ccb3dd0f650acace92 Mon Sep 17 00:00:00 2001 From: Michael Hennerich Date: Fri, 6 Mar 2009 00:27:57 +0800 Subject: [PATCH 234/580] Blackfin arch: SPI_MMC is now mainlined MMC_SPI Signed-off-by: Mike Frysinger Signed-off-by: Michael Hennerich Signed-off-by: Bryan Wu --- arch/blackfin/mach-bf518/boards/ezbrd.c | 24 ++++++------------ arch/blackfin/mach-bf527/boards/cm_bf527.c | 26 ++++++-------------- arch/blackfin/mach-bf527/boards/ezbrd.c | 24 ++++++------------ arch/blackfin/mach-bf533/boards/blackstamp.c | 24 ++++++------------ arch/blackfin/mach-bf533/boards/cm_bf533.c | 24 ++++++------------ arch/blackfin/mach-bf533/boards/ip0x.c | 13 +++++----- arch/blackfin/mach-bf537/boards/cm_bf537.c | 26 ++++++-------------- arch/blackfin/mach-bf537/boards/minotaur.c | 24 ++++++------------ arch/blackfin/mach-bf537/boards/pnav10.c | 24 ++++++------------ arch/blackfin/mach-bf537/boards/tcm_bf537.c | 24 ++++++------------ arch/blackfin/mach-bf561/boards/cm_bf561.c | 15 ++++++----- 11 files changed, 78 insertions(+), 170 deletions(-) diff --git a/arch/blackfin/mach-bf518/boards/ezbrd.c b/arch/blackfin/mach-bf518/boards/ezbrd.c index b044de49ecf0..41f2eacfef20 100644 --- a/arch/blackfin/mach-bf518/boards/ezbrd.c +++ b/arch/blackfin/mach-bf518/boards/ezbrd.c @@ -182,9 +182,9 @@ static struct bfin5xx_spi_chip spi_switch_info = { #endif #endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) -static struct bfin5xx_spi_chip spi_mmc_chip_info = { - .enable_dma = 1, +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) +static struct bfin5xx_spi_chip mmc_spi_chip_info = { + .enable_dma = 0, .bits_per_word = 8, }; #endif @@ -276,23 +276,13 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = { #endif #endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) { - .modalias = "spi_mmc_dummy", + .modalias = "mmc_spi", .max_speed_hz = 25000000, /* max spi clock (SCK) speed in HZ */ .bus_num = 0, - .chip_select = 0, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, - .mode = SPI_MODE_3, - }, - { - .modalias = "spi_mmc", - .max_speed_hz = 25000000, /* max spi clock (SCK) speed in HZ */ - .bus_num = 0, - .chip_select = CONFIG_SPI_MMC_CS_CHAN, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, + .chip_select = 5, + .controller_data = &mmc_spi_chip_info, .mode = SPI_MODE_3, }, #endif diff --git a/arch/blackfin/mach-bf527/boards/cm_bf527.c b/arch/blackfin/mach-bf527/boards/cm_bf527.c index 856c097b5317..48e69eecdba4 100644 --- a/arch/blackfin/mach-bf527/boards/cm_bf527.c +++ b/arch/blackfin/mach-bf527/boards/cm_bf527.c @@ -487,9 +487,9 @@ static struct bfin5xx_spi_chip ad9960_spi_chip_info = { }; #endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) -static struct bfin5xx_spi_chip spi_mmc_chip_info = { - .enable_dma = 1, +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) +static struct bfin5xx_spi_chip mmc_spi_chip_info = { + .enable_dma = 0, .bits_per_word = 8, }; #endif @@ -585,23 +585,13 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = { .controller_data = &ad9960_spi_chip_info, }, #endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) { - .modalias = "spi_mmc_dummy", - .max_speed_hz = 25000000, /* max spi clock (SCK) speed in HZ */ + .modalias = "mmc_spi", + .max_speed_hz = 20000000, /* max spi clock (SCK) speed in HZ */ .bus_num = 0, - .chip_select = 0, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, - .mode = SPI_MODE_3, - }, - { - .modalias = "spi_mmc", - .max_speed_hz = 25000000, /* max spi clock (SCK) speed in HZ */ - .bus_num = 0, - .chip_select = CONFIG_SPI_MMC_CS_CHAN, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, + .chip_select = 5, + .controller_data = &mmc_spi_chip_info, .mode = SPI_MODE_3, }, #endif diff --git a/arch/blackfin/mach-bf527/boards/ezbrd.c b/arch/blackfin/mach-bf527/boards/ezbrd.c index 83606fcdde27..7fe480e4ebe8 100644 --- a/arch/blackfin/mach-bf527/boards/ezbrd.c +++ b/arch/blackfin/mach-bf527/boards/ezbrd.c @@ -256,9 +256,9 @@ static struct bfin5xx_spi_chip spi_adc_chip_info = { }; #endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) -static struct bfin5xx_spi_chip spi_mmc_chip_info = { - .enable_dma = 1, +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) +static struct bfin5xx_spi_chip mmc_spi_chip_info = { + .enable_dma = 0, .bits_per_word = 8, }; #endif @@ -366,23 +366,13 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = { }, #endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) { - .modalias = "spi_mmc_dummy", + .modalias = "mmc_spi", .max_speed_hz = 25000000, /* max spi clock (SCK) speed in HZ */ .bus_num = 0, - .chip_select = 0, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, - .mode = SPI_MODE_3, - }, - { - .modalias = "spi_mmc", - .max_speed_hz = 25000000, /* max spi clock (SCK) speed in HZ */ - .bus_num = 0, - .chip_select = CONFIG_SPI_MMC_CS_CHAN, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, + .chip_select = 5, + .controller_data = &mmc_spi_chip_info, .mode = SPI_MODE_3, }, #endif diff --git a/arch/blackfin/mach-bf533/boards/blackstamp.c b/arch/blackfin/mach-bf533/boards/blackstamp.c index 015c18f85e7f..0765872a8ada 100644 --- a/arch/blackfin/mach-bf533/boards/blackstamp.c +++ b/arch/blackfin/mach-bf533/boards/blackstamp.c @@ -101,9 +101,9 @@ static struct bfin5xx_spi_chip spi_flash_chip_info = { }; #endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) -static struct bfin5xx_spi_chip spi_mmc_chip_info = { - .enable_dma = 1, +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) +static struct bfin5xx_spi_chip mmc_spi_chip_info = { + .enable_dma = 0, .bits_per_word = 8, }; #endif @@ -129,23 +129,13 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = { }, #endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) { - .modalias = "spi_mmc_dummy", + .modalias = "mmc_spi", .max_speed_hz = 20000000, /* max spi clock (SCK) speed in HZ */ .bus_num = 0, - .chip_select = 0, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, - .mode = SPI_MODE_3, - }, - { - .modalias = "spi_mmc", - .max_speed_hz = 20000000, /* max spi clock (SCK) speed in HZ */ - .bus_num = 0, - .chip_select = CONFIG_SPI_MMC_CS_CHAN, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, + .chip_select = 5, + .controller_data = &mmc_spi_chip_info, .mode = SPI_MODE_3, }, #endif diff --git a/arch/blackfin/mach-bf533/boards/cm_bf533.c b/arch/blackfin/mach-bf533/boards/cm_bf533.c index e7061c7e8c42..e8974878d8c2 100644 --- a/arch/blackfin/mach-bf533/boards/cm_bf533.c +++ b/arch/blackfin/mach-bf533/boards/cm_bf533.c @@ -96,9 +96,9 @@ static struct bfin5xx_spi_chip ad1836_spi_chip_info = { }; #endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) -static struct bfin5xx_spi_chip spi_mmc_chip_info = { - .enable_dma = 1, +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) +static struct bfin5xx_spi_chip mmc_spi_chip_info = { + .enable_dma = 0, .bits_per_word = 8, }; #endif @@ -138,23 +138,13 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = { }, #endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) { - .modalias = "spi_mmc_dummy", + .modalias = "mmc_spi", .max_speed_hz = 25000000, /* max spi clock (SCK) speed in HZ */ .bus_num = 0, - .chip_select = 0, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, - .mode = SPI_MODE_3, - }, - { - .modalias = "spi_mmc", - .max_speed_hz = 25000000, /* max spi clock (SCK) speed in HZ */ - .bus_num = 0, - .chip_select = CONFIG_SPI_MMC_CS_CHAN, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, + .chip_select = 5, + .controller_data = &mmc_spi_chip_info, .mode = SPI_MODE_3, }, #endif diff --git a/arch/blackfin/mach-bf533/boards/ip0x.c b/arch/blackfin/mach-bf533/boards/ip0x.c index e30b1b7d1442..f19b63378b12 100644 --- a/arch/blackfin/mach-bf533/boards/ip0x.c +++ b/arch/blackfin/mach-bf533/boards/ip0x.c @@ -127,8 +127,8 @@ static struct platform_device dm9000_device2 = { #if defined(CONFIG_SPI_BFIN) || defined(CONFIG_SPI_BFIN_MODULE) /* all SPI peripherals info goes here */ -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) -static struct bfin5xx_spi_chip spi_mmc_chip_info = { +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) +static struct bfin5xx_spi_chip mmc_spi_chip_info = { /* * CPOL (Clock Polarity) * 0 - Active high SCK @@ -152,14 +152,13 @@ static struct bfin5xx_spi_chip spi_mmc_chip_info = { /* Notice: for blackfin, the speed_hz is the value of register * SPI_BAUD, not the real baudrate */ static struct spi_board_info bfin_spi_board_info[] __initdata = { -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) { - .modalias = "spi_mmc", + .modalias = "mmc_spi", .max_speed_hz = 2, .bus_num = 1, - .chip_select = CONFIG_SPI_MMC_CS_CHAN, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, + .chip_select = 5, + .controller_data = &mmc_spi_chip_info, }, #endif }; diff --git a/arch/blackfin/mach-bf537/boards/cm_bf537.c b/arch/blackfin/mach-bf537/boards/cm_bf537.c index 9cd8fb2a30d3..41c75b9bfac0 100644 --- a/arch/blackfin/mach-bf537/boards/cm_bf537.c +++ b/arch/blackfin/mach-bf537/boards/cm_bf537.c @@ -108,9 +108,9 @@ static struct bfin5xx_spi_chip ad9960_spi_chip_info = { }; #endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) -static struct bfin5xx_spi_chip spi_mmc_chip_info = { - .enable_dma = 1, +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) +static struct bfin5xx_spi_chip mmc_spi_chip_info = { + .enable_dma = 0, .bits_per_word = 8, }; #endif @@ -160,23 +160,13 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = { }, #endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) { - .modalias = "spi_mmc_dummy", - .max_speed_hz = 25000000, /* max spi clock (SCK) speed in HZ */ + .modalias = "mmc_spi", + .max_speed_hz = 20000000, /* max spi clock (SCK) speed in HZ */ .bus_num = 0, - .chip_select = 7, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, - .mode = SPI_MODE_3, - }, - { - .modalias = "spi_mmc", - .max_speed_hz = 25000000, /* max spi clock (SCK) speed in HZ */ - .bus_num = 0, - .chip_select = CONFIG_SPI_MMC_CS_CHAN, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, + .chip_select = 1, + .controller_data = &mmc_spi_chip_info, .mode = SPI_MODE_3, }, #endif diff --git a/arch/blackfin/mach-bf537/boards/minotaur.c b/arch/blackfin/mach-bf537/boards/minotaur.c index db7d3a385e4b..3c159819e555 100644 --- a/arch/blackfin/mach-bf537/boards/minotaur.c +++ b/arch/blackfin/mach-bf537/boards/minotaur.c @@ -134,9 +134,9 @@ static struct bfin5xx_spi_chip spi_flash_chip_info = { }; #endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) -static struct bfin5xx_spi_chip spi_mmc_chip_info = { - .enable_dma = 1, +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) +static struct bfin5xx_spi_chip mmc_spi_chip_info = { + .enable_dma = 0, .bits_per_word = 8, }; #endif @@ -156,23 +156,13 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = { }, #endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) { - .modalias = "spi_mmc_dummy", + .modalias = "mmc_spi", .max_speed_hz = 5000000, /* max spi clock (SCK) speed in HZ */ .bus_num = 0, - .chip_select = 0, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, - .mode = SPI_MODE_3, - }, - { - .modalias = "spi_mmc", - .max_speed_hz = 5000000, /* max spi clock (SCK) speed in HZ */ - .bus_num = 0, - .chip_select = CONFIG_SPI_MMC_CS_CHAN, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, + .chip_select = 5, + .controller_data = &mmc_spi_chip_info, .mode = SPI_MODE_3, }, #endif diff --git a/arch/blackfin/mach-bf537/boards/pnav10.c b/arch/blackfin/mach-bf537/boards/pnav10.c index 590eb3a139b7..4e1de1e53f89 100644 --- a/arch/blackfin/mach-bf537/boards/pnav10.c +++ b/arch/blackfin/mach-bf537/boards/pnav10.c @@ -289,9 +289,9 @@ static struct bfin5xx_spi_chip ad9960_spi_chip_info = { }; #endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) -static struct bfin5xx_spi_chip spi_mmc_chip_info = { - .enable_dma = 1, +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) +static struct bfin5xx_spi_chip mmc_spi_chip_info = { + .enable_dma = 0, .bits_per_word = 8, }; #endif @@ -364,23 +364,13 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = { .controller_data = &ad9960_spi_chip_info, }, #endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) { - .modalias = "spi_mmc_dummy", + .modalias = "mmc_spi", .max_speed_hz = 25000000, /* max spi clock (SCK) speed in HZ */ .bus_num = 0, - .chip_select = 7, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, - .mode = SPI_MODE_3, - }, - { - .modalias = "spi_mmc", - .max_speed_hz = 25000000, /* max spi clock (SCK) speed in HZ */ - .bus_num = 0, - .chip_select = CONFIG_SPI_MMC_CS_CHAN, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, + .chip_select = 5, + .controller_data = &mmc_spi_chip_info, .mode = SPI_MODE_3, }, #endif diff --git a/arch/blackfin/mach-bf537/boards/tcm_bf537.c b/arch/blackfin/mach-bf537/boards/tcm_bf537.c index 3f4f203a06ec..53ad10f3cd76 100644 --- a/arch/blackfin/mach-bf537/boards/tcm_bf537.c +++ b/arch/blackfin/mach-bf537/boards/tcm_bf537.c @@ -108,9 +108,9 @@ static struct bfin5xx_spi_chip ad9960_spi_chip_info = { }; #endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) -static struct bfin5xx_spi_chip spi_mmc_chip_info = { - .enable_dma = 1, +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) +static struct bfin5xx_spi_chip mmc_spi_chip_info = { + .enable_dma = 0, .bits_per_word = 8, }; #endif @@ -160,23 +160,13 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = { }, #endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) { - .modalias = "spi_mmc_dummy", + .modalias = "mmc_spi", .max_speed_hz = 25000000, /* max spi clock (SCK) speed in HZ */ .bus_num = 0, - .chip_select = 7, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, - .mode = SPI_MODE_3, - }, - { - .modalias = "spi_mmc", - .max_speed_hz = 25000000, /* max spi clock (SCK) speed in HZ */ - .bus_num = 0, - .chip_select = CONFIG_SPI_MMC_CS_CHAN, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, + .chip_select = 5, + .controller_data = &mmc_spi_chip_info, .mode = SPI_MODE_3, }, #endif diff --git a/arch/blackfin/mach-bf561/boards/cm_bf561.c b/arch/blackfin/mach-bf561/boards/cm_bf561.c index 6880d1ebfe60..f623c6b0719f 100644 --- a/arch/blackfin/mach-bf561/boards/cm_bf561.c +++ b/arch/blackfin/mach-bf561/boards/cm_bf561.c @@ -105,9 +105,9 @@ static struct bfin5xx_spi_chip ad9960_spi_chip_info = { }; #endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) -static struct bfin5xx_spi_chip spi_mmc_chip_info = { - .enable_dma = 1, +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) +static struct bfin5xx_spi_chip mmc_spi_chip_info = { + .enable_dma = 0, .bits_per_word = 8, }; #endif @@ -155,14 +155,13 @@ static struct spi_board_info bfin_spi_board_info[] __initdata = { .controller_data = &ad9960_spi_chip_info, }, #endif -#if defined(CONFIG_SPI_MMC) || defined(CONFIG_SPI_MMC_MODULE) +#if defined(CONFIG_MMC_SPI) || defined(CONFIG_MMC_SPI_MODULE) { - .modalias = "spi_mmc", + .modalias = "mmc_spi", .max_speed_hz = 25000000, /* max spi clock (SCK) speed in HZ */ .bus_num = 0, - .chip_select = CONFIG_SPI_MMC_CS_CHAN, - .platform_data = NULL, - .controller_data = &spi_mmc_chip_info, + .chip_select = 5, + .controller_data = &mmc_spi_chip_info, .mode = SPI_MODE_3, }, #endif From 33dd6f92a1a7ad85c54d47fd9d73371a32c0bde4 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 20 Feb 2009 06:53:48 -0700 Subject: [PATCH 235/580] [SCSI] sd: Don't try to spin up drives that are connected to an inactive port We currently try to spin up drives connected to standby and unavailable ports. This will never succeed and wastes a lot of time. Fail quickly if the sense data reports the port is in standby or unavailable state. Reported-by: Narayanan Rengarajan Tested-by: Narayanan Rengarajan Signed-off-by: Matthew Wilcox Signed-off-by: James Bottomley --- drivers/scsi/sd.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 55310dbc10a6..4970ae4a62d6 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1167,23 +1167,19 @@ sd_spinup_disk(struct scsi_disk *sdkp) /* * The device does not want the automatic start to be issued. */ - if (sdkp->device->no_start_on_add) { + if (sdkp->device->no_start_on_add) break; - } - /* - * If manual intervention is required, or this is an - * absent USB storage device, a spinup is meaningless. - */ - if (sense_valid && - sshdr.sense_key == NOT_READY && - sshdr.asc == 4 && sshdr.ascq == 3) { - break; /* manual intervention required */ - - /* - * Issue command to spin up drive when not ready - */ - } else if (sense_valid && sshdr.sense_key == NOT_READY) { + if (sense_valid && sshdr.sense_key == NOT_READY) { + if (sshdr.asc == 4 && sshdr.ascq == 3) + break; /* manual intervention required */ + if (sshdr.asc == 4 && sshdr.ascq == 0xb) + break; /* standby */ + if (sshdr.asc == 4 && sshdr.ascq == 0xc) + break; /* unavailable */ + /* + * Issue command to spin up drive when not ready + */ if (!spintime) { sd_printk(KERN_NOTICE, sdkp, "Spinning up disk..."); cmd[0] = START_STOP; From ef449e6d21b6e560c7ec36ed285ec3a34e0b12ed Mon Sep 17 00:00:00 2001 From: Hartley Sweeten Date: Thu, 5 Mar 2009 17:33:50 +0100 Subject: [PATCH 236/580] [ARM] 5419/1: ep93xx: fix build warnings about struct i2c_board_info Fix build warnings due to struct i2c_board_info in Patch "5311/1: add core support for built in i2c bus" is causing 11 of 39 the build warnings with Kautobuild for ep93xx_defconfig on kernel 2.6.29-rc5-git4. This patch fixes it. Signed-off-by: H Hartley Sweeten Signed-off-by: Russell King --- arch/arm/mach-ep93xx/include/mach/platform.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/mach-ep93xx/include/mach/platform.h b/arch/arm/mach-ep93xx/include/mach/platform.h index 88f7e88f152f..05f0f4f2f3ce 100644 --- a/arch/arm/mach-ep93xx/include/mach/platform.h +++ b/arch/arm/mach-ep93xx/include/mach/platform.h @@ -4,6 +4,8 @@ #ifndef __ASSEMBLY__ +struct i2c_board_info; + struct ep93xx_eth_data { unsigned char dev_addr[6]; From 9eb77ab0762d8cfc4686e1ec5e9a52905f3a5009 Mon Sep 17 00:00:00 2001 From: Xose Vazquez Perez Date: Sat, 28 Feb 2009 00:26:09 +0100 Subject: [PATCH 237/580] rt2x00 : more devices to rt2500usb.c add more usb_dev to rt2500usb.c . IDs 'stolen' from the windows inf file(02/12/2009, 2.01.01.0015). Signed-off-by: Xose Vazquez Perez Signed-off-by: John W. Linville --- drivers/net/wireless/rt2x00/rt2500usb.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/wireless/rt2x00/rt2500usb.c b/drivers/net/wireless/rt2x00/rt2500usb.c index af6b5847be5c..3e2ac2bbb12f 100644 --- a/drivers/net/wireless/rt2x00/rt2500usb.c +++ b/drivers/net/wireless/rt2x00/rt2500usb.c @@ -1952,6 +1952,8 @@ static struct usb_device_id rt2500usb_device_table[] = { { USB_DEVICE(0x13b1, 0x000d), USB_DEVICE_DATA(&rt2500usb_ops) }, { USB_DEVICE(0x13b1, 0x0011), USB_DEVICE_DATA(&rt2500usb_ops) }, { USB_DEVICE(0x13b1, 0x001a), USB_DEVICE_DATA(&rt2500usb_ops) }, + /* CNet */ + { USB_DEVICE(0x1371, 0x9022), USB_DEVICE_DATA(&rt2500usb_ops) }, /* Conceptronic */ { USB_DEVICE(0x14b2, 0x3c02), USB_DEVICE_DATA(&rt2500usb_ops) }, /* D-LINK */ @@ -1976,14 +1978,20 @@ static struct usb_device_id rt2500usb_device_table[] = { { USB_DEVICE(0x148f, 0x2570), USB_DEVICE_DATA(&rt2500usb_ops) }, { USB_DEVICE(0x148f, 0x2573), USB_DEVICE_DATA(&rt2500usb_ops) }, { USB_DEVICE(0x148f, 0x9020), USB_DEVICE_DATA(&rt2500usb_ops) }, + /* Sagem */ + { USB_DEVICE(0x079b, 0x004b), USB_DEVICE_DATA(&rt2500usb_ops) }, /* Siemens */ { USB_DEVICE(0x0681, 0x3c06), USB_DEVICE_DATA(&rt2500usb_ops) }, /* SMC */ { USB_DEVICE(0x0707, 0xee13), USB_DEVICE_DATA(&rt2500usb_ops) }, /* Spairon */ { USB_DEVICE(0x114b, 0x0110), USB_DEVICE_DATA(&rt2500usb_ops) }, + /* SURECOM */ + { USB_DEVICE(0x0769, 0x11f3), USB_DEVICE_DATA(&rt2500usb_ops) }, /* Trust */ { USB_DEVICE(0x0eb0, 0x9020), USB_DEVICE_DATA(&rt2500usb_ops) }, + /* VTech */ + { USB_DEVICE(0x0f88, 0x3012), USB_DEVICE_DATA(&rt2500usb_ops) }, /* Zinwell */ { USB_DEVICE(0x5a57, 0x0260), USB_DEVICE_DATA(&rt2500usb_ops) }, { 0, } From ef4bb70d876b4ae5787cc43727477d1a36cdc2e8 Mon Sep 17 00:00:00 2001 From: Xose Vazquez Perez Date: Sat, 28 Feb 2009 00:34:23 +0100 Subject: [PATCH 238/580] rt2x00 : more devices to rt73usb.c add more usb_dev to rt73usb.c . IDs 'stolen' from the windows inf file(10/21/2008, 1.03.02.0000) plus some from the Ralink linux driver(2009_0206_RT73_Linux_STA_Drv1.1.0.2.tar.bz2) Signed-off-by: Xose Vazquez Perez Signed-off-by: John W. Linville --- drivers/net/wireless/rt2x00/rt73usb.c | 32 ++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/rt2x00/rt73usb.c b/drivers/net/wireless/rt2x00/rt73usb.c index 96a8d69f8790..cefee1b26cd8 100644 --- a/drivers/net/wireless/rt2x00/rt73usb.c +++ b/drivers/net/wireless/rt2x00/rt73usb.c @@ -2281,7 +2281,18 @@ static const struct rt2x00_ops rt73usb_ops = { */ static struct usb_device_id rt73usb_device_table[] = { /* AboCom */ + { USB_DEVICE(0x07b8, 0xb21b), USB_DEVICE_DATA(&rt73usb_ops) }, + { USB_DEVICE(0x07b8, 0xb21c), USB_DEVICE_DATA(&rt73usb_ops) }, { USB_DEVICE(0x07b8, 0xb21d), USB_DEVICE_DATA(&rt73usb_ops) }, + { USB_DEVICE(0x07b8, 0xb21e), USB_DEVICE_DATA(&rt73usb_ops) }, + { USB_DEVICE(0x07b8, 0xb21f), USB_DEVICE_DATA(&rt73usb_ops) }, + /* AL */ + { USB_DEVICE(0x14b2, 0x3c10), USB_DEVICE_DATA(&rt73usb_ops) }, + /* Amigo */ + { USB_DEVICE(0x148f, 0x9021), USB_DEVICE_DATA(&rt73usb_ops) }, + { USB_DEVICE(0x0eb0, 0x9021), USB_DEVICE_DATA(&rt73usb_ops) }, + /* AMIT */ + { USB_DEVICE(0x18c5, 0x0002), USB_DEVICE_DATA(&rt73usb_ops) }, /* Askey */ { USB_DEVICE(0x1690, 0x0722), USB_DEVICE_DATA(&rt73usb_ops) }, /* ASUS */ @@ -2294,7 +2305,9 @@ static struct usb_device_id rt73usb_device_table[] = { { USB_DEVICE(0x050d, 0x905c), USB_DEVICE_DATA(&rt73usb_ops) }, /* Billionton */ { USB_DEVICE(0x1631, 0xc019), USB_DEVICE_DATA(&rt73usb_ops) }, + { USB_DEVICE(0x08dd, 0x0120), USB_DEVICE_DATA(&rt73usb_ops) }, /* Buffalo */ + { USB_DEVICE(0x0411, 0x00d8), USB_DEVICE_DATA(&rt73usb_ops) }, { USB_DEVICE(0x0411, 0x00f4), USB_DEVICE_DATA(&rt73usb_ops) }, /* CNet */ { USB_DEVICE(0x1371, 0x9022), USB_DEVICE_DATA(&rt73usb_ops) }, @@ -2308,6 +2321,11 @@ static struct usb_device_id rt73usb_device_table[] = { { USB_DEVICE(0x07d1, 0x3c04), USB_DEVICE_DATA(&rt73usb_ops) }, { USB_DEVICE(0x07d1, 0x3c06), USB_DEVICE_DATA(&rt73usb_ops) }, { USB_DEVICE(0x07d1, 0x3c07), USB_DEVICE_DATA(&rt73usb_ops) }, + /* Edimax */ + { USB_DEVICE(0x7392, 0x7318), USB_DEVICE_DATA(&rt73usb_ops) }, + { USB_DEVICE(0x7392, 0x7618), USB_DEVICE_DATA(&rt73usb_ops) }, + /* EnGenius */ + { USB_DEVICE(0x1740, 0x3701), USB_DEVICE_DATA(&rt73usb_ops) }, /* Gemtek */ { USB_DEVICE(0x15a9, 0x0004), USB_DEVICE_DATA(&rt73usb_ops) }, /* Gigabyte */ @@ -2328,22 +2346,34 @@ static struct usb_device_id rt73usb_device_table[] = { { USB_DEVICE(0x0db0, 0xa861), USB_DEVICE_DATA(&rt73usb_ops) }, { USB_DEVICE(0x0db0, 0xa874), USB_DEVICE_DATA(&rt73usb_ops) }, /* Ralink */ + { USB_DEVICE(0x04bb, 0x093d), USB_DEVICE_DATA(&rt73usb_ops) }, { USB_DEVICE(0x148f, 0x2573), USB_DEVICE_DATA(&rt73usb_ops) }, { USB_DEVICE(0x148f, 0x2671), USB_DEVICE_DATA(&rt73usb_ops) }, /* Qcom */ { USB_DEVICE(0x18e8, 0x6196), USB_DEVICE_DATA(&rt73usb_ops) }, { USB_DEVICE(0x18e8, 0x6229), USB_DEVICE_DATA(&rt73usb_ops) }, { USB_DEVICE(0x18e8, 0x6238), USB_DEVICE_DATA(&rt73usb_ops) }, + /* Samsung */ + { USB_DEVICE(0x04e8, 0x4471), USB_DEVICE_DATA(&rt73usb_ops) }, /* Senao */ { USB_DEVICE(0x1740, 0x7100), USB_DEVICE_DATA(&rt73usb_ops) }, /* Sitecom */ - { USB_DEVICE(0x0df6, 0x9712), USB_DEVICE_DATA(&rt73usb_ops) }, + { USB_DEVICE(0x0df6, 0x0024), USB_DEVICE_DATA(&rt73usb_ops) }, + { USB_DEVICE(0x0df6, 0x0027), USB_DEVICE_DATA(&rt73usb_ops) }, + { USB_DEVICE(0x0df6, 0x002f), USB_DEVICE_DATA(&rt73usb_ops) }, { USB_DEVICE(0x0df6, 0x90ac), USB_DEVICE_DATA(&rt73usb_ops) }, + { USB_DEVICE(0x0df6, 0x9712), USB_DEVICE_DATA(&rt73usb_ops) }, /* Surecom */ { USB_DEVICE(0x0769, 0x31f3), USB_DEVICE_DATA(&rt73usb_ops) }, + /* Philips */ + { USB_DEVICE(0x0471, 0x200a), USB_DEVICE_DATA(&rt73usb_ops) }, /* Planex */ { USB_DEVICE(0x2019, 0xab01), USB_DEVICE_DATA(&rt73usb_ops) }, { USB_DEVICE(0x2019, 0xab50), USB_DEVICE_DATA(&rt73usb_ops) }, + /* Zcom */ + { USB_DEVICE(0x0cde, 0x001c), USB_DEVICE_DATA(&rt73usb_ops) }, + /* ZyXEL */ + { USB_DEVICE(0x0586, 0x3415), USB_DEVICE_DATA(&rt73usb_ops) }, { 0, } }; From 623d563e52d4d4041612e24b33a5610a900dd778 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 3 Mar 2009 11:37:04 -0800 Subject: [PATCH 239/580] iwlwifi: fix error flow in iwl*_pci_probe Both the agn and 3945 drivers has some problems with dealing with errors in their probe functions. Ensure that a goto will undo only things that was done before the goto was called. Signed-off-by: Reinette Chatre Signed-off-by: John W. Linville --- drivers/net/wireless/iwlwifi/iwl-agn.c | 6 ++++-- drivers/net/wireless/iwlwifi/iwl3945-base.c | 17 +++++++---------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c index 36bafeb353ce..129e2d330abb 100644 --- a/drivers/net/wireless/iwlwifi/iwl-agn.c +++ b/drivers/net/wireless/iwlwifi/iwl-agn.c @@ -3868,7 +3868,7 @@ static int iwl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } err = iwl_eeprom_check_version(priv); if (err) - goto out_iounmap; + goto out_free_eeprom; /* extract MAC Address */ iwl_eeprom_get_mac(priv, priv->mac_addr); @@ -3945,6 +3945,8 @@ static int iwl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) return 0; out_remove_sysfs: + destroy_workqueue(priv->workqueue); + priv->workqueue = NULL; sysfs_remove_group(&pdev->dev.kobj, &iwl_attribute_group); out_uninit_drv: iwl_uninit_drv(priv); @@ -3953,8 +3955,8 @@ static int iwl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) out_iounmap: pci_iounmap(pdev, priv->hw_base); out_pci_release_regions: - pci_release_regions(pdev); pci_set_drvdata(pdev, NULL); + pci_release_regions(pdev); out_pci_disable_device: pci_disable_device(pdev); out_ieee80211_free_hw: diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c index 93be74a1f139..57dd34e256d8 100644 --- a/drivers/net/wireless/iwlwifi/iwl3945-base.c +++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c @@ -7911,7 +7911,7 @@ static int iwl3945_pci_probe(struct pci_dev *pdev, const struct pci_device_id *e CSR_GP_CNTRL_REG_FLAG_MAC_CLOCK_READY, 25000); if (err < 0) { IWL_DEBUG_INFO("Failed to init the card\n"); - goto out_remove_sysfs; + goto out_iounmap; } /*********************** @@ -7921,7 +7921,7 @@ static int iwl3945_pci_probe(struct pci_dev *pdev, const struct pci_device_id *e err = iwl3945_eeprom_init(priv); if (err) { IWL_ERROR("Unable to init EEPROM\n"); - goto out_remove_sysfs; + goto out_iounmap; } /* MAC Address location in EEPROM same for 3945/4965 */ get_eeprom_mac(priv, priv->mac_addr); @@ -7975,7 +7975,7 @@ static int iwl3945_pci_probe(struct pci_dev *pdev, const struct pci_device_id *e err = iwl3945_init_channel_map(priv); if (err) { IWL_ERROR("initializing regulatory failed: %d\n", err); - goto out_release_irq; + goto out_unset_hw_setting; } err = iwl3945_init_geos(priv); @@ -8045,25 +8045,22 @@ static int iwl3945_pci_probe(struct pci_dev *pdev, const struct pci_device_id *e return 0; out_remove_sysfs: + destroy_workqueue(priv->workqueue); + priv->workqueue = NULL; sysfs_remove_group(&pdev->dev.kobj, &iwl3945_attribute_group); out_free_geos: iwl3945_free_geos(priv); out_free_channel_map: iwl3945_free_channel_map(priv); - - - out_release_irq: - destroy_workqueue(priv->workqueue); - priv->workqueue = NULL; + out_unset_hw_setting: iwl3945_unset_hw_setting(priv); - out_iounmap: pci_iounmap(pdev, priv->hw_base); out_pci_release_regions: pci_release_regions(pdev); out_pci_disable_device: - pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); + pci_disable_device(pdev); out_ieee80211_free_hw: ieee80211_free_hw(priv->hw); out: From c9a0c8a6845b5efb64841f40b8efb4c387051d46 Mon Sep 17 00:00:00 2001 From: Wim Van Sebroeck Date: Wed, 25 Feb 2009 13:33:01 +0100 Subject: [PATCH 240/580] [WATCHDOG] orion5x_wdt.c: 'ORION5X_TCLK' undeclared orion5x_wdt no longer compiled after the changes in commit ebe35aff883496c07248df82c8576c3b6e84bbbe ("Orion: prepare for runtime-determined timer tick rate"). The tick rate define (ORION5X_TCLK) was removed in favor of a runtime detection. The quick fix is to add the define in the watchdog driver. The fix is not correct for all supported orion5x platforms, but since the supported platforms right now are 133 Mhz and 166 Mhz, it won't be _that_ far off. ;-) A fix that uses the runtime-determined timer tick rate will be applied later. Signed-off-by: Wim Van Sebroeck Cc: Kristof Provost Acked-by: Lennert Buytenhek Cc: Nicolas Pitre Cc: Russell King Cc: Andrew Morton --- drivers/watchdog/orion5x_wdt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/watchdog/orion5x_wdt.c b/drivers/watchdog/orion5x_wdt.c index 14a339f58b6a..b64ae1a17832 100644 --- a/drivers/watchdog/orion5x_wdt.c +++ b/drivers/watchdog/orion5x_wdt.c @@ -29,6 +29,7 @@ #define WDT_EN 0x0010 #define WDT_VAL (TIMER_VIRT_BASE + 0x0024) +#define ORION5X_TCLK 166666667 #define WDT_MAX_DURATION (0xffffffff / ORION5X_TCLK) #define WDT_IN_USE 0 #define WDT_OK_TO_CLOSE 1 From d2c452306ab402d7a3572bc3bf8e575796529bf8 Mon Sep 17 00:00:00 2001 From: Gregory Lardiere Date: Sun, 22 Feb 2009 17:54:11 -0300 Subject: [PATCH 241/580] V4L/DVB (10789): m5602-s5k4aa: Split up the initial sensor probe in chunks. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous probe rotine tried to read 6 bytes in one chunk which currently isn't allowed. This is the rev. 10346 243399e67c41 readded with a high priority. Signed-off-by: Gregory Lardiere Signed-off-by: Erik Andrén Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/gspca/m5602/m5602_s5k4aa.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/media/video/gspca/m5602/m5602_s5k4aa.c b/drivers/media/video/gspca/m5602/m5602_s5k4aa.c index e564a61a72d7..48892b5715d5 100644 --- a/drivers/media/video/gspca/m5602/m5602_s5k4aa.c +++ b/drivers/media/video/gspca/m5602/m5602_s5k4aa.c @@ -102,7 +102,11 @@ int s5k4aa_probe(struct sd *sd) } /* Test some registers, but we don't know their exact meaning yet */ - if (m5602_read_sensor(sd, 0x00, prod_id, sizeof(prod_id))) + if (m5602_read_sensor(sd, 0x00, prod_id, 2)) + return -ENODEV; + if (m5602_read_sensor(sd, 0x02, prod_id+2, 2)) + return -ENODEV; + if (m5602_read_sensor(sd, 0x04, prod_id+4, 2)) return -ENODEV; if (memcmp(prod_id, expected_prod_id, sizeof(prod_id))) From 4c6c390eb8ba0c569279266a98c604508c695ef8 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Thu, 5 Mar 2009 13:03:32 -0300 Subject: [PATCH 242/580] V4L/DVB (10832): tvaudio: Avoid breakage with tda9874a The 'bytes' array is 64 bytes large but the easy standard programming (TDA9874A_ESP) has a number of 255, outside the shadow array size. This patch increases the size of the shadow array in order to accomodate this register. Signed-off-by: Vitaly Wool Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/tvaudio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/video/tvaudio.c b/drivers/media/video/tvaudio.c index 5aeccb301cea..076ed5bf48b1 100644 --- a/drivers/media/video/tvaudio.c +++ b/drivers/media/video/tvaudio.c @@ -54,7 +54,7 @@ MODULE_LICENSE("GPL"); /* ---------------------------------------------------------------------- */ /* our structs */ -#define MAXREGS 64 +#define MAXREGS 256 struct CHIPSTATE; typedef int (*getvalue)(int); From e08e7b5f01de7ec246b996c65e9c26c7cea0c62d Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 5 Mar 2009 16:19:14 -0300 Subject: [PATCH 243/580] V4L/DVB (10834): zoran: auto-select bt866 for AverMedia 6 Eyes AFAIK, the bt866 is only seen on AverMedia 6 Eyes. However, no module selects it. Adds a proper select for this driver. Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/zoran/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/video/zoran/Kconfig b/drivers/media/video/zoran/Kconfig index 4ea5fa71de89..8666e19f31a7 100644 --- a/drivers/media/video/zoran/Kconfig +++ b/drivers/media/video/zoran/Kconfig @@ -68,6 +68,7 @@ config VIDEO_ZORAN_AVS6EYES tristate "AverMedia 6 Eyes support (EXPERIMENTAL)" depends on VIDEO_ZORAN_ZR36060 && EXPERIMENTAL && VIDEO_V4L1 select VIDEO_BT856 if VIDEO_HELPER_CHIPS_AUTO + select VIDEO_BT866 if VIDEO_HELPER_CHIPS_AUTO select VIDEO_KS0127 if VIDEO_HELPER_CHIPS_AUTO help Support for the AverMedia 6 Eyes video surveillance card. From 6a242909b01120f6f3d571c0b75e20ec61f0d8d3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:58 +0900 Subject: [PATCH 244/580] percpu: clean up percpu constants Impact: cleaup Make the following cleanups. * There isn't much arch-specific about PERCPU_MODULE_RESERVE. Always define it whether arch overrides PERCPU_ENOUGH_ROOM or not. * blackfin overrides PERCPU_ENOUGH_ROOM to align static area size. Do it by default. * percpu allocation sizes doesn't have much to do with the page size. Don't use PAGE_SHIFT in their definition. Signed-off-by: Tejun Heo Cc: Bryan Wu --- arch/blackfin/include/asm/percpu.h | 10 ---------- include/linux/percpu.h | 24 +++++++++++++----------- 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/arch/blackfin/include/asm/percpu.h b/arch/blackfin/include/asm/percpu.h index 797c0c165069..c94c7bc88c71 100644 --- a/arch/blackfin/include/asm/percpu.h +++ b/arch/blackfin/include/asm/percpu.h @@ -3,14 +3,4 @@ #include -#ifdef CONFIG_MODULES -#define PERCPU_MODULE_RESERVE 8192 -#else -#define PERCPU_MODULE_RESERVE 0 -#endif - -#define PERCPU_ENOUGH_ROOM \ - (ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \ - PERCPU_MODULE_RESERVE) - #endif /* __ARCH_BLACKFIN_PERCPU__ */ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 545b068bcb70..2d34b038fe70 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -5,6 +5,7 @@ #include /* For kmalloc() */ #include #include +#include #include @@ -52,17 +53,18 @@ #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) -/* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */ -#ifndef PERCPU_ENOUGH_ROOM +/* enough to cover all DEFINE_PER_CPUs in modules */ #ifdef CONFIG_MODULES -#define PERCPU_MODULE_RESERVE 8192 +#define PERCPU_MODULE_RESERVE (8 << 10) #else -#define PERCPU_MODULE_RESERVE 0 +#define PERCPU_MODULE_RESERVE 0 #endif +#ifndef PERCPU_ENOUGH_ROOM #define PERCPU_ENOUGH_ROOM \ - (__per_cpu_end - __per_cpu_start + PERCPU_MODULE_RESERVE) -#endif /* PERCPU_ENOUGH_ROOM */ + (ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \ + PERCPU_MODULE_RESERVE) +#endif /* * Must be an lvalue. Since @var must be a simple identifier, @@ -79,7 +81,7 @@ #ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA /* minimum unit size, also is the maximum supported allocation size */ -#define PCPU_MIN_UNIT_SIZE (16UL << PAGE_SHIFT) +#define PCPU_MIN_UNIT_SIZE PFN_ALIGN(64 << 10) /* * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy @@ -96,15 +98,15 @@ #ifndef PERCPU_DYNAMIC_RESERVE # if BITS_PER_LONG > 32 # ifdef CONFIG_MODULES -# define PERCPU_DYNAMIC_RESERVE (6 << PAGE_SHIFT) +# define PERCPU_DYNAMIC_RESERVE (24 << 10) # else -# define PERCPU_DYNAMIC_RESERVE (4 << PAGE_SHIFT) +# define PERCPU_DYNAMIC_RESERVE (16 << 10) # endif # else # ifdef CONFIG_MODULES -# define PERCPU_DYNAMIC_RESERVE (4 << PAGE_SHIFT) +# define PERCPU_DYNAMIC_RESERVE (16 << 10) # else -# define PERCPU_DYNAMIC_RESERVE (2 << PAGE_SHIFT) +# define PERCPU_DYNAMIC_RESERVE (8 << 10) # endif # endif #endif /* PERCPU_DYNAMIC_RESERVE */ From 2441d15c97d498b18f03ae9fba262ffeae42a08b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: [PATCH 245/580] percpu: cosmetic renames in pcpu_setup_first_chunk() Impact: cosmetic, preparation for future changes Make the following renames in pcpur_setup_first_chunk() in preparation for future changes. * s/free_size/dyn_size/ * s/static_vm/first_vm/ * s/static_chunk/schunk/ Signed-off-by: Tejun Heo --- include/linux/percpu.h | 2 +- mm/percpu.c | 58 +++++++++++++++++++++--------------------- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 2d34b038fe70..a0b4ea2a3354 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -118,7 +118,7 @@ typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr); extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t unit_size, - size_t free_size, void *base_addr, + size_t dyn_size, void *base_addr, pcpu_populate_pte_fn_t populate_pte_fn); /* diff --git a/mm/percpu.c b/mm/percpu.c index 3d0f5456827c..9531590e6b69 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -831,7 +831,7 @@ EXPORT_SYMBOL_GPL(free_percpu); * @get_page_fn: callback to fetch page pointer * @static_size: the size of static percpu area in bytes * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto - * @free_size: free size in bytes, 0 for auto + * @dyn_size: free size for dynamic allocation in bytes, 0 for auto * @base_addr: mapped address, NULL for auto * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary * @@ -849,12 +849,12 @@ EXPORT_SYMBOL_GPL(free_percpu); * return the same number of pages for all cpus. * * @unit_size, if non-zero, determines unit size and must be aligned - * to PAGE_SIZE and equal to or larger than @static_size + @free_size. + * to PAGE_SIZE and equal to or larger than @static_size + @dyn_size. * - * @free_size determines the number of free bytes after the static + * @dyn_size determines the number of free bytes after the static * area in the first chunk. If zero, whatever left is available. * Specifying non-zero value make percpu leave the area after - * @static_size + @free_size alone. + * @static_size + @dyn_size alone. * * Non-null @base_addr means that the caller already allocated virtual * region for the first chunk and mapped it. percpu must not mess @@ -870,19 +870,19 @@ EXPORT_SYMBOL_GPL(free_percpu); */ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t unit_size, - size_t free_size, void *base_addr, + size_t dyn_size, void *base_addr, pcpu_populate_pte_fn_t populate_pte_fn) { - static struct vm_struct static_vm; - struct pcpu_chunk *static_chunk; + static struct vm_struct first_vm; + struct pcpu_chunk *schunk; unsigned int cpu; int nr_pages; int err, i; /* santiy checks */ BUG_ON(!static_size); - BUG_ON(!unit_size && free_size); - BUG_ON(unit_size && unit_size < static_size + free_size); + BUG_ON(!unit_size && dyn_size); + BUG_ON(unit_size && unit_size < static_size + dyn_size); BUG_ON(unit_size & ~PAGE_MASK); BUG_ON(base_addr && !unit_size); BUG_ON(base_addr && populate_pte_fn); @@ -908,24 +908,24 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); - /* init static_chunk */ - static_chunk = alloc_bootmem(pcpu_chunk_struct_size); - INIT_LIST_HEAD(&static_chunk->list); - static_chunk->vm = &static_vm; + /* init static chunk */ + schunk = alloc_bootmem(pcpu_chunk_struct_size); + INIT_LIST_HEAD(&schunk->list); + schunk->vm = &first_vm; - if (free_size) - static_chunk->free_size = free_size; + if (dyn_size) + schunk->free_size = dyn_size; else - static_chunk->free_size = pcpu_unit_size - pcpu_static_size; + schunk->free_size = pcpu_unit_size - pcpu_static_size; - static_chunk->contig_hint = static_chunk->free_size; + schunk->contig_hint = schunk->free_size; /* allocate vm address */ - static_vm.flags = VM_ALLOC; - static_vm.size = pcpu_chunk_size; + first_vm.flags = VM_ALLOC; + first_vm.size = pcpu_chunk_size; if (!base_addr) - vm_area_register_early(&static_vm, PAGE_SIZE); + vm_area_register_early(&first_vm, PAGE_SIZE); else { /* * Pages already mapped. No need to remap into @@ -933,8 +933,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, * be mapped or unmapped by percpu and is marked * immutable. */ - static_vm.addr = base_addr; - static_chunk->immutable = true; + first_vm.addr = base_addr; + schunk->immutable = true; } /* assign pages */ @@ -945,7 +945,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, if (!page) break; - *pcpu_chunk_pagep(static_chunk, cpu, i) = page; + *pcpu_chunk_pagep(schunk, cpu, i) = page; } BUG_ON(i < PFN_UP(pcpu_static_size)); @@ -960,20 +960,20 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, if (populate_pte_fn) { for_each_possible_cpu(cpu) for (i = 0; i < nr_pages; i++) - populate_pte_fn(pcpu_chunk_addr(static_chunk, + populate_pte_fn(pcpu_chunk_addr(schunk, cpu, i)); - err = pcpu_map(static_chunk, 0, nr_pages); + err = pcpu_map(schunk, 0, nr_pages); if (err) panic("failed to setup static percpu area, err=%d\n", err); } - /* link static_chunk in */ - pcpu_chunk_relocate(static_chunk, -1); - pcpu_chunk_addr_insert(static_chunk); + /* link the first chunk in */ + pcpu_chunk_relocate(schunk, -1); + pcpu_chunk_addr_insert(schunk); /* we're done */ - pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0); + pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); return pcpu_unit_size; } From 61ace7fa2fff9c4b6641c506b6b3f1a9394a1b11 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: [PATCH 246/580] percpu: improve first chunk initial area map handling Impact: no functional change When the first chunk is created, its initial area map is not allocated because kmalloc isn't online yet. The map is allocated and initialized on the first allocation request on the chunk. This works fine but the scattering of initialization logic between the init function and allocation path is a bit confusing. This patch makes the first chunk initialize and use minimal statically allocated map from pcpu_setpu_first_chunk(). The map resizing path still needs to handle this specially but it's more straight-forward and gives more latitude to the init path. This will ease future changes. Signed-off-by: Tejun Heo --- mm/percpu.c | 53 +++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 9531590e6b69..503ccad091af 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -93,9 +93,6 @@ static size_t pcpu_chunk_struct_size __read_mostly; void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); -/* the size of kernel static area */ -static int pcpu_static_size __read_mostly; - /* * One mutex to rule them all. * @@ -316,15 +313,28 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) /* reallocation required? */ if (chunk->map_alloc < target) { - int new_alloc = chunk->map_alloc; + int new_alloc; int *new; + new_alloc = PCPU_DFL_MAP_ALLOC; while (new_alloc < target) new_alloc *= 2; - new = pcpu_realloc(chunk->map, - chunk->map_alloc * sizeof(new[0]), - new_alloc * sizeof(new[0])); + if (chunk->map_alloc < PCPU_DFL_MAP_ALLOC) { + /* + * map_alloc smaller than the default size + * indicates that the chunk is one of the + * first chunks and still using static map. + * Allocate a dynamic one and copy. + */ + new = pcpu_realloc(NULL, 0, new_alloc * sizeof(new[0])); + if (new) + memcpy(new, chunk->map, + chunk->map_alloc * sizeof(new[0])); + } else + new = pcpu_realloc(chunk->map, + chunk->map_alloc * sizeof(new[0]), + new_alloc * sizeof(new[0])); if (!new) return -ENOMEM; @@ -367,22 +377,6 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) int max_contig = 0; int i, off; - /* - * The static chunk initially doesn't have map attached - * because kmalloc wasn't available during init. Give it one. - */ - if (unlikely(!chunk->map)) { - chunk->map = pcpu_realloc(NULL, 0, - PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); - if (!chunk->map) - return -ENOMEM; - - chunk->map_alloc = PCPU_DFL_MAP_ALLOC; - chunk->map[chunk->map_used++] = -pcpu_static_size; - if (chunk->free_size) - chunk->map[chunk->map_used++] = chunk->free_size; - } - for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) { bool is_last = i + 1 == chunk->map_used; int head, tail; @@ -874,12 +868,14 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, pcpu_populate_pte_fn_t populate_pte_fn) { static struct vm_struct first_vm; + static int smap[2]; struct pcpu_chunk *schunk; unsigned int cpu; int nr_pages; int err, i; /* santiy checks */ + BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC); BUG_ON(!static_size); BUG_ON(!unit_size && dyn_size); BUG_ON(unit_size && unit_size < static_size + dyn_size); @@ -893,7 +889,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, PFN_UP(static_size)); - pcpu_static_size = static_size; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) @@ -912,14 +907,20 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, schunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&schunk->list); schunk->vm = &first_vm; + schunk->map = smap; + schunk->map_alloc = ARRAY_SIZE(smap); if (dyn_size) schunk->free_size = dyn_size; else - schunk->free_size = pcpu_unit_size - pcpu_static_size; + schunk->free_size = pcpu_unit_size - static_size; schunk->contig_hint = schunk->free_size; + schunk->map[schunk->map_used++] = -static_size; + if (schunk->free_size) + schunk->map[schunk->map_used++] = schunk->free_size; + /* allocate vm address */ first_vm.flags = VM_ALLOC; first_vm.size = pcpu_chunk_size; @@ -948,7 +949,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, *pcpu_chunk_pagep(schunk, cpu, i) = page; } - BUG_ON(i < PFN_UP(pcpu_static_size)); + BUG_ON(i < PFN_UP(static_size)); if (nr_pages < 0) nr_pages = i; From cafe8816b217b98dc3f268d3b77445da498beb4f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: [PATCH 247/580] percpu: use negative for auto for pcpu_setup_first_chunk() arguments Impact: argument semantic cleanup In pcpu_setup_first_chunk(), zero @unit_size and @dyn_size meant auto-sizing. It's okay for @unit_size as 0 doesn't make sense but 0 dynamic reserve size is valid. Alos, if arch @dyn_size is calculated from other parameters, it might end up passing in 0 @dyn_size and malfunction when the size is automatically adjusted. This patch makes both @unit_size and @dyn_size ssize_t and use -1 for auto sizing. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 2 +- include/linux/percpu.h | 5 ++-- mm/percpu.c | 46 ++++++++++++++++++---------------- 3 files changed, 29 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index c29f301d3885..ef3a2cd3fe64 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -344,7 +344,7 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", pcpu4k_nr_static_pages, static_size); - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL, + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, -1, -1, NULL, pcpu4k_populate_pte); goto out_free_ar; diff --git a/include/linux/percpu.h b/include/linux/percpu.h index a0b4ea2a3354..a96fc53bbd62 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -117,8 +117,9 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr); extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, - size_t static_size, size_t unit_size, - size_t dyn_size, void *base_addr, + size_t static_size, + ssize_t unit_size, ssize_t dyn_size, + void *base_addr, pcpu_populate_pte_fn_t populate_pte_fn); /* diff --git a/mm/percpu.c b/mm/percpu.c index 503ccad091af..a84cf9977faf 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -824,8 +824,8 @@ EXPORT_SYMBOL_GPL(free_percpu); * pcpu_setup_first_chunk - initialize the first percpu chunk * @get_page_fn: callback to fetch page pointer * @static_size: the size of static percpu area in bytes - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto - * @dyn_size: free size for dynamic allocation in bytes, 0 for auto + * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto + * @dyn_size: free size for dynamic allocation in bytes, -1 for auto * @base_addr: mapped address, NULL for auto * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary * @@ -842,13 +842,14 @@ EXPORT_SYMBOL_GPL(free_percpu); * indicates end of pages for the cpu. Note that @get_page_fn() must * return the same number of pages for all cpus. * - * @unit_size, if non-zero, determines unit size and must be aligned - * to PAGE_SIZE and equal to or larger than @static_size + @dyn_size. + * @unit_size, if non-negative, specifies unit size and must be + * aligned to PAGE_SIZE and equal to or larger than @static_size + + * @dyn_size. * - * @dyn_size determines the number of free bytes after the static - * area in the first chunk. If zero, whatever left is available. - * Specifying non-zero value make percpu leave the area after - * @static_size + @dyn_size alone. + * @dyn_size, if non-negative, limits the number of bytes available + * for dynamic allocation in the first chunk. Specifying non-negative + * value make percpu leave alone the area beyond @static_size + + * @dyn_size. * * Non-null @base_addr means that the caller already allocated virtual * region for the first chunk and mapped it. percpu must not mess @@ -863,8 +864,9 @@ EXPORT_SYMBOL_GPL(free_percpu); * percpu access. */ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, - size_t static_size, size_t unit_size, - size_t dyn_size, void *base_addr, + size_t static_size, + ssize_t unit_size, ssize_t dyn_size, + void *base_addr, pcpu_populate_pte_fn_t populate_pte_fn) { static struct vm_struct first_vm; @@ -877,13 +879,17 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, /* santiy checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC); BUG_ON(!static_size); - BUG_ON(!unit_size && dyn_size); - BUG_ON(unit_size && unit_size < static_size + dyn_size); - BUG_ON(unit_size & ~PAGE_MASK); - BUG_ON(base_addr && !unit_size); + if (unit_size >= 0) { + BUG_ON(unit_size < static_size + + (dyn_size >= 0 ? dyn_size : 0)); + BUG_ON(unit_size & ~PAGE_MASK); + } else { + BUG_ON(dyn_size >= 0); + BUG_ON(base_addr); + } BUG_ON(base_addr && populate_pte_fn); - if (unit_size) + if (unit_size >= 0) pcpu_unit_pages = unit_size >> PAGE_SHIFT; else pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, @@ -894,6 +900,9 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); + if (dyn_size < 0) + dyn_size = pcpu_unit_size - static_size; + /* * Allocate chunk slots. The additional last slot is for * empty chunks. @@ -909,12 +918,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, schunk->vm = &first_vm; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); - - if (dyn_size) - schunk->free_size = dyn_size; - else - schunk->free_size = pcpu_unit_size - static_size; - + schunk->free_size = dyn_size; schunk->contig_hint = schunk->free_size; schunk->map[schunk->map_used++] = -static_size; From 9a4f8a878b68d5a5d9ee60908a52cf6a55e1b823 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: [PATCH 248/580] x86: make embedding percpu allocator return excessive free space Impact: reduce unnecessary memory usage on certain configurations Embedding percpu allocator allocates unit_size * smp_num_possible_cpus() bytes consecutively and use it for the first chunk. However, if the static area is small, this can result in excessive prellocated free space in the first chunk due to PCPU_MIN_UNIT_SIZE restriction. This patch makes embedding percpu allocator preallocate only what's necessary as described by PERPCU_DYNAMIC_RESERVE and return the leftover to the bootmem allocator. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 44 +++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index ef3a2cd3fe64..38e2b2a470a5 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -241,24 +241,31 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) * Embedding allocator * * The first chunk is sized to just contain the static area plus - * PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using - * bootmem allocator and used as-is without being mapped into vmalloc - * area. This enables the first chunk to piggy back on the linear - * physical PMD mapping and doesn't add any additional pressure to - * TLB. + * module and dynamic reserves, and allocated as a contiguous area + * using bootmem allocator and used as-is without being mapped into + * vmalloc area. This enables the first chunk to piggy back on the + * linear physical PMD mapping and doesn't add any additional pressure + * to TLB. Note that if the needed size is smaller than the minimum + * unit size, the leftover is returned to the bootmem allocator. */ static void *pcpue_ptr __initdata; +static size_t pcpue_size __initdata; static size_t pcpue_unit_size __initdata; static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) { - return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size - + ((size_t)pageno << PAGE_SHIFT)); + size_t off = (size_t)pageno << PAGE_SHIFT; + + if (off >= pcpue_size) + return NULL; + + return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); } static ssize_t __init setup_pcpu_embed(size_t static_size) { unsigned int cpu; + size_t dyn_size; /* * If large page isn't supported, there's no benefit in doing @@ -269,25 +276,30 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) return -EINVAL; /* allocate and copy */ - pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); - pcpue_unit_size = max_t(size_t, pcpue_unit_size, PCPU_MIN_UNIT_SIZE); + pcpue_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); + pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); + dyn_size = pcpue_size - static_size; + pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, PAGE_SIZE); if (!pcpue_ptr) return -ENOMEM; - for_each_possible_cpu(cpu) - memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load, - static_size); + for_each_possible_cpu(cpu) { + void *ptr = pcpue_ptr + cpu * pcpue_unit_size; + + free_bootmem(__pa(ptr + pcpue_size), + pcpue_unit_size - pcpue_size); + memcpy(ptr, __per_cpu_load, static_size); + } /* we're ready, commit */ pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", - pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size); + pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); return pcpu_setup_first_chunk(pcpue_get_page, static_size, - pcpue_unit_size, - pcpue_unit_size - static_size, pcpue_ptr, - NULL); + pcpue_unit_size, dyn_size, + pcpue_ptr, NULL); } /* From 3e24aa58907c62bc79d1094e941a374568f62522 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: [PATCH 249/580] percpu: add an indirection ptr for chunk page map access Impact: allow sharing page map, no functional difference yet Make chunk->page access indirect by adding a pointer and renaming the actual array to page_ar. This will be used by future changes. Signed-off-by: Tejun Heo --- mm/percpu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/percpu.c b/mm/percpu.c index a84cf9977faf..5b47d9fe65f5 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -80,7 +80,8 @@ struct pcpu_chunk { int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ bool immutable; /* no [de]population allowed */ - struct page *page[]; /* #cpus * UNIT_PAGES */ + struct page **page; /* points to page array */ + struct page *page_ar[]; /* #cpus * UNIT_PAGES */ }; static int pcpu_unit_pages __read_mostly; @@ -696,6 +697,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); chunk->map_alloc = PCPU_DFL_MAP_ALLOC; chunk->map[chunk->map_used++] = pcpu_unit_size; + chunk->page = chunk->page_ar; chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); if (!chunk->vm) { @@ -918,6 +920,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, schunk->vm = &first_vm; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); + schunk->page = schunk->page_ar; schunk->free_size = dyn_size; schunk->contig_hint = schunk->free_size; From edcb463997ed7b2ffa3bac76e3e75957318f2e01 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: [PATCH 250/580] percpu, module: implement reserved allocation and use it for module percpu variables Impact: add reserved allocation functionality and use it for module percpu variables This patch implements reserved allocation from the first chunk. When setting up the first chunk, arch can ask to set aside certain number of bytes right after the core static area which is available only through a separate reserved allocator. This will be used primarily for module static percpu variables on architectures with limited relocation range to ensure that the module perpcu symbols are inside the relocatable range. If reserved area is requested, the first chunk becomes reserved and isn't available for regular allocation. If the first chunk also includes piggy-back dynamic allocation area, a separate chunk mapping the same region is created to serve dynamic allocation. The first one is called static first chunk and the second dynamic first chunk. Although they share the page map, their different area map initializations guarantee they serve disjoint areas according to their purposes. If arch doesn't setup reserved area, reserved allocation is handled like any other allocation. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 8 +- include/linux/percpu.h | 10 ++- kernel/module.c | 2 +- mm/percpu.c | 153 ++++++++++++++++++++++++++++----- 4 files changed, 144 insertions(+), 29 deletions(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 38e2b2a470a5..dd4eabc747c8 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -217,7 +217,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) pr_info("PERCPU: Remapped at %p with large pages, static data " "%zu bytes\n", vm.addr, static_size); - ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE, + ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 0, PMD_SIZE, pcpur_size - static_size, vm.addr, NULL); goto out_free_ar; @@ -297,7 +297,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); - return pcpu_setup_first_chunk(pcpue_get_page, static_size, + return pcpu_setup_first_chunk(pcpue_get_page, static_size, 0, pcpue_unit_size, dyn_size, pcpue_ptr, NULL); } @@ -356,8 +356,8 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", pcpu4k_nr_static_pages, static_size); - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, -1, -1, NULL, - pcpu4k_populate_pte); + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, -1, -1, + NULL, pcpu4k_populate_pte); goto out_free_ar; enomem: diff --git a/include/linux/percpu.h b/include/linux/percpu.h index a96fc53bbd62..8ff15153ae20 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -117,10 +117,10 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr); extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, - size_t static_size, - ssize_t unit_size, ssize_t dyn_size, - void *base_addr, - pcpu_populate_pte_fn_t populate_pte_fn); + size_t static_size, size_t reserved_size, + ssize_t unit_size, ssize_t dyn_size, + void *base_addr, + pcpu_populate_pte_fn_t populate_pte_fn); /* * Use this to get to a cpu's version of the per-cpu object @@ -129,6 +129,8 @@ extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, */ #define per_cpu_ptr(ptr, cpu) SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu))) +extern void *__alloc_reserved_percpu(size_t size, size_t align); + #else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ struct percpu_data { diff --git a/kernel/module.c b/kernel/module.c index 1f0657ae555b..f0e04d6b67d8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -381,7 +381,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align, align = PAGE_SIZE; } - ptr = __alloc_percpu(size, align); + ptr = __alloc_reserved_percpu(size, align); if (!ptr) printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n", size); diff --git a/mm/percpu.c b/mm/percpu.c index 5b47d9fe65f5..ef8e169b7731 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -94,6 +94,11 @@ static size_t pcpu_chunk_struct_size __read_mostly; void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); +/* optional reserved chunk, only accessible for reserved allocations */ +static struct pcpu_chunk *pcpu_reserved_chunk; +/* offset limit of the reserved chunk */ +static int pcpu_reserved_chunk_limit; + /* * One mutex to rule them all. * @@ -201,13 +206,14 @@ static void *pcpu_realloc(void *p, size_t size, size_t new_size) * * This function is called after an allocation or free changed @chunk. * New slot according to the changed state is determined and @chunk is - * moved to the slot. + * moved to the slot. Note that the reserved chunk is never put on + * chunk slots. */ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) { int nslot = pcpu_chunk_slot(chunk); - if (oslot != nslot) { + if (chunk != pcpu_reserved_chunk && oslot != nslot) { if (oslot < nslot) list_move(&chunk->list, &pcpu_slot[nslot]); else @@ -255,6 +261,15 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) struct rb_node *n, *parent; struct pcpu_chunk *chunk; + /* is it in the reserved chunk? */ + if (pcpu_reserved_chunk) { + void *start = pcpu_reserved_chunk->vm->addr; + + if (addr >= start && addr < start + pcpu_reserved_chunk_limit) + return pcpu_reserved_chunk; + } + + /* nah... search the regular ones */ n = *pcpu_chunk_rb_search(addr, &parent); if (!n) { /* no exactly matching chunk, the parent is the closest */ @@ -713,9 +728,10 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) } /** - * __alloc_percpu - allocate percpu area + * pcpu_alloc - the percpu allocator * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) + * @reserved: allocate from the reserved chunk if available * * Allocate percpu area of @size bytes aligned at @align. Might * sleep. Might trigger writeouts. @@ -723,7 +739,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -void *__alloc_percpu(size_t size, size_t align) +static void *pcpu_alloc(size_t size, size_t align, bool reserved) { void *ptr = NULL; struct pcpu_chunk *chunk; @@ -737,7 +753,18 @@ void *__alloc_percpu(size_t size, size_t align) mutex_lock(&pcpu_mutex); - /* allocate area */ + /* serve reserved allocations from the reserved chunk if available */ + if (reserved && pcpu_reserved_chunk) { + chunk = pcpu_reserved_chunk; + if (size > chunk->contig_hint) + goto out_unlock; + off = pcpu_alloc_area(chunk, size, align); + if (off >= 0) + goto area_found; + goto out_unlock; + } + + /* search through normal chunks */ for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { list_for_each_entry(chunk, &pcpu_slot[slot], list) { if (size > chunk->contig_hint) @@ -773,8 +800,41 @@ void *__alloc_percpu(size_t size, size_t align) mutex_unlock(&pcpu_mutex); return ptr; } + +/** + * __alloc_percpu - allocate dynamic percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Allocate percpu area of @size bytes aligned at @align. Might + * sleep. Might trigger writeouts. + * + * RETURNS: + * Percpu pointer to the allocated area on success, NULL on failure. + */ +void *__alloc_percpu(size_t size, size_t align) +{ + return pcpu_alloc(size, align, false); +} EXPORT_SYMBOL_GPL(__alloc_percpu); +/** + * __alloc_reserved_percpu - allocate reserved percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Allocate percpu area of @size bytes aligned at @align from reserved + * percpu area if arch has set it up; otherwise, allocation is served + * from the same dynamic area. Might sleep. Might trigger writeouts. + * + * RETURNS: + * Percpu pointer to the allocated area on success, NULL on failure. + */ +void *__alloc_reserved_percpu(size_t size, size_t align) +{ + return pcpu_alloc(size, align, true); +} + static void pcpu_kill_chunk(struct pcpu_chunk *chunk) { WARN_ON(chunk->immutable); @@ -826,6 +886,7 @@ EXPORT_SYMBOL_GPL(free_percpu); * pcpu_setup_first_chunk - initialize the first percpu chunk * @get_page_fn: callback to fetch page pointer * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto * @dyn_size: free size for dynamic allocation in bytes, -1 for auto * @base_addr: mapped address, NULL for auto @@ -844,14 +905,22 @@ EXPORT_SYMBOL_GPL(free_percpu); * indicates end of pages for the cpu. Note that @get_page_fn() must * return the same number of pages for all cpus. * + * @reserved_size, if non-zero, specifies the amount of bytes to + * reserve after the static area in the first chunk. This reserves + * the first chunk such that it's available only through reserved + * percpu allocation. This is primarily used to serve module percpu + * static areas on architectures where the addressing model has + * limited offset range for symbol relocations to guarantee module + * percpu symbols fall inside the relocatable range. + * * @unit_size, if non-negative, specifies unit size and must be * aligned to PAGE_SIZE and equal to or larger than @static_size + - * @dyn_size. + * @reserved_size + @dyn_size. * * @dyn_size, if non-negative, limits the number of bytes available * for dynamic allocation in the first chunk. Specifying non-negative * value make percpu leave alone the area beyond @static_size + - * @dyn_size. + * @reserved_size + @dyn_size. * * Non-null @base_addr means that the caller already allocated virtual * region for the first chunk and mapped it. percpu must not mess @@ -861,28 +930,36 @@ EXPORT_SYMBOL_GPL(free_percpu); * @populate_pte_fn is used to populate the pagetable. NULL means the * caller already populated the pagetable. * + * If the first chunk ends up with both reserved and dynamic areas, it + * is served by two chunks - one to serve the core static and reserved + * areas and the other for the dynamic area. They share the same vm + * and page map but uses different area allocation map to stay away + * from each other. The latter chunk is circulated in the chunk slots + * and available for dynamic allocation like any other chunks. + * * RETURNS: * The determined pcpu_unit_size which can be used to initialize * percpu access. */ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, - size_t static_size, + size_t static_size, size_t reserved_size, ssize_t unit_size, ssize_t dyn_size, void *base_addr, pcpu_populate_pte_fn_t populate_pte_fn) { static struct vm_struct first_vm; - static int smap[2]; - struct pcpu_chunk *schunk; + static int smap[2], dmap[2]; + struct pcpu_chunk *schunk, *dchunk = NULL; unsigned int cpu; int nr_pages; int err, i; /* santiy checks */ - BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC); + BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || + ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); BUG_ON(!static_size); if (unit_size >= 0) { - BUG_ON(unit_size < static_size + + BUG_ON(unit_size < static_size + reserved_size + (dyn_size >= 0 ? dyn_size : 0)); BUG_ON(unit_size & ~PAGE_MASK); } else { @@ -895,7 +972,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, pcpu_unit_pages = unit_size >> PAGE_SHIFT; else pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, - PFN_UP(static_size)); + PFN_UP(static_size + reserved_size)); pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; @@ -903,7 +980,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); if (dyn_size < 0) - dyn_size = pcpu_unit_size - static_size; + dyn_size = pcpu_unit_size - static_size - reserved_size; /* * Allocate chunk slots. The additional last slot is for @@ -914,20 +991,49 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); - /* init static chunk */ + /* + * Initialize static chunk. If reserved_size is zero, the + * static chunk covers static area + dynamic allocation area + * in the first chunk. If reserved_size is not zero, it + * covers static area + reserved area (mostly used for module + * static percpu allocation). + */ schunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&schunk->list); schunk->vm = &first_vm; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); schunk->page = schunk->page_ar; - schunk->free_size = dyn_size; + + if (reserved_size) { + schunk->free_size = reserved_size; + pcpu_reserved_chunk = schunk; /* not for dynamic alloc */ + } else { + schunk->free_size = dyn_size; + dyn_size = 0; /* dynamic area covered */ + } schunk->contig_hint = schunk->free_size; schunk->map[schunk->map_used++] = -static_size; if (schunk->free_size) schunk->map[schunk->map_used++] = schunk->free_size; + pcpu_reserved_chunk_limit = static_size + schunk->free_size; + + /* init dynamic chunk if necessary */ + if (dyn_size) { + dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); + INIT_LIST_HEAD(&dchunk->list); + dchunk->vm = &first_vm; + dchunk->map = dmap; + dchunk->map_alloc = ARRAY_SIZE(dmap); + dchunk->page = schunk->page_ar; /* share page map with schunk */ + + dchunk->contig_hint = dchunk->free_size = dyn_size; + dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; + dchunk->map[dchunk->map_used++] = dchunk->free_size; + } + /* allocate vm address */ first_vm.flags = VM_ALLOC; first_vm.size = pcpu_chunk_size; @@ -937,12 +1043,14 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, else { /* * Pages already mapped. No need to remap into - * vmalloc area. In this case the static chunk can't - * be mapped or unmapped by percpu and is marked + * vmalloc area. In this case the first chunks can't + * be mapped or unmapped by percpu and are marked * immutable. */ first_vm.addr = base_addr; schunk->immutable = true; + if (dchunk) + dchunk->immutable = true; } /* assign pages */ @@ -978,8 +1086,13 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, } /* link the first chunk in */ - pcpu_chunk_relocate(schunk, -1); - pcpu_chunk_addr_insert(schunk); + if (!dchunk) { + pcpu_chunk_relocate(schunk, -1); + pcpu_chunk_addr_insert(schunk); + } else { + pcpu_chunk_relocate(dchunk, -1); + pcpu_chunk_addr_insert(dchunk); + } /* we're done */ pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); From 6b19b0c2400437a3c10059ede0e59b517092e1bd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: [PATCH 251/580] x86, percpu: setup reserved percpu area for x86_64 Impact: fix relocation overflow during module load x86_64 uses 32bit relocations for symbol access and static percpu symbols whether in core or modules must be inside 2GB of the percpu segement base which the dynamic percpu allocator doesn't guarantee. This patch makes x86_64 reserve PERCPU_MODULE_RESERVE bytes in the first chunk so that module percpu areas are always allocated from the first chunk which is always inside the relocatable range. This problem exists for any percpu allocator but is easily triggered when using the embedding allocator because the second chunk is located beyond 2GB on it. This patch also changes the meaning of PERCPU_DYNAMIC_RESERVE such that it only indicates the size of the area to reserve for dynamic allocation as static and dynamic areas can be separate. New PERCPU_DYNAMIC_RESERVED is increased by 4k for both 32 and 64bits as the reserved area separation eats away some allocatable space and having slightly more headroom (currently between 4 and 8k after minimal boot sans module area) makes sense for common case performance. x86_32 can address anywhere from anywhere and doesn't need reserving. Mike Galbraith first reported the problem first and bisected it to the embedding percpu allocator commit. Signed-off-by: Tejun Heo Reported-by: Mike Galbraith Reported-by: Jaswinder Singh Rajput --- arch/x86/kernel/setup_percpu.c | 37 +++++++++++++++++++++++++--------- include/linux/percpu.h | 35 +++++++++++--------------------- 2 files changed, 40 insertions(+), 32 deletions(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index dd4eabc747c8..efa615f2bf43 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -42,6 +42,19 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { }; EXPORT_SYMBOL(__per_cpu_offset); +/* + * On x86_64 symbols referenced from code should be reachable using + * 32bit relocations. Reserve space for static percpu variables in + * modules so that they are always served from the first chunk which + * is located at the percpu segment base. On x86_32, anything can + * address anywhere. No need to reserve space in the first chunk. + */ +#ifdef CONFIG_X86_64 +#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE +#else +#define PERCPU_FIRST_CHUNK_RESERVE 0 +#endif + /** * pcpu_need_numa - determine percpu allocation needs to consider NUMA * @@ -141,7 +154,7 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) { static struct vm_struct vm; pg_data_t *last; - size_t ptrs_size; + size_t ptrs_size, dyn_size; unsigned int cpu; ssize_t ret; @@ -169,12 +182,14 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) * Currently supports only single page. Supporting multiple * pages won't be too difficult if it ever becomes necessary. */ - pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); + pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE); if (pcpur_size > PMD_SIZE) { pr_warning("PERCPU: static data is larger than large page, " "can't use large page\n"); return -EINVAL; } + dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; /* allocate pointer array and alloc large pages */ ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); @@ -217,8 +232,9 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) pr_info("PERCPU: Remapped at %p with large pages, static data " "%zu bytes\n", vm.addr, static_size); - ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 0, PMD_SIZE, - pcpur_size - static_size, vm.addr, NULL); + ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, + PERCPU_FIRST_CHUNK_RESERVE, + PMD_SIZE, dyn_size, vm.addr, NULL); goto out_free_ar; enomem: @@ -276,9 +292,10 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) return -EINVAL; /* allocate and copy */ - pcpue_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE); + pcpue_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE); pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - dyn_size = pcpue_size - static_size; + dyn_size = pcpue_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, PAGE_SIZE); @@ -297,7 +314,8 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); - return pcpu_setup_first_chunk(pcpue_get_page, static_size, 0, + return pcpu_setup_first_chunk(pcpue_get_page, static_size, + PERCPU_FIRST_CHUNK_RESERVE, pcpue_unit_size, dyn_size, pcpue_ptr, NULL); } @@ -356,8 +374,9 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", pcpu4k_nr_static_pages, static_size); - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, -1, -1, - NULL, pcpu4k_populate_pte); + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, + PERCPU_FIRST_CHUNK_RESERVE, -1, -1, NULL, + pcpu4k_populate_pte); goto out_free_ar; enomem: diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 8ff15153ae20..54a968b4b924 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -85,31 +85,20 @@ /* * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy - * back on the first chunk if arch is manually allocating and mapping - * it for faster access (as a part of large page mapping for example). - * Note that dynamic percpu allocator covers both static and dynamic - * areas, so these values are bigger than PERCPU_MODULE_RESERVE. + * back on the first chunk for dynamic percpu allocation if arch is + * manually allocating and mapping it for faster access (as a part of + * large page mapping for example). * - * On typical configuration with modules, the following values leave - * about 8k of free space on the first chunk after boot on both x86_32 - * and 64 when module support is enabled. When module support is - * disabled, it's much tighter. + * The following values give between one and two pages of free space + * after typical minimal boot (2-way SMP, single disk and NIC) with + * both defconfig and a distro config on x86_64 and 32. More + * intelligent way to determine this would be nice. */ -#ifndef PERCPU_DYNAMIC_RESERVE -# if BITS_PER_LONG > 32 -# ifdef CONFIG_MODULES -# define PERCPU_DYNAMIC_RESERVE (24 << 10) -# else -# define PERCPU_DYNAMIC_RESERVE (16 << 10) -# endif -# else -# ifdef CONFIG_MODULES -# define PERCPU_DYNAMIC_RESERVE (16 << 10) -# else -# define PERCPU_DYNAMIC_RESERVE (8 << 10) -# endif -# endif -#endif /* PERCPU_DYNAMIC_RESERVE */ +#if BITS_PER_LONG > 32 +#define PERCPU_DYNAMIC_RESERVE (20 << 10) +#else +#define PERCPU_DYNAMIC_RESERVE (12 << 10) +#endif extern void *pcpu_base_addr; From 59247eaea50cc68cc6ce3d3fd3855f3301b65c96 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 6 Mar 2009 08:55:24 +0100 Subject: [PATCH 252/580] block: fix missing bio back/front segment size setting in blk_recount_segments() Commit 1e42807918d17e8c93bf14fbb74be84b141334c1 introduced a bug where we don't get front/back segment sizes in the bio in blk_recount_segments(). Fix this by tracking the back bio as well as the front bio in __blk_recalc_rq_segments(), this also cleans up the interface by getting rid of the segment size pointer passing. Tested-by: Thomas Gleixner Tested-by: Ingo Molnar Signed-off-by: Jens Axboe --- block/blk-merge.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index a104593e70c3..5a244f05360f 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -39,14 +39,13 @@ void blk_recalc_rq_sectors(struct request *rq, int nsect) } static unsigned int __blk_recalc_rq_segments(struct request_queue *q, - struct bio *bio, - unsigned int *seg_size_ptr) + struct bio *bio) { unsigned int phys_size; struct bio_vec *bv, *bvprv = NULL; int cluster, i, high, highprv = 1; unsigned int seg_size, nr_phys_segs; - struct bio *fbio; + struct bio *fbio, *bbio; if (!bio) return 0; @@ -87,26 +86,20 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, seg_size = bv->bv_len; highprv = high; } + bbio = bio; } - if (seg_size_ptr) - *seg_size_ptr = seg_size; + if (nr_phys_segs == 1 && seg_size > fbio->bi_seg_front_size) + fbio->bi_seg_front_size = seg_size; + if (seg_size > bbio->bi_seg_back_size) + bbio->bi_seg_back_size = seg_size; return nr_phys_segs; } void blk_recalc_rq_segments(struct request *rq) { - unsigned int seg_size = 0, phys_segs; - - phys_segs = __blk_recalc_rq_segments(rq->q, rq->bio, &seg_size); - - if (phys_segs == 1 && seg_size > rq->bio->bi_seg_front_size) - rq->bio->bi_seg_front_size = seg_size; - if (seg_size > rq->biotail->bi_seg_back_size) - rq->biotail->bi_seg_back_size = seg_size; - - rq->nr_phys_segments = phys_segs; + rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio); } void blk_recount_segments(struct request_queue *q, struct bio *bio) @@ -114,7 +107,7 @@ void blk_recount_segments(struct request_queue *q, struct bio *bio) struct bio *nxt = bio->bi_next; bio->bi_next = NULL; - bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, NULL); + bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio); bio->bi_next = nxt; bio->bi_flags |= (1 << BIO_SEG_VALID); } From 14b97595e0e1f47b6f809e180e5bcd8dcd995690 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 6 Mar 2009 09:42:07 +0100 Subject: [PATCH 253/580] ALSA: hda - Fix typos in slave controls in patch_sigmatel.c "Headphone Playback ..." appears twice in slave_vols[] and slave_sws[]. They should be "Headphone Playback2 ..." Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_sigmatel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c index 3bc427645da8..995b413078f5 100644 --- a/sound/pci/hda/patch_sigmatel.c +++ b/sound/pci/hda/patch_sigmatel.c @@ -1207,7 +1207,7 @@ static const char *slave_vols[] = { "LFE Playback Volume", "Side Playback Volume", "Headphone Playback Volume", - "Headphone Playback Volume", + "Headphone2 Playback Volume", "Speaker Playback Volume", "External Speaker Playback Volume", "Speaker2 Playback Volume", @@ -1221,7 +1221,7 @@ static const char *slave_sws[] = { "LFE Playback Switch", "Side Playback Switch", "Headphone Playback Switch", - "Headphone Playback Switch", + "Headphone2 Playback Switch", "Speaker Playback Switch", "External Speaker Playback Switch", "Speaker2 Playback Switch", From c50ff7c04225c945b13d410d50fde6ff6c59d7ee Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 6 Mar 2009 09:43:58 +0100 Subject: [PATCH 254/580] ALSA: hda - Fix headphone-detect regression with multiple HP jacks The recent changes over the DAC detection mechanism in patch_sigmatel.c breaks the HP detection on the machines with multiple HP jacks. It's basically because of the workaround to support the multi-channel output. Since the HP detection is more important feature, disable the HP-swap workaroud temporarily. Reference: Novell bnc#482052 https://bugzilla.novell.com/show_bug.cgi?id=482052 Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_sigmatel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c index 995b413078f5..6094344fb223 100644 --- a/sound/pci/hda/patch_sigmatel.c +++ b/sound/pci/hda/patch_sigmatel.c @@ -3516,6 +3516,7 @@ static int stac92xx_parse_auto_config(struct hda_codec *codec, hda_nid_t dig_out if (! spec->autocfg.line_outs) return 0; /* can't find valid pin config */ +#if 0 /* FIXME: temporarily disabled */ /* If we have no real line-out pin and multiple hp-outs, HPs should * be set up as multi-channel outputs. */ @@ -3535,6 +3536,7 @@ static int stac92xx_parse_auto_config(struct hda_codec *codec, hda_nid_t dig_out spec->autocfg.line_out_type = AUTO_PIN_HP_OUT; spec->autocfg.hp_outs = 0; } +#endif /* FIXME: temporarily disabled */ if (spec->autocfg.mono_out_pin) { int dir = get_wcaps(codec, spec->autocfg.mono_out_pin) & (AC_WCAP_OUT_AMP | AC_WCAP_IN_AMP); From d1a8e7792047f7dca7eb5759250e2c12800bf262 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 6 Mar 2009 03:12:50 -0800 Subject: [PATCH 255/580] x86: make "memtest" like "memtest=17" Impact: make boot command line "memtest" do one loop by default So don't need to guess many patterns in one loop. Signed-off-by: Yinghai Lu LKML-Reference: <49B10532.3020105@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 0bcd7883d036..605c8be06217 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -100,6 +100,9 @@ static int __init parse_memtest(char *arg) { if (arg) memtest_pattern = simple_strtoul(arg, NULL, 0); + else + memtest_pattern = ARRAY_SIZE(patterns); + return 0; } From c77a3b59c624454c501cbfa1a3611d5a00bf9532 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 17:04:26 +0200 Subject: [PATCH 256/580] x86: fix uninitialized variable in init_memory_mapping() Signed-off-by: Pekka Enberg LKML-Reference: <1236265466.31324.9.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6d63e3d1253d..15219e0d1243 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -134,8 +134,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, { unsigned long page_size_mask = 0; unsigned long start_pfn, end_pfn; + unsigned long ret = 0; unsigned long pos; - unsigned long ret; struct map_range mr[NR_RANGE_MR]; int nr_range, i; From 5dd61dfabcaa5bfb67afb8a2d255bd1e156562e3 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 17:04:57 +0200 Subject: [PATCH 257/580] x86: rename do_not_nx to disable_nx in mm/init_64.c As a preparational step for unifying noexec handling on 32-bit and 64-bit, rename the do_not_nx variable to disable_nx on 64-bit. Signed-off-by: Pekka Enberg LKML-Reference: <1236265497.31324.11.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 8a853bc3b287..54efa57d1c03 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -85,7 +85,7 @@ early_param("gbpages", parse_direct_gbpages_on); pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; EXPORT_SYMBOL_GPL(__supported_pte_mask); -static int do_not_nx __cpuinitdata; +static int disable_nx __cpuinitdata; /* * noexec=on|off @@ -100,9 +100,9 @@ static int __init nonx_setup(char *str) return -EINVAL; if (!strncmp(str, "on", 2)) { __supported_pte_mask |= _PAGE_NX; - do_not_nx = 0; + disable_nx = 0; } else if (!strncmp(str, "off", 3)) { - do_not_nx = 1; + disable_nx = 1; __supported_pte_mask &= ~_PAGE_NX; } return 0; @@ -114,7 +114,7 @@ void __cpuinit check_efer(void) unsigned long efer; rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || do_not_nx) + if (!(efer & EFER_NX) || disable_nx) __supported_pte_mask &= ~_PAGE_NX; } From 9ca0791dcaa666e8e8f4b4ca028b65b4bde9cb28 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Thu, 5 Mar 2009 08:49:54 +0100 Subject: [PATCH 258/580] x86, bts: remove bad warning In case a ptraced task is reaped (while the tracer is still attached), ds_exit_thread() is called before ptrace_exit(). The latter will release the bts_tracer and remove the thread's ds_ctx. The former will WARN() if the context is not NULL. Oleg Nesterov submitted patches that move ptrace_exit() before exit_thread() and thus reverse the order of the above calls. Remove the bad warning. I will add it again when Oleg's changes are in. Signed-off-by: Markus Metzger LKML-Reference: <20090305084954.A22000@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 169a120587be..de7cdbda1c36 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -1029,5 +1029,4 @@ void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) void ds_exit_thread(struct task_struct *tsk) { - WARN_ON(tsk->thread.ds_ctx); } From 73bf1b62f561fc8ecb00e2810efe4fe769f4933e Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Thu, 5 Mar 2009 08:57:21 +0100 Subject: [PATCH 259/580] x86, pebs: correct qualifier passed to ds_write_config() from ds_request_pebs() ds_write_config() can write the BTS as well as the PEBS part of the DS config. ds_request_pebs() passes the wrong qualifier, which results in the wrong configuration to be written. Reported-by: Stephane Eranian Signed-off-by: Markus Metzger LKML-Reference: <20090305085721.A22550@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index de7cdbda1c36..87b67e3a765a 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -729,7 +729,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, spin_unlock_irqrestore(&ds_lock, irq); - ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); ds_resume_pebs(tracer); return tracer; From 1880d93b80acc3171850e9df5048bcb26b75c2f5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2009 00:44:09 +0900 Subject: [PATCH 260/580] percpu: replace pcpu_realloc() with pcpu_mem_alloc() and pcpu_mem_free() Impact: code reorganization for later changes With static map handling moved to pcpu_split_block(), pcpu_realloc() only clutters the code and it's also unsuitable for scheduled locking changes. Implement and use pcpu_mem_alloc/free() instead. Signed-off-by: Tejun Heo --- mm/percpu.c | 87 ++++++++++++++++++++++++++--------------------------- 1 file changed, 43 insertions(+), 44 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index ef8e169b7731..f1d0e905850c 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -164,39 +164,41 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, } /** - * pcpu_realloc - versatile realloc - * @p: the current pointer (can be NULL for new allocations) - * @size: the current size in bytes (can be 0 for new allocations) - * @new_size: the wanted new size in bytes (can be 0 for free) + * pcpu_mem_alloc - allocate memory + * @size: bytes to allocate * - * More robust realloc which can be used to allocate, resize or free a - * memory area of arbitrary size. If the needed size goes over - * PAGE_SIZE, kernel VM is used. + * Allocate @size bytes. If @size is smaller than PAGE_SIZE, + * kzalloc() is used; otherwise, vmalloc() is used. The returned + * memory is always zeroed. * * RETURNS: - * The new pointer on success, NULL on failure. + * Pointer to the allocated area on success, NULL on failure. */ -static void *pcpu_realloc(void *p, size_t size, size_t new_size) +static void *pcpu_mem_alloc(size_t size) { - void *new; - - if (new_size <= PAGE_SIZE) - new = kmalloc(new_size, GFP_KERNEL); - else - new = vmalloc(new_size); - if (new_size && !new) - return NULL; - - memcpy(new, p, min(size, new_size)); - if (new_size > size) - memset(new + size, 0, new_size - size); - if (size <= PAGE_SIZE) - kfree(p); - else - vfree(p); + return kzalloc(size, GFP_KERNEL); + else { + void *ptr = vmalloc(size); + if (ptr) + memset(ptr, 0, size); + return ptr; + } +} - return new; +/** + * pcpu_mem_free - free memory + * @ptr: memory to free + * @size: size of the area + * + * Free @ptr. @ptr should have been allocated using pcpu_mem_alloc(). + */ +static void pcpu_mem_free(void *ptr, size_t size) +{ + if (size <= PAGE_SIZE) + kfree(ptr); + else + vfree(ptr); } /** @@ -331,29 +333,27 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) if (chunk->map_alloc < target) { int new_alloc; int *new; + size_t size; new_alloc = PCPU_DFL_MAP_ALLOC; while (new_alloc < target) new_alloc *= 2; - if (chunk->map_alloc < PCPU_DFL_MAP_ALLOC) { - /* - * map_alloc smaller than the default size - * indicates that the chunk is one of the - * first chunks and still using static map. - * Allocate a dynamic one and copy. - */ - new = pcpu_realloc(NULL, 0, new_alloc * sizeof(new[0])); - if (new) - memcpy(new, chunk->map, - chunk->map_alloc * sizeof(new[0])); - } else - new = pcpu_realloc(chunk->map, - chunk->map_alloc * sizeof(new[0]), - new_alloc * sizeof(new[0])); + new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); if (!new) return -ENOMEM; + size = chunk->map_alloc * sizeof(chunk->map[0]); + memcpy(new, chunk->map, size); + + /* + * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the + * chunk is one of the first chunks and still using + * static map. + */ + if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) + pcpu_mem_free(chunk->map, size); + chunk->map_alloc = new_alloc; chunk->map = new; } @@ -696,7 +696,7 @@ static void free_pcpu_chunk(struct pcpu_chunk *chunk) return; if (chunk->vm) free_vm_area(chunk->vm); - pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0); + pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); kfree(chunk); } @@ -708,8 +708,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) if (!chunk) return NULL; - chunk->map = pcpu_realloc(NULL, 0, - PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); + chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); chunk->map_alloc = PCPU_DFL_MAP_ALLOC; chunk->map[chunk->map_used++] = pcpu_unit_size; chunk->page = chunk->page_ar; From 9f7dcf224bd09ec9ebcbfb383bf2c465e0e0b03d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2009 00:44:09 +0900 Subject: [PATCH 261/580] percpu: move chunk area map extension out of area allocation Impact: code reorganization for later changes Separate out chunk area map extension into a separate function - pcpu_extend_area_map() - and call it directly from pcpu_alloc() such that pcpu_alloc_area() is guaranteed to have enough area map slots on invocation. With this change, pcpu_alloc_area() does only area allocation and the only failure mode is when the chunk doens't have enough room, so there's no need to distinguish it from memory allocation failures. Make it return -1 on such cases instead of hacky -ENOSPC. Signed-off-by: Tejun Heo --- mm/percpu.c | 108 +++++++++++++++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 48 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index f1d0e905850c..7d9bc35e8ed2 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -306,6 +306,50 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) rb_insert_color(&new->rb_node, &pcpu_addr_root); } +/** + * pcpu_extend_area_map - extend area map for allocation + * @chunk: target chunk + * + * Extend area map of @chunk so that it can accomodate an allocation. + * A single allocation can split an area into three areas, so this + * function makes sure that @chunk->map has at least two extra slots. + * + * RETURNS: + * 0 if noop, 1 if successfully extended, -errno on failure. + */ +static int pcpu_extend_area_map(struct pcpu_chunk *chunk) +{ + int new_alloc; + int *new; + size_t size; + + /* has enough? */ + if (chunk->map_alloc >= chunk->map_used + 2) + return 0; + + new_alloc = PCPU_DFL_MAP_ALLOC; + while (new_alloc < chunk->map_used + 2) + new_alloc *= 2; + + new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); + if (!new) + return -ENOMEM; + + size = chunk->map_alloc * sizeof(chunk->map[0]); + memcpy(new, chunk->map, size); + + /* + * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is + * one of the first chunks and still using static map. + */ + if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) + pcpu_mem_free(chunk->map, size); + + chunk->map_alloc = new_alloc; + chunk->map = new; + return 0; +} + /** * pcpu_split_block - split a map block * @chunk: chunk of interest @@ -321,44 +365,16 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) * depending on @head, is reduced by @tail bytes and @tail byte block * is inserted after the target block. * - * RETURNS: - * 0 on success, -errno on failure. + * @chunk->map must have enough free slots to accomodate the split. */ -static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) +static void pcpu_split_block(struct pcpu_chunk *chunk, int i, + int head, int tail) { int nr_extra = !!head + !!tail; - int target = chunk->map_used + nr_extra; - /* reallocation required? */ - if (chunk->map_alloc < target) { - int new_alloc; - int *new; - size_t size; + BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra); - new_alloc = PCPU_DFL_MAP_ALLOC; - while (new_alloc < target) - new_alloc *= 2; - - new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); - if (!new) - return -ENOMEM; - - size = chunk->map_alloc * sizeof(chunk->map[0]); - memcpy(new, chunk->map, size); - - /* - * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the - * chunk is one of the first chunks and still using - * static map. - */ - if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) - pcpu_mem_free(chunk->map, size); - - chunk->map_alloc = new_alloc; - chunk->map = new; - } - - /* insert a new subblock */ + /* insert new subblocks */ memmove(&chunk->map[i + nr_extra], &chunk->map[i], sizeof(chunk->map[0]) * (chunk->map_used - i)); chunk->map_used += nr_extra; @@ -371,7 +387,6 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) chunk->map[i++] -= tail; chunk->map[i] = tail; } - return 0; } /** @@ -384,8 +399,11 @@ static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) * Note that this function only allocates the offset. It doesn't * populate or map the area. * + * @chunk->map must have at least two free slots. + * * RETURNS: - * Allocated offset in @chunk on success, -errno on failure. + * Allocated offset in @chunk on success, -1 if no matching area is + * found. */ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) { @@ -433,8 +451,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) /* split if warranted */ if (head || tail) { - if (pcpu_split_block(chunk, i, head, tail)) - return -ENOMEM; + pcpu_split_block(chunk, i, head, tail); if (head) { i++; off += head; @@ -461,14 +478,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) chunk->contig_hint = max_contig; /* fully scanned */ pcpu_chunk_relocate(chunk, oslot); - /* - * Tell the upper layer that this chunk has no area left. - * Note that this is not an error condition but a notification - * to upper layer that it needs to look at other chunks. - * -ENOSPC is chosen as it isn't used in memory subsystem and - * matches the meaning in a way. - */ - return -ENOSPC; + /* tell the upper layer that this chunk has no matching area */ + return -1; } /** @@ -755,7 +766,8 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) /* serve reserved allocations from the reserved chunk if available */ if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; - if (size > chunk->contig_hint) + if (size > chunk->contig_hint || + pcpu_extend_area_map(chunk) < 0) goto out_unlock; off = pcpu_alloc_area(chunk, size, align); if (off >= 0) @@ -768,11 +780,11 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) list_for_each_entry(chunk, &pcpu_slot[slot], list) { if (size > chunk->contig_hint) continue; + if (pcpu_extend_area_map(chunk) < 0) + goto out_unlock; off = pcpu_alloc_area(chunk, size, align); if (off >= 0) goto area_found; - if (off != -ENOSPC) - goto out_unlock; } } From a56dbddf06b653ef9c04ca3767f260fd31ccebab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2009 00:44:11 +0900 Subject: [PATCH 262/580] percpu: move fully free chunk reclamation into a work Impact: code reorganization for later changes Do fully free chunk reclamation using a work. This change is to prepare for locking changes. Signed-off-by: Tejun Heo --- mm/percpu.c | 48 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 7d9bc35e8ed2..4c8a419119da 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -118,6 +119,10 @@ static DEFINE_MUTEX(pcpu_mutex); static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ +/* reclaim work to release fully free chunks, scheduled from free path */ +static void pcpu_reclaim(struct work_struct *work); +static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); + static int __pcpu_size_to_slot(int size) { int highbit = fls(size); /* size is in bytes */ @@ -846,13 +851,37 @@ void *__alloc_reserved_percpu(size_t size, size_t align) return pcpu_alloc(size, align, true); } -static void pcpu_kill_chunk(struct pcpu_chunk *chunk) +/** + * pcpu_reclaim - reclaim fully free chunks, workqueue function + * @work: unused + * + * Reclaim all fully free chunks except for the first one. + */ +static void pcpu_reclaim(struct work_struct *work) { - WARN_ON(chunk->immutable); - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); - list_del(&chunk->list); - rb_erase(&chunk->rb_node, &pcpu_addr_root); - free_pcpu_chunk(chunk); + LIST_HEAD(todo); + struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; + struct pcpu_chunk *chunk, *next; + + mutex_lock(&pcpu_mutex); + + list_for_each_entry_safe(chunk, next, head, list) { + WARN_ON(chunk->immutable); + + /* spare the first one */ + if (chunk == list_first_entry(head, struct pcpu_chunk, list)) + continue; + + rb_erase(&chunk->rb_node, &pcpu_addr_root); + list_move(&chunk->list, &todo); + } + + mutex_unlock(&pcpu_mutex); + + list_for_each_entry_safe(chunk, next, &todo, list) { + pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); + free_pcpu_chunk(chunk); + } } /** @@ -877,14 +906,13 @@ void free_percpu(void *ptr) pcpu_free_area(chunk, off); - /* the chunk became fully free, kill one if there are other free ones */ + /* if there are more than one fully free chunks, wake up grim reaper */ if (chunk->free_size == pcpu_unit_size) { struct pcpu_chunk *pos; - list_for_each_entry(pos, - &pcpu_slot[pcpu_chunk_slot(chunk)], list) + list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { - pcpu_kill_chunk(pos); + schedule_work(&pcpu_reclaim_work); break; } } From 7ab152470e8416ef2a44c800fdc157e2192f2974 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 6 Mar 2009 19:08:34 +0300 Subject: [PATCH 263/580] x86: linkage.h - guard assembler specifics by __ASSEMBLY__ Stephen Rothwell reported: |Today's linux-next build (x86_64 allmodconfig) produced this warning: | |In file included from drivers/char/epca.c:49: |drivers/char/digiFep1.h:7:1: warning: "GLOBAL" redefined |In file included from include/linux/linkage.h:5, | from include/linux/kernel.h:11, | from arch/x86/include/asm/system.h:10, | from arch/x86/include/asm/processor.h:17, | from include/linux/prefetch.h:14, | from include/linux/list.h:6, | from include/linux/module.h:9, | from drivers/char/epca.c:29: |arch/x86/include/asm/linkage.h:55:1: warning: this is the location of the previous definition | |Probably introduced by commit 95695547a7db44b88a7ee36cf5df188de267e99e |("x86: asm linkage - introduce GLOBAL macro") from the x86 tree. Any assembler specific snippets being placed in headers are to be protected by __ASSEMBLY__. Fixed. Also move __ALIGN definition under the same protection as well. Reported-by: Stephen Rothwell Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090306160833.GB7420@localhost> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/linkage.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 9320e2a8a26a..a0d70b46c27c 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -4,11 +4,6 @@ #undef notrace #define notrace __attribute__((no_instrument_function)) -#ifdef CONFIG_X86_64 -#define __ALIGN .p2align 4,,15 -#define __ALIGN_STR ".p2align 4,,15" -#endif - #ifdef CONFIG_X86_32 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) /* @@ -50,16 +45,25 @@ __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \ "g" (arg4), "g" (arg5), "g" (arg6)) -#endif +#endif /* CONFIG_X86_32 */ + +#ifdef __ASSEMBLY__ #define GLOBAL(name) \ .globl name; \ name: +#ifdef CONFIG_X86_64 +#define __ALIGN .p2align 4,,15 +#define __ALIGN_STR ".p2align 4,,15" +#endif + #ifdef CONFIG_X86_ALIGNMENT_16 #define __ALIGN .align 16,0x90 #define __ALIGN_STR ".align 16,0x90" #endif +#endif /* __ASSEMBLY__ */ + #endif /* _ASM_X86_LINKAGE_H */ From 20214fcd74bddc16d65d466212f5dd32aafe8868 Mon Sep 17 00:00:00 2001 From: Darius Augulis Date: Mon, 12 Jan 2009 16:22:57 +0200 Subject: [PATCH 264/580] MX1 fix include Includes missed irqs.h in devices.c and mx1ads.c. Signed-off-by: Darius Augulis Signed-off-by: Sascha Hauer --- arch/arm/mach-mx1/devices.c | 2 ++ arch/arm/mach-mx1/mx1ads.c | 1 + 2 files changed, 3 insertions(+) diff --git a/arch/arm/mach-mx1/devices.c b/arch/arm/mach-mx1/devices.c index 686d8d2dbb24..a95644193f3f 100644 --- a/arch/arm/mach-mx1/devices.c +++ b/arch/arm/mach-mx1/devices.c @@ -23,6 +23,8 @@ #include #include #include + +#include #include static struct resource imx_csi_resources[] = { diff --git a/arch/arm/mach-mx1/mx1ads.c b/arch/arm/mach-mx1/mx1ads.c index 2e4b185fe4a9..3200cf60e384 100644 --- a/arch/arm/mach-mx1/mx1ads.c +++ b/arch/arm/mach-mx1/mx1ads.c @@ -21,6 +21,7 @@ #include #include +#include #include #include #include From c63c58056e268b0d6bd6994b69030c144567990d Mon Sep 17 00:00:00 2001 From: Jeremy Higdon Date: Wed, 4 Mar 2009 12:09:46 -0800 Subject: [PATCH 265/580] [IA64] fix PCI DMA flag propagation on SN (Altix) with PICs We recently discovered a problem with passing of DMA attributes on SN systems with the older PIC chips. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Jeremy Higdon Cc: Cc: Jesse Barnes Signed-off-by: Andrew Morton Signed-off-by: Tony Luck --- arch/ia64/sn/pci/pcibr/pcibr_dma.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/ia64/sn/pci/pcibr/pcibr_dma.c b/arch/ia64/sn/pci/pcibr/pcibr_dma.c index e626e50a938a..060df4aa9916 100644 --- a/arch/ia64/sn/pci/pcibr/pcibr_dma.c +++ b/arch/ia64/sn/pci/pcibr/pcibr_dma.c @@ -135,11 +135,10 @@ pcibr_dmatrans_direct64(struct pcidev_info * info, u64 paddr, if (SN_DMA_ADDRTYPE(dma_flags) == SN_DMA_ADDR_PHYS) pci_addr = IS_PIC_SOFT(pcibus_info) ? PHYS_TO_DMA(paddr) : - PHYS_TO_TIODMA(paddr) | dma_attributes; + PHYS_TO_TIODMA(paddr); else - pci_addr = IS_PIC_SOFT(pcibus_info) ? - paddr : - paddr | dma_attributes; + pci_addr = paddr; + pci_addr |= dma_attributes; /* Handle Bus mode */ if (IS_PCIX(pcibus_info)) From bd05f28e1a15ae62994fe309a524695fe26dd834 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 3 Mar 2009 22:55:21 +0100 Subject: [PATCH 266/580] cfg80211: test before subtraction on unsigned freq_diff is unsigned, so test before subtraction Signed-off-by: Roel Kluin Signed-off-by: John W. Linville --- net/wireless/reg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 85c9034c59b2..bd0a16c3de5e 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -380,7 +380,8 @@ static bool is_valid_reg_rule(const struct ieee80211_reg_rule *rule) freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz; - if (freq_diff <= 0 || freq_range->max_bandwidth_khz > freq_diff) + if (freq_range->end_freq_khz <= freq_range->start_freq_khz || + freq_range->max_bandwidth_khz > freq_diff) return false; return true; From c0350024723b4a69e38655816484d934aca8eb30 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Fri, 6 Mar 2009 00:53:59 +0100 Subject: [PATCH 267/580] p54: fix race condition in memory management This patch fixes a number of race conditions in the driver. Up until now, "entry" pointer was initialized before acquiring the right lock. Signed-off-by: Christian Lamparter Signed-off-by: John W. Linville --- drivers/net/wireless/p54/p54common.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/p54/p54common.c b/drivers/net/wireless/p54/p54common.c index 34561e6e816b..f170106bf0ae 100644 --- a/drivers/net/wireless/p54/p54common.c +++ b/drivers/net/wireless/p54/p54common.c @@ -710,10 +710,11 @@ static struct sk_buff *p54_find_tx_entry(struct ieee80211_hw *dev, __le32 req_id) { struct p54_common *priv = dev->priv; - struct sk_buff *entry = priv->tx_queue.next; + struct sk_buff *entry; unsigned long flags; spin_lock_irqsave(&priv->tx_queue.lock, flags); + entry = priv->tx_queue.next; while (entry != (struct sk_buff *)&priv->tx_queue) { struct p54_hdr *hdr = (struct p54_hdr *) entry->data; @@ -732,7 +733,7 @@ static void p54_rx_frame_sent(struct ieee80211_hw *dev, struct sk_buff *skb) struct p54_common *priv = dev->priv; struct p54_hdr *hdr = (struct p54_hdr *) skb->data; struct p54_frame_sent *payload = (struct p54_frame_sent *) hdr->data; - struct sk_buff *entry = (struct sk_buff *) priv->tx_queue.next; + struct sk_buff *entry; u32 addr = le32_to_cpu(hdr->req_id) - priv->headroom; struct memrecord *range = NULL; u32 freed = 0; @@ -741,6 +742,7 @@ static void p54_rx_frame_sent(struct ieee80211_hw *dev, struct sk_buff *skb) int count, idx; spin_lock_irqsave(&priv->tx_queue.lock, flags); + entry = (struct sk_buff *) priv->tx_queue.next; while (entry != (struct sk_buff *)&priv->tx_queue) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(entry); struct p54_hdr *entry_hdr; @@ -976,7 +978,7 @@ static int p54_assign_address(struct ieee80211_hw *dev, struct sk_buff *skb, struct p54_hdr *data, u32 len) { struct p54_common *priv = dev->priv; - struct sk_buff *entry = priv->tx_queue.next; + struct sk_buff *entry; struct sk_buff *target_skb = NULL; struct ieee80211_tx_info *info; struct memrecord *range; @@ -1014,6 +1016,7 @@ static int p54_assign_address(struct ieee80211_hw *dev, struct sk_buff *skb, } } + entry = priv->tx_queue.next; while (left--) { u32 hole_size; info = IEEE80211_SKB_CB(entry); From 1f6ff364ceda516f88351a8ab640e656beed0b26 Mon Sep 17 00:00:00 2001 From: Abhijeet Joglekar Date: Fri, 27 Feb 2009 10:54:35 -0800 Subject: [PATCH 268/580] [SCSI] libfc: Pass lport in exch_mgr_reset fc_exch_mgr structure is private to fc_exch.c. To export exch_mgr_reset to transport, transport needs access to the exch manager. Change exch_mgr_reset to use lport param which is the shared structure between libFC and transport. Alternatively, fc_exch_mgr definition can be moved to libfc.h so that lport can be accessed from mp*. Signed-off-by: Abhijeet Joglekar Signed-off-by: Robert Love Signed-off-by: James Bottomley --- drivers/scsi/libfc/fc_exch.c | 3 ++- drivers/scsi/libfc/fc_lport.c | 4 ++-- drivers/scsi/libfc/fc_rport.c | 4 ++-- include/scsi/libfc.h | 4 ++-- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c index 66db08a5f27f..a09416fd843c 100644 --- a/drivers/scsi/libfc/fc_exch.c +++ b/drivers/scsi/libfc/fc_exch.c @@ -1480,10 +1480,11 @@ static void fc_exch_reset(struct fc_exch *ep) * If sid is non-zero, reset only exchanges we source from that FID. * If did is non-zero, reset only exchanges destined to that FID. */ -void fc_exch_mgr_reset(struct fc_exch_mgr *mp, u32 sid, u32 did) +void fc_exch_mgr_reset(struct fc_lport *lp, u32 sid, u32 did) { struct fc_exch *ep; struct fc_exch *next; + struct fc_exch_mgr *mp = lp->emp; spin_lock_bh(&mp->em_lock); restart: diff --git a/drivers/scsi/libfc/fc_lport.c b/drivers/scsi/libfc/fc_lport.c index 0b9bdb1fb807..5db223ce3b25 100644 --- a/drivers/scsi/libfc/fc_lport.c +++ b/drivers/scsi/libfc/fc_lport.c @@ -663,7 +663,7 @@ int fc_lport_destroy(struct fc_lport *lport) { lport->tt.frame_send = fc_frame_drop; lport->tt.fcp_abort_io(lport); - lport->tt.exch_mgr_reset(lport->emp, 0, 0); + lport->tt.exch_mgr_reset(lport, 0, 0); return 0; } EXPORT_SYMBOL(fc_lport_destroy); @@ -973,7 +973,7 @@ static void fc_lport_enter_reset(struct fc_lport *lport) lport->tt.disc_stop(lport); - lport->tt.exch_mgr_reset(lport->emp, 0, 0); + lport->tt.exch_mgr_reset(lport, 0, 0); fc_host_fabric_name(lport->host) = 0; fc_host_port_id(lport->host) = 0; diff --git a/drivers/scsi/libfc/fc_rport.c b/drivers/scsi/libfc/fc_rport.c index e780d8caf70e..dec7bae0e56d 100644 --- a/drivers/scsi/libfc/fc_rport.c +++ b/drivers/scsi/libfc/fc_rport.c @@ -1285,7 +1285,7 @@ void fc_rport_terminate_io(struct fc_rport *rport) struct fc_rport_libfc_priv *rdata = rport->dd_data; struct fc_lport *lport = rdata->local_port; - lport->tt.exch_mgr_reset(lport->emp, 0, rport->port_id); - lport->tt.exch_mgr_reset(lport->emp, rport->port_id, 0); + lport->tt.exch_mgr_reset(lport, 0, rport->port_id); + lport->tt.exch_mgr_reset(lport, rport->port_id, 0); } EXPORT_SYMBOL(fc_rport_terminate_io); diff --git a/include/scsi/libfc.h b/include/scsi/libfc.h index 9f2876397dda..042f4ade73d7 100644 --- a/include/scsi/libfc.h +++ b/include/scsi/libfc.h @@ -472,7 +472,7 @@ struct libfc_function_template { * If s_id is non-zero, reset only exchanges originating from that FID. * If d_id is non-zero, reset only exchanges sending to that FID. */ - void (*exch_mgr_reset)(struct fc_exch_mgr *, + void (*exch_mgr_reset)(struct fc_lport *, u32 s_id, u32 d_id); void (*rport_flush_queue)(void); @@ -916,7 +916,7 @@ struct fc_seq *fc_seq_start_next(struct fc_seq *sp); * If s_id is non-zero, reset only exchanges originating from that FID. * If d_id is non-zero, reset only exchanges sending to that FID. */ -void fc_exch_mgr_reset(struct fc_exch_mgr *, u32 s_id, u32 d_id); +void fc_exch_mgr_reset(struct fc_lport *, u32 s_id, u32 d_id); /* * Functions for fc_functions_template From 571f824c3cd7b7f5a40ba100f7e576b6b0fe826a Mon Sep 17 00:00:00 2001 From: Abhijeet Joglekar Date: Fri, 27 Feb 2009 10:54:41 -0800 Subject: [PATCH 269/580] [SCSI] libfc: when rport goes away (re-plogi), clean up exchanges to/from rport When a rport goes away, libFC does a plogi which will reset exchanges at the rport. Clean exchanges at our end, both in transport and libFC. If transport hooks into exch_mgr_reset, it will call back into fc_exch_mgr_reset() to clean up libFC exchanges. Signed-off-by: Abhijeet Joglekar Signed-off-by: Robert Love Signed-off-by: James Bottomley --- drivers/scsi/libfc/fc_rport.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/libfc/fc_rport.c b/drivers/scsi/libfc/fc_rport.c index dec7bae0e56d..717575934152 100644 --- a/drivers/scsi/libfc/fc_rport.c +++ b/drivers/scsi/libfc/fc_rport.c @@ -214,6 +214,7 @@ static void fc_rport_state_enter(struct fc_rport *rport, static void fc_rport_work(struct work_struct *work) { + u32 port_id; struct fc_rport_libfc_priv *rdata = container_of(work, struct fc_rport_libfc_priv, event_work); enum fc_rport_event event; @@ -279,8 +280,12 @@ static void fc_rport_work(struct work_struct *work) rport_ops->event_callback(lport, rport, event); if (trans_state == FC_PORTSTATE_ROGUE) put_device(&rport->dev); - else + else { + port_id = rport->port_id; fc_remote_port_delete(rport); + lport->tt.exch_mgr_reset(lport, 0, port_id); + lport->tt.exch_mgr_reset(lport, port_id, 0); + } } else mutex_unlock(&rdata->rp_mutex); } From 78342da3682ec843e3e6301af5c723c88a46c408 Mon Sep 17 00:00:00 2001 From: Vasu Dev Date: Fri, 27 Feb 2009 10:54:46 -0800 Subject: [PATCH 270/580] [SCSI] libfc: handle RRQ exch timeout Cleanup exchange held due to RRQ when RRQ exch times out, in this case the ABTS is already done causing RRQ req therefore proceeding with cleanup in fc_exch_rrq_resp should be okay to restore exch resource. Signed-off-by: Vasu Dev Signed-off-by: Robert Love Signed-off-by: James Bottomley --- drivers/scsi/libfc/fc_exch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c index a09416fd843c..e874e77b740c 100644 --- a/drivers/scsi/libfc/fc_exch.c +++ b/drivers/scsi/libfc/fc_exch.c @@ -1608,7 +1608,7 @@ static void fc_exch_rrq_resp(struct fc_seq *sp, struct fc_frame *fp, void *arg) if (IS_ERR(fp)) { int err = PTR_ERR(fp); - if (err == -FC_EX_CLOSED) + if (err == -FC_EX_CLOSED || err == -FC_EX_TIMEOUT) goto cleanup; FC_DBG("Cannot process RRQ, because of frame error %d\n", err); return; From a7e84f2b83f17f8f11da34ccef3ba5a862dc0182 Mon Sep 17 00:00:00 2001 From: Vasu Dev Date: Fri, 27 Feb 2009 10:54:51 -0800 Subject: [PATCH 271/580] [SCSI] libfc: fixed a soft lockup issue in fc_exch_recv_abts The fc_seq_start_next grabs ep->ex_lock but this lock was already held here, so instead called fc_seq_start_next_locked to avoid soft lockup. Signed-off-by: Vasu Dev Signed-off-by: Robert Love Signed-off-by: James Bottomley --- drivers/scsi/libfc/fc_exch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c index e874e77b740c..dd269e5f953e 100644 --- a/drivers/scsi/libfc/fc_exch.c +++ b/drivers/scsi/libfc/fc_exch.c @@ -1096,7 +1096,7 @@ static void fc_exch_recv_abts(struct fc_exch *ep, struct fc_frame *rx_fp) ap->ba_high_seq_cnt = fh->fh_seq_cnt; ap->ba_low_seq_cnt = htons(sp->cnt); } - sp = fc_seq_start_next(sp); + sp = fc_seq_start_next_locked(sp); spin_unlock_bh(&ep->ex_lock); fc_seq_send_last(sp, fp, FC_RCTL_BA_ACC, FC_TYPE_BLS); fc_frame_free(rx_fp); From bc0e17f691085315ae9303eb5b0883fe16dfe6b1 Mon Sep 17 00:00:00 2001 From: Vasu Dev Date: Fri, 27 Feb 2009 10:54:57 -0800 Subject: [PATCH 272/580] [SCSI] libfc, fcoe: fixed locking issues with lport->lp_mutex around lport->link_status The fcoe_xmit could call fc_pause in case the pending skb queue len is larger than FCOE_MAX_QUEUE_DEPTH, the fc_pause was trying to grab lport->lp_muex to change lport->link_status and that had these issues :- 1. The fcoe_xmit was getting called with bh disabled, thus causing "BUG: scheduling while atomic" when grabbing lport->lp_muex with bh disabled. 2. fc_linkup and fc_linkdown function calls lport_enter function with lport->lp_mutex held and these enter function in turn calls fcoe_xmit to send lport related FC frame, e.g. fc_linkup => fc_lport_enter_flogi to send flogi req. In this case grabbing the same lport->lp_mutex again in fc_puase from fcoe_xmit would cause deadlock. The lport->lp_mutex was used for setting FC_PAUSE in fcoe_xmit path but FC_PAUSE bit was not used anywhere beside just setting and clear this bit in lport->link_status, instead used a separate field qfull in fc_lport to eliminate need for lport->lp_mutex to track pending queue full condition and in turn avoid above described two locking issues. Also added check for lp->qfull in fc_fcp_lport_queue_ready to trigger SCSI_MLQUEUE_HOST_BUSY when lp->qfull is set to prevent more scsi-ml cmds while lp->qfull is set. This patch eliminated FC_LINK_UP and FC_PAUSE and instead used dedicated fields in fc_lport for this, this simplified all related conditional code. Also removed fc_pause and fc_unpause functions and instead used newly added lport->qfull directly in fcoe. Signed-off-by: Vasu Dev Signed-off-by: Robert Love Signed-off-by: James Bottomley --- drivers/scsi/fcoe/fcoe_sw.c | 6 ++--- drivers/scsi/fcoe/libfcoe.c | 41 +++++++++++++++-------------------- drivers/scsi/libfc/fc_fcp.c | 4 ++-- drivers/scsi/libfc/fc_lport.c | 36 +++++------------------------- include/scsi/libfc.h | 12 ++-------- 5 files changed, 30 insertions(+), 69 deletions(-) diff --git a/drivers/scsi/fcoe/fcoe_sw.c b/drivers/scsi/fcoe/fcoe_sw.c index dc4cd5e25760..cf83675a0fb9 100644 --- a/drivers/scsi/fcoe/fcoe_sw.c +++ b/drivers/scsi/fcoe/fcoe_sw.c @@ -116,7 +116,8 @@ static int fcoe_sw_lport_config(struct fc_lport *lp) { int i = 0; - lp->link_status = 0; + lp->link_up = 0; + lp->qfull = 0; lp->max_retry_count = 3; lp->e_d_tov = 2 * 1000; /* FC-FS default */ lp->r_a_tov = 2 * 2 * 1000; @@ -181,9 +182,8 @@ static int fcoe_sw_netdev_config(struct fc_lport *lp, struct net_device *netdev) if (fc_set_mfs(lp, mfs)) return -EINVAL; - lp->link_status = ~FC_PAUSE & ~FC_LINK_UP; if (!fcoe_link_ok(lp)) - lp->link_status |= FC_LINK_UP; + lp->link_up = 1; /* offload features support */ if (fc->real_dev->features & NETIF_F_SG) diff --git a/drivers/scsi/fcoe/libfcoe.c b/drivers/scsi/fcoe/libfcoe.c index e419f486cdb3..296071043f55 100644 --- a/drivers/scsi/fcoe/libfcoe.c +++ b/drivers/scsi/fcoe/libfcoe.c @@ -504,7 +504,7 @@ int fcoe_xmit(struct fc_lport *lp, struct fc_frame *fp) if (rc) { fcoe_insert_wait_queue(lp, skb); if (fc->fcoe_pending_queue.qlen > FCOE_MAX_QUEUE_DEPTH) - fc_pause(lp); + lp->qfull = 1; } return 0; @@ -718,7 +718,7 @@ static void fcoe_recv_flogi(struct fcoe_softc *fc, struct fc_frame *fp, u8 *sa) * fcoe_watchdog - fcoe timer callback * @vp: * - * This checks the pending queue length for fcoe and put fcoe to be paused state + * This checks the pending queue length for fcoe and set lport qfull * if the FCOE_MAX_QUEUE_DEPTH is reached. This is done for all fc_lport on the * fcoe_hostlist. * @@ -728,17 +728,17 @@ void fcoe_watchdog(ulong vp) { struct fc_lport *lp; struct fcoe_softc *fc; - int paused = 0; + int qfilled = 0; read_lock(&fcoe_hostlist_lock); list_for_each_entry(fc, &fcoe_hostlist, list) { lp = fc->lp; if (lp) { if (fc->fcoe_pending_queue.qlen > FCOE_MAX_QUEUE_DEPTH) - paused = 1; + qfilled = 1; if (fcoe_check_wait_queue(lp) < FCOE_MAX_QUEUE_DEPTH) { - if (paused) - fc_unpause(lp); + if (qfilled) + lp->qfull = 0; } } } @@ -767,8 +767,7 @@ void fcoe_watchdog(ulong vp) **/ static int fcoe_check_wait_queue(struct fc_lport *lp) { - int rc, unpause = 0; - int paused = 0; + int rc; struct sk_buff *skb; struct fcoe_softc *fc; @@ -776,10 +775,10 @@ static int fcoe_check_wait_queue(struct fc_lport *lp) spin_lock_bh(&fc->fcoe_pending_queue.lock); /* - * is this interface paused? + * if interface pending queue full then set qfull in lport. */ if (fc->fcoe_pending_queue.qlen > FCOE_MAX_QUEUE_DEPTH) - paused = 1; + lp->qfull = 1; if (fc->fcoe_pending_queue.qlen) { while ((skb = __skb_dequeue(&fc->fcoe_pending_queue)) != NULL) { spin_unlock_bh(&fc->fcoe_pending_queue.lock); @@ -791,11 +790,9 @@ static int fcoe_check_wait_queue(struct fc_lport *lp) spin_lock_bh(&fc->fcoe_pending_queue.lock); } if (fc->fcoe_pending_queue.qlen < FCOE_MAX_QUEUE_DEPTH) - unpause = 1; + lp->qfull = 0; } spin_unlock_bh(&fc->fcoe_pending_queue.lock); - if ((unpause) && (paused)) - fc_unpause(lp); return fc->fcoe_pending_queue.qlen; } @@ -873,7 +870,7 @@ static int fcoe_device_notification(struct notifier_block *notifier, struct net_device *real_dev = ptr; struct fcoe_softc *fc; struct fcoe_dev_stats *stats; - u16 new_status; + u32 new_link_up; u32 mfs; int rc = NOTIFY_OK; @@ -890,17 +887,15 @@ static int fcoe_device_notification(struct notifier_block *notifier, goto out; } - new_status = lp->link_status; + new_link_up = lp->link_up; switch (event) { case NETDEV_DOWN: case NETDEV_GOING_DOWN: - new_status &= ~FC_LINK_UP; + new_link_up = 0; break; case NETDEV_UP: case NETDEV_CHANGE: - new_status &= ~FC_LINK_UP; - if (!fcoe_link_ok(lp)) - new_status |= FC_LINK_UP; + new_link_up = !fcoe_link_ok(lp); break; case NETDEV_CHANGEMTU: mfs = fc->real_dev->mtu - @@ -908,17 +903,15 @@ static int fcoe_device_notification(struct notifier_block *notifier, sizeof(struct fcoe_crc_eof)); if (mfs >= FC_MIN_MAX_FRAME) fc_set_mfs(lp, mfs); - new_status &= ~FC_LINK_UP; - if (!fcoe_link_ok(lp)) - new_status |= FC_LINK_UP; + new_link_up = !fcoe_link_ok(lp); break; case NETDEV_REGISTER: break; default: FC_DBG("unknown event %ld call", event); } - if (lp->link_status != new_status) { - if ((new_status & FC_LINK_UP) == FC_LINK_UP) + if (lp->link_up != new_link_up) { + if (new_link_up) fc_linkup(lp); else { stats = lp->dev_stats[smp_processor_id()]; diff --git a/drivers/scsi/libfc/fc_fcp.c b/drivers/scsi/libfc/fc_fcp.c index 404e63ff46b8..f440aaca39c2 100644 --- a/drivers/scsi/libfc/fc_fcp.c +++ b/drivers/scsi/libfc/fc_fcp.c @@ -1621,7 +1621,7 @@ static void fc_fcp_srr_error(struct fc_fcp_pkt *fsp, struct fc_frame *fp) static inline int fc_fcp_lport_queue_ready(struct fc_lport *lp) { /* lock ? */ - return (lp->state == LPORT_ST_READY) && (lp->link_status & FC_LINK_UP); + return (lp->state == LPORT_ST_READY) && lp->link_up && !lp->qfull; } /** @@ -1890,7 +1890,7 @@ int fc_eh_abort(struct scsi_cmnd *sc_cmd) lp = shost_priv(sc_cmd->device->host); if (lp->state != LPORT_ST_READY) return rc; - else if (!(lp->link_status & FC_LINK_UP)) + else if (!lp->link_up) return rc; spin_lock_irqsave(lp->host->host_lock, flags); diff --git a/drivers/scsi/libfc/fc_lport.c b/drivers/scsi/libfc/fc_lport.c index 5db223ce3b25..a6ab692f5f51 100644 --- a/drivers/scsi/libfc/fc_lport.c +++ b/drivers/scsi/libfc/fc_lport.c @@ -250,7 +250,7 @@ void fc_get_host_port_state(struct Scsi_Host *shost) { struct fc_lport *lp = shost_priv(shost); - if ((lp->link_status & FC_LINK_UP) == FC_LINK_UP) + if (lp->link_up) fc_host_port_state(shost) = FC_PORTSTATE_ONLINE; else fc_host_port_state(shost) = FC_PORTSTATE_OFFLINE; @@ -577,8 +577,8 @@ void fc_linkup(struct fc_lport *lport) fc_host_port_id(lport->host)); mutex_lock(&lport->lp_mutex); - if ((lport->link_status & FC_LINK_UP) != FC_LINK_UP) { - lport->link_status |= FC_LINK_UP; + if (!lport->link_up) { + lport->link_up = 1; if (lport->state == LPORT_ST_RESET) fc_lport_enter_flogi(lport); @@ -597,8 +597,8 @@ void fc_linkdown(struct fc_lport *lport) FC_DEBUG_LPORT("Link is down for port (%6x)\n", fc_host_port_id(lport->host)); - if ((lport->link_status & FC_LINK_UP) == FC_LINK_UP) { - lport->link_status &= ~(FC_LINK_UP); + if (lport->link_up) { + lport->link_up = 0; fc_lport_enter_reset(lport); lport->tt.fcp_cleanup(lport); } @@ -606,30 +606,6 @@ void fc_linkdown(struct fc_lport *lport) } EXPORT_SYMBOL(fc_linkdown); -/** - * fc_pause - Pause the flow of frames - * @lport: The lport to be paused - */ -void fc_pause(struct fc_lport *lport) -{ - mutex_lock(&lport->lp_mutex); - lport->link_status |= FC_PAUSE; - mutex_unlock(&lport->lp_mutex); -} -EXPORT_SYMBOL(fc_pause); - -/** - * fc_unpause - Unpause the flow of frames - * @lport: The lport to be unpaused - */ -void fc_unpause(struct fc_lport *lport) -{ - mutex_lock(&lport->lp_mutex); - lport->link_status &= ~(FC_PAUSE); - mutex_unlock(&lport->lp_mutex); -} -EXPORT_SYMBOL(fc_unpause); - /** * fc_fabric_logoff - Logout of the fabric * @lport: fc_lport pointer to logoff the fabric @@ -977,7 +953,7 @@ static void fc_lport_enter_reset(struct fc_lport *lport) fc_host_fabric_name(lport->host) = 0; fc_host_port_id(lport->host) = 0; - if ((lport->link_status & FC_LINK_UP) == FC_LINK_UP) + if (lport->link_up) fc_lport_enter_flogi(lport); } diff --git a/include/scsi/libfc.h b/include/scsi/libfc.h index 042f4ade73d7..b9e6c1cd8914 100644 --- a/include/scsi/libfc.h +++ b/include/scsi/libfc.h @@ -68,9 +68,6 @@ /* * FC HBA status */ -#define FC_PAUSE (1 << 1) -#define FC_LINK_UP (1 << 0) - enum fc_lport_state { LPORT_ST_NONE = 0, LPORT_ST_FLOGI, @@ -603,7 +600,8 @@ struct fc_lport { /* Operational Information */ struct libfc_function_template tt; - u16 link_status; + u8 link_up; + u8 qfull; enum fc_lport_state state; unsigned long boot_time; @@ -703,12 +701,6 @@ void fc_linkup(struct fc_lport *); */ void fc_linkdown(struct fc_lport *); -/* - * Pause and unpause traffic. - */ -void fc_pause(struct fc_lport *); -void fc_unpause(struct fc_lport *); - /* * Configure the local port. */ From 6755db1cd4587084be85f860b7aa7c0cc9d776dc Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Fri, 27 Feb 2009 10:55:02 -0800 Subject: [PATCH 273/580] [SCSI] libfc: rport retry on LS_RJT from certain ELS This allows any rport ELS to retry on LS_RJT. The rport error handling would only retry on resource allocation failures and exchange timeouts. I have a target that will occasionally reject PLOGI when we do a quick LOGO/PLOGI. When a critical ELS was rejected, libfc would fail silently leaving the rport in a dead state. The retry count and delay are managed by fc_rport_error_retry. If the retry count is exceeded fc_rport_error will be called. When retrying is not the correct course of action, fc_rport_error can be called directly. Signed-off-by: Chris Leech Signed-off-by: Robert Love Signed-off-by: James Bottomley --- drivers/scsi/libfc/fc_exch.c | 2 - drivers/scsi/libfc/fc_rport.c | 111 ++++++++++++++++++++-------------- include/scsi/fc/fc_fs.h | 5 ++ 3 files changed, 69 insertions(+), 49 deletions(-) diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c index dd269e5f953e..8c4018956d4c 100644 --- a/drivers/scsi/libfc/fc_exch.c +++ b/drivers/scsi/libfc/fc_exch.c @@ -32,8 +32,6 @@ #include #include -#define FC_DEF_R_A_TOV (10 * 1000) /* resource allocation timeout */ - /* * fc_exch_debug can be set in debugger or at compile time to get more logs. */ diff --git a/drivers/scsi/libfc/fc_rport.c b/drivers/scsi/libfc/fc_rport.c index 717575934152..600a8fffa940 100644 --- a/drivers/scsi/libfc/fc_rport.c +++ b/drivers/scsi/libfc/fc_rport.c @@ -81,6 +81,7 @@ static void fc_rport_recv_logo_req(struct fc_rport *, struct fc_seq *, struct fc_frame *); static void fc_rport_timeout(struct work_struct *); static void fc_rport_error(struct fc_rport *, struct fc_frame *); +static void fc_rport_error_retry(struct fc_rport *, struct fc_frame *); static void fc_rport_work(struct work_struct *); static const char *fc_rport_state_names[] = { @@ -410,57 +411,73 @@ static void fc_rport_timeout(struct work_struct *work) } /** - * fc_rport_error - Handler for any errors + * fc_rport_error - Error handler, called once retries have been exhausted * @rport: The fc_rport object * @fp: The frame pointer * - * If the error was caused by a resource allocation failure - * then wait for half a second and retry, otherwise retry - * immediately. - * * Locking Note: The rport lock is expected to be held before * calling this routine */ static void fc_rport_error(struct fc_rport *rport, struct fc_frame *fp) { struct fc_rport_libfc_priv *rdata = rport->dd_data; - unsigned long delay = 0; FC_DEBUG_RPORT("Error %ld in state %s, retries %d\n", PTR_ERR(fp), fc_rport_state(rport), rdata->retries); - if (!fp || PTR_ERR(fp) == -FC_EX_TIMEOUT) { - /* - * Memory allocation failure, or the exchange timed out. - * Retry after delay - */ - if (rdata->retries < rdata->local_port->max_retry_count) { - rdata->retries++; - if (!fp) - delay = msecs_to_jiffies(500); - get_device(&rport->dev); - schedule_delayed_work(&rdata->retry_work, delay); - } else { - switch (rdata->rp_state) { - case RPORT_ST_PLOGI: - case RPORT_ST_PRLI: - case RPORT_ST_LOGO: - rdata->event = RPORT_EV_FAILED; - queue_work(rport_event_queue, - &rdata->event_work); - break; - case RPORT_ST_RTV: - fc_rport_enter_ready(rport); - break; - case RPORT_ST_NONE: - case RPORT_ST_READY: - case RPORT_ST_INIT: - break; - } - } + switch (rdata->rp_state) { + case RPORT_ST_PLOGI: + case RPORT_ST_PRLI: + case RPORT_ST_LOGO: + rdata->event = RPORT_EV_FAILED; + queue_work(rport_event_queue, + &rdata->event_work); + break; + case RPORT_ST_RTV: + fc_rport_enter_ready(rport); + break; + case RPORT_ST_NONE: + case RPORT_ST_READY: + case RPORT_ST_INIT: + break; } } +/** + * fc_rport_error_retry - Error handler when retries are desired + * @rport: The fc_rport object + * @fp: The frame pointer + * + * If the error was an exchange timeout retry immediately, + * otherwise wait for E_D_TOV. + * + * Locking Note: The rport lock is expected to be held before + * calling this routine + */ +static void fc_rport_error_retry(struct fc_rport *rport, struct fc_frame *fp) +{ + struct fc_rport_libfc_priv *rdata = rport->dd_data; + unsigned long delay = FC_DEF_E_D_TOV; + + /* make sure this isn't an FC_EX_CLOSED error, never retry those */ + if (PTR_ERR(fp) == -FC_EX_CLOSED) + return fc_rport_error(rport, fp); + + if (rdata->retries < rdata->local_port->max_retry_count) { + FC_DEBUG_RPORT("Error %ld in state %s, retrying\n", + PTR_ERR(fp), fc_rport_state(rport)); + rdata->retries++; + /* no additional delay on exchange timeouts */ + if (PTR_ERR(fp) == -FC_EX_TIMEOUT) + delay = 0; + get_device(&rport->dev); + schedule_delayed_work(&rdata->retry_work, delay); + return; + } + + return fc_rport_error(rport, fp); +} + /** * fc_rport_plogi_recv_resp - Handle incoming ELS PLOGI response * @sp: current sequence in the PLOGI exchange @@ -495,7 +512,7 @@ static void fc_rport_plogi_resp(struct fc_seq *sp, struct fc_frame *fp, } if (IS_ERR(fp)) { - fc_rport_error(rport, fp); + fc_rport_error_retry(rport, fp); goto err; } @@ -527,7 +544,7 @@ static void fc_rport_plogi_resp(struct fc_seq *sp, struct fc_frame *fp, else fc_rport_enter_prli(rport); } else - fc_rport_error(rport, fp); + fc_rport_error_retry(rport, fp); out: fc_frame_free(fp); @@ -557,14 +574,14 @@ static void fc_rport_enter_plogi(struct fc_rport *rport) rport->maxframe_size = FC_MIN_MAX_PAYLOAD; fp = fc_frame_alloc(lport, sizeof(struct fc_els_flogi)); if (!fp) { - fc_rport_error(rport, fp); + fc_rport_error_retry(rport, fp); return; } rdata->e_d_tov = lport->e_d_tov; if (!lport->tt.elsct_send(lport, rport, fp, ELS_PLOGI, fc_rport_plogi_resp, rport, lport->e_d_tov)) - fc_rport_error(rport, fp); + fc_rport_error_retry(rport, fp); else get_device(&rport->dev); } @@ -604,7 +621,7 @@ static void fc_rport_prli_resp(struct fc_seq *sp, struct fc_frame *fp, } if (IS_ERR(fp)) { - fc_rport_error(rport, fp); + fc_rport_error_retry(rport, fp); goto err; } @@ -662,7 +679,7 @@ static void fc_rport_logo_resp(struct fc_seq *sp, struct fc_frame *fp, rport->port_id); if (IS_ERR(fp)) { - fc_rport_error(rport, fp); + fc_rport_error_retry(rport, fp); goto err; } @@ -712,13 +729,13 @@ static void fc_rport_enter_prli(struct fc_rport *rport) fp = fc_frame_alloc(lport, sizeof(*pp)); if (!fp) { - fc_rport_error(rport, fp); + fc_rport_error_retry(rport, fp); return; } if (!lport->tt.elsct_send(lport, rport, fp, ELS_PRLI, fc_rport_prli_resp, rport, lport->e_d_tov)) - fc_rport_error(rport, fp); + fc_rport_error_retry(rport, fp); else get_device(&rport->dev); } @@ -809,13 +826,13 @@ static void fc_rport_enter_rtv(struct fc_rport *rport) fp = fc_frame_alloc(lport, sizeof(struct fc_els_rtv)); if (!fp) { - fc_rport_error(rport, fp); + fc_rport_error_retry(rport, fp); return; } if (!lport->tt.elsct_send(lport, rport, fp, ELS_RTV, fc_rport_rtv_resp, rport, lport->e_d_tov)) - fc_rport_error(rport, fp); + fc_rport_error_retry(rport, fp); else get_device(&rport->dev); } @@ -840,13 +857,13 @@ static void fc_rport_enter_logo(struct fc_rport *rport) fp = fc_frame_alloc(lport, sizeof(struct fc_els_logo)); if (!fp) { - fc_rport_error(rport, fp); + fc_rport_error_retry(rport, fp); return; } if (!lport->tt.elsct_send(lport, rport, fp, ELS_LOGO, fc_rport_logo_resp, rport, lport->e_d_tov)) - fc_rport_error(rport, fp); + fc_rport_error_retry(rport, fp); else get_device(&rport->dev); } diff --git a/include/scsi/fc/fc_fs.h b/include/scsi/fc/fc_fs.h index 3e4801d2bdbb..1b7af3a64c7c 100644 --- a/include/scsi/fc/fc_fs.h +++ b/include/scsi/fc/fc_fs.h @@ -337,4 +337,9 @@ enum fc_pf_rjt_reason { FC_RJT_VENDOR = 0xff, /* vendor specific reject */ }; +/* default timeout values */ + +#define FC_DEF_E_D_TOV 2000UL +#define FC_DEF_R_A_TOV 10000UL + #endif /* _FC_FS_H_ */ From 26d9cab558f901051d0b69b2c445c8588931ce8d Mon Sep 17 00:00:00 2001 From: Vasu Dev Date: Fri, 27 Feb 2009 10:55:07 -0800 Subject: [PATCH 274/580] [SCSI] libfc: fixed a read IO data integrity issue when a IO data frame lost The fc_fcp_complete_locked detected data underrun in this case and set the FC_DATA_UNDRUN but that was ignored by fc_io_compl for all cases including read underrun. Added code to not to ignore FC_DATA_UNDRUN for read IO and instead suggested scsi-ml to retry cmd to recover from lost data frame. Not sure if it is okay to ignore FC_DATA_UNDRUN for other case, so let code as is for other cases but removed or-ing with zero valued fsp->cdb_status for those cases. Signed-off-by: Vasu Dev Signed-off-by: Robert Love Signed-off-by: James Bottomley --- drivers/scsi/libfc/fc_fcp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/libfc/fc_fcp.c b/drivers/scsi/libfc/fc_fcp.c index f440aaca39c2..ecc72612c385 100644 --- a/drivers/scsi/libfc/fc_fcp.c +++ b/drivers/scsi/libfc/fc_fcp.c @@ -1810,12 +1810,12 @@ static void fc_io_compl(struct fc_fcp_pkt *fsp) sc_cmd->result = DID_ERROR << 16; break; case FC_DATA_UNDRUN: - if (fsp->cdb_status == 0) { + if ((fsp->cdb_status == 0) && !(fsp->req_flags & FC_SRB_READ)) { /* * scsi status is good but transport level - * underrun. for read it should be an error?? + * underrun. */ - sc_cmd->result = (DID_OK << 16) | fsp->cdb_status; + sc_cmd->result = DID_OK << 16; } else { /* * scsi got underrun, this is an error From f7db2c150cf5082cf74555f30a1305938041de80 Mon Sep 17 00:00:00 2001 From: Steve Ma Date: Fri, 27 Feb 2009 10:55:13 -0800 Subject: [PATCH 275/580] [SCSI] libfc: exch mgr is freed while lport still retrying sequences When a sequence cannot be delivered to the target, the local port will schedule retries, While this process is in progress, if we destroy the FCoE interface, the fcoe_sw_destroy routine is entered, and the fc_exch_mgr_free(lp->emp) is called. Thus if fc_exch_alloc() is called when retrying the sequence, the mempool_alloc() will fail to allocate the exchange because the mempool of the exchange manager has already been released. This patch is to cancel any pending retry work of the local port before we start to destroy the interface. Also, when resetting the local port, we should also stop the scheduled pending retries. Signed-off-by: Steve Ma Signed-off-by: Robert Love Signed-off-by: James Bottomley --- drivers/scsi/libfc/fc_lport.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/libfc/fc_lport.c b/drivers/scsi/libfc/fc_lport.c index a6ab692f5f51..e6ea4f119530 100644 --- a/drivers/scsi/libfc/fc_lport.c +++ b/drivers/scsi/libfc/fc_lport.c @@ -619,6 +619,7 @@ int fc_fabric_logoff(struct fc_lport *lport) mutex_lock(&lport->lp_mutex); fc_lport_enter_logo(lport); mutex_unlock(&lport->lp_mutex); + cancel_delayed_work_sync(&lport->retry_work); return 0; } EXPORT_SYMBOL(fc_fabric_logoff); @@ -918,6 +919,7 @@ static void fc_lport_recv_req(struct fc_lport *lport, struct fc_seq *sp, */ int fc_lport_reset(struct fc_lport *lport) { + cancel_delayed_work_sync(&lport->retry_work); mutex_lock(&lport->lp_mutex); fc_lport_enter_reset(lport); mutex_unlock(&lport->lp_mutex); From 5101ff99f59aefb72e0c96e82aa32048ac9f8425 Mon Sep 17 00:00:00 2001 From: Robert Love Date: Fri, 27 Feb 2009 10:55:18 -0800 Subject: [PATCH 276/580] [SCSI] libfc: Don't violate transport template for rogue port creation Signed-off-by: Robert Love Signed-off-by: James Bottomley --- drivers/scsi/libfc/fc_disc.c | 6 +++--- drivers/scsi/libfc/fc_lport.c | 4 ++-- drivers/scsi/libfc/fc_rport.c | 3 +++ include/scsi/libfc.h | 5 +++++ 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/libfc/fc_disc.c b/drivers/scsi/libfc/fc_disc.c index dd1564c9e04a..15e3f840c049 100644 --- a/drivers/scsi/libfc/fc_disc.c +++ b/drivers/scsi/libfc/fc_disc.c @@ -430,7 +430,7 @@ static int fc_disc_new_target(struct fc_disc *disc, dp.ids.port_name = ids->port_name; dp.ids.node_name = ids->node_name; dp.ids.roles = ids->roles; - rport = fc_rport_rogue_create(&dp); + rport = lport->tt.rport_create(&dp); } if (!rport) error = -ENOMEM; @@ -617,7 +617,7 @@ static int fc_disc_gpn_ft_parse(struct fc_disc *disc, void *buf, size_t len) if ((dp.ids.port_id != fc_host_port_id(lport->host)) && (dp.ids.port_name != lport->wwpn)) { - rport = fc_rport_rogue_create(&dp); + rport = lport->tt.rport_create(&dp); if (rport) { rdata = rport->dd_data; rdata->ops = &fc_disc_rport_ops; @@ -769,7 +769,7 @@ static void fc_disc_single(struct fc_disc *disc, struct fc_disc_port *dp) if (rport) fc_disc_del_target(disc, rport); - new_rport = fc_rport_rogue_create(dp); + new_rport = lport->tt.rport_create(dp); if (new_rport) { rdata = new_rport->dd_data; rdata->ops = &fc_disc_rport_ops; diff --git a/drivers/scsi/libfc/fc_lport.c b/drivers/scsi/libfc/fc_lport.c index e6ea4f119530..07335ae2947c 100644 --- a/drivers/scsi/libfc/fc_lport.c +++ b/drivers/scsi/libfc/fc_lport.c @@ -232,7 +232,7 @@ static void fc_lport_ptp_setup(struct fc_lport *lport, lport->ptp_rp = NULL; } - lport->ptp_rp = fc_rport_rogue_create(&dp); + lport->ptp_rp = lport->tt.rport_create(&dp); lport->tt.rport_login(lport->ptp_rp); @@ -1282,7 +1282,7 @@ static void fc_lport_enter_dns(struct fc_lport *lport) fc_lport_state_enter(lport, LPORT_ST_DNS); - rport = fc_rport_rogue_create(&dp); + rport = lport->tt.rport_create(&dp); if (!rport) goto err; diff --git a/drivers/scsi/libfc/fc_rport.c b/drivers/scsi/libfc/fc_rport.c index 600a8fffa940..81b3ca188789 100644 --- a/drivers/scsi/libfc/fc_rport.c +++ b/drivers/scsi/libfc/fc_rport.c @@ -1271,6 +1271,9 @@ static void fc_rport_flush_queue(void) int fc_rport_init(struct fc_lport *lport) { + if (!lport->tt.rport_create) + lport->tt.rport_create = fc_rport_rogue_create; + if (!lport->tt.rport_login) lport->tt.rport_login = fc_rport_login; diff --git a/include/scsi/libfc.h b/include/scsi/libfc.h index b9e6c1cd8914..37df48e13b97 100644 --- a/include/scsi/libfc.h +++ b/include/scsi/libfc.h @@ -489,6 +489,11 @@ struct libfc_function_template { * Remote Port interfaces */ + /* + * Create a remote port + */ + struct fc_rport *(*rport_create)(struct fc_disc_port *); + /* * Initiates the RP state machine. It is called from the LP module. * This function will issue the following commands to the N_Port From 23f11f9076fcd6ee20c56e413b11f1b5658e3f82 Mon Sep 17 00:00:00 2001 From: Robert Love Date: Fri, 27 Feb 2009 10:55:23 -0800 Subject: [PATCH 277/580] [SCSI] libfc: correct RPORT_TO_PRIV usage We only need to use this macro when assigning a value to rport->dd_data. All other accesses should just use dd_data. Signed-off-by: Robert Love Signed-off-by: James Bottomley --- drivers/scsi/libfc/fc_disc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/libfc/fc_disc.c b/drivers/scsi/libfc/fc_disc.c index 15e3f840c049..9f4e408bc7fc 100644 --- a/drivers/scsi/libfc/fc_disc.c +++ b/drivers/scsi/libfc/fc_disc.c @@ -246,7 +246,7 @@ static void fc_disc_recv_rscn_req(struct fc_seq *sp, struct fc_frame *fp, list_del(&dp->peers); rport = lport->tt.rport_lookup(lport, dp->ids.port_id); if (rport) { - rdata = RPORT_TO_PRIV(rport); + rdata = rport->dd_data; list_del(&rdata->peers); lport->tt.rport_logoff(rport); } @@ -453,7 +453,7 @@ static int fc_disc_new_target(struct fc_disc *disc, static void fc_disc_del_target(struct fc_disc *disc, struct fc_rport *rport) { struct fc_lport *lport = disc->lport; - struct fc_rport_libfc_priv *rdata = RPORT_TO_PRIV(rport); + struct fc_rport_libfc_priv *rdata = rport->dd_data; list_del(&rdata->peers); lport->tt.rport_logoff(rport); } From d3b33327cab0c8e9cae2c12d908ca79433c0d1ac Mon Sep 17 00:00:00 2001 From: Robert Love Date: Fri, 27 Feb 2009 10:55:29 -0800 Subject: [PATCH 278/580] [SCSI] libfc: rename rp to rdata in fc_disc_new_target() Just rename the variable as per our naming convention. Signed-off-by: Robert Love Signed-off-by: James Bottomley --- drivers/scsi/libfc/fc_disc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/libfc/fc_disc.c b/drivers/scsi/libfc/fc_disc.c index 9f4e408bc7fc..cfbce893941a 100644 --- a/drivers/scsi/libfc/fc_disc.c +++ b/drivers/scsi/libfc/fc_disc.c @@ -396,7 +396,7 @@ static int fc_disc_new_target(struct fc_disc *disc, struct fc_rport_identifiers *ids) { struct fc_lport *lport = disc->lport; - struct fc_rport_libfc_priv *rp; + struct fc_rport_libfc_priv *rdata; int error = 0; if (rport && ids->port_name) { @@ -436,9 +436,9 @@ static int fc_disc_new_target(struct fc_disc *disc, error = -ENOMEM; } if (rport) { - rp = rport->dd_data; - rp->ops = &fc_disc_rport_ops; - rp->rp_state = RPORT_ST_INIT; + rdata = rport->dd_data; + rdata->ops = &fc_disc_rport_ops; + rdata->rp_state = RPORT_ST_INIT; lport->tt.rport_login(rport); } } From efaf5c085dd2d31757b0ff7886970dfddd8d1808 Mon Sep 17 00:00:00 2001 From: Robert Love Date: Fri, 27 Feb 2009 10:55:34 -0800 Subject: [PATCH 279/580] [SCSI] libfc: check for err when recv and state is incorrect If we've just created an interface and the an rport is logging in we may have a request on the wire (say PRLI). If we destroy the interface, we'll go through each rport on the disc->rports list and set each rport's state to NONE. Then the lport will reset the EM. The EM reset will send a CLOSED event to the prli_resp() handler which will notice that the state != PRLI. In this case it frees the frame pointer, decrements the refcount and unlocks the rport. The problem is that there isn't a frame in this case. It's just a pointer with an embedded error code. The free causes an Oops. This patch moves the error checking to be before the state checking. Signed-off-by: Robert Love Signed-off-by: James Bottomley --- drivers/scsi/libfc/fc_lport.c | 50 +++++++++++++++++------------------ drivers/scsi/libfc/fc_rport.c | 30 ++++++++++----------- 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/drivers/scsi/libfc/fc_lport.c b/drivers/scsi/libfc/fc_lport.c index 07335ae2947c..c00de2244c78 100644 --- a/drivers/scsi/libfc/fc_lport.c +++ b/drivers/scsi/libfc/fc_lport.c @@ -1031,17 +1031,17 @@ static void fc_lport_rft_id_resp(struct fc_seq *sp, struct fc_frame *fp, FC_DEBUG_LPORT("Received a RFT_ID response\n"); + if (IS_ERR(fp)) { + fc_lport_error(lport, fp); + goto err; + } + if (lport->state != LPORT_ST_RFT_ID) { FC_DBG("Received a RFT_ID response, but in state %s\n", fc_lport_state(lport)); goto out; } - if (IS_ERR(fp)) { - fc_lport_error(lport, fp); - goto err; - } - fh = fc_frame_header_get(fp); ct = fc_frame_payload_get(fp, sizeof(*ct)); @@ -1083,17 +1083,17 @@ static void fc_lport_rpn_id_resp(struct fc_seq *sp, struct fc_frame *fp, FC_DEBUG_LPORT("Received a RPN_ID response\n"); + if (IS_ERR(fp)) { + fc_lport_error(lport, fp); + goto err; + } + if (lport->state != LPORT_ST_RPN_ID) { FC_DBG("Received a RPN_ID response, but in state %s\n", fc_lport_state(lport)); goto out; } - if (IS_ERR(fp)) { - fc_lport_error(lport, fp); - goto err; - } - fh = fc_frame_header_get(fp); ct = fc_frame_payload_get(fp, sizeof(*ct)); if (fh && ct && fh->fh_type == FC_TYPE_CT && @@ -1133,17 +1133,17 @@ static void fc_lport_scr_resp(struct fc_seq *sp, struct fc_frame *fp, FC_DEBUG_LPORT("Received a SCR response\n"); + if (IS_ERR(fp)) { + fc_lport_error(lport, fp); + goto err; + } + if (lport->state != LPORT_ST_SCR) { FC_DBG("Received a SCR response, but in state %s\n", fc_lport_state(lport)); goto out; } - if (IS_ERR(fp)) { - fc_lport_error(lport, fp); - goto err; - } - op = fc_frame_payload_op(fp); if (op == ELS_LS_ACC) fc_lport_enter_ready(lport); @@ -1359,17 +1359,17 @@ static void fc_lport_logo_resp(struct fc_seq *sp, struct fc_frame *fp, FC_DEBUG_LPORT("Received a LOGO response\n"); + if (IS_ERR(fp)) { + fc_lport_error(lport, fp); + goto err; + } + if (lport->state != LPORT_ST_LOGO) { FC_DBG("Received a LOGO response, but in state %s\n", fc_lport_state(lport)); goto out; } - if (IS_ERR(fp)) { - fc_lport_error(lport, fp); - goto err; - } - op = fc_frame_payload_op(fp); if (op == ELS_LS_ACC) fc_lport_enter_reset(lport); @@ -1443,17 +1443,17 @@ static void fc_lport_flogi_resp(struct fc_seq *sp, struct fc_frame *fp, FC_DEBUG_LPORT("Received a FLOGI response\n"); + if (IS_ERR(fp)) { + fc_lport_error(lport, fp); + goto err; + } + if (lport->state != LPORT_ST_FLOGI) { FC_DBG("Received a FLOGI response, but in state %s\n", fc_lport_state(lport)); goto out; } - if (IS_ERR(fp)) { - fc_lport_error(lport, fp); - goto err; - } - fh = fc_frame_header_get(fp); did = ntoh24(fh->fh_d_id); if (fc_frame_payload_op(fp) == ELS_LS_ACC && did != 0) { diff --git a/drivers/scsi/libfc/fc_rport.c b/drivers/scsi/libfc/fc_rport.c index 81b3ca188789..4f23a9bb15a1 100644 --- a/drivers/scsi/libfc/fc_rport.c +++ b/drivers/scsi/libfc/fc_rport.c @@ -505,17 +505,17 @@ static void fc_rport_plogi_resp(struct fc_seq *sp, struct fc_frame *fp, FC_DEBUG_RPORT("Received a PLOGI response from port (%6x)\n", rport->port_id); + if (IS_ERR(fp)) { + fc_rport_error_retry(rport, fp); + goto err; + } + if (rdata->rp_state != RPORT_ST_PLOGI) { FC_DBG("Received a PLOGI response, but in state %s\n", fc_rport_state(rport)); goto out; } - if (IS_ERR(fp)) { - fc_rport_error_retry(rport, fp); - goto err; - } - op = fc_frame_payload_op(fp); if (op == ELS_LS_ACC && (plp = fc_frame_payload_get(fp, sizeof(*plp))) != NULL) { @@ -614,17 +614,17 @@ static void fc_rport_prli_resp(struct fc_seq *sp, struct fc_frame *fp, FC_DEBUG_RPORT("Received a PRLI response from port (%6x)\n", rport->port_id); + if (IS_ERR(fp)) { + fc_rport_error_retry(rport, fp); + goto err; + } + if (rdata->rp_state != RPORT_ST_PRLI) { FC_DBG("Received a PRLI response, but in state %s\n", fc_rport_state(rport)); goto out; } - if (IS_ERR(fp)) { - fc_rport_error_retry(rport, fp); - goto err; - } - op = fc_frame_payload_op(fp); if (op == ELS_LS_ACC) { pp = fc_frame_payload_get(fp, sizeof(*pp)); @@ -764,17 +764,17 @@ static void fc_rport_rtv_resp(struct fc_seq *sp, struct fc_frame *fp, FC_DEBUG_RPORT("Received a RTV response from port (%6x)\n", rport->port_id); + if (IS_ERR(fp)) { + fc_rport_error(rport, fp); + goto err; + } + if (rdata->rp_state != RPORT_ST_RTV) { FC_DBG("Received a RTV response, but in state %s\n", fc_rport_state(rport)); goto out; } - if (IS_ERR(fp)) { - fc_rport_error(rport, fp); - goto err; - } - op = fc_frame_payload_op(fp); if (op == ELS_LS_ACC) { struct fc_els_rtv_acc *rtv; From 0ae4d4ae47d2ccbcad813b0d6d8fe12590c7d648 Mon Sep 17 00:00:00 2001 From: Robert Love Date: Fri, 27 Feb 2009 10:55:39 -0800 Subject: [PATCH 280/580] [SCSI] libfc: Cleanup libfc_function_template comments Made the comments more like the comments for struct scsi_host_template. Signed-off-by: Robert Love Signed-off-by: James Bottomley --- include/scsi/libfc.h | 107 ++++++++++++++++++++++++++----------------- 1 file changed, 66 insertions(+), 41 deletions(-) diff --git a/include/scsi/libfc.h b/include/scsi/libfc.h index 37df48e13b97..282829cdf352 100644 --- a/include/scsi/libfc.h +++ b/include/scsi/libfc.h @@ -336,31 +336,17 @@ struct fc_exch { struct libfc_function_template { - /** - * Mandatory Fields - * - * These handlers must be implemented by the LLD. - */ - /* * Interface to send a FC frame + * + * STATUS: REQUIRED */ int (*frame_send)(struct fc_lport *lp, struct fc_frame *fp); - /** - * Optional Fields - * - * The LLD may choose to implement any of the following handlers. - * If LLD doesn't specify hander and leaves its pointer NULL then - * the default libfc function will be used for that handler. - */ - - /** - * ELS/CT interfaces - */ - /* - * elsct_send - sends ELS/CT frame + * Interface to send ELS/CT frames + * + * STATUS: OPTIONAL */ struct fc_seq *(*elsct_send)(struct fc_lport *lport, struct fc_rport *rport, @@ -370,9 +356,6 @@ struct libfc_function_template { struct fc_frame *fp, void *arg), void *arg, u32 timer_msec); - /** - * Exhance Manager interfaces - */ /* * Send the FC frame payload using a new exchange and sequence. @@ -404,6 +387,8 @@ struct libfc_function_template { * timer_msec argument is specified. The timer is canceled when * it fires or when the exchange is done. The exchange timeout handler * is registered by EM layer. + * + * STATUS: OPTIONAL */ struct fc_seq *(*exch_seq_send)(struct fc_lport *lp, struct fc_frame *fp, @@ -415,14 +400,18 @@ struct libfc_function_template { void *arg, unsigned int timer_msec); /* - * send a frame using existing sequence and exchange. + * Send a frame using an existing sequence and exchange. + * + * STATUS: OPTIONAL */ int (*seq_send)(struct fc_lport *lp, struct fc_seq *sp, struct fc_frame *fp); /* - * Send ELS response using mainly infomation - * in exchange and sequence in EM layer. + * Send an ELS response using infomation from a previous + * exchange and sequence. + * + * STATUS: OPTIONAL */ void (*seq_els_rsp_send)(struct fc_seq *sp, enum fc_els_cmd els_cmd, struct fc_seq_els_data *els_data); @@ -434,6 +423,8 @@ struct libfc_function_template { * A timer_msec can be specified for abort timeout, if non-zero * timer_msec value is specified then exchange resp handler * will be called with timeout error if no response to abort. + * + * STATUS: OPTIONAL */ int (*seq_exch_abort)(const struct fc_seq *req_sp, unsigned int timer_msec); @@ -441,6 +432,8 @@ struct libfc_function_template { /* * Indicate that an exchange/sequence tuple is complete and the memory * allocated for the related objects may be freed. + * + * STATUS: OPTIONAL */ void (*exch_done)(struct fc_seq *sp); @@ -448,6 +441,8 @@ struct libfc_function_template { * Assigns a EM and a free XID for an new exchange and then * allocates a new exchange and sequence pair. * The fp can be used to determine free XID. + * + * STATUS: OPTIONAL */ struct fc_exch *(*exch_get)(struct fc_lport *lp, struct fc_frame *fp); @@ -455,12 +450,16 @@ struct libfc_function_template { * Release previously assigned XID by exch_get API. * The LLD may implement this if XID is assigned by LLD * in exch_get(). + * + * STATUS: OPTIONAL */ void (*exch_put)(struct fc_lport *lp, struct fc_exch_mgr *mp, u16 ex_id); /* * Start a new sequence on the same exchange/sequence tuple. + * + * STATUS: OPTIONAL */ struct fc_seq *(*seq_start_next)(struct fc_seq *sp); @@ -468,26 +467,33 @@ struct libfc_function_template { * Reset an exchange manager, completing all sequences and exchanges. * If s_id is non-zero, reset only exchanges originating from that FID. * If d_id is non-zero, reset only exchanges sending to that FID. + * + * STATUS: OPTIONAL */ void (*exch_mgr_reset)(struct fc_lport *, u32 s_id, u32 d_id); - void (*rport_flush_queue)(void); - /** - * Local Port interfaces + /* + * Flush the rport work queue. Generally used before shutdown. + * + * STATUS: OPTIONAL */ + void (*rport_flush_queue)(void); /* - * Receive a frame to a local port. + * Receive a frame for a local port. + * + * STATUS: OPTIONAL */ void (*lport_recv)(struct fc_lport *lp, struct fc_seq *sp, struct fc_frame *fp); - int (*lport_reset)(struct fc_lport *); - - /** - * Remote Port interfaces + /* + * Reset the local port. + * + * STATUS: OPTIONAL */ + int (*lport_reset)(struct fc_lport *); /* * Create a remote port @@ -502,26 +508,33 @@ struct libfc_function_template { * - PLOGI * - PRLI * - RTV + * + * STATUS: OPTIONAL */ int (*rport_login)(struct fc_rport *rport); /* * Logoff, and remove the rport from the transport if * it had been added. This will send a LOGO to the target. + * + * STATUS: OPTIONAL */ int (*rport_logoff)(struct fc_rport *rport); /* * Recieve a request from a remote port. + * + * STATUS: OPTIONAL */ void (*rport_recv_req)(struct fc_seq *, struct fc_frame *, struct fc_rport *); - struct fc_rport *(*rport_lookup)(const struct fc_lport *, u32); - - /** - * FCP interfaces + /* + * lookup an rport by it's port ID. + * + * STATUS: OPTIONAL */ + struct fc_rport *(*rport_lookup)(const struct fc_lport *, u32); /* * Send a fcp cmd from fsp pkt. @@ -529,30 +542,38 @@ struct libfc_function_template { * * The resp handler is called when FCP_RSP received. * + * STATUS: OPTIONAL */ int (*fcp_cmd_send)(struct fc_lport *lp, struct fc_fcp_pkt *fsp, void (*resp)(struct fc_seq *, struct fc_frame *fp, void *arg)); /* - * Used at least durring linkdown and reset + * Cleanup the FCP layer, used durring link down and reset + * + * STATUS: OPTIONAL */ void (*fcp_cleanup)(struct fc_lport *lp); /* * Abort all I/O on a local port + * + * STATUS: OPTIONAL */ void (*fcp_abort_io)(struct fc_lport *lp); - /** - * Discovery interfaces + /* + * Receive a request for the discovery layer. + * + * STATUS: OPTIONAL */ - void (*disc_recv_req)(struct fc_seq *, struct fc_frame *, struct fc_lport *); /* * Start discovery for a local port. + * + * STATUS: OPTIONAL */ void (*disc_start)(void (*disc_callback)(struct fc_lport *, enum fc_disc_event), @@ -561,6 +582,8 @@ struct libfc_function_template { /* * Stop discovery for a given lport. This will remove * all discovered rports + * + * STATUS: OPTIONAL */ void (*disc_stop) (struct fc_lport *); @@ -568,6 +591,8 @@ struct libfc_function_template { * Stop discovery for a given lport. This will block * until all discovered rports are deleted from the * FC transport class + * + * STATUS: OPTIONAL */ void (*disc_stop_final) (struct fc_lport *); }; From ff392c497b43ddedbab5627b53928a654cc5486e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Mar 2009 14:48:36 -0500 Subject: [PATCH 281/580] xfs: prevent kernel crash due to corrupted inode log format Andras Korn reported an oops on log replay causes by a corrupted xfs_inode_log_format_t passing a 0 size to kmem_zalloc. This patch handles to small or too large numbers of log regions gracefully by rejecting the log replay with a useful error message. Signed-off-by: Christoph Hellwig Reported-by: Andras Korn Reviewed-by: Eric Sandeen Signed-off-by: Felix Blyakher --- fs/xfs/xfs_log_recover.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index b1047de2fffd..61af610d79b3 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1455,10 +1455,19 @@ xlog_recover_add_to_trans( item = item->ri_prev; if (item->ri_total == 0) { /* first region to be added */ - item->ri_total = in_f->ilf_size; - ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM); - item->ri_buf = kmem_zalloc((item->ri_total * - sizeof(xfs_log_iovec_t)), KM_SLEEP); + if (in_f->ilf_size == 0 || + in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { + xlog_warn( + "XFS: bad number of regions (%d) in inode log format", + in_f->ilf_size); + ASSERT(0); + return XFS_ERROR(EIO); + } + + item->ri_total = in_f->ilf_size; + item->ri_buf = + kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), + KM_SLEEP); } ASSERT(item->ri_total > item->ri_cnt); /* Description region is ri_buf[0] */ From 7d46be4a25fdfb503c20bad60a618adebfe2ac5c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Mar 2009 14:48:35 -0500 Subject: [PATCH 282/580] xfs: prevent lockdep false positive in xfs_iget_cache_miss The inode can't be locked by anyone else as we just created it a few lines above and it's not been added to any lookup data structure yet. So use a trylock that must succeed to get around the lockdep warnings. Signed-off-by: Christoph Hellwig Reported-by: Alexander Beregalov Reviewed-by: Eric Sandeen Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_iget.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index e2fb6210d4c5..478e587087fe 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -246,9 +246,6 @@ xfs_iget_cache_miss( goto out_destroy; } - if (lock_flags) - xfs_ilock(ip, lock_flags); - /* * Preload the radix tree so we can insert safely under the * write spinlock. Note that we cannot sleep inside the preload @@ -256,7 +253,16 @@ xfs_iget_cache_miss( */ if (radix_tree_preload(GFP_KERNEL)) { error = EAGAIN; - goto out_unlock; + goto out_destroy; + } + + /* + * Because the inode hasn't been added to the radix-tree yet it can't + * be found by another thread, so we can do the non-sleeping lock here. + */ + if (lock_flags) { + if (!xfs_ilock_nowait(ip, lock_flags)) + BUG(); } mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); @@ -284,7 +290,6 @@ xfs_iget_cache_miss( out_preload_end: write_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); -out_unlock: if (lock_flags) xfs_iunlock(ip, lock_flags); out_destroy: From c141b2928fe20396a9ecdec85526e4b66ae96c90 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Mar 2009 14:48:37 -0500 Subject: [PATCH 283/580] xfs: only issues a cache flush on unmount if barriers are enabled Currently we unconditionally issue a flush from xfs_free_buftarg, but since 2.6.29-rc1 this gives a warning in the style of end_request: I/O error, dev vdb, sector 0 Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_buf.c | 12 ++++++++++-- fs/xfs/linux-2.6/xfs_buf.h | 2 +- fs/xfs/linux-2.6/xfs_super.c | 10 +++++----- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index cb329edc925b..aa1016bb9134 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -34,6 +34,12 @@ #include #include +#include "xfs_sb.h" +#include "xfs_inum.h" +#include "xfs_ag.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" + static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); STATIC int xfsbufd_wakeup(int, gfp_t); @@ -1435,10 +1441,12 @@ xfs_unregister_buftarg( void xfs_free_buftarg( - xfs_buftarg_t *btp) + struct xfs_mount *mp, + struct xfs_buftarg *btp) { xfs_flush_buftarg(btp, 1); - xfs_blkdev_issue_flush(btp); + if (mp->m_flags & XFS_MOUNT_BARRIER) + xfs_blkdev_issue_flush(btp); xfs_free_bufhash(btp); iput(btp->bt_mapping->host); diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 288ae7c4c800..9b4d666ad31f 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -413,7 +413,7 @@ static inline int XFS_bwrite(xfs_buf_t *bp) * Handling of buftargs. */ extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); -extern void xfs_free_buftarg(xfs_buftarg_t *); +extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); extern int xfs_flush_buftarg(xfs_buftarg_t *, int); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index c71e226da7f5..32ae5028e96b 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -734,15 +734,15 @@ xfs_close_devices( { if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { struct block_device *logdev = mp->m_logdev_targp->bt_bdev; - xfs_free_buftarg(mp->m_logdev_targp); + xfs_free_buftarg(mp, mp->m_logdev_targp); xfs_blkdev_put(logdev); } if (mp->m_rtdev_targp) { struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; - xfs_free_buftarg(mp->m_rtdev_targp); + xfs_free_buftarg(mp, mp->m_rtdev_targp); xfs_blkdev_put(rtdev); } - xfs_free_buftarg(mp->m_ddev_targp); + xfs_free_buftarg(mp, mp->m_ddev_targp); } /* @@ -811,9 +811,9 @@ xfs_open_devices( out_free_rtdev_targ: if (mp->m_rtdev_targp) - xfs_free_buftarg(mp->m_rtdev_targp); + xfs_free_buftarg(mp, mp->m_rtdev_targp); out_free_ddev_targ: - xfs_free_buftarg(mp->m_ddev_targp); + xfs_free_buftarg(mp, mp->m_ddev_targp); out_close_rtdev: if (rtdev) xfs_blkdev_put(rtdev); From ccea34b5d0fbab081496d1860f31acee99fa8a6d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2009 00:44:13 +0900 Subject: [PATCH 284/580] percpu: finer grained locking to break deadlock and allow atomic free Impact: fix deadlock and allow atomic free Percpu allocation always uses GFP_KERNEL and whole alloc/free paths were protected by single mutex. All percpu allocations have been from GFP_KERNEL-safe context and the original allocator had this assumption too. However, by protecting both alloc and free paths with the same mutex, the new allocator creates free -> alloc -> GFP_KERNEL dependency which the original allocator didn't have. This can lead to deadlock if free is called from FS or IO paths. Also, in general, allocators are expected to allow free to be called from atomic context. This patch implements finer grained locking to break the deadlock and allow atomic free. For details, please read the "Synchronization rules" comment. While at it, also add CONTEXT: to function comments to describe which context they expect to be called from and what they do to it. This problem was reported by Thomas Gleixner and Peter Zijlstra. http://thread.gmane.org/gmane.linux.kernel/802384 Signed-off-by: Tejun Heo Reported-by: Thomas Gleixner Reported-by: Peter Zijlstra --- mm/percpu.c | 157 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 122 insertions(+), 35 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 4c8a419119da..bfe6a3afaf45 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -101,20 +102,28 @@ static struct pcpu_chunk *pcpu_reserved_chunk; static int pcpu_reserved_chunk_limit; /* - * One mutex to rule them all. + * Synchronization rules. * - * The following mutex is grabbed in the outermost public alloc/free - * interface functions and released only when the operation is - * complete. As such, every function in this file other than the - * outermost functions are called under pcpu_mutex. + * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former + * protects allocation/reclaim paths, chunks and chunk->page arrays. + * The latter is a spinlock and protects the index data structures - + * chunk slots, rbtree, chunks and area maps in chunks. * - * It can easily be switched to use spinlock such that only the area - * allocation and page population commit are protected with it doing - * actual [de]allocation without holding any lock. However, given - * what this allocator does, I think it's better to let them run - * sequentially. + * During allocation, pcpu_alloc_mutex is kept locked all the time and + * pcpu_lock is grabbed and released as necessary. All actual memory + * allocations are done using GFP_KERNEL with pcpu_lock released. + * + * Free path accesses and alters only the index data structures, so it + * can be safely called from atomic context. When memory needs to be + * returned to the system, free path schedules reclaim_work which + * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be + * reclaimed, release both locks and frees the chunks. Note that it's + * necessary to grab both locks to remove a chunk from circulation as + * allocation path might be referencing the chunk with only + * pcpu_alloc_mutex locked. */ -static DEFINE_MUTEX(pcpu_mutex); +static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ +static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ @@ -176,6 +185,9 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, * kzalloc() is used; otherwise, vmalloc() is used. The returned * memory is always zeroed. * + * CONTEXT: + * Does GFP_KERNEL allocation. + * * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ @@ -215,6 +227,9 @@ static void pcpu_mem_free(void *ptr, size_t size) * New slot according to the changed state is determined and @chunk is * moved to the slot. Note that the reserved chunk is never put on * chunk slots. + * + * CONTEXT: + * pcpu_lock. */ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) { @@ -260,6 +275,9 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr, * searchs for the chunk with the highest start address which isn't * beyond @addr. * + * CONTEXT: + * pcpu_lock. + * * RETURNS: * The address of the found chunk. */ @@ -300,6 +318,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) * @new: chunk to insert * * Insert @new into address rb tree. + * + * CONTEXT: + * pcpu_lock. */ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) { @@ -319,6 +340,10 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) * A single allocation can split an area into three areas, so this * function makes sure that @chunk->map has at least two extra slots. * + * CONTEXT: + * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired + * if area map is extended. + * * RETURNS: * 0 if noop, 1 if successfully extended, -errno on failure. */ @@ -332,13 +357,25 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk) if (chunk->map_alloc >= chunk->map_used + 2) return 0; + spin_unlock_irq(&pcpu_lock); + new_alloc = PCPU_DFL_MAP_ALLOC; while (new_alloc < chunk->map_used + 2) new_alloc *= 2; new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); - if (!new) + if (!new) { + spin_lock_irq(&pcpu_lock); return -ENOMEM; + } + + /* + * Acquire pcpu_lock and switch to new area map. Only free + * could have happened inbetween, so map_used couldn't have + * grown. + */ + spin_lock_irq(&pcpu_lock); + BUG_ON(new_alloc < chunk->map_used + 2); size = chunk->map_alloc * sizeof(chunk->map[0]); memcpy(new, chunk->map, size); @@ -371,6 +408,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk) * is inserted after the target block. * * @chunk->map must have enough free slots to accomodate the split. + * + * CONTEXT: + * pcpu_lock. */ static void pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) @@ -406,6 +446,9 @@ static void pcpu_split_block(struct pcpu_chunk *chunk, int i, * * @chunk->map must have at least two free slots. * + * CONTEXT: + * pcpu_lock. + * * RETURNS: * Allocated offset in @chunk on success, -1 if no matching area is * found. @@ -495,6 +538,9 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) * Free area starting from @freeme to @chunk. Note that this function * only modifies the allocation map. It doesn't depopulate or unmap * the area. + * + * CONTEXT: + * pcpu_lock. */ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) { @@ -580,6 +626,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, * For each cpu, depopulate and unmap pages [@page_start,@page_end) * from @chunk. If @flush is true, vcache is flushed before unmapping * and tlb after. + * + * CONTEXT: + * pcpu_alloc_mutex. */ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, bool flush) @@ -658,6 +707,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) * * For each cpu, populate and map pages [@page_start,@page_end) into * @chunk. The area is cleared on return. + * + * CONTEXT: + * pcpu_alloc_mutex, does GFP_KERNEL allocation. */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) { @@ -748,15 +800,16 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) * @align: alignment of area (max PAGE_SIZE) * @reserved: allocate from the reserved chunk if available * - * Allocate percpu area of @size bytes aligned at @align. Might - * sleep. Might trigger writeouts. + * Allocate percpu area of @size bytes aligned at @align. + * + * CONTEXT: + * Does GFP_KERNEL allocation. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ static void *pcpu_alloc(size_t size, size_t align, bool reserved) { - void *ptr = NULL; struct pcpu_chunk *chunk; int slot, off; @@ -766,27 +819,37 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) return NULL; } - mutex_lock(&pcpu_mutex); + mutex_lock(&pcpu_alloc_mutex); + spin_lock_irq(&pcpu_lock); /* serve reserved allocations from the reserved chunk if available */ if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; if (size > chunk->contig_hint || pcpu_extend_area_map(chunk) < 0) - goto out_unlock; + goto fail_unlock; off = pcpu_alloc_area(chunk, size, align); if (off >= 0) goto area_found; - goto out_unlock; + goto fail_unlock; } +restart: /* search through normal chunks */ for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { list_for_each_entry(chunk, &pcpu_slot[slot], list) { if (size > chunk->contig_hint) continue; - if (pcpu_extend_area_map(chunk) < 0) - goto out_unlock; + + switch (pcpu_extend_area_map(chunk)) { + case 0: + break; + case 1: + goto restart; /* pcpu_lock dropped, restart */ + default: + goto fail_unlock; + } + off = pcpu_alloc_area(chunk, size, align); if (off >= 0) goto area_found; @@ -794,27 +857,36 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) } /* hmmm... no space left, create a new chunk */ + spin_unlock_irq(&pcpu_lock); + chunk = alloc_pcpu_chunk(); if (!chunk) - goto out_unlock; + goto fail_unlock_mutex; + + spin_lock_irq(&pcpu_lock); pcpu_chunk_relocate(chunk, -1); pcpu_chunk_addr_insert(chunk); - - off = pcpu_alloc_area(chunk, size, align); - if (off < 0) - goto out_unlock; + goto restart; area_found: + spin_unlock_irq(&pcpu_lock); + /* populate, map and clear the area */ if (pcpu_populate_chunk(chunk, off, size)) { + spin_lock_irq(&pcpu_lock); pcpu_free_area(chunk, off); - goto out_unlock; + goto fail_unlock; } - ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off); -out_unlock: - mutex_unlock(&pcpu_mutex); - return ptr; + mutex_unlock(&pcpu_alloc_mutex); + + return __addr_to_pcpu_ptr(chunk->vm->addr + off); + +fail_unlock: + spin_unlock_irq(&pcpu_lock); +fail_unlock_mutex: + mutex_unlock(&pcpu_alloc_mutex); + return NULL; } /** @@ -825,6 +897,9 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) * Allocate percpu area of @size bytes aligned at @align. Might * sleep. Might trigger writeouts. * + * CONTEXT: + * Does GFP_KERNEL allocation. + * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ @@ -843,6 +918,9 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); * percpu area if arch has set it up; otherwise, allocation is served * from the same dynamic area. Might sleep. Might trigger writeouts. * + * CONTEXT: + * Does GFP_KERNEL allocation. + * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ @@ -856,6 +934,9 @@ void *__alloc_reserved_percpu(size_t size, size_t align) * @work: unused * * Reclaim all fully free chunks except for the first one. + * + * CONTEXT: + * workqueue context. */ static void pcpu_reclaim(struct work_struct *work) { @@ -863,7 +944,8 @@ static void pcpu_reclaim(struct work_struct *work) struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; - mutex_lock(&pcpu_mutex); + mutex_lock(&pcpu_alloc_mutex); + spin_lock_irq(&pcpu_lock); list_for_each_entry_safe(chunk, next, head, list) { WARN_ON(chunk->immutable); @@ -876,7 +958,8 @@ static void pcpu_reclaim(struct work_struct *work) list_move(&chunk->list, &todo); } - mutex_unlock(&pcpu_mutex); + spin_unlock_irq(&pcpu_lock); + mutex_unlock(&pcpu_alloc_mutex); list_for_each_entry_safe(chunk, next, &todo, list) { pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); @@ -888,18 +971,22 @@ static void pcpu_reclaim(struct work_struct *work) * free_percpu - free percpu area * @ptr: pointer to area to free * - * Free percpu area @ptr. Might sleep. + * Free percpu area @ptr. + * + * CONTEXT: + * Can be called from atomic context. */ void free_percpu(void *ptr) { void *addr = __pcpu_ptr_to_addr(ptr); struct pcpu_chunk *chunk; + unsigned long flags; int off; if (!ptr) return; - mutex_lock(&pcpu_mutex); + spin_lock_irqsave(&pcpu_lock, flags); chunk = pcpu_chunk_addr_search(addr); off = addr - chunk->vm->addr; @@ -917,7 +1004,7 @@ void free_percpu(void *ptr) } } - mutex_unlock(&pcpu_mutex); + spin_unlock_irqrestore(&pcpu_lock, flags); } EXPORT_SYMBOL_GPL(free_percpu); From d15bd1067b1fcb2b7250d22bc0c7c7fea0b759f7 Mon Sep 17 00:00:00 2001 From: "Justin P. Mattock" Date: Sat, 7 Mar 2009 13:31:29 +0100 Subject: [PATCH 285/580] kbuild: fix C libary confusion in unifdef.c due to getline() This fixes an error when compiling the kernel. CHK include/linux/version.h HOSTCC scripts/unifdef scripts/unifdef.c:209: error: conflicting types for 'getline' /usr/include/stdio.h:651: note: previous declaration of 'getline' was here make[1]: *** [scripts/unifdef] Error 1 make: *** [__headers] Error 2 Signed-off-by: Justin P. Mattock Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Sam Ravnborg --- scripts/unifdef.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/unifdef.c b/scripts/unifdef.c index 552025e72acb..05a31a6c7e1b 100644 --- a/scripts/unifdef.c +++ b/scripts/unifdef.c @@ -206,7 +206,7 @@ static void done(void); static void error(const char *); static int findsym(const char *); static void flushline(bool); -static Linetype getline(void); +static Linetype get_line(void); static Linetype ifeval(const char **); static void ignoreoff(void); static void ignoreon(void); @@ -512,7 +512,7 @@ process(void) for (;;) { linenum++; - lineval = getline(); + lineval = get_line(); trans_table[ifstate[depth]][lineval](); debug("process %s -> %s depth %d", linetype_name[lineval], @@ -526,7 +526,7 @@ process(void) * help from skipcomment(). */ static Linetype -getline(void) +get_line(void) { const char *cp; int cursym; From a2ebcc7a863761b6d59a4183c600edf5af63b8d2 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Sun, 22 Feb 2009 10:54:55 -0800 Subject: [PATCH 286/580] kbuild: fix mkspec to cleanup RPM_BUILD_ROOT The contents of the %clean section in mkspec is currently commented out leaving RPM_BUILD_ROOT and its contents on the build machine. This patch removes it once the rpm build process is complete. Signed-off-by: Josh Hunt Signed-off-by: Sam Ravnborg --- scripts/package/mkspec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/package/mkspec b/scripts/package/mkspec index ee448cdc6a2b..3d93f8c81252 100755 --- a/scripts/package/mkspec +++ b/scripts/package/mkspec @@ -96,7 +96,7 @@ echo "%endif" echo "" echo "%clean" -echo '#echo -rf $RPM_BUILD_ROOT' +echo 'rm -rf $RPM_BUILD_ROOT' echo "" echo "%files" echo '%defattr (-, root, root)' From b925dbfe3c59b637666670a60473a15d29e0a7a7 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Thu, 12 Feb 2009 10:16:05 -0800 Subject: [PATCH 287/580] kbuild: fix 'make rpm' when CONFIG_LOCALVERSION_AUTO=y and using SCM tree Running 'make rpm' fails when CONFIG_LOCALVERSION_AUTO=y and using a kernel source tree under SCM. This is due to KERNELRELEASE being different when the initial make is run and when make is run from rpmbuild. mkspec creates kernel.spec using KERNELRELEASE: echo "%files" echo '%defattr (-, root, root)' echo "%dir /lib/modules" echo "/lib/modules/$KERNELRELEASE" echo "/lib/firmware" echo "/boot/*" echo "" When CONFIG_LOCALVERSION_AUTO=y scripts/setlocalversion is called and grabs any additional version info from SCM. Next, the srctree is tarred up and SCM information is excluded. rpmbuild reruns make and in the process generates a new include/config/kernel.release and thus a new KERNELRELEASE. However this time the SCM information is gone so KERNELRELEASE no longer has the additional version information. When "make modules_install" runs, it uses the new KERNELRELEASE value to determine where to install the modules. This conflicts with where the spec file assumes they are going because of the mis-matching KERNELRELEASE versions. + INSTALL_MOD_PATH=/var/tmp/kernel-2.6.29rc4tip01479g5d85422-root + make -j16 modules_install INSTALL crypto/aead.ko INSTALL crypto/cbc.ko INSTALL crypto/chainiv.ko INSTALL crypto/crc32c.ko INSTALL crypto/crypto_algapi.ko INSTALL crypto/crypto_blkcipher.ko INSTALL crypto/crypto_hash.ko INSTALL crypto/cryptomgr.ko INSTALL crypto/ecb.ko INSTALL crypto/eseqiv.ko INSTALL crypto/krng.ko INSTALL crypto/md5.ko INSTALL crypto/pcbc.ko INSTALL crypto/rng.ko INSTALL drivers/block/cciss.ko INSTALL drivers/hid/hid-dummy.ko INSTALL drivers/scsi/iscsi_tcp.ko INSTALL drivers/scsi/libiscsi.ko INSTALL drivers/scsi/libiscsi_tcp.ko INSTALL drivers/scsi/scsi_transport_iscsi.ko INSTALL drivers/scsi/scsi_wait_scan.ko INSTALL fs/lockd/lockd.ko INSTALL fs/nfs/nfs.ko INSTALL fs/nfsd/nfsd.ko INSTALL lib/libcrc32c.ko INSTALL net/sunrpc/sunrpc.ko DEPMOD 2.6.29-rc4-tip + cp arch/x86/boot/bzImage /var/tmp/kernel-2.6.29rc4tip01479g5d85422-root/boot/vmlinuz-2.6.29-rc4-tip-01479-g5d85422 + cp System.map /var/tmp/kernel-2.6.29rc4tip01479g5d85422-root/boot/System.map-2.6.29-rc4-tip-01479-g5d85422 + cp .config /var/tmp/kernel-2.6.29rc4tip01479g5d85422-root/boot/config-2.6.29-rc4-tip-01479-g5d85422 + cp vmlinux vmlinux.orig + bzip2 -9 vmlinux + mv vmlinux.bz2 /var/tmp/kernel-2.6.29rc4tip01479g5d85422-root/boot/vmlinux-2.6.29-rc4-tip-01479-g5d85422.bz2 + mv vmlinux.orig vmlinux + /usr/lib/rpm/brp-compress Processing files: kernel-2.6.29rc4tip01479g5d85422-2 error: File not found: /var/tmp/kernel-2.6.29rc4tip01479g5d85422-root/lib/modules/2.6.29-rc4-tip-01479-g5d85422 RPM build errors: File not found: /var/tmp/kernel-2.6.29rc4tip01479g5d85422-root/lib/modules/2.6.29-rc4-tip-01479-g5d85422 make[1]: *** [rpm] Error 1 make: *** [rpm] Error 2 I have tested this patch on git -tip, Linus' git tree, and the kernel.org tar files, both with and without CONFIG_LOCALVERSION_AUTO=y. Signed-off-by: Josh Hunt Signed-off-by: Sam Ravnborg ---- --- Makefile | 16 +++++++++++----- scripts/package/Makefile | 3 ++- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index d04ee0ad1dcc..55424172bea4 100644 --- a/Makefile +++ b/Makefile @@ -904,12 +904,18 @@ localver = $(subst $(space),, $(string) \ # and if the SCM is know a tag from the SCM is appended. # The appended tag is determined by the SCM used. # -# Currently, only git is supported. -# Other SCMs can edit scripts/setlocalversion and add the appropriate -# checks as needed. +# .scmversion is used when generating rpm packages so we do not loose +# the version information from the SCM when we do the build of the kernel +# from the copied source ifdef CONFIG_LOCALVERSION_AUTO - _localver-auto = $(shell $(CONFIG_SHELL) \ - $(srctree)/scripts/setlocalversion $(srctree)) + +ifeq ($(wildcard .scmversion),) + _localver-auto = $(shell $(CONFIG_SHELL) \ + $(srctree)/scripts/setlocalversion $(srctree)) +else + _localver-auto = $(shell cat .scmversion 2> /dev/null) +endif + localver-auto = $(LOCALVERSION)$(_localver-auto) endif diff --git a/scripts/package/Makefile b/scripts/package/Makefile index 8c6b7b09606a..fa4a0a17b7e0 100644 --- a/scripts/package/Makefile +++ b/scripts/package/Makefile @@ -35,9 +35,10 @@ $(objtree)/kernel.spec: $(MKSPEC) $(srctree)/Makefile rpm-pkg rpm: $(objtree)/kernel.spec FORCE $(MAKE) clean $(PREV) ln -sf $(srctree) $(KERNELPATH) + $(CONFIG_SHELL) $(srctree)/scripts/setlocalversion > $(objtree)/.scmversion $(PREV) tar -cz $(RCS_TAR_IGNORE) -f $(KERNELPATH).tar.gz $(KERNELPATH)/. $(PREV) rm $(KERNELPATH) - + rm -f $(objtree)/.scmversion set -e; \ $(CONFIG_SHELL) $(srctree)/scripts/mkversion > $(objtree)/.tmp_version set -e; \ From 75bccd881a49d2da796ec0852158f957dc023f61 Mon Sep 17 00:00:00 2001 From: Gilles Espinasse Date: Sat, 7 Mar 2009 13:57:25 +0100 Subject: [PATCH 288/580] kbuild: remove unused -r option for module-init-tool depmod Following a thread on busybox mailing list depmod -r option is ignored by module-init-tools depmod -r option break busybox depmod. So the best solution look to remove -r from kernel Makefile Signed-off-by: Gilles Espinasse Signed-off-by: Sam Ravnborg --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 55424172bea4..5a5b82eaaec2 100644 --- a/Makefile +++ b/Makefile @@ -1543,7 +1543,7 @@ quiet_cmd_depmod = DEPMOD $(KERNELRELEASE) cmd_depmod = \ if [ -r System.map -a -x $(DEPMOD) ]; then \ $(DEPMOD) -ae -F System.map \ - $(if $(strip $(INSTALL_MOD_PATH)), -b $(INSTALL_MOD_PATH) -r) \ + $(if $(strip $(INSTALL_MOD_PATH)), -b $(INSTALL_MOD_PATH) ) \ $(KERNELRELEASE); \ fi From ab96ddec7213004b632d24dc2cdcd2df5f16f50b Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sat, 7 Mar 2009 13:39:22 -0800 Subject: [PATCH 289/580] Input: serio - fix protocol number for TouchIT213 Protocol 0x37 has been reserved for iNexio devices and Sahara was supposed to get 0x38. Reported-by: Claudio Nieder Signed-off-by: Dmitry Torokhov --- include/linux/serio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/serio.h b/include/linux/serio.h index 1bcb357a01a1..e0417e4d3f15 100644 --- a/include/linux/serio.h +++ b/include/linux/serio.h @@ -212,7 +212,7 @@ static inline void serio_unpin_driver(struct serio *serio) #define SERIO_FUJITSU 0x35 #define SERIO_ZHENHUA 0x36 #define SERIO_INEXIO 0x37 -#define SERIO_TOUCHIT213 0x37 +#define SERIO_TOUCHIT213 0x38 #define SERIO_W8001 0x39 #endif From 3a450de1365d20afde406f0d9b2931a5e4a4fd6a Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Fri, 6 Mar 2009 17:30:56 -0600 Subject: [PATCH 290/580] x86: UV: remove uv_flush_tlb_others() WARN_ON In uv_flush_tlb_others() (arch/x86/kernel/tlb_uv.c), the "WARN_ON(!in_atomic())" fails if CONFIG_PREEMPT is not enabled. And CONFIG_PREEMPT is not enabled by default in the distribution that most UV owners will use. We could #ifdef CONFIG_PREEMPT the warning, but that is not good form. And there seems to be no suitable fix to in_atomic() when CONFIG_PREMPT is not on. As Ingo commented: > and we have no proper primitive to test for atomicity. (mainly > because we dont know about atomicity on a non-preempt kernel) So we drop the WARN_ON. Signed-off-by: Cliff Wickman Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index f04549afcfe9..d038b9c45cf8 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -314,8 +314,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, int locals = 0; struct bau_desc *bau_desc; - WARN_ON(!in_atomic()); - cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); uv_cpu = uv_blade_processor_id(); From cda56ac29f2d8288d62978272856884d26e0b47b Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 10 Feb 2009 16:32:33 +0200 Subject: [PATCH 291/580] mmc: fix data timeout for SEND_EXT_CSD Commit 0d3e0460f307e84904968aad6cff97bd688583d8 "MMC: CSD and CID timeout values" inadvertently broke the timeout for the MMC command SEND_EXT_CSD. This patch puts it back again. Depending on the characteristics of the controller, this bug may prevent the use of MMC cards. Signed-off-by: Adrian Hunter Signed-off-by: Pierre Ossman --- drivers/mmc/core/mmc_ops.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c index 9c50e6f1c236..34ce2703d29a 100644 --- a/drivers/mmc/core/mmc_ops.c +++ b/drivers/mmc/core/mmc_ops.c @@ -248,12 +248,15 @@ mmc_send_cxd_data(struct mmc_card *card, struct mmc_host *host, sg_init_one(&sg, data_buf, len); - /* - * The spec states that CSR and CID accesses have a timeout - * of 64 clock cycles. - */ - data.timeout_ns = 0; - data.timeout_clks = 64; + if (opcode == MMC_SEND_CSD || opcode == MMC_SEND_CID) { + /* + * The spec states that CSR and CID accesses have a timeout + * of 64 clock cycles. + */ + data.timeout_ns = 0; + data.timeout_clks = 64; + } else + mmc_set_data_timeout(&data, card); mmc_wait_for_req(host, &mrq); From 1f442d70c84aa798e243e721eba728a98434cd86 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 7 Mar 2009 23:46:26 -0800 Subject: [PATCH 292/580] x86: remove smp_apply_quirks()/smp_checks() Impact: cleanup and code size reduction on 64-bit This code is only applied to Intel Pentium and AMD K7 32-bit cpus. Move those checks to intel_init()/amd_init() for 32-bit so 64-bit will not build this code. Also change to use cpu_index check to see if we need to emit warning. Signed-off-by: Yinghai Lu LKML-Reference: <49B377D2.8030108@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 52 +++++++++++++++++++++++++ arch/x86/kernel/cpu/intel.c | 25 ++++++++++++ arch/x86/kernel/smpboot.c | 78 ------------------------------------- 3 files changed, 77 insertions(+), 78 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 25423a5b80ed..f47df59016c5 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -5,6 +5,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 # include @@ -141,6 +142,55 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) } } +static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + /* calling is from identify_secondary_cpu() ? */ + if (c->cpu_index == boot_cpu_id) + return; + + /* + * Certain Athlons might work (for various values of 'work') in SMP + * but they are not certified as MP capable. + */ + /* Athlon 660/661 is valid. */ + if ((c->x86_model == 6) && ((c->x86_mask == 0) || + (c->x86_mask == 1))) + goto valid_k7; + + /* Duron 670 is valid */ + if ((c->x86_model == 7) && (c->x86_mask == 0)) + goto valid_k7; + + /* + * Athlon 662, Duron 671, and Athlon >model 7 have capability + * bit. It's worth noting that the A5 stepping (662) of some + * Athlon XP's have the MP bit set. + * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for + * more. + */ + if (((c->x86_model == 6) && (c->x86_mask >= 2)) || + ((c->x86_model == 7) && (c->x86_mask >= 1)) || + (c->x86_model > 7)) + if (cpu_has_mp) + goto valid_k7; + + /* If we get here, not a certified SMP capable AMD system. */ + + /* + * Don't taint if we are running SMP kernel on a single non-MP + * approved Athlon + */ + WARN_ONCE(1, "WARNING: This combination of AMD" + "processors is not suitable for SMP.\n"); + if (!test_taint(TAINT_UNSAFE_SMP)) + add_taint(TAINT_UNSAFE_SMP); + +valid_k7: + ; +#endif +} + static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) { u32 l, h; @@ -175,6 +225,8 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) } set_cpu_cap(c, X86_FEATURE_K7); + + amd_k7_smp_check(c); } #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 25c559ba8d54..191117f1ad51 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include @@ -110,6 +111,28 @@ static void __cpuinit trap_init_f00f_bug(void) } #endif +static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + /* calling is from identify_secondary_cpu() ? */ + if (c->cpu_index == boot_cpu_id) + return; + + /* + * Mask B, Pentium, but not Pentium MMX + */ + if (c->x86 == 5 && + c->x86_mask >= 1 && c->x86_mask <= 4 && + c->x86_model <= 3) { + /* + * Remember we have B step Pentia with bugs + */ + WARN_ONCE(1, "WARNING: SMP operation may be unreliable" + "with B stepping processors.\n"); + } +#endif +} + static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) { unsigned long lo, hi; @@ -186,6 +209,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_NUMAQ numaq_tsc_disable(); #endif + + intel_smp_check(c); } #else static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 249334f5080a..ef7d10170c30 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -114,10 +114,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); atomic_t init_deasserted; - -/* Set if we find a B stepping CPU */ -static int __cpuinitdata smp_b_stepping; - #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) /* which logical CPUs are on which nodes */ @@ -271,8 +267,6 @@ static void __cpuinit smp_callin(void) cpumask_set_cpu(cpuid, cpu_callin_mask); } -static int __cpuinitdata unsafe_smp; - /* * Activate a secondary processor. */ @@ -340,76 +334,6 @@ notrace static void __cpuinit start_secondary(void *unused) cpu_idle(); } -static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) -{ - /* - * Mask B, Pentium, but not Pentium MMX - */ - if (c->x86_vendor == X86_VENDOR_INTEL && - c->x86 == 5 && - c->x86_mask >= 1 && c->x86_mask <= 4 && - c->x86_model <= 3) - /* - * Remember we have B step Pentia with bugs - */ - smp_b_stepping = 1; - - /* - * Certain Athlons might work (for various values of 'work') in SMP - * but they are not certified as MP capable. - */ - if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) { - - if (num_possible_cpus() == 1) - goto valid_k7; - - /* Athlon 660/661 is valid. */ - if ((c->x86_model == 6) && ((c->x86_mask == 0) || - (c->x86_mask == 1))) - goto valid_k7; - - /* Duron 670 is valid */ - if ((c->x86_model == 7) && (c->x86_mask == 0)) - goto valid_k7; - - /* - * Athlon 662, Duron 671, and Athlon >model 7 have capability - * bit. It's worth noting that the A5 stepping (662) of some - * Athlon XP's have the MP bit set. - * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for - * more. - */ - if (((c->x86_model == 6) && (c->x86_mask >= 2)) || - ((c->x86_model == 7) && (c->x86_mask >= 1)) || - (c->x86_model > 7)) - if (cpu_has_mp) - goto valid_k7; - - /* If we get here, not a certified SMP capable AMD system. */ - unsafe_smp = 1; - } - -valid_k7: - ; -} - -static void __cpuinit smp_checks(void) -{ - if (smp_b_stepping) - printk(KERN_WARNING "WARNING: SMP operation may be unreliable" - "with B stepping processors.\n"); - - /* - * Don't taint if we are running SMP kernel on a single non-MP - * approved Athlon - */ - if (unsafe_smp && num_online_cpus() > 1) { - printk(KERN_INFO "WARNING: This combination of AMD" - "processors is not suitable for SMP.\n"); - add_taint(TAINT_UNSAFE_SMP); - } -} - /* * The bootstrap kernel entry code has set these up. Save them for * a given CPU @@ -423,7 +347,6 @@ void __cpuinit smp_store_cpu_info(int id) c->cpu_index = id; if (id != 0) identify_secondary_cpu(c); - smp_apply_quirks(c); } @@ -1193,7 +1116,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) pr_debug("Boot done.\n"); impress_friends(); - smp_checks(); #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); #endif From 8827247ffcc9e880cbe4705655065cf011265157 Mon Sep 17 00:00:00 2001 From: Wang Chen Date: Sat, 7 Mar 2009 13:34:19 +0800 Subject: [PATCH 293/580] x86: don't define __this_fixmap_does_not_exist() Impact: improve out-of-range fixmap index debugging Commit "1b42f51630c7eebce6fb780b480731eb81afd325" defined the __this_fixmap_does_not_exist() function with a WARN_ON(1) in it. This causes the linker to not report an error when __this_fixmap_does_not_exist() is called with a non-constant parameter. Ingo defined __this_fixmap_does_not_exist() because he wanted to get virt addresses of fix memory of nest level by non-constant index. But we can fix this and still keep the link-time check: We can get the four slot virt addresses on link time and store them to array slot_virt[]. Then we can then refer the slot_virt with non-constant index, in the ioremap-leak detection code. Signed-off-by: Wang Chen LKML-Reference: <49B2075B.4070509@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 62773abdf088..96786ef2c9a9 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -504,13 +504,19 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr) return &bm_pte[pte_index(addr)]; } +static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; + void __init early_ioremap_init(void) { pmd_t *pmd; + int i; if (early_ioremap_debug) printk(KERN_INFO "early_ioremap_init()\n"); + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + slot_virt[i] = fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); memset(bm_pte, 0, sizeof(bm_pte)); pmd_populate_kernel(&init_mm, pmd, bm_pte); @@ -577,6 +583,7 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx) static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; + static int __init check_early_ioremap_leak(void) { int count = 0; @@ -598,7 +605,8 @@ static int __init check_early_ioremap_leak(void) } late_initcall(check_early_ioremap_leak); -static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) +static void __init __iomem * +__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) { unsigned long offset, last_addr; unsigned int nrpages; @@ -664,9 +672,9 @@ static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned lo --nrpages; } if (early_ioremap_debug) - printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); + printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]); - prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0)); + prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); return prev_map[slot]; } @@ -734,8 +742,3 @@ void __init early_iounmap(void __iomem *addr, unsigned long size) } prev_map[slot] = NULL; } - -void __this_fixmap_does_not_exist(void) -{ - WARN_ON(1); -} From 4302e5d53b9166d45317e3ddf0a7a9dab3efd43b Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Thu, 5 Mar 2009 11:45:48 +0100 Subject: [PATCH 294/580] MIPS: compat: Implement is_compat_task. This is a build fix required after "x86-64: seccomp: fix 32/64 syscall hole" (commit 5b1017404aea6d2e552e991b3fd814d839e9cd67). MIPS doesn't have the issue that was fixed for x86-64 by that patch. This also doesn't solve the N32 issue which is that N32 seccomp processes will be treated as non-compat processes thus only have access to N64 syscalls. Signed-off-by: Ralf Baechle Signed-off-by: Linus Torvalds --- arch/mips/include/asm/compat.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h index ac5d541368e9..6c5b40905dd6 100644 --- a/arch/mips/include/asm/compat.h +++ b/arch/mips/include/asm/compat.h @@ -3,6 +3,8 @@ /* * Architecture specific compatibility types */ +#include +#include #include #include #include @@ -218,4 +220,9 @@ struct compat_shmid64_ds { compat_ulong_t __unused2; }; +static inline int is_compat_task(void) +{ + return test_thread_flag(TIF_32BIT); +} + #endif /* _ASM_COMPAT_H */ From e954ef20c29b7af07a8cb5452f14fb69e3d9d2b2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 5 Mar 2009 12:04:57 -0800 Subject: [PATCH 295/580] x86: fix warning about nodeid Impact: cleanup Ingo found there warning about nodeid with some configs. try to use for_each_online_node for non numa too. in that case nodeid will be 0. also move out boundary checking from setup_node_bootmem(), so non-numa config will not check it. Signed-off-by: Yinghai Lu Cc: Andrew Morton LKML-Reference: <49B03069.80001@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2966c6b8d304..db81e9a8556b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -806,11 +806,6 @@ static unsigned long __init setup_node_bootmem(int nodeid, { unsigned long bootmap_size; - if (start_pfn > max_low_pfn) - return bootmap; - if (end_pfn > max_low_pfn) - end_pfn = max_low_pfn; - /* don't touch min_low_pfn */ bootmap_size = init_bootmem_node(NODE_DATA(nodeid), bootmap >> PAGE_SHIFT, @@ -843,13 +838,23 @@ void __init setup_bootmem_allocator(void) max_pfn_mapped< max_low_pfn) + continue; + if (end_pfn > max_low_pfn) + end_pfn = max_low_pfn; #else - bootmap = setup_node_bootmem(0, 0, max_low_pfn, bootmap); + start_pfn = 0; + end_pfn = max_low_pfn; #endif + bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn, + bootmap); + } after_bootmem = 1; } From d0fc63f7bd07cb779a06dc1cdd0c5a14e7f5d562 Mon Sep 17 00:00:00 2001 From: Stuart Bennett Date: Sun, 8 Mar 2009 20:21:35 +0200 Subject: [PATCH 296/580] x86 mmiotrace: fix remove_kmmio_fault_pages() Impact: fix race+crash in mmiotrace The list manipulation in remove_kmmio_fault_pages() was broken. If more than one consecutive kmmio_fault_page was re-added during the grace period between unregister_kmmio_probe() and remove_kmmio_fault_pages(), the list manipulation failed to remove pages from the release list. After a second grace period the pages get into rcu_free_kmmio_fault_pages() and raise a BUG_ON() kernel crash. The list manipulation is fixed to properly remove pages from the release list. This bug has been present from the very beginning of mmiotrace in the mainline kernel. It was introduced in 0fd0e3da ("x86: mmiotrace full patch, preview 1"); An urgent fix for Linus. Tested by Stuart (on 32-bit) and Pekka (on amd and intel 64-bit systems, nouveau and nvidia proprietary). Signed-off-by: Stuart Bennett Signed-off-by: Pekka Paalanen LKML-Reference: <20090308202135.34933feb@daedalus.pq.iki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 9f205030d9aa..6a518dd08a36 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -451,23 +451,24 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head) static void remove_kmmio_fault_pages(struct rcu_head *head) { - struct kmmio_delayed_release *dr = container_of( - head, - struct kmmio_delayed_release, - rcu); + struct kmmio_delayed_release *dr = + container_of(head, struct kmmio_delayed_release, rcu); struct kmmio_fault_page *p = dr->release_list; struct kmmio_fault_page **prevp = &dr->release_list; unsigned long flags; + spin_lock_irqsave(&kmmio_lock, flags); while (p) { - if (!p->count) + if (!p->count) { list_del_rcu(&p->list); - else + prevp = &p->release_next; + } else { *prevp = p->release_next; - prevp = &p->release_next; + } p = p->release_next; } spin_unlock_irqrestore(&kmmio_lock, flags); + /* This is the real RCU destroy call. */ call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); } From 0feca851c1b3cb4ebfa3149144b3d5de0879ebaa Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 6 Mar 2009 10:09:26 -0800 Subject: [PATCH 297/580] x86-32: make sure virt_addr_valid() returns false for fixmap addresses I found that virt_addr_valid() was returning true for fixmap addresses. I'm not sure whether pfn_valid() is supposed to include this test, but there's no harm in being explicit. Signed-off-by: Jeremy Fitzhardinge Cc: Jiri Slaby Cc: Yinghai Lu LKML-Reference: <49B166D6.2080505@goop.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 62773abdf088..62def5795730 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -87,6 +87,8 @@ bool __virt_addr_valid(unsigned long x) return false; if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) return false; + if (x >= FIXADDR_START) + return false; return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); } EXPORT_SYMBOL(__virt_addr_valid); From cbd88c8e6f5cdb8d4b9af01df825305200240382 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 9 Mar 2009 10:06:22 -0600 Subject: [PATCH 298/580] lguest: fix crash 'unhandled trap 13 at ' Impact: fix lguest boot crash on modern Intel machines The code in early_init_intel does: if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { u64 misc_enable; rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); And that rdmsr faults (not allowed from non-0 PL). We can get around this by mugging the family ID part of the cpuid. 5 seems like a good number. Of course, this is a hack (how very lguest!). We could just indicate that we don't support MSRs, or implement lguest_rdmst. Reported-by: Patrick McHardy Signed-off-by: Rusty Russell Tested-by: Patrick McHardy --- arch/x86/lguest/boot.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 92f1c6f3e19d..ba5c05e97f10 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -343,6 +343,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, * flush_tlb_user() for both user and kernel mappings unless * the Page Global Enable (PGE) feature bit is set. */ *dx |= 0x00002000; + /* We also lie, and say we're family id 5. 6 or greater + * leads to a rdmsr in early_init_intel which we can't handle. + * Family ID is returned as bits 8-12 in ax. */ + *ax &= 0xFFFFF0FF; + *ax |= 0x00000500; break; case 0x80000000: /* Futureproof this a little: if they ask how much extended From 6db6a5f3ae2ca6b874b0fd97ae16fdc9b5cdd6cc Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 9 Mar 2009 10:06:28 -0600 Subject: [PATCH 299/580] lguest: fix for CONFIG_SPARSE_IRQ=y Impact: remove lots of lguest boot WARN_ON() when CONFIG_SPARSE_IRQ=y We now need to call irq_to_desc_alloc_cpu() before set_irq_chip_and_handler_name(), but we can't do that from init_IRQ (no kmalloc available). So do it as we use interrupts instead. Also means we only alloc for irqs we use, which was the intent of CONFIG_SPARSE_IRQ anyway. Signed-off-by: Rusty Russell Cc: Ingo Molnar --- arch/x86/lguest/boot.c | 16 +++++++++------- drivers/lguest/lguest_device.c | 6 ++++++ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index ba5c05e97f10..960a8d9c049c 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -594,19 +594,21 @@ static void __init lguest_init_IRQ(void) /* Some systems map "vectors" to interrupts weirdly. Lguest has * a straightforward 1 to 1 mapping, so force that here. */ __get_cpu_var(vector_irq)[vector] = i; - if (vector != SYSCALL_VECTOR) { - set_intr_gate(vector, - interrupt[vector-FIRST_EXTERNAL_VECTOR]); - set_irq_chip_and_handler_name(i, &lguest_irq_controller, - handle_level_irq, - "level"); - } + if (vector != SYSCALL_VECTOR) + set_intr_gate(vector, interrupt[i]); } /* This call is required to set up for 4k stacks, where we have * separate stacks for hard and soft interrupts. */ irq_ctx_init(smp_processor_id()); } +void lguest_setup_irq(unsigned int irq) +{ + irq_to_desc_alloc_cpu(irq, 0); + set_irq_chip_and_handler_name(irq, &lguest_irq_controller, + handle_level_irq, "level"); +} + /* * Time. * diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index b4d44e571d76..8132533d71f9 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -212,6 +212,9 @@ static void lg_notify(struct virtqueue *vq) hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0); } +/* An extern declaration inside a C file is bad form. Don't do it. */ +extern void lguest_setup_irq(unsigned int irq); + /* This routine finds the first virtqueue described in the configuration of * this device and sets it up. * @@ -266,6 +269,9 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, goto unmap; } + /* Make sure the interrupt is allocated. */ + lguest_setup_irq(lvq->config.irq); + /* Tell the interrupt for this virtqueue to go to the virtio_ring * interrupt handler. */ /* FIXME: We used to have a flag for the Host to tell us we could use From 0796e75503adc6b0a119493ce2e599fb5fd8f96e Mon Sep 17 00:00:00 2001 From: Friedrich Oslage Date: Sun, 8 Mar 2009 20:13:42 -0700 Subject: [PATCH 300/580] sunhme: Fix qfe parent detection. Signed-off-by: Friedrich Oslage Signed-off-by: David S. Miller --- drivers/net/sunhme.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/sunhme.c b/drivers/net/sunhme.c index cc4013be5e18..0ff837a9c804 100644 --- a/drivers/net/sunhme.c +++ b/drivers/net/sunhme.c @@ -2630,8 +2630,6 @@ static int __devinit happy_meal_sbus_probe_one(struct of_device *op, int is_qfe) int err = -ENODEV; sbus_dp = to_of_device(op->dev.parent)->node; - if (is_qfe) - sbus_dp = to_of_device(op->dev.parent->parent)->node; /* We can match PCI devices too, do not accept those here. */ if (strcmp(sbus_dp->name, "sbus")) From 6d5b5acca9e566515ef3f1ed617e7295c4f94345 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 9 Mar 2009 13:31:59 +0100 Subject: [PATCH 301/580] Fix fixpoint divide exception in acct_update_integrals Frans Pop reported the crash below when running an s390 kernel under Hercules: Kernel BUG at 000738b4 verbose debug info unavailable! fixpoint divide exception: 0009 #1! SMP Modules linked in: nfs lockd nfs_acl sunrpc ctcm fsm tape_34xx cu3088 tape ccwgroup tape_class ext3 jbd mbcache dm_mirror dm_log dm_snapshot dm_mod dasd_eckd_mod dasd_mod CPU: 0 Not tainted 2.6.27.19 #13 Process awk (pid: 2069, task: 0f9ed9b8, ksp: 0f4f7d18) Krnl PSW : 070c1000 800738b4 (acct_update_integrals+0x4c/0x118) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:0 CC:1 PM:0 Krnl GPRS: 00000000 000007d0 7fffffff fffff830 00000000 ffffffff 00000002 0f9ed9b8 00000000 00008ca0 00000000 0f9ed9b8 0f9edda4 8007386e 0f4f7ec8 0f4f7e98 Krnl Code: 800738aa: a71807d0 lhi %r1,2000 800738ae: 8c200001 srdl %r2,1 800738b2: 1d21 dr %r2,%r1 >800738b4: 5810d10e l %r1,270(%r13) 800738b8: 1823 lr %r2,%r3 800738ba: 4130f060 la %r3,96(%r15) 800738be: 0de1 basr %r14,%r1 800738c0: 5800f060 l %r0,96(%r15) Call Trace: ( <000000000004fdea>! blocking_notifier_call_chain+0x1e/0x2c) <0000000000038502>! do_exit+0x106/0x7c0 <0000000000038c36>! do_group_exit+0x7a/0xb4 <0000000000038c8e>! SyS_exit_group+0x1e/0x30 <0000000000021c28>! sysc_do_restart+0x12/0x16 <0000000077e7e924>! 0x77e7e924 Reason for this is that cpu time accounting usually only happens from interrupt context, but acct_update_integrals gets also called from process context with interrupts enabled. So in acct_update_integrals we may end up with the following scenario: Between reading tsk->stime/tsk->utime and tsk->acct_timexpd an interrupt happens which updates accouting values. This causes acct_timexpd to be greater than the former stime + utime. The subsequent calculation of dtime = cputime_sub(time, tsk->acct_timexpd); will be negative and the division performed by cputime_to_jiffies(dtime) will generate an exception since the result won't fit into a 32 bit register. In order to fix this just always disable interrupts while accessing any of the accounting values. Reported by: Frans Pop Tested by: Frans Pop Cc: stable@kernel.org Cc: Martin Schwidefsky Signed-off-by: Heiko Carstens Signed-off-by: Linus Torvalds --- kernel/tsacct.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 43f891b05a4b..00d59d048edf 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -122,8 +122,10 @@ void acct_update_integrals(struct task_struct *tsk) if (likely(tsk->mm)) { cputime_t time, dtime; struct timeval value; + unsigned long flags; u64 delta; + local_irq_save(flags); time = tsk->stime + tsk->utime; dtime = cputime_sub(time, tsk->acct_timexpd); jiffies_to_timeval(cputime_to_jiffies(dtime), &value); @@ -131,10 +133,12 @@ void acct_update_integrals(struct task_struct *tsk) delta = delta * USEC_PER_SEC + value.tv_usec; if (delta == 0) - return; + goto out; tsk->acct_timexpd = time; tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; + out: + local_irq_restore(flags); } } From b9447ef80bd301b932ac4d85c9622e929de5fd62 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 9 Mar 2009 11:45:38 -0400 Subject: [PATCH 302/580] Btrfs: fix spinlock assertions on UP systems btrfs_tree_locked was being used to make sure a given extent_buffer was properly locked in a few places. But, it wasn't correct for UP compiled kernels. This switches it to using assert_spin_locked instead, and renames it to btrfs_assert_tree_locked to better reflect how it was really being used. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 10 +++++----- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/extent-tree.c | 4 ++-- fs/btrfs/locking.c | 6 +++--- fs/btrfs/locking.h | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 42491d728e99..37f31b5529aa 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -277,7 +277,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (*cow_ret == buf) unlock_orig = 1; - WARN_ON(!btrfs_tree_locked(buf)); + btrfs_assert_tree_locked(buf); if (parent) parent_start = parent->start; @@ -2365,7 +2365,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (slot >= btrfs_header_nritems(upper) - 1) return 1; - WARN_ON(!btrfs_tree_locked(path->nodes[1])); + btrfs_assert_tree_locked(path->nodes[1]); right = read_node_slot(root, upper, slot + 1); btrfs_tree_lock(right); @@ -2562,7 +2562,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root if (right_nritems == 0) return 1; - WARN_ON(!btrfs_tree_locked(path->nodes[1])); + btrfs_assert_tree_locked(path->nodes[1]); left = read_node_slot(root, path->nodes[1], slot - 1); btrfs_tree_lock(left); @@ -4101,7 +4101,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) next = read_node_slot(root, c, slot); if (!path->skip_locking) { - WARN_ON(!btrfs_tree_locked(c)); + btrfs_assert_tree_locked(c); btrfs_tree_lock(next); btrfs_set_lock_blocking(next); } @@ -4126,7 +4126,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) reada_for_search(root, path, level, slot, 0); next = read_node_slot(root, next, 0); if (!path->skip_locking) { - WARN_ON(!btrfs_tree_locked(path->nodes[level])); + btrfs_assert_tree_locked(path->nodes[level]); btrfs_tree_lock(next); btrfs_set_lock_blocking(next); } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index adda739a0215..3e18175248e0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -857,7 +857,7 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *btree_inode = root->fs_info->btree_inode; if (btrfs_header_generation(buf) == root->fs_info->running_transaction->transid) { - WARN_ON(!btrfs_tree_locked(buf)); + btrfs_assert_tree_locked(buf); /* ugh, clear_extent_buffer_dirty can be expensive */ btrfs_set_lock_blocking(buf); @@ -2361,7 +2361,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) btrfs_set_lock_blocking(buf); - WARN_ON(!btrfs_tree_locked(buf)); + btrfs_assert_tree_locked(buf); if (transid != root->fs_info->generation) { printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " "found %llu running %llu\n", diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6b5966aacf44..9abf81f71c46 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4418,13 +4418,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); - BUG_ON(!btrfs_tree_locked(parent)); + btrfs_assert_tree_locked(parent); parent_level = btrfs_header_level(parent); extent_buffer_get(parent); path->nodes[parent_level] = parent; path->slots[parent_level] = btrfs_header_nritems(parent); - BUG_ON(!btrfs_tree_locked(node)); + btrfs_assert_tree_locked(node); level = btrfs_header_level(node); extent_buffer_get(node); path->nodes[level] = node; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 85506c4a3af7..47b0a88c12a2 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -220,8 +220,8 @@ int btrfs_tree_unlock(struct extent_buffer *eb) return 0; } -int btrfs_tree_locked(struct extent_buffer *eb) +void btrfs_assert_tree_locked(struct extent_buffer *eb) { - return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) || - spin_is_locked(&eb->lock); + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + assert_spin_locked(&eb->lock); } diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 6bb0afbff928..6c4ce457168c 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -21,11 +21,11 @@ int btrfs_tree_lock(struct extent_buffer *eb); int btrfs_tree_unlock(struct extent_buffer *eb); -int btrfs_tree_locked(struct extent_buffer *eb); int btrfs_try_tree_lock(struct extent_buffer *eb); int btrfs_try_spin_lock(struct extent_buffer *eb); void btrfs_set_lock_blocking(struct extent_buffer *eb); void btrfs_clear_lock_blocking(struct extent_buffer *eb); +void btrfs_assert_tree_locked(struct extent_buffer *eb); #endif From ed75d8635abde0c43d26152014e386706475120e Mon Sep 17 00:00:00 2001 From: Guennadi Liakhovetski Date: Thu, 5 Mar 2009 13:25:06 +0000 Subject: [PATCH 303/580] powerpc: fix linkstation and storcenter compilation breakage Defining flash partition table in platform code is deprecated, and due to recent changes linkstation and storcenter do not compile any more with their default configurations because of undefined references to physmap_set_partitions(). Instead of fixing them by using the correct kernel configuration macro in preprocessor conditional, remove partition table definitions altogether. Instead add support for partition definition on the command-line and in device tree to the default configurations. Signed-off-by: Guennadi Liakhovetski Signed-off-by: Kumar Gala --- arch/powerpc/configs/linkstation_defconfig | 36 +++++++++++------- arch/powerpc/configs/storcenter_defconfig | 35 ++++++++++------- .../platforms/embedded6xx/linkstation.c | 38 ------------------- .../platforms/embedded6xx/storcenter.c | 32 ---------------- 4 files changed, 44 insertions(+), 97 deletions(-) diff --git a/arch/powerpc/configs/linkstation_defconfig b/arch/powerpc/configs/linkstation_defconfig index aa5855a156de..15900dcf0bfa 100644 --- a/arch/powerpc/configs/linkstation_defconfig +++ b/arch/powerpc/configs/linkstation_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29-rc2 -# Mon Jan 26 15:35:29 2009 +# Linux kernel version: 2.6.29-rc6 +# Fri Mar 6 00:07:38 2009 # # CONFIG_PPC64 is not set @@ -71,6 +71,15 @@ CONFIG_POSIX_MQUEUE=y # CONFIG_BSD_PROCESS_ACCT is not set # CONFIG_TASKSTATS is not set # CONFIG_AUDIT is not set + +# +# RCU Subsystem +# +CONFIG_CLASSIC_RCU=y +# CONFIG_TREE_RCU is not set +# CONFIG_PREEMPT_RCU is not set +# CONFIG_TREE_RCU_TRACE is not set +# CONFIG_PREEMPT_RCU_TRACE is not set CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 @@ -88,6 +97,7 @@ CONFIG_NAMESPACES=y # CONFIG_IPC_NS is not set # CONFIG_USER_NS is not set # CONFIG_PID_NS is not set +# CONFIG_NET_NS is not set CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" CONFIG_CC_OPTIMIZE_FOR_SIZE=y @@ -153,11 +163,6 @@ CONFIG_DEFAULT_AS=y # CONFIG_DEFAULT_CFQ is not set # CONFIG_DEFAULT_NOOP is not set CONFIG_DEFAULT_IOSCHED="anticipatory" -CONFIG_CLASSIC_RCU=y -# CONFIG_TREE_RCU is not set -# CONFIG_PREEMPT_RCU is not set -# CONFIG_TREE_RCU_TRACE is not set -# CONFIG_PREEMPT_RCU_TRACE is not set # CONFIG_FREEZER is not set # @@ -294,7 +299,6 @@ CONFIG_NET=y # # Networking options # -# CONFIG_NET_NS is not set CONFIG_COMPAT_NET_DEV_OPS=y CONFIG_PACKET=y CONFIG_PACKET_MMAP=y @@ -508,8 +512,8 @@ CONFIG_MTD_CONCAT=y CONFIG_MTD_PARTITIONS=y # CONFIG_MTD_TESTS is not set # CONFIG_MTD_REDBOOT_PARTS is not set -# CONFIG_MTD_CMDLINE_PARTS is not set -# CONFIG_MTD_OF_PARTS is not set +CONFIG_MTD_CMDLINE_PARTS=y +CONFIG_MTD_OF_PARTS=y # CONFIG_MTD_AR7_PARTS is not set # @@ -587,7 +591,6 @@ CONFIG_MTD_PHYSMAP=y # LPDDR flash memory drivers # # CONFIG_MTD_LPDDR is not set -# CONFIG_MTD_QINFO_PROBE is not set # # UBI - Unsorted block images @@ -617,13 +620,19 @@ CONFIG_BLK_DEV_RAM_SIZE=8192 # CONFIG_BLK_DEV_HD is not set CONFIG_MISC_DEVICES=y # CONFIG_PHANTOM is not set -# CONFIG_EEPROM_93CX6 is not set # CONFIG_SGI_IOC4 is not set # CONFIG_TIFM_CORE is not set # CONFIG_ICS932S401 is not set # CONFIG_ENCLOSURE_SERVICES is not set # CONFIG_HP_ILO is not set # CONFIG_C2PORT is not set + +# +# EEPROM support +# +# CONFIG_EEPROM_AT24 is not set +CONFIG_EEPROM_LEGACY=m +# CONFIG_EEPROM_93CX6 is not set CONFIG_HAVE_IDE=y # CONFIG_IDE is not set @@ -839,6 +848,7 @@ CONFIG_R8169=y # CONFIG_QLA3XXX is not set # CONFIG_ATL1 is not set # CONFIG_ATL1E is not set +# CONFIG_ATL1C is not set # CONFIG_JME is not set CONFIG_NETDEV_10000=y # CONFIG_CHELSIO_T1 is not set @@ -1037,8 +1047,6 @@ CONFIG_I2C_MPC=y # Miscellaneous I2C Chip support # # CONFIG_DS1682 is not set -# CONFIG_EEPROM_AT24 is not set -CONFIG_EEPROM_LEGACY=m # CONFIG_SENSORS_PCF8574 is not set # CONFIG_PCF8575 is not set # CONFIG_SENSORS_PCA9539 is not set diff --git a/arch/powerpc/configs/storcenter_defconfig b/arch/powerpc/configs/storcenter_defconfig index 86512c8790d1..94903465ea12 100644 --- a/arch/powerpc/configs/storcenter_defconfig +++ b/arch/powerpc/configs/storcenter_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29-rc2 -# Mon Jan 26 15:35:46 2009 +# Linux kernel version: 2.6.29-rc6 +# Fri Mar 6 00:09:08 2009 # # CONFIG_PPC64 is not set @@ -71,6 +71,15 @@ CONFIG_SYSVIPC_SYSCTL=y # CONFIG_BSD_PROCESS_ACCT is not set # CONFIG_TASKSTATS is not set # CONFIG_AUDIT is not set + +# +# RCU Subsystem +# +CONFIG_CLASSIC_RCU=y +# CONFIG_TREE_RCU is not set +# CONFIG_PREEMPT_RCU is not set +# CONFIG_TREE_RCU_TRACE is not set +# CONFIG_PREEMPT_RCU_TRACE is not set # CONFIG_IKCONFIG is not set CONFIG_LOG_BUF_SHIFT=14 CONFIG_GROUP_SCHED=y @@ -144,11 +153,6 @@ CONFIG_IOSCHED_CFQ=y CONFIG_DEFAULT_CFQ=y # CONFIG_DEFAULT_NOOP is not set CONFIG_DEFAULT_IOSCHED="cfq" -CONFIG_CLASSIC_RCU=y -# CONFIG_TREE_RCU is not set -# CONFIG_PREEMPT_RCU is not set -# CONFIG_TREE_RCU_TRACE is not set -# CONFIG_PREEMPT_RCU_TRACE is not set # CONFIG_FREEZER is not set # @@ -377,8 +381,8 @@ CONFIG_MTD=y CONFIG_MTD_PARTITIONS=y # CONFIG_MTD_TESTS is not set # CONFIG_MTD_REDBOOT_PARTS is not set -# CONFIG_MTD_CMDLINE_PARTS is not set -# CONFIG_MTD_OF_PARTS is not set +CONFIG_MTD_CMDLINE_PARTS=y +CONFIG_MTD_OF_PARTS=y # CONFIG_MTD_AR7_PARTS is not set # @@ -452,7 +456,6 @@ CONFIG_MTD_PHYSMAP=y # LPDDR flash memory drivers # # CONFIG_MTD_LPDDR is not set -# CONFIG_MTD_QINFO_PROBE is not set # # UBI - Unsorted block images @@ -478,13 +481,19 @@ CONFIG_BLK_DEV=y # CONFIG_BLK_DEV_HD is not set CONFIG_MISC_DEVICES=y # CONFIG_PHANTOM is not set -# CONFIG_EEPROM_93CX6 is not set # CONFIG_SGI_IOC4 is not set # CONFIG_TIFM_CORE is not set # CONFIG_ICS932S401 is not set # CONFIG_ENCLOSURE_SERVICES is not set # CONFIG_HP_ILO is not set # CONFIG_C2PORT is not set + +# +# EEPROM support +# +# CONFIG_EEPROM_AT24 is not set +# CONFIG_EEPROM_LEGACY is not set +# CONFIG_EEPROM_93CX6 is not set CONFIG_HAVE_IDE=y CONFIG_IDE=y @@ -677,6 +686,7 @@ CONFIG_R8169=y # CONFIG_QLA3XXX is not set # CONFIG_ATL1 is not set # CONFIG_ATL1E is not set +# CONFIG_ATL1C is not set # CONFIG_JME is not set # CONFIG_NETDEV_10000 is not set # CONFIG_TR is not set @@ -818,8 +828,6 @@ CONFIG_I2C_MPC=y # Miscellaneous I2C Chip support # # CONFIG_DS1682 is not set -# CONFIG_EEPROM_AT24 is not set -# CONFIG_EEPROM_LEGACY is not set # CONFIG_SENSORS_PCF8574 is not set # CONFIG_PCF8575 is not set # CONFIG_SENSORS_PCA9539 is not set @@ -1159,6 +1167,7 @@ CONFIG_JFFS2_RTIME=y # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set # CONFIG_NETWORK_FILESYSTEMS is not set +CONFIG_EXPORTFS=m # # Partition Types diff --git a/arch/powerpc/platforms/embedded6xx/linkstation.c b/arch/powerpc/platforms/embedded6xx/linkstation.c index 2ca7be65c2d2..244f997de791 100644 --- a/arch/powerpc/platforms/embedded6xx/linkstation.c +++ b/arch/powerpc/platforms/embedded6xx/linkstation.c @@ -12,7 +12,6 @@ #include #include -#include #include #include @@ -22,39 +21,6 @@ #include "mpc10x.h" -static struct mtd_partition linkstation_physmap_partitions[] = { - { - .name = "mtd_firmimg", - .offset = 0x000000, - .size = 0x300000, - }, - { - .name = "mtd_bootcode", - .offset = 0x300000, - .size = 0x070000, - }, - { - .name = "mtd_status", - .offset = 0x370000, - .size = 0x010000, - }, - { - .name = "mtd_conf", - .offset = 0x380000, - .size = 0x080000, - }, - { - .name = "mtd_allflash", - .offset = 0x000000, - .size = 0x400000, - }, - { - .name = "mtd_data", - .offset = 0x310000, - .size = 0x0f0000, - }, -}; - static __initdata struct of_device_id of_bus_ids[] = { { .type = "soc", }, { .compatible = "simple-bus", }, @@ -99,10 +65,6 @@ static int __init linkstation_add_bridge(struct device_node *dev) static void __init linkstation_setup_arch(void) { struct device_node *np; -#ifdef CONFIG_MTD_PHYSMAP - physmap_set_partitions(linkstation_physmap_partitions, - ARRAY_SIZE(linkstation_physmap_partitions)); -#endif /* Lookup PCI host bridges */ for_each_compatible_node(np, "pci", "mpc10x-pci") diff --git a/arch/powerpc/platforms/embedded6xx/storcenter.c b/arch/powerpc/platforms/embedded6xx/storcenter.c index 8864e4884980..613070e9ddbe 100644 --- a/arch/powerpc/platforms/embedded6xx/storcenter.c +++ b/arch/powerpc/platforms/embedded6xx/storcenter.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include @@ -26,32 +25,6 @@ #include "mpc10x.h" -#ifdef CONFIG_MTD_PHYSMAP -static struct mtd_partition storcenter_physmap_partitions[] = { - { - .name = "kernel", - .offset = 0x000000, - .size = 0x170000, - }, - { - .name = "rootfs", - .offset = 0x170000, - .size = 0x590000, - }, - { - .name = "uboot", - .offset = 0x700000, - .size = 0x040000, - }, - { - .name = "config", - .offset = 0x740000, - .size = 0x0c0000, - }, -}; -#endif - - static __initdata struct of_device_id storcenter_of_bus[] = { { .name = "soc", }, {}, @@ -96,11 +69,6 @@ static void __init storcenter_setup_arch(void) { struct device_node *np; -#ifdef CONFIG_MTD_PHYSMAP - physmap_set_partitions(storcenter_physmap_partitions, - ARRAY_SIZE(storcenter_physmap_partitions)); -#endif - /* Lookup PCI host bridges */ for_each_compatible_node(np, "pci", "mpc10x-pci") storcenter_add_bridge(np); From 129f8ae9b1b5be94517da76009ea956e89104ce8 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 9 Mar 2009 15:07:33 -0400 Subject: [PATCH 304/580] Revert "[CPUFREQ] Disable sysfs ui for p4-clockmod." This reverts commit e088e4c9cdb618675874becb91b2fd581ee707e6. Removing the sysfs interface for p4-clockmod was flagged as a regression in bug 12826. Course of action: - Find out the remaining causes of overheating, and fix them if possible. ACPI should be doing the right thing automatically. If it isn't, we need to fix that. - mark p4-clockmod ui as deprecated - try again with the removal in six months. It's not really feasible to printk about the deprecation, because it needs to happen at all the sysfs entry points, which means adding a lot of strcmp("p4-clockmod".. calls to the core, which.. bleuch. Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | 1 - drivers/cpufreq/cpufreq.c | 51 ++++++++--------------- include/linux/cpufreq.h | 1 - 3 files changed, 18 insertions(+), 35 deletions(-) diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index b585e04cbc9e..3178c3acd97e 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -277,7 +277,6 @@ static struct cpufreq_driver p4clockmod_driver = { .name = "p4-clockmod", .owner = THIS_MODULE, .attr = p4clockmod_attr, - .hide_interface = 1, }; diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index b55cb67435bd..d6daf3c507d3 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -754,11 +754,6 @@ static struct kobj_type ktype_cpufreq = { .release = cpufreq_sysfs_release, }; -static struct kobj_type ktype_empty_cpufreq = { - .sysfs_ops = &sysfs_ops, - .release = cpufreq_sysfs_release, -}; - /** * cpufreq_add_dev - add a CPU device @@ -892,36 +887,26 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) memcpy(&new_policy, policy, sizeof(struct cpufreq_policy)); /* prepare interface data */ - if (!cpufreq_driver->hide_interface) { - ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq, - &sys_dev->kobj, "cpufreq"); + ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq, &sys_dev->kobj, + "cpufreq"); + if (ret) + goto err_out_driver_exit; + + /* set up files for this cpu device */ + drv_attr = cpufreq_driver->attr; + while ((drv_attr) && (*drv_attr)) { + ret = sysfs_create_file(&policy->kobj, &((*drv_attr)->attr)); if (ret) goto err_out_driver_exit; - - /* set up files for this cpu device */ - drv_attr = cpufreq_driver->attr; - while ((drv_attr) && (*drv_attr)) { - ret = sysfs_create_file(&policy->kobj, - &((*drv_attr)->attr)); - if (ret) - goto err_out_driver_exit; - drv_attr++; - } - if (cpufreq_driver->get) { - ret = sysfs_create_file(&policy->kobj, - &cpuinfo_cur_freq.attr); - if (ret) - goto err_out_driver_exit; - } - if (cpufreq_driver->target) { - ret = sysfs_create_file(&policy->kobj, - &scaling_cur_freq.attr); - if (ret) - goto err_out_driver_exit; - } - } else { - ret = kobject_init_and_add(&policy->kobj, &ktype_empty_cpufreq, - &sys_dev->kobj, "cpufreq"); + drv_attr++; + } + if (cpufreq_driver->get) { + ret = sysfs_create_file(&policy->kobj, &cpuinfo_cur_freq.attr); + if (ret) + goto err_out_driver_exit; + } + if (cpufreq_driver->target) { + ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr); if (ret) goto err_out_driver_exit; } diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 384b38d3e8e2..161042746afc 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -234,7 +234,6 @@ struct cpufreq_driver { int (*suspend) (struct cpufreq_policy *policy, pm_message_t pmsg); int (*resume) (struct cpufreq_policy *policy); struct freq_attr **attr; - bool hide_interface; }; /* flags */ From 753b7aea8e4611433c13ac157f944d8b4bf42482 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 9 Mar 2009 15:14:37 -0400 Subject: [PATCH 305/580] [CPUFREQ] Add p4-clockmod sysfs-ui removal to feature-removal schedule. Signed-off-by: Matthew Garrett Signed-off-by: Dave Jones --- Documentation/feature-removal-schedule.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 5ddbe350487a..20d3b94703a4 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -335,3 +335,12 @@ Why: In 2.6.18 the Secmark concept was introduced to replace the "compat_net" Secmark, it is time to deprecate the older mechanism and start the process of removing the old code. Who: Paul Moore +--------------------------- + +What: sysfs ui for changing p4-clockmod parameters +When: September 2009 +Why: See commits 129f8ae9b1b5be94517da76009ea956e89104ce8 and + e088e4c9cdb618675874becb91b2fd581ee707e6. + Removal is subject to fixing any remaining bugs in ACPI which may + cause the thermal throttling not to happen at the right time. +Who: Dave Jones , Matthew Garrett From 2d5516cbb9daf7d0e342a2e3b0fc6f8c39a81205 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 2 Mar 2009 22:58:45 +0100 Subject: [PATCH 306/580] copy_process: fix CLONE_PARENT && parent_exec_id interaction CLONE_PARENT can fool the ->self_exec_id/parent_exec_id logic. If we re-use the old parent, we must also re-use ->parent_exec_id to make sure exit_notify() sees the right ->xxx_exec_id's when the CLONE_PARENT'ed task exits. Also, move down the "p->parent_exec_id = p->self_exec_id" thing, to place two different cases together. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Andrew Morton Cc: David Howells Cc: Serge E. Hallyn Signed-off-by: Linus Torvalds --- kernel/fork.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index a66fbde20715..4854c2c4a82e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1179,10 +1179,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif clear_all_latency_tracing(p); - /* Our parent execution domain becomes current domain - These must match for thread signalling to apply */ - p->parent_exec_id = p->self_exec_id; - /* ok, now we should be set up.. */ p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); p->pdeath_signal = 0; @@ -1220,10 +1216,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, set_task_cpu(p, smp_processor_id()); /* CLONE_PARENT re-uses the old parent */ - if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) + if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; - else + p->parent_exec_id = current->parent_exec_id; + } else { p->real_parent = current; + p->parent_exec_id = current->self_exec_id; + } spin_lock(¤t->sighand->siglock); From ba087e6f69381de6c91d6634aa0f603a2fdc96a9 Mon Sep 17 00:00:00 2001 From: Nobuhiro Iwamatsu Date: Fri, 6 Mar 2009 02:51:14 +0000 Subject: [PATCH 307/580] sh: Add media/soc_camera.h to board setup of Renesas AP325RXA Other compilation errors were revised by commit of "sh: ap325rxa: Revert ov772x support" (08c2f5b4d76f83213e379b12df504269d21c9e7c) but other compilation errors are given. We revert this commit and need to add new header(media/soc_camera.h). This change revises new compilation error. Signed-off-by: Nobuhiro Iwamatsu Signed-off-by: Paul Mundt --- arch/sh/boards/board-ap325rxa.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/sh/boards/board-ap325rxa.c b/arch/sh/boards/board-ap325rxa.c index 72da416f6162..15b6d450fbf0 100644 --- a/arch/sh/boards/board-ap325rxa.c +++ b/arch/sh/boards/board-ap325rxa.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include