kernel-fxtec-pro1x/arch/powerpc/kernel/irq.c

556 lines
14 KiB
C
Raw Normal View History

/*
* Derived from arch/i386/kernel/irq.c
* Copyright (C) 1992 Linus Torvalds
* Adapted from arch/i386 by Gary Thomas
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
* Updated and modified by Cort Dougan <cort@fsmlabs.com>
* Copyright (C) 1996-2001 Cort Dougan
* Adapted for Power Macintosh by Paul Mackerras
* Copyright (C) 1996 Paul Mackerras (paulus@cs.anu.edu.au)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* This file contains the code used by various IRQ handling routines:
* asking for different IRQ's should be done through these routines
* instead of just grabbing them. Thus setups with different IRQ numbers
* shouldn't result in any weird surprises, and installing new handlers
* should be easier.
*
* The MPC8xx has an interrupt mask in the SIU. If a bit is set, the
* interrupt is _enabled_. As expected, IRQ0 is bit 0 in the 32-bit
* mask register (of which only 16 are defined), hence the weird shifting
* and complement of the cached_irq_mask. I want to be able to stuff
* this right into the SIU SMASK register.
* Many of the prep/chrp functions are conditional compiled on CONFIG_8xx
* to reduce code space and undefined function references.
*/
2006-07-03 21:36:01 +10:00
#undef DEBUG
#include <linux/export.h>
#include <linux/threads.h>
#include <linux/kernel_stat.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/ptrace.h>
#include <linux/ioport.h>
#include <linux/interrupt.h>
#include <linux/timex.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/delay.h>
#include <linux/irq.h>
#include <linux/seq_file.h>
#include <linux/cpumask.h>
#include <linux/profile.h>
#include <linux/bitops.h>
2006-07-03 21:36:01 +10:00
#include <linux/list.h>
#include <linux/radix-tree.h>
#include <linux/mutex.h>
#include <linux/bootmem.h>
#include <linux/pci.h>
#include <linux/debugfs.h>
#include <linux/of.h>
#include <linux/of_irq.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/io.h>
#include <asm/pgtable.h>
#include <asm/irq.h>
#include <asm/cache.h>
#include <asm/prom.h>
#include <asm/ptrace.h>
#include <asm/machdep.h>
2006-07-03 21:36:01 +10:00
#include <asm/udbg.h>
#include <asm/smp.h>
#ifdef CONFIG_PPC64
#include <asm/paca.h>
#include <asm/firmware.h>
#include <asm/lv1call.h>
#endif
#define CREATE_TRACE_POINTS
#include <asm/trace.h>
DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
EXPORT_PER_CPU_SYMBOL(irq_stat);
int __irq_offset_value;
#ifdef CONFIG_PPC32
EXPORT_SYMBOL(__irq_offset_value);
atomic_t ppc_n_lost_interrupts;
#ifdef CONFIG_TAU_INT
extern int tau_initialized;
extern int tau_interrupts(int);
#endif
#endif /* CONFIG_PPC32 */
#ifdef CONFIG_PPC64
#ifndef CONFIG_SPARSE_IRQ
EXPORT_SYMBOL(irq_desc);
#endif
int distribute_irqs = 1;
static inline notrace unsigned long get_hard_enabled(void)
{
unsigned long enabled;
__asm__ __volatile__("lbz %0,%1(13)"
: "=r" (enabled) : "i" (offsetof(struct paca_struct, hard_enabled)));
return enabled;
}
static inline notrace void set_soft_enabled(unsigned long enable)
{
__asm__ __volatile__("stb %0,%1(13)"
: : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
}
static inline notrace void decrementer_check_overflow(void)
{
u64 now = get_tb_or_rtc();
u64 *next_tb;
preempt_disable();
next_tb = &__get_cpu_var(decrementers_next_tb);
if (now >= *next_tb)
set_dec(1);
preempt_enable();
}
Fix IRQ flag handling naming Fix the IRQ flag handling naming. In linux/irqflags.h under one configuration, it maps: local_irq_enable() -> raw_local_irq_enable() local_irq_disable() -> raw_local_irq_disable() local_irq_save() -> raw_local_irq_save() ... and under the other configuration, it maps: raw_local_irq_enable() -> local_irq_enable() raw_local_irq_disable() -> local_irq_disable() raw_local_irq_save() -> local_irq_save() ... This is quite confusing. There should be one set of names expected of the arch, and this should be wrapped to give another set of names that are expected by users of this facility. Change this to have the arch provide: flags = arch_local_save_flags() flags = arch_local_irq_save() arch_local_irq_restore(flags) arch_local_irq_disable() arch_local_irq_enable() arch_irqs_disabled_flags(flags) arch_irqs_disabled() arch_safe_halt() Then linux/irqflags.h wraps these to provide: raw_local_save_flags(flags) raw_local_irq_save(flags) raw_local_irq_restore(flags) raw_local_irq_disable() raw_local_irq_enable() raw_irqs_disabled_flags(flags) raw_irqs_disabled() raw_safe_halt() with type checking on the flags 'arguments', and then wraps those to provide: local_save_flags(flags) local_irq_save(flags) local_irq_restore(flags) local_irq_disable() local_irq_enable() irqs_disabled_flags(flags) irqs_disabled() safe_halt() with tracing included if enabled. The arch functions can now all be inline functions rather than some of them having to be macros. Signed-off-by: David Howells <dhowells@redhat.com> [X86, FRV, MN10300] Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> [Tile] Signed-off-by: Michal Simek <monstr@monstr.eu> [Microblaze] Tested-by: Catalin Marinas <catalin.marinas@arm.com> [ARM] Acked-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com> [AVR] Acked-by: Tony Luck <tony.luck@intel.com> [IA-64] Acked-by: Hirokazu Takata <takata@linux-m32r.org> [M32R] Acked-by: Greg Ungerer <gerg@uclinux.org> [M68K/M68KNOMMU] Acked-by: Ralf Baechle <ralf@linux-mips.org> [MIPS] Acked-by: Kyle McMartin <kyle@mcmartin.ca> [PA-RISC] Acked-by: Paul Mackerras <paulus@samba.org> [PowerPC] Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com> [S390] Acked-by: Chen Liqin <liqin.chen@sunplusct.com> [Score] Acked-by: Matt Fleming <matt@console-pimps.org> [SH] Acked-by: David S. Miller <davem@davemloft.net> [Sparc] Acked-by: Chris Zankel <chris@zankel.net> [Xtensa] Reviewed-by: Richard Henderson <rth@twiddle.net> [Alpha] Reviewed-by: Yoshinori Sato <ysato@users.sourceforge.jp> [H8300] Cc: starvik@axis.com [CRIS] Cc: jesper.nilsson@axis.com [CRIS] Cc: linux-cris-kernel@axis.com
2010-10-07 14:08:55 +01:00
notrace void arch_local_irq_restore(unsigned long en)
{
/*
* get_paca()->soft_enabled = en;
* Is it ever valid to use local_irq_restore(0) when soft_enabled is 1?
* That was allowed before, and in such a case we do need to take care
* that gcc will set soft_enabled directly via r13, not choose to use
* an intermediate register, lest we're preempted to a different cpu.
*/
set_soft_enabled(en);
if (!en)
return;
#ifdef CONFIG_PPC_STD_MMU_64
if (firmware_has_feature(FW_FEATURE_ISERIES)) {
/*
* Do we need to disable preemption here? Not really: in the
* unlikely event that we're preempted to a different cpu in
* between getting r13, loading its lppaca_ptr, and loading
* its any_int, we might call iseries_handle_interrupts without
* an interrupt pending on the new cpu, but that's no disaster,
* is it? And the business of preempting us off the old cpu
* would itself involve a local_irq_restore which handles the
* interrupt to that cpu.
*
* But use "local_paca->lppaca_ptr" instead of "get_lppaca()"
* to avoid any preemption checking added into get_paca().
*/
if (local_paca->lppaca_ptr->int_dword.any_int)
iseries_handle_interrupts();
}
#endif /* CONFIG_PPC_STD_MMU_64 */
/*
* if (get_paca()->hard_enabled) return;
* But again we need to take care that gcc gets hard_enabled directly
* via r13, not choose to use an intermediate register, lest we're
* preempted to a different cpu in between the two instructions.
*/
if (get_hard_enabled())
return;
/*
* Need to hard-enable interrupts here. Since currently disabled,
* no need to take further asm precautions against preemption; but
* use local_paca instead of get_paca() to avoid preemption checking.
*/
local_paca->hard_enabled = en;
/*
* Trigger the decrementer if we have a pending event. Some processors
* only trigger on edge transitions of the sign bit. We might also
* have disabled interrupts long enough that the decrementer wrapped
* to positive.
*/
decrementer_check_overflow();
/*
* Force the delivery of pending soft-disabled interrupts on PS3.
* Any HV call will have this side effect.
*/
if (firmware_has_feature(FW_FEATURE_PS3_LV1)) {
u64 tmp, tmp2;
lv1_get_version_info(&tmp, &tmp2);
}
__hard_irq_enable();
}
Fix IRQ flag handling naming Fix the IRQ flag handling naming. In linux/irqflags.h under one configuration, it maps: local_irq_enable() -> raw_local_irq_enable() local_irq_disable() -> raw_local_irq_disable() local_irq_save() -> raw_local_irq_save() ... and under the other configuration, it maps: raw_local_irq_enable() -> local_irq_enable() raw_local_irq_disable() -> local_irq_disable() raw_local_irq_save() -> local_irq_save() ... This is quite confusing. There should be one set of names expected of the arch, and this should be wrapped to give another set of names that are expected by users of this facility. Change this to have the arch provide: flags = arch_local_save_flags() flags = arch_local_irq_save() arch_local_irq_restore(flags) arch_local_irq_disable() arch_local_irq_enable() arch_irqs_disabled_flags(flags) arch_irqs_disabled() arch_safe_halt() Then linux/irqflags.h wraps these to provide: raw_local_save_flags(flags) raw_local_irq_save(flags) raw_local_irq_restore(flags) raw_local_irq_disable() raw_local_irq_enable() raw_irqs_disabled_flags(flags) raw_irqs_disabled() raw_safe_halt() with type checking on the flags 'arguments', and then wraps those to provide: local_save_flags(flags) local_irq_save(flags) local_irq_restore(flags) local_irq_disable() local_irq_enable() irqs_disabled_flags(flags) irqs_disabled() safe_halt() with tracing included if enabled. The arch functions can now all be inline functions rather than some of them having to be macros. Signed-off-by: David Howells <dhowells@redhat.com> [X86, FRV, MN10300] Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> [Tile] Signed-off-by: Michal Simek <monstr@monstr.eu> [Microblaze] Tested-by: Catalin Marinas <catalin.marinas@arm.com> [ARM] Acked-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Haavard Skinnemoen <haavard.skinnemoen@atmel.com> [AVR] Acked-by: Tony Luck <tony.luck@intel.com> [IA-64] Acked-by: Hirokazu Takata <takata@linux-m32r.org> [M32R] Acked-by: Greg Ungerer <gerg@uclinux.org> [M68K/M68KNOMMU] Acked-by: Ralf Baechle <ralf@linux-mips.org> [MIPS] Acked-by: Kyle McMartin <kyle@mcmartin.ca> [PA-RISC] Acked-by: Paul Mackerras <paulus@samba.org> [PowerPC] Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com> [S390] Acked-by: Chen Liqin <liqin.chen@sunplusct.com> [Score] Acked-by: Matt Fleming <matt@console-pimps.org> [SH] Acked-by: David S. Miller <davem@davemloft.net> [Sparc] Acked-by: Chris Zankel <chris@zankel.net> [Xtensa] Reviewed-by: Richard Henderson <rth@twiddle.net> [Alpha] Reviewed-by: Yoshinori Sato <ysato@users.sourceforge.jp> [H8300] Cc: starvik@axis.com [CRIS] Cc: jesper.nilsson@axis.com [CRIS] Cc: linux-cris-kernel@axis.com
2010-10-07 14:08:55 +01:00
EXPORT_SYMBOL(arch_local_irq_restore);
#endif /* CONFIG_PPC64 */
int arch_show_interrupts(struct seq_file *p, int prec)
{
int j;
#if defined(CONFIG_PPC32) && defined(CONFIG_TAU_INT)
if (tau_initialized) {
seq_printf(p, "%*s: ", prec, "TAU");
for_each_online_cpu(j)
seq_printf(p, "%10u ", tau_interrupts(j));
seq_puts(p, " PowerPC Thermal Assist (cpu temp)\n");
}
#endif /* CONFIG_PPC32 && CONFIG_TAU_INT */
seq_printf(p, "%*s: ", prec, "LOC");
for_each_online_cpu(j)
seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs);
seq_printf(p, " Local timer interrupts\n");
seq_printf(p, "%*s: ", prec, "SPU");
for_each_online_cpu(j)
seq_printf(p, "%10u ", per_cpu(irq_stat, j).spurious_irqs);
seq_printf(p, " Spurious interrupts\n");
seq_printf(p, "%*s: ", prec, "CNT");
for_each_online_cpu(j)
seq_printf(p, "%10u ", per_cpu(irq_stat, j).pmu_irqs);
seq_printf(p, " Performance monitoring interrupts\n");
seq_printf(p, "%*s: ", prec, "MCE");
for_each_online_cpu(j)
seq_printf(p, "%10u ", per_cpu(irq_stat, j).mce_exceptions);
seq_printf(p, " Machine check exceptions\n");
return 0;
}
/*
* /proc/stat helpers
*/
u64 arch_irq_stat_cpu(unsigned int cpu)
{
u64 sum = per_cpu(irq_stat, cpu).timer_irqs;
sum += per_cpu(irq_stat, cpu).pmu_irqs;
sum += per_cpu(irq_stat, cpu).mce_exceptions;
sum += per_cpu(irq_stat, cpu).spurious_irqs;
return sum;
}
#ifdef CONFIG_HOTPLUG_CPU
void migrate_irqs(void)
{
struct irq_desc *desc;
unsigned int irq;
static int warned;
cpumask_var_t mask;
const struct cpumask *map = cpu_online_mask;
alloc_cpumask_var(&mask, GFP_KERNEL);
for_each_irq(irq) {
struct irq_data *data;
struct irq_chip *chip;
desc = irq_to_desc(irq);
if (!desc)
continue;
data = irq_desc_get_irq_data(desc);
if (irqd_is_per_cpu(data))
continue;
chip = irq_data_get_irq_chip(data);
cpumask_and(mask, data->affinity, map);
if (cpumask_any(mask) >= nr_cpu_ids) {
printk("Breaking affinity for irq %i\n", irq);
cpumask_copy(mask, map);
}
if (chip->irq_set_affinity)
chip->irq_set_affinity(data, mask, true);
else if (desc->action && !(warned++))
printk("Cannot set affinity for irq %i\n", irq);
}
free_cpumask_var(mask);
local_irq_enable();
mdelay(1);
local_irq_disable();
}
#endif
static inline void handle_one_irq(unsigned int irq)
{
struct thread_info *curtp, *irqtp;
unsigned long saved_sp_limit;
struct irq_desc *desc;
desc = irq_to_desc(irq);
if (!desc)
return;
/* Switch to the irq stack to handle this */
curtp = current_thread_info();
irqtp = hardirq_ctx[smp_processor_id()];
if (curtp == irqtp) {
/* We're already on the irq stack, just handle it */
desc->handle_irq(irq, desc);
return;
}
saved_sp_limit = current->thread.ksp_limit;
irqtp->task = curtp->task;
irqtp->flags = 0;
/* Copy the softirq bits in preempt_count so that the
* softirq checks work in the hardirq context. */
irqtp->preempt_count = (irqtp->preempt_count & ~SOFTIRQ_MASK) |
(curtp->preempt_count & SOFTIRQ_MASK);
current->thread.ksp_limit = (unsigned long)irqtp +
_ALIGN_UP(sizeof(struct thread_info), 16);
call_handle_irq(irq, desc, irqtp, desc->handle_irq);
current->thread.ksp_limit = saved_sp_limit;
irqtp->task = NULL;
/* Set any flag that may have been set on the
* alternate stack
*/
if (irqtp->flags)
set_bits(irqtp->flags, &curtp->flags);
}
static inline void check_stack_overflow(void)
{
#ifdef CONFIG_DEBUG_STACKOVERFLOW
long sp;
sp = __get_SP() & (THREAD_SIZE-1);
/* check for stack overflow: is there less than 2KB free? */
if (unlikely(sp < (sizeof(struct thread_info) + 2048))) {
printk("do_IRQ: stack overflow: %ld\n",
sp - sizeof(struct thread_info));
dump_stack();
}
#endif
}
void do_IRQ(struct pt_regs *regs)
{
IRQ: Maintain regs pointer globally rather than passing to IRQ handlers Maintain a per-CPU global "struct pt_regs *" variable which can be used instead of passing regs around manually through all ~1800 interrupt handlers in the Linux kernel. The regs pointer is used in few places, but it potentially costs both stack space and code to pass it around. On the FRV arch, removing the regs parameter from all the genirq function results in a 20% speed up of the IRQ exit path (ie: from leaving timer_interrupt() to leaving do_IRQ()). Where appropriate, an arch may override the generic storage facility and do something different with the variable. On FRV, for instance, the address is maintained in GR28 at all times inside the kernel as part of general exception handling. Having looked over the code, it appears that the parameter may be handed down through up to twenty or so layers of functions. Consider a USB character device attached to a USB hub, attached to a USB controller that posts its interrupts through a cascaded auxiliary interrupt controller. A character device driver may want to pass regs to the sysrq handler through the input layer which adds another few layers of parameter passing. I've build this code with allyesconfig for x86_64 and i386. I've runtested the main part of the code on FRV and i386, though I can't test most of the drivers. I've also done partial conversion for powerpc and MIPS - these at least compile with minimal configurations. This will affect all archs. Mostly the changes should be relatively easy. Take do_IRQ(), store the regs pointer at the beginning, saving the old one: struct pt_regs *old_regs = set_irq_regs(regs); And put the old one back at the end: set_irq_regs(old_regs); Don't pass regs through to generic_handle_irq() or __do_IRQ(). In timer_interrupt(), this sort of change will be necessary: - update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING, regs); + update_process_times(user_mode(get_irq_regs())); + profile_tick(CPU_PROFILING); I'd like to move update_process_times()'s use of get_irq_regs() into itself, except that i386, alone of the archs, uses something other than user_mode(). Some notes on the interrupt handling in the drivers: (*) input_dev() is now gone entirely. The regs pointer is no longer stored in the input_dev struct. (*) finish_unlinks() in drivers/usb/host/ohci-q.c needs checking. It does something different depending on whether it's been supplied with a regs pointer or not. (*) Various IRQ handler function pointers have been moved to type irq_handler_t. Signed-Off-By: David Howells <dhowells@redhat.com> (cherry picked from 1b16e7ac850969f38b375e511e3fa2f474a33867 commit)
2006-10-05 14:55:46 +01:00
struct pt_regs *old_regs = set_irq_regs(regs);
2006-07-03 21:36:01 +10:00
unsigned int irq;
trace_irq_entry(regs);
irq_enter();
check_stack_overflow();
irq = ppc_md.get_irq();
if (irq != NO_IRQ && irq != NO_IRQ_IGNORE)
handle_one_irq(irq);
else if (irq != NO_IRQ_IGNORE)
__get_cpu_var(irq_stat).spurious_irqs++;
irq_exit();
IRQ: Maintain regs pointer globally rather than passing to IRQ handlers Maintain a per-CPU global "struct pt_regs *" variable which can be used instead of passing regs around manually through all ~1800 interrupt handlers in the Linux kernel. The regs pointer is used in few places, but it potentially costs both stack space and code to pass it around. On the FRV arch, removing the regs parameter from all the genirq function results in a 20% speed up of the IRQ exit path (ie: from leaving timer_interrupt() to leaving do_IRQ()). Where appropriate, an arch may override the generic storage facility and do something different with the variable. On FRV, for instance, the address is maintained in GR28 at all times inside the kernel as part of general exception handling. Having looked over the code, it appears that the parameter may be handed down through up to twenty or so layers of functions. Consider a USB character device attached to a USB hub, attached to a USB controller that posts its interrupts through a cascaded auxiliary interrupt controller. A character device driver may want to pass regs to the sysrq handler through the input layer which adds another few layers of parameter passing. I've build this code with allyesconfig for x86_64 and i386. I've runtested the main part of the code on FRV and i386, though I can't test most of the drivers. I've also done partial conversion for powerpc and MIPS - these at least compile with minimal configurations. This will affect all archs. Mostly the changes should be relatively easy. Take do_IRQ(), store the regs pointer at the beginning, saving the old one: struct pt_regs *old_regs = set_irq_regs(regs); And put the old one back at the end: set_irq_regs(old_regs); Don't pass regs through to generic_handle_irq() or __do_IRQ(). In timer_interrupt(), this sort of change will be necessary: - update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING, regs); + update_process_times(user_mode(get_irq_regs())); + profile_tick(CPU_PROFILING); I'd like to move update_process_times()'s use of get_irq_regs() into itself, except that i386, alone of the archs, uses something other than user_mode(). Some notes on the interrupt handling in the drivers: (*) input_dev() is now gone entirely. The regs pointer is no longer stored in the input_dev struct. (*) finish_unlinks() in drivers/usb/host/ohci-q.c needs checking. It does something different depending on whether it's been supplied with a regs pointer or not. (*) Various IRQ handler function pointers have been moved to type irq_handler_t. Signed-Off-By: David Howells <dhowells@redhat.com> (cherry picked from 1b16e7ac850969f38b375e511e3fa2f474a33867 commit)
2006-10-05 14:55:46 +01:00
set_irq_regs(old_regs);
#ifdef CONFIG_PPC_ISERIES
if (firmware_has_feature(FW_FEATURE_ISERIES) &&
get_lppaca()->int_dword.fields.decr_int) {
get_lppaca()->int_dword.fields.decr_int = 0;
/* Signal a fake decrementer interrupt */
timer_interrupt(regs);
}
#endif
trace_irq_exit(regs);
}
void __init init_IRQ(void)
{
if (ppc_md.init_IRQ)
ppc_md.init_IRQ();
exc_lvl_ctx_init();
irq_ctx_init();
}
#if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
struct thread_info *critirq_ctx[NR_CPUS] __read_mostly;
struct thread_info *dbgirq_ctx[NR_CPUS] __read_mostly;
struct thread_info *mcheckirq_ctx[NR_CPUS] __read_mostly;
void exc_lvl_ctx_init(void)
{
struct thread_info *tp;
int i, cpu_nr;
for_each_possible_cpu(i) {
#ifdef CONFIG_PPC64
cpu_nr = i;
#else
cpu_nr = get_hard_smp_processor_id(i);
#endif
memset((void *)critirq_ctx[cpu_nr], 0, THREAD_SIZE);
tp = critirq_ctx[cpu_nr];
tp->cpu = cpu_nr;
tp->preempt_count = 0;
#ifdef CONFIG_BOOKE
memset((void *)dbgirq_ctx[cpu_nr], 0, THREAD_SIZE);
tp = dbgirq_ctx[cpu_nr];
tp->cpu = cpu_nr;
tp->preempt_count = 0;
memset((void *)mcheckirq_ctx[cpu_nr], 0, THREAD_SIZE);
tp = mcheckirq_ctx[cpu_nr];
tp->cpu = cpu_nr;
tp->preempt_count = HARDIRQ_OFFSET;
#endif
}
}
#endif
struct thread_info *softirq_ctx[NR_CPUS] __read_mostly;
struct thread_info *hardirq_ctx[NR_CPUS] __read_mostly;
void irq_ctx_init(void)
{
struct thread_info *tp;
int i;
for_each_possible_cpu(i) {
memset((void *)softirq_ctx[i], 0, THREAD_SIZE);
tp = softirq_ctx[i];
tp->cpu = i;
tp->preempt_count = 0;
memset((void *)hardirq_ctx[i], 0, THREAD_SIZE);
tp = hardirq_ctx[i];
tp->cpu = i;
tp->preempt_count = HARDIRQ_OFFSET;
}
}
powerpc: Implement accurate task and CPU time accounting This implements accurate task and cpu time accounting for 64-bit powerpc kernels. Instead of accounting a whole jiffy of time to a task on a timer interrupt because that task happened to be running at the time, we now account time in units of timebase ticks according to the actual time spent by the task in user mode and kernel mode. We also count the time spent processing hardware and software interrupts accurately. This is conditional on CONFIG_VIRT_CPU_ACCOUNTING. If that is not set, we do tick-based approximate accounting as before. To get this accurate information, we read either the PURR (processor utilization of resources register) on POWER5 machines, or the timebase on other machines on * each entry to the kernel from usermode * each exit to usermode * transitions between process context, hard irq context and soft irq context in kernel mode * context switches. On POWER5 systems with shared-processor logical partitioning we also read both the PURR and the timebase at each timer interrupt and context switch in order to determine how much time has been taken by the hypervisor to run other partitions ("steal" time). Unfortunately, since we need values of the PURR on both threads at the same time to accurately calculate the steal time, and since we can only calculate steal time on a per-core basis, the apportioning of the steal time between idle time (time which we ceded to the hypervisor in the idle loop) and actual stolen time is somewhat approximate at the moment. This is all based quite heavily on what s390 does, and it uses the generic interfaces that were added by the s390 developers, i.e. account_system_time(), account_user_time(), etc. This patch doesn't add any new interfaces between the kernel and userspace, and doesn't change the units in which time is reported to userspace by things such as /proc/stat, /proc/<pid>/stat, getrusage(), times(), etc. Internally the various task and cpu times are stored in timebase units, but they are converted to USER_HZ units (1/100th of a second) when reported to userspace. Some precision is therefore lost but there should not be any accumulating error, since the internal accumulation is at full precision. Signed-off-by: Paul Mackerras <paulus@samba.org>
2006-02-24 10:06:59 +11:00
static inline void do_softirq_onstack(void)
{
struct thread_info *curtp, *irqtp;
unsigned long saved_sp_limit = current->thread.ksp_limit;
powerpc: Implement accurate task and CPU time accounting This implements accurate task and cpu time accounting for 64-bit powerpc kernels. Instead of accounting a whole jiffy of time to a task on a timer interrupt because that task happened to be running at the time, we now account time in units of timebase ticks according to the actual time spent by the task in user mode and kernel mode. We also count the time spent processing hardware and software interrupts accurately. This is conditional on CONFIG_VIRT_CPU_ACCOUNTING. If that is not set, we do tick-based approximate accounting as before. To get this accurate information, we read either the PURR (processor utilization of resources register) on POWER5 machines, or the timebase on other machines on * each entry to the kernel from usermode * each exit to usermode * transitions between process context, hard irq context and soft irq context in kernel mode * context switches. On POWER5 systems with shared-processor logical partitioning we also read both the PURR and the timebase at each timer interrupt and context switch in order to determine how much time has been taken by the hypervisor to run other partitions ("steal" time). Unfortunately, since we need values of the PURR on both threads at the same time to accurately calculate the steal time, and since we can only calculate steal time on a per-core basis, the apportioning of the steal time between idle time (time which we ceded to the hypervisor in the idle loop) and actual stolen time is somewhat approximate at the moment. This is all based quite heavily on what s390 does, and it uses the generic interfaces that were added by the s390 developers, i.e. account_system_time(), account_user_time(), etc. This patch doesn't add any new interfaces between the kernel and userspace, and doesn't change the units in which time is reported to userspace by things such as /proc/stat, /proc/<pid>/stat, getrusage(), times(), etc. Internally the various task and cpu times are stored in timebase units, but they are converted to USER_HZ units (1/100th of a second) when reported to userspace. Some precision is therefore lost but there should not be any accumulating error, since the internal accumulation is at full precision. Signed-off-by: Paul Mackerras <paulus@samba.org>
2006-02-24 10:06:59 +11:00
curtp = current_thread_info();
irqtp = softirq_ctx[smp_processor_id()];
irqtp->task = curtp->task;
irqtp->flags = 0;
current->thread.ksp_limit = (unsigned long)irqtp +
_ALIGN_UP(sizeof(struct thread_info), 16);
powerpc: Implement accurate task and CPU time accounting This implements accurate task and cpu time accounting for 64-bit powerpc kernels. Instead of accounting a whole jiffy of time to a task on a timer interrupt because that task happened to be running at the time, we now account time in units of timebase ticks according to the actual time spent by the task in user mode and kernel mode. We also count the time spent processing hardware and software interrupts accurately. This is conditional on CONFIG_VIRT_CPU_ACCOUNTING. If that is not set, we do tick-based approximate accounting as before. To get this accurate information, we read either the PURR (processor utilization of resources register) on POWER5 machines, or the timebase on other machines on * each entry to the kernel from usermode * each exit to usermode * transitions between process context, hard irq context and soft irq context in kernel mode * context switches. On POWER5 systems with shared-processor logical partitioning we also read both the PURR and the timebase at each timer interrupt and context switch in order to determine how much time has been taken by the hypervisor to run other partitions ("steal" time). Unfortunately, since we need values of the PURR on both threads at the same time to accurately calculate the steal time, and since we can only calculate steal time on a per-core basis, the apportioning of the steal time between idle time (time which we ceded to the hypervisor in the idle loop) and actual stolen time is somewhat approximate at the moment. This is all based quite heavily on what s390 does, and it uses the generic interfaces that were added by the s390 developers, i.e. account_system_time(), account_user_time(), etc. This patch doesn't add any new interfaces between the kernel and userspace, and doesn't change the units in which time is reported to userspace by things such as /proc/stat, /proc/<pid>/stat, getrusage(), times(), etc. Internally the various task and cpu times are stored in timebase units, but they are converted to USER_HZ units (1/100th of a second) when reported to userspace. Some precision is therefore lost but there should not be any accumulating error, since the internal accumulation is at full precision. Signed-off-by: Paul Mackerras <paulus@samba.org>
2006-02-24 10:06:59 +11:00
call_do_softirq(irqtp);
current->thread.ksp_limit = saved_sp_limit;
powerpc: Implement accurate task and CPU time accounting This implements accurate task and cpu time accounting for 64-bit powerpc kernels. Instead of accounting a whole jiffy of time to a task on a timer interrupt because that task happened to be running at the time, we now account time in units of timebase ticks according to the actual time spent by the task in user mode and kernel mode. We also count the time spent processing hardware and software interrupts accurately. This is conditional on CONFIG_VIRT_CPU_ACCOUNTING. If that is not set, we do tick-based approximate accounting as before. To get this accurate information, we read either the PURR (processor utilization of resources register) on POWER5 machines, or the timebase on other machines on * each entry to the kernel from usermode * each exit to usermode * transitions between process context, hard irq context and soft irq context in kernel mode * context switches. On POWER5 systems with shared-processor logical partitioning we also read both the PURR and the timebase at each timer interrupt and context switch in order to determine how much time has been taken by the hypervisor to run other partitions ("steal" time). Unfortunately, since we need values of the PURR on both threads at the same time to accurately calculate the steal time, and since we can only calculate steal time on a per-core basis, the apportioning of the steal time between idle time (time which we ceded to the hypervisor in the idle loop) and actual stolen time is somewhat approximate at the moment. This is all based quite heavily on what s390 does, and it uses the generic interfaces that were added by the s390 developers, i.e. account_system_time(), account_user_time(), etc. This patch doesn't add any new interfaces between the kernel and userspace, and doesn't change the units in which time is reported to userspace by things such as /proc/stat, /proc/<pid>/stat, getrusage(), times(), etc. Internally the various task and cpu times are stored in timebase units, but they are converted to USER_HZ units (1/100th of a second) when reported to userspace. Some precision is therefore lost but there should not be any accumulating error, since the internal accumulation is at full precision. Signed-off-by: Paul Mackerras <paulus@samba.org>
2006-02-24 10:06:59 +11:00
irqtp->task = NULL;
/* Set any flag that may have been set on the
* alternate stack
*/
if (irqtp->flags)
set_bits(irqtp->flags, &curtp->flags);
powerpc: Implement accurate task and CPU time accounting This implements accurate task and cpu time accounting for 64-bit powerpc kernels. Instead of accounting a whole jiffy of time to a task on a timer interrupt because that task happened to be running at the time, we now account time in units of timebase ticks according to the actual time spent by the task in user mode and kernel mode. We also count the time spent processing hardware and software interrupts accurately. This is conditional on CONFIG_VIRT_CPU_ACCOUNTING. If that is not set, we do tick-based approximate accounting as before. To get this accurate information, we read either the PURR (processor utilization of resources register) on POWER5 machines, or the timebase on other machines on * each entry to the kernel from usermode * each exit to usermode * transitions between process context, hard irq context and soft irq context in kernel mode * context switches. On POWER5 systems with shared-processor logical partitioning we also read both the PURR and the timebase at each timer interrupt and context switch in order to determine how much time has been taken by the hypervisor to run other partitions ("steal" time). Unfortunately, since we need values of the PURR on both threads at the same time to accurately calculate the steal time, and since we can only calculate steal time on a per-core basis, the apportioning of the steal time between idle time (time which we ceded to the hypervisor in the idle loop) and actual stolen time is somewhat approximate at the moment. This is all based quite heavily on what s390 does, and it uses the generic interfaces that were added by the s390 developers, i.e. account_system_time(), account_user_time(), etc. This patch doesn't add any new interfaces between the kernel and userspace, and doesn't change the units in which time is reported to userspace by things such as /proc/stat, /proc/<pid>/stat, getrusage(), times(), etc. Internally the various task and cpu times are stored in timebase units, but they are converted to USER_HZ units (1/100th of a second) when reported to userspace. Some precision is therefore lost but there should not be any accumulating error, since the internal accumulation is at full precision. Signed-off-by: Paul Mackerras <paulus@samba.org>
2006-02-24 10:06:59 +11:00
}
void do_softirq(void)
{
unsigned long flags;
if (in_interrupt())
return;
local_irq_save(flags);
if (local_softirq_pending())
powerpc: Implement accurate task and CPU time accounting This implements accurate task and cpu time accounting for 64-bit powerpc kernels. Instead of accounting a whole jiffy of time to a task on a timer interrupt because that task happened to be running at the time, we now account time in units of timebase ticks according to the actual time spent by the task in user mode and kernel mode. We also count the time spent processing hardware and software interrupts accurately. This is conditional on CONFIG_VIRT_CPU_ACCOUNTING. If that is not set, we do tick-based approximate accounting as before. To get this accurate information, we read either the PURR (processor utilization of resources register) on POWER5 machines, or the timebase on other machines on * each entry to the kernel from usermode * each exit to usermode * transitions between process context, hard irq context and soft irq context in kernel mode * context switches. On POWER5 systems with shared-processor logical partitioning we also read both the PURR and the timebase at each timer interrupt and context switch in order to determine how much time has been taken by the hypervisor to run other partitions ("steal" time). Unfortunately, since we need values of the PURR on both threads at the same time to accurately calculate the steal time, and since we can only calculate steal time on a per-core basis, the apportioning of the steal time between idle time (time which we ceded to the hypervisor in the idle loop) and actual stolen time is somewhat approximate at the moment. This is all based quite heavily on what s390 does, and it uses the generic interfaces that were added by the s390 developers, i.e. account_system_time(), account_user_time(), etc. This patch doesn't add any new interfaces between the kernel and userspace, and doesn't change the units in which time is reported to userspace by things such as /proc/stat, /proc/<pid>/stat, getrusage(), times(), etc. Internally the various task and cpu times are stored in timebase units, but they are converted to USER_HZ units (1/100th of a second) when reported to userspace. Some precision is therefore lost but there should not be any accumulating error, since the internal accumulation is at full precision. Signed-off-by: Paul Mackerras <paulus@samba.org>
2006-02-24 10:06:59 +11:00
do_softirq_onstack();
local_irq_restore(flags);
}
irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
{
irq_domain/powerpc: eliminate irq_map; use irq_alloc_desc() instead This patch drops the powerpc-specific irq_map table and replaces it with directly using the irq_alloc_desc()/irq_free_desc() interfaces for allocating and freeing irq_desc structures. This patch is a preparation step for generalizing the powerpc-specific virq infrastructure to become irq_domains. As part of this change, the irq_big_lock is changed to a mutex from a raw spinlock. There is no longer any need to use a spin lock since the irq_desc allocation code is now responsible for the critical section of finding an unused range of irq numbers. The radix lookup table is also changed to store the irq_data pointer instead of the irq_map entry since the irq_map is removed. This should end up being functionally equivalent since only allocated irq_descs are ever added to the radix tree. v5: - Really don't ever allocate virq 0. The previous version could still do it if hint == 0 - Respect irq_virq_count setting for NOMAP. Some NOMAP domains cannot use virq values above irq_virq_count. - Use numa_node_id() when allocating irq_descs. Ideally the API should obtain that value from the caller, but that touches a lot of call sites so will be deferred to a follow-on patch. - Fix irq_find_mapping() to include irq numbers lower than NUM_ISA_INTERRUPTS. With the switch to irq_alloc_desc*(), the lowest possible allocated irq is now returned by arch_probe_nr_irqs(). v4: - Fix incorrect access to irq_data structure in debugfs code - Don't ever allocate virq 0 Signed-off-by: Grant Likely <grant.likely@secretlab.ca> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Milton Miller <miltonm@bga.com> Tested-by: Olof Johansson <olof@lixom.net>
2012-02-14 14:06:51 -07:00
return d->hwirq;
}
EXPORT_SYMBOL_GPL(irqd_to_hwirq);
irq_hw_number_t virq_to_hw(unsigned int virq)
{
irq_domain/powerpc: eliminate irq_map; use irq_alloc_desc() instead This patch drops the powerpc-specific irq_map table and replaces it with directly using the irq_alloc_desc()/irq_free_desc() interfaces for allocating and freeing irq_desc structures. This patch is a preparation step for generalizing the powerpc-specific virq infrastructure to become irq_domains. As part of this change, the irq_big_lock is changed to a mutex from a raw spinlock. There is no longer any need to use a spin lock since the irq_desc allocation code is now responsible for the critical section of finding an unused range of irq numbers. The radix lookup table is also changed to store the irq_data pointer instead of the irq_map entry since the irq_map is removed. This should end up being functionally equivalent since only allocated irq_descs are ever added to the radix tree. v5: - Really don't ever allocate virq 0. The previous version could still do it if hint == 0 - Respect irq_virq_count setting for NOMAP. Some NOMAP domains cannot use virq values above irq_virq_count. - Use numa_node_id() when allocating irq_descs. Ideally the API should obtain that value from the caller, but that touches a lot of call sites so will be deferred to a follow-on patch. - Fix irq_find_mapping() to include irq numbers lower than NUM_ISA_INTERRUPTS. With the switch to irq_alloc_desc*(), the lowest possible allocated irq is now returned by arch_probe_nr_irqs(). v4: - Fix incorrect access to irq_data structure in debugfs code - Don't ever allocate virq 0 Signed-off-by: Grant Likely <grant.likely@secretlab.ca> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Milton Miller <miltonm@bga.com> Tested-by: Olof Johansson <olof@lixom.net>
2012-02-14 14:06:51 -07:00
struct irq_data *irq_data = irq_get_irq_data(virq);
return WARN_ON(!irq_data) ? 0 : irq_data->hwirq;
}
EXPORT_SYMBOL_GPL(virq_to_hw);
#ifdef CONFIG_SMP
int irq_choose_cpu(const struct cpumask *mask)
{
int cpuid;
if (cpumask_equal(mask, cpu_all_mask)) {
static int irq_rover;
static DEFINE_RAW_SPINLOCK(irq_rover_lock);
unsigned long flags;
/* Round-robin distribution... */
do_round_robin:
raw_spin_lock_irqsave(&irq_rover_lock, flags);
irq_rover = cpumask_next(irq_rover, cpu_online_mask);
if (irq_rover >= nr_cpu_ids)
irq_rover = cpumask_first(cpu_online_mask);
cpuid = irq_rover;
raw_spin_unlock_irqrestore(&irq_rover_lock, flags);
} else {
cpuid = cpumask_first_and(mask, cpu_online_mask);
if (cpuid >= nr_cpu_ids)
goto do_round_robin;
}
return get_hard_smp_processor_id(cpuid);
}
#else
int irq_choose_cpu(const struct cpumask *mask)
{
return hard_smp_processor_id();
}
#endif
2006-07-03 21:36:01 +10:00
int arch_early_irq_init(void)
2006-07-03 21:36:01 +10:00
{
return 0;
2006-07-03 21:36:01 +10:00
}
powerpc: Implement accurate task and CPU time accounting This implements accurate task and cpu time accounting for 64-bit powerpc kernels. Instead of accounting a whole jiffy of time to a task on a timer interrupt because that task happened to be running at the time, we now account time in units of timebase ticks according to the actual time spent by the task in user mode and kernel mode. We also count the time spent processing hardware and software interrupts accurately. This is conditional on CONFIG_VIRT_CPU_ACCOUNTING. If that is not set, we do tick-based approximate accounting as before. To get this accurate information, we read either the PURR (processor utilization of resources register) on POWER5 machines, or the timebase on other machines on * each entry to the kernel from usermode * each exit to usermode * transitions between process context, hard irq context and soft irq context in kernel mode * context switches. On POWER5 systems with shared-processor logical partitioning we also read both the PURR and the timebase at each timer interrupt and context switch in order to determine how much time has been taken by the hypervisor to run other partitions ("steal" time). Unfortunately, since we need values of the PURR on both threads at the same time to accurately calculate the steal time, and since we can only calculate steal time on a per-core basis, the apportioning of the steal time between idle time (time which we ceded to the hypervisor in the idle loop) and actual stolen time is somewhat approximate at the moment. This is all based quite heavily on what s390 does, and it uses the generic interfaces that were added by the s390 developers, i.e. account_system_time(), account_user_time(), etc. This patch doesn't add any new interfaces between the kernel and userspace, and doesn't change the units in which time is reported to userspace by things such as /proc/stat, /proc/<pid>/stat, getrusage(), times(), etc. Internally the various task and cpu times are stored in timebase units, but they are converted to USER_HZ units (1/100th of a second) when reported to userspace. Some precision is therefore lost but there should not be any accumulating error, since the internal accumulation is at full precision. Signed-off-by: Paul Mackerras <paulus@samba.org>
2006-02-24 10:06:59 +11:00
#ifdef CONFIG_PPC64
static int __init setup_noirqdistrib(char *str)
{
distribute_irqs = 0;
return 1;
}
__setup("noirqdistrib", setup_noirqdistrib);
#endif /* CONFIG_PPC64 */