From b64f34cdfe5bef9dfed1304c513220b0f2862eca Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 29 Jan 2008 14:27:30 +0900 Subject: [PATCH 01/27] [IA64] VIRT_CPU_ACCOUNTING (accurate cpu time accounting) This patch implements VIRT_CPU_ACCOUNTING for ia64, which enable us to use more accurate cpu time accounting. The VIRT_CPU_ACCOUNTING is an item of kernel config, which s390 and powerpc arch have. By turning this config on, these archs change the mechanism of cpu time accounting from tick-sampling based one to state-transition based one. The state-transition based accounting is done by checking time (cycle counter in processor) at every state-transition point, such as entrance/exit of kernel, interrupt, softirq etc. The difference between point to point is the actual time consumed during in the state. There is no doubt about that this value is more accurate than that of tick-sampling based accounting. Signed-off-by: Hidetoshi Seto Signed-off-by: Tony Luck --- arch/ia64/Kconfig | 11 ++++ arch/ia64/ia32/elfcore32.h | 14 ++++- arch/ia64/kernel/asm-offsets.c | 6 ++ arch/ia64/kernel/entry.S | 65 +++++++++++++++++++++ arch/ia64/kernel/fsys.S | 26 +++++++++ arch/ia64/kernel/head.S | 20 +++++++ arch/ia64/kernel/ivt.S | 69 ++++++++++++++++++++++ arch/ia64/kernel/minstate.h | 14 +++++ arch/ia64/kernel/time.c | 78 +++++++++++++++++++++++++ include/asm-ia64/cputime.h | 104 +++++++++++++++++++++++++++++++++ include/asm-ia64/system.h | 12 ++++ include/asm-ia64/thread_info.h | 14 +++++ 12 files changed, 432 insertions(+), 1 deletion(-) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index dff9edfc7465..c3567727c13c 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -280,6 +280,17 @@ config FORCE_MAX_ZONEORDER default "17" if HUGETLB_PAGE default "11" +config VIRT_CPU_ACCOUNTING + bool "Deterministic task and CPU time accounting" + default n + help + Select this option to enable more accurate task and CPU time + accounting. This is done by reading a CPU counter on each + kernel entry and exit and on transitions within the kernel + between system, softirq and hardirq state, so there is a + small performance impact. + If in doubt, say N here. + config SMP bool "Symmetric multi-processing support" help diff --git a/arch/ia64/ia32/elfcore32.h b/arch/ia64/ia32/elfcore32.h index 446c9aac924d..9a3abf58cea3 100644 --- a/arch/ia64/ia32/elfcore32.h +++ b/arch/ia64/ia32/elfcore32.h @@ -30,7 +30,19 @@ struct elf_siginfo int si_errno; /* errno */ }; -#define jiffies_to_timeval(a,b) do { (b)->tv_usec = 0; (b)->tv_sec = (a)/HZ; }while(0) +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +/* + * Hacks are here since types between compat_timeval (= pair of s32) and + * ia64-native timeval (= pair of s64) are not compatible, at least a file + * arch/ia64/ia32/../../../fs/binfmt_elf.c will get warnings from compiler on + * use of cputime_to_timeval(), which usually an alias of jiffies_to_timeval(). + */ +#define cputime_to_timeval(a,b) \ + do { (b)->tv_usec = 0; (b)->tv_sec = (a)/NSEC_PER_SEC; } while(0) +#else +#define jiffies_to_timeval(a,b) \ + do { (b)->tv_usec = 0; (b)->tv_sec = (a)/HZ; } while(0) +#endif struct elf_prstatus { diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c index 0aebc6f79e95..5865130b0a92 100644 --- a/arch/ia64/kernel/asm-offsets.c +++ b/arch/ia64/kernel/asm-offsets.c @@ -39,6 +39,12 @@ void foo(void) DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count)); +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + DEFINE(TI_AC_STAMP, offsetof(struct thread_info, ac_stamp)); + DEFINE(TI_AC_LEAVE, offsetof(struct thread_info, ac_leave)); + DEFINE(TI_AC_STIME, offsetof(struct thread_info, ac_stime)); + DEFINE(TI_AC_UTIME, offsetof(struct thread_info, ac_utime)); +#endif BLANK(); diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index 3c331c464b40..b0be4a280174 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -710,6 +710,16 @@ ENTRY(ia64_leave_syscall) (pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk #endif .work_processed_syscall: +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + adds r2=PT(LOADRS)+16,r12 +(pUStk) mov.m r22=ar.itc // fetch time at leave + adds r18=TI_FLAGS+IA64_TASK_SIZE,r13 + ;; +(p6) ld4 r31=[r18] // load current_thread_info()->flags + ld8 r19=[r2],PT(B6)-PT(LOADRS) // load ar.rsc value for "loadrs" + adds r3=PT(AR_BSPSTORE)+16,r12 // deferred + ;; +#else adds r2=PT(LOADRS)+16,r12 adds r3=PT(AR_BSPSTORE)+16,r12 adds r18=TI_FLAGS+IA64_TASK_SIZE,r13 @@ -718,6 +728,7 @@ ENTRY(ia64_leave_syscall) ld8 r19=[r2],PT(B6)-PT(LOADRS) // load ar.rsc value for "loadrs" nop.i 0 ;; +#endif mov r16=ar.bsp // M2 get existing backing store pointer ld8 r18=[r2],PT(R9)-PT(B6) // load b6 (p6) and r15=TIF_WORK_MASK,r31 // any work other than TIF_SYSCALL_TRACE? @@ -737,12 +748,21 @@ ENTRY(ia64_leave_syscall) ld8 r29=[r2],16 // M0|1 load cr.ipsr ld8 r28=[r3],16 // M0|1 load cr.iip +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +(pUStk) add r14=TI_AC_LEAVE+IA64_TASK_SIZE,r13 + ;; + ld8 r30=[r2],16 // M0|1 load cr.ifs + ld8 r25=[r3],16 // M0|1 load ar.unat +(pUStk) add r15=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13 + ;; +#else mov r22=r0 // A clear r22 ;; ld8 r30=[r2],16 // M0|1 load cr.ifs ld8 r25=[r3],16 // M0|1 load ar.unat (pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13 ;; +#endif ld8 r26=[r2],PT(B0)-PT(AR_PFS) // M0|1 load ar.pfs (pKStk) mov r22=psr // M2 read PSR now that interrupts are disabled nop 0 @@ -759,7 +779,11 @@ ENTRY(ia64_leave_syscall) ld8.fill r1=[r3],16 // M0|1 load r1 (pUStk) mov r17=1 // A ;; +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +(pUStk) st1 [r15]=r17 // M2|3 +#else (pUStk) st1 [r14]=r17 // M2|3 +#endif ld8.fill r13=[r3],16 // M0|1 mov f8=f0 // F clear f8 ;; @@ -775,12 +799,22 @@ ENTRY(ia64_leave_syscall) shr.u r18=r19,16 // I0|1 get byte size of existing "dirty" partition cover // B add current frame into dirty partition & set cr.ifs ;; +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + mov r19=ar.bsp // M2 get new backing store pointer + st8 [r14]=r22 // M save time at leave + mov f10=f0 // F clear f10 + + mov r22=r0 // A clear r22 + movl r14=__kernel_syscall_via_epc // X + ;; +#else mov r19=ar.bsp // M2 get new backing store pointer mov f10=f0 // F clear f10 nop.m 0 movl r14=__kernel_syscall_via_epc // X ;; +#endif mov.m ar.csd=r0 // M2 clear ar.csd mov.m ar.ccv=r0 // M2 clear ar.ccv mov b7=r14 // I0 clear b7 (hint with __kernel_syscall_via_epc) @@ -913,10 +947,18 @@ GLOBAL_ENTRY(ia64_leave_kernel) adds r16=PT(CR_IPSR)+16,r12 adds r17=PT(CR_IIP)+16,r12 +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + .pred.rel.mutex pUStk,pKStk +(pKStk) mov r22=psr // M2 read PSR now that interrupts are disabled +(pUStk) mov.m r22=ar.itc // M fetch time at leave + nop.i 0 + ;; +#else (pKStk) mov r22=psr // M2 read PSR now that interrupts are disabled nop.i 0 nop.i 0 ;; +#endif ld8 r29=[r16],16 // load cr.ipsr ld8 r28=[r17],16 // load cr.iip ;; @@ -938,15 +980,37 @@ GLOBAL_ENTRY(ia64_leave_kernel) ;; ld8.fill r12=[r16],16 ld8.fill r13=[r17],16 +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +(pUStk) adds r3=TI_AC_LEAVE+IA64_TASK_SIZE,r18 +#else (pUStk) adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18 +#endif ;; ld8 r20=[r16],16 // ar.fpsr ld8.fill r15=[r17],16 +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +(pUStk) adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18 // deferred +#endif ;; ld8.fill r14=[r16],16 ld8.fill r2=[r17] (pUStk) mov r17=1 ;; +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + // mmi_ : ld8 st1 shr;; mmi_ : st8 st1 shr;; + // mib : mov add br -> mib : ld8 add br + // bbb_ : br nop cover;; mbb_ : mov br cover;; + // + // no one require bsp in r16 if (pKStk) branch is selected. +(pUStk) st8 [r3]=r22 // save time at leave +(pUStk) st1 [r18]=r17 // restore current->thread.on_ustack + shr.u r18=r19,16 // get byte size of existing "dirty" partition + ;; + ld8.fill r3=[r16] // deferred + LOAD_PHYS_STACK_REG_SIZE(r17) +(pKStk) br.cond.dpnt skip_rbs_switch + mov r16=ar.bsp // get existing backing store pointer +#else ld8.fill r3=[r16] (pUStk) st1 [r18]=r17 // restore current->thread.on_ustack shr.u r18=r19,16 // get byte size of existing "dirty" partition @@ -954,6 +1018,7 @@ GLOBAL_ENTRY(ia64_leave_kernel) mov r16=ar.bsp // get existing backing store pointer LOAD_PHYS_STACK_REG_SIZE(r17) (pKStk) br.cond.dpnt skip_rbs_switch +#endif /* * Restore user backing store. diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index 44841971f077..c932d86e2d81 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S @@ -660,7 +660,11 @@ GLOBAL_ENTRY(fsys_bubble_down) nop.i 0 ;; mov ar.rsc=0 // M2 set enforced lazy mode, pl 0, LE, loadrs=0 +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + mov.m r30=ar.itc // M get cycle for accounting +#else nop.m 0 +#endif nop.i 0 ;; mov r23=ar.bspstore // M2 (12 cyc) save ar.bspstore @@ -682,6 +686,28 @@ GLOBAL_ENTRY(fsys_bubble_down) cmp.ne pKStk,pUStk=r0,r0 // A set pKStk <- 0, pUStk <- 1 br.call.sptk.many b7=ia64_syscall_setup // B ;; +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + // mov.m r30=ar.itc is called in advance + add r16=TI_AC_STAMP+IA64_TASK_SIZE,r2 + add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r2 + ;; + ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP // time at last check in kernel + ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE // time at leave kernel + ;; + ld8 r20=[r16],TI_AC_STAMP-TI_AC_STIME // cumulated stime + ld8 r21=[r17] // cumulated utime + sub r22=r19,r18 // stime before leave kernel + ;; + st8 [r16]=r30,TI_AC_STIME-TI_AC_STAMP // update stamp + sub r18=r30,r19 // elapsed time in user mode + ;; + add r20=r20,r22 // sum stime + add r21=r21,r18 // sum utime + ;; + st8 [r16]=r20 // update stime + st8 [r17]=r21 // update utime + ;; +#endif mov ar.rsc=0x3 // M2 set eager mode, pl 0, LE, loadrs=0 mov rp=r14 // I0 set the real return addr and r3=_TIF_SYSCALL_TRACEAUDIT,r3 // A diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S index d3a41d5f8d12..ddeab4e36fd5 100644 --- a/arch/ia64/kernel/head.S +++ b/arch/ia64/kernel/head.S @@ -1002,6 +1002,26 @@ GLOBAL_ENTRY(sched_clock) br.ret.sptk.many rp END(sched_clock) +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +GLOBAL_ENTRY(cycle_to_cputime) + alloc r16=ar.pfs,1,0,0,0 + addl r8=THIS_CPU(cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0 + ;; + ldf8 f8=[r8] + ;; + setf.sig f9=r32 + ;; + xmpy.lu f10=f9,f8 // calculate low 64 bits of 128-bit product (4 cyc) + xmpy.hu f11=f9,f8 // calculate high 64 bits of 128-bit product + ;; + getf.sig r8=f10 // (5 cyc) + getf.sig r9=f11 + ;; + shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT + br.ret.sptk.many rp +END(cycle_to_cputime) +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + GLOBAL_ENTRY(start_kernel_thread) .prologue .save rp, r0 // this is the end of the call-chain diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S index 34f44d8be00d..6678c49daba3 100644 --- a/arch/ia64/kernel/ivt.S +++ b/arch/ia64/kernel/ivt.S @@ -805,8 +805,13 @@ ENTRY(break_fault) (p8) adds r28=16,r28 // A switch cr.iip to next bundle (p9) adds r8=1,r8 // A increment ei to next slot +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + ;; + mov b6=r30 // I0 setup syscall handler branch reg early +#else nop.i 0 ;; +#endif mov.m r25=ar.unat // M2 (5 cyc) dep r29=r8,r29,41,2 // I0 insert new ei into cr.ipsr @@ -817,7 +822,11 @@ ENTRY(break_fault) // /////////////////////////////////////////////////////////////////////// st1 [r16]=r0 // M2|3 clear current->thread.on_ustack flag +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + mov.m r30=ar.itc // M get cycle for accounting +#else mov b6=r30 // I0 setup syscall handler branch reg early +#endif cmp.eq pKStk,pUStk=r0,r17 // A were we on kernel stacks already? and r9=_TIF_SYSCALL_TRACEAUDIT,r9 // A mask trace or audit @@ -829,6 +838,30 @@ ENTRY(break_fault) cmp.eq p14,p0=r9,r0 // A are syscalls being traced/audited? br.call.sptk.many b7=ia64_syscall_setup // B 1: +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + // mov.m r30=ar.itc is called in advance, and r13 is current + add r16=TI_AC_STAMP+IA64_TASK_SIZE,r13 // A + add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r13 // A +(pKStk) br.cond.spnt .skip_accounting // B unlikely skip + ;; + ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP // M get last stamp + ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE // M time at leave + ;; + ld8 r20=[r16],TI_AC_STAMP-TI_AC_STIME // M cumulated stime + ld8 r21=[r17] // M cumulated utime + sub r22=r19,r18 // A stime before leave + ;; + st8 [r16]=r30,TI_AC_STIME-TI_AC_STAMP // M update stamp + sub r18=r30,r19 // A elapsed time in user + ;; + add r20=r20,r22 // A sum stime + add r21=r21,r18 // A sum utime + ;; + st8 [r16]=r20 // M update stime + st8 [r17]=r21 // M update utime + ;; +.skip_accounting: +#endif mov ar.rsc=0x3 // M2 set eager mode, pl 0, LE, loadrs=0 nop 0 bsw.1 // B (6 cyc) regs are saved, switch to bank 1 @@ -928,6 +961,7 @@ END(interrupt) * - r27: saved ar.rsc * - r28: saved cr.iip * - r29: saved cr.ipsr + * - r30: ar.itc for accounting (don't touch) * - r31: saved pr * - b0: original contents (to be saved) * On exit: @@ -1090,6 +1124,41 @@ END(dispatch_illegal_op_fault) DBG_FAULT(16) FAULT(16) +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + /* + * There is no particular reason for this code to be here, other than + * that there happens to be space here that would go unused otherwise. + * If this fault ever gets "unreserved", simply moved the following + * code to a more suitable spot... + * + * account_sys_enter is called from SAVE_MIN* macros if accounting is + * enabled and if the macro is entered from user mode. + */ +ENTRY(account_sys_enter) + // mov.m r20=ar.itc is called in advance, and r13 is current + add r16=TI_AC_STAMP+IA64_TASK_SIZE,r13 + add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r13 + ;; + ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP // time at last check in kernel + ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE // time at left from kernel + ;; + ld8 r23=[r16],TI_AC_STAMP-TI_AC_STIME // cumulated stime + ld8 r21=[r17] // cumulated utime + sub r22=r19,r18 // stime before leave kernel + ;; + st8 [r16]=r20,TI_AC_STIME-TI_AC_STAMP // update stamp + sub r18=r20,r19 // elapsed time in user mode + ;; + add r23=r23,r22 // sum stime + add r21=r21,r18 // sum utime + ;; + st8 [r16]=r23 // update stime + st8 [r17]=r21 // update utime + ;; + br.ret.sptk.many rp +END(account_sys_enter) +#endif + .org ia64_ivt+0x4400 ///////////////////////////////////////////////////////////////////////////////////////// // 0x4400 Entry 17 (size 64 bundles) Reserved diff --git a/arch/ia64/kernel/minstate.h b/arch/ia64/kernel/minstate.h index c9ac8bada786..7c548ac52bbc 100644 --- a/arch/ia64/kernel/minstate.h +++ b/arch/ia64/kernel/minstate.h @@ -3,6 +3,18 @@ #include "entry.h" +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +/* read ar.itc in advance, and use it before leaving bank 0 */ +#define ACCOUNT_GET_STAMP \ +(pUStk) mov.m r20=ar.itc; +#define ACCOUNT_SYS_ENTER \ +(pUStk) br.call.spnt rp=account_sys_enter \ + ;; +#else +#define ACCOUNT_GET_STAMP +#define ACCOUNT_SYS_ENTER +#endif + /* * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves * the minimum state necessary that allows us to turn psr.ic back @@ -122,11 +134,13 @@ ;; \ .mem.offset 0,0; st8.spill [r16]=r2,16; \ .mem.offset 8,0; st8.spill [r17]=r3,16; \ + ACCOUNT_GET_STAMP \ adds r2=IA64_PT_REGS_R16_OFFSET,r1; \ ;; \ EXTRA; \ movl r1=__gp; /* establish kernel global pointer */ \ ;; \ + ACCOUNT_SYS_ENTER \ bsw.1; /* switch back to bank 1 (must be last in insn group) */ \ ;; diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 17fda5293c67..48e15a51782f 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -59,6 +59,84 @@ static struct clocksource clocksource_itc = { }; static struct clocksource *itc_clocksource; +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + +#include + +extern cputime_t cycle_to_cputime(u64 cyc); + +/* + * Called from the context switch with interrupts disabled, to charge all + * accumulated times to the current process, and to prepare accounting on + * the next process. + */ +void ia64_account_on_switch(struct task_struct *prev, struct task_struct *next) +{ + struct thread_info *pi = task_thread_info(prev); + struct thread_info *ni = task_thread_info(next); + cputime_t delta_stime, delta_utime; + __u64 now; + + now = ia64_get_itc(); + + delta_stime = cycle_to_cputime(pi->ac_stime + (now - pi->ac_stamp)); + account_system_time(prev, 0, delta_stime); + account_system_time_scaled(prev, delta_stime); + + if (pi->ac_utime) { + delta_utime = cycle_to_cputime(pi->ac_utime); + account_user_time(prev, delta_utime); + account_user_time_scaled(prev, delta_utime); + } + + pi->ac_stamp = ni->ac_stamp = now; + ni->ac_stime = ni->ac_utime = 0; +} + +/* + * Account time for a transition between system, hard irq or soft irq state. + * Note that this function is called with interrupts enabled. + */ +void account_system_vtime(struct task_struct *tsk) +{ + struct thread_info *ti = task_thread_info(tsk); + unsigned long flags; + cputime_t delta_stime; + __u64 now; + + local_irq_save(flags); + + now = ia64_get_itc(); + + delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp)); + account_system_time(tsk, 0, delta_stime); + account_system_time_scaled(tsk, delta_stime); + ti->ac_stime = 0; + + ti->ac_stamp = now; + + local_irq_restore(flags); +} + +/* + * Called from the timer interrupt handler to charge accumulated user time + * to the current process. Must be called with interrupts disabled. + */ +void account_process_tick(struct task_struct *p, int user_tick) +{ + struct thread_info *ti = task_thread_info(p); + cputime_t delta_utime; + + if (ti->ac_utime) { + delta_utime = cycle_to_cputime(ti->ac_utime); + account_user_time(p, delta_utime); + account_user_time_scaled(p, delta_utime); + ti->ac_utime = 0; + } +} + +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + static irqreturn_t timer_interrupt (int irq, void *dev_id) { diff --git a/include/asm-ia64/cputime.h b/include/asm-ia64/cputime.h index 72400a78002a..f9abdec6577a 100644 --- a/include/asm-ia64/cputime.h +++ b/include/asm-ia64/cputime.h @@ -1,6 +1,110 @@ +/* + * include/asm-ia64/cputime.h: + * Definitions for measuring cputime on ia64 machines. + * + * Based on . + * + * Copyright (C) 2007 FUJITSU LIMITED + * Copyright (C) 2007 Hidetoshi Seto + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * If we have CONFIG_VIRT_CPU_ACCOUNTING, we measure cpu time in nsec. + * Otherwise we measure cpu time in jiffies using the generic definitions. + */ + #ifndef __IA64_CPUTIME_H #define __IA64_CPUTIME_H +#ifndef CONFIG_VIRT_CPU_ACCOUNTING #include +#else +#include +#include +#include + +typedef u64 cputime_t; +typedef u64 cputime64_t; + +#define cputime_zero ((cputime_t)0) +#define cputime_max ((~((cputime_t)0) >> 1) - 1) +#define cputime_add(__a, __b) ((__a) + (__b)) +#define cputime_sub(__a, __b) ((__a) - (__b)) +#define cputime_div(__a, __n) ((__a) / (__n)) +#define cputime_halve(__a) ((__a) >> 1) +#define cputime_eq(__a, __b) ((__a) == (__b)) +#define cputime_gt(__a, __b) ((__a) > (__b)) +#define cputime_ge(__a, __b) ((__a) >= (__b)) +#define cputime_lt(__a, __b) ((__a) < (__b)) +#define cputime_le(__a, __b) ((__a) <= (__b)) + +#define cputime64_zero ((cputime64_t)0) +#define cputime64_add(__a, __b) ((__a) + (__b)) +#define cputime64_sub(__a, __b) ((__a) - (__b)) +#define cputime_to_cputime64(__ct) (__ct) + +/* + * Convert cputime <-> jiffies (HZ) + */ +#define cputime_to_jiffies(__ct) ((__ct) / (NSEC_PER_SEC / HZ)) +#define jiffies_to_cputime(__jif) ((__jif) * (NSEC_PER_SEC / HZ)) +#define cputime64_to_jiffies64(__ct) ((__ct) / (NSEC_PER_SEC / HZ)) +#define jiffies64_to_cputime64(__jif) ((__jif) * (NSEC_PER_SEC / HZ)) + +/* + * Convert cputime <-> milliseconds + */ +#define cputime_to_msecs(__ct) ((__ct) / NSEC_PER_MSEC) +#define msecs_to_cputime(__msecs) ((__msecs) * NSEC_PER_MSEC) + +/* + * Convert cputime <-> seconds + */ +#define cputime_to_secs(__ct) ((__ct) / NSEC_PER_SEC) +#define secs_to_cputime(__secs) ((__secs) * NSEC_PER_SEC) + +/* + * Convert cputime <-> timespec (nsec) + */ +static inline cputime_t timespec_to_cputime(const struct timespec *val) +{ + cputime_t ret = val->tv_sec * NSEC_PER_SEC; + return (ret + val->tv_nsec); +} +static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val) +{ + val->tv_sec = ct / NSEC_PER_SEC; + val->tv_nsec = ct % NSEC_PER_SEC; +} + +/* + * Convert cputime <-> timeval (msec) + */ +static inline cputime_t timeval_to_cputime(struct timeval *val) +{ + cputime_t ret = val->tv_sec * NSEC_PER_SEC; + return (ret + val->tv_usec * NSEC_PER_USEC); +} +static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val) +{ + val->tv_sec = ct / NSEC_PER_SEC; + val->tv_usec = (ct % NSEC_PER_SEC) / NSEC_PER_USEC; +} + +/* + * Convert cputime <-> clock (USER_HZ) + */ +#define cputime_to_clock_t(__ct) ((__ct) / (NSEC_PER_SEC / USER_HZ)) +#define clock_t_to_cputime(__x) ((__x) * (NSEC_PER_SEC / USER_HZ)) + +/* + * Convert cputime64 to clock. + */ +#define cputime64_to_clock_t(__ct) cputime_to_clock_t((cputime_t)__ct) + +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ #endif /* __IA64_CPUTIME_H */ diff --git a/include/asm-ia64/system.h b/include/asm-ia64/system.h index 595112bca3cc..dff8128fa58e 100644 --- a/include/asm-ia64/system.h +++ b/include/asm-ia64/system.h @@ -210,6 +210,13 @@ struct task_struct; extern void ia64_save_extra (struct task_struct *task); extern void ia64_load_extra (struct task_struct *task); +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct *next); +# define IA64_ACCOUNT_ON_SWITCH(p,n) ia64_account_on_switch(p,n) +#else +# define IA64_ACCOUNT_ON_SWITCH(p,n) +#endif + #ifdef CONFIG_PERFMON DECLARE_PER_CPU(unsigned long, pfm_syst_info); # define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1) @@ -222,6 +229,7 @@ extern void ia64_load_extra (struct task_struct *task); || IS_IA32_PROCESS(task_pt_regs(t)) || PERFMON_IS_SYSWIDE()) #define __switch_to(prev,next,last) do { \ + IA64_ACCOUNT_ON_SWITCH(prev, next); \ if (IA64_HAS_EXTRA_STATE(prev)) \ ia64_save_extra(prev); \ if (IA64_HAS_EXTRA_STATE(next)) \ @@ -266,6 +274,10 @@ void cpu_idle_wait(void); void default_idle(void); +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +extern void account_system_vtime(struct task_struct *); +#endif + #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ diff --git a/include/asm-ia64/thread_info.h b/include/asm-ia64/thread_info.h index 93d83cbe0c8c..6da8069a0f77 100644 --- a/include/asm-ia64/thread_info.h +++ b/include/asm-ia64/thread_info.h @@ -31,6 +31,12 @@ struct thread_info { mm_segment_t addr_limit; /* user-level address space limit */ int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ struct restart_block restart_block; +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + __u64 ac_stamp; + __u64 ac_leave; + __u64 ac_stime; + __u64 ac_utime; +#endif }; #define THREAD_SIZE KERNEL_STACK_SIZE @@ -62,9 +68,17 @@ struct thread_info { #define task_stack_page(tsk) ((void *)(tsk)) #define __HAVE_THREAD_FUNCTIONS +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +#define setup_thread_stack(p, org) \ + *task_thread_info(p) = *task_thread_info(org); \ + task_thread_info(p)->ac_stime = 0; \ + task_thread_info(p)->ac_utime = 0; \ + task_thread_info(p)->task = (p); +#else #define setup_thread_stack(p, org) \ *task_thread_info(p) = *task_thread_info(org); \ task_thread_info(p)->task = (p); +#endif #define end_of_stack(p) (unsigned long *)((void *)(p) + IA64_RBS_OFFSET) #define __HAVE_ARCH_TASK_STRUCT_ALLOCATOR From 4fe01c68eba53c3f324807faff71535218c41e9c Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 29 Jan 2008 14:39:33 +0900 Subject: [PATCH 02/27] [IA64] cleanup and improve fsys_gettimeofday This patch does: - Remove outdated comments (which someday I marked with "?"). - Reassemble instructions to fit them in fewer bundles. - If McKinley Errata 9 workaround is not needed, the workaround bundles will be patched out with NOPs. However it also not needed to have a totally NOP bundle (nop * 3) before branch. As a result, this makes the code path 3 (or 2) bundles shorter (and remove 1 unnecessary stop bit). It seems to be 1% faster. (10sec loop test, with nojitter @ Madison 1.5GHz x 4) Before: CPU 0: 0.14 (usecs) (0 errors / 69598875 iterations) CPU 1: 0.14 (usecs) (0 errors / 69630721 iterations) CPU 2: 0.14 (usecs) (0 errors / 69607850 iterations) CPU 3: 0.14 (usecs) (0 errors / 69619832 iterations) After: CPU 0: 0.14 (usecs) (0 errors / 70257728 iterations) CPU 1: 0.14 (usecs) (0 errors / 70309498 iterations) CPU 2: 0.14 (usecs) (0 errors / 70280639 iterations) CPU 3: 0.14 (usecs) (0 errors / 70260682 iterations) Signed-off-by: Hidetoshi Seto Signed-off-by: Tony Luck --- arch/ia64/kernel/fsys.S | 30 +++++++++++++----------------- arch/ia64/kernel/patch.c | 8 ++++---- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index 44841971f077..6a72db7ddecc 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S @@ -210,27 +210,25 @@ ENTRY(fsys_gettimeofday) // Note that instructions are optimized for McKinley. McKinley can // process two bundles simultaneously and therefore we continuously // try to feed the CPU two bundles and then a stop. - // - // Additional note that code has changed a lot. Optimization is TBD. - // Comments begin with "?" are maybe outdated. - tnat.nz p6,p0 = r31 // ? branch deferred to fit later bundle - mov pr = r30,0xc000 // Set predicates according to function + add r2 = TI_FLAGS+IA64_TASK_SIZE,r16 + tnat.nz p6,p0 = r31 // guard against Nat argument +(p6) br.cond.spnt.few .fail_einval movl r20 = fsyscall_gtod_data // load fsyscall gettimeofday data address ;; + ld4 r2 = [r2] // process work pending flags movl r29 = itc_jitter_data // itc_jitter add r22 = IA64_GTOD_WALL_TIME_OFFSET,r20 // wall_time - ld4 r2 = [r2] // process work pending flags - ;; -(p15) add r22 = IA64_GTOD_MONO_TIME_OFFSET,r20 // monotonic_time add r21 = IA64_CLKSRC_MMIO_OFFSET,r20 - add r19 = IA64_ITC_LASTCYCLE_OFFSET,r29 - and r2 = TIF_ALLWORK_MASK,r2 -(p6) br.cond.spnt.few .fail_einval // ? deferred branch + mov pr = r30,0xc000 // Set predicates according to function ;; - add r26 = IA64_CLKSRC_CYCLE_LAST_OFFSET,r20 // clksrc_cycle_last + and r2 = TIF_ALLWORK_MASK,r2 + add r19 = IA64_ITC_LASTCYCLE_OFFSET,r29 +(p15) add r22 = IA64_GTOD_MONO_TIME_OFFSET,r20 // monotonic_time + ;; + add r26 = IA64_CLKSRC_CYCLE_LAST_OFFSET,r20 // clksrc_cycle_last cmp.ne p6, p0 = 0, r2 // Fallback if work is scheduled -(p6) br.cond.spnt.many fsys_fallback_syscall +(p6) br.cond.spnt.many fsys_fallback_syscall ;; // Begin critical section .time_redo: @@ -258,7 +256,6 @@ ENTRY(fsys_gettimeofday) (p8) mov r2 = ar.itc // CPU_TIMER. 36 clocks latency!!! (p9) ld8 r2 = [r30] // MMIO_TIMER. Could also have latency issues.. (p13) ld8 r25 = [r19] // get itc_lastcycle value - ;; // ? could be removed by moving the last add upward ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET // tv_sec ;; ld8 r8 = [r22],-IA64_TIMESPEC_TV_NSEC_OFFSET // tv_nsec @@ -285,13 +282,12 @@ ENTRY(fsys_gettimeofday) EX(.fail_efault, probe.w.fault r31, 3) xmpy.l f8 = f8,f7 // nsec_per_cyc*(counter-last_counter) ;; - // ? simulate tbit.nz.or p7,p0 = r28,0 getf.sig r2 = f8 mf ;; ld4 r10 = [r20] // gtod_lock.sequence shr.u r2 = r2,r23 // shift by factor - ;; // ? overloaded 3 bundles! + ;; add r8 = r8,r2 // Add xtime.nsecs cmp4.ne p7,p0 = r28,r10 (p7) br.cond.dpnt.few .time_redo // sequence number changed, redo @@ -319,9 +315,9 @@ EX(.fail_efault, probe.w.fault r31, 3) EX(.fail_efault, probe.w.fault r23, 3) // This also costs 5 cycles (p14) xmpy.hu f8 = f8, f7 // xmpy has 5 cycles latency so use it ;; - mov r8 = r0 (p14) getf.sig r2 = f8 ;; + mov r8 = r0 (p14) shr.u r21 = r2, 4 ;; EX(.fail_efault, st8 [r31] = r9) diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c index 2cb9425e0421..e0dca8743dbb 100644 --- a/arch/ia64/kernel/patch.c +++ b/arch/ia64/kernel/patch.c @@ -135,10 +135,10 @@ ia64_patch_mckinley_e9 (unsigned long start, unsigned long end) while (offp < (s32 *) end) { wp = (u64 *) ia64_imva((char *) offp + *offp); - wp[0] = 0x0000000100000000UL; /* nop.m 0; nop.i 0; nop.i 0 */ - wp[1] = 0x0004000000000200UL; - wp[2] = 0x0000000100000011UL; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */ - wp[3] = 0x0084006880000200UL; + wp[0] = 0x0000000100000011UL; /* nop.m 0; nop.i 0; br.ret.sptk.many b6 */ + wp[1] = 0x0084006880000200UL; + wp[2] = 0x0000000100000000UL; /* nop.m 0; nop.i 0; nop.i 0 */ + wp[3] = 0x0004000000000200UL; ia64_fc(wp); ia64_fc(wp + 2); ++offp; } From c70f8f68676866d778564de337bec6b8734c3850 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 28 Feb 2008 16:47:50 +0800 Subject: [PATCH 03/27] [IA64] regset: 64-bit support This is the 64-bit regset implementation under IA64. Basically register read/write, which is derived from current ptrace register read/write. Signed-off-by: Shaohua Li Signed-off-by: Tony Luck --- arch/ia64/kernel/ptrace.c | 690 ++++++++++++++++++++++++++++++++++++++ include/asm-ia64/elf.h | 24 ++ 2 files changed, 714 insertions(+) diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index ab784ec4319d..7136c7811efc 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -3,6 +3,9 @@ * * Copyright (C) 1999-2005 Hewlett-Packard Co * David Mosberger-Tang + * Copyright (C) 2006 Intel Co + * 2006-08-12 - IA64 Native Utrace implementation support added by + * Anil S Keshavamurthy * * Derived from the x86 and Alpha versions. */ @@ -17,6 +20,8 @@ #include #include #include +#include +#include #include #include @@ -1626,3 +1631,688 @@ syscall_trace_leave (long arg0, long arg1, long arg2, long arg3, if (test_thread_flag(TIF_RESTORE_RSE)) ia64_sync_krbs(); } + +/* Utrace implementation starts here */ +struct regset_get { + void *kbuf; + void __user *ubuf; +}; + +struct regset_set { + const void *kbuf; + const void __user *ubuf; +}; + +struct regset_getset { + struct task_struct *target; + const struct user_regset *regset; + union { + struct regset_get get; + struct regset_set set; + } u; + unsigned int pos; + unsigned int count; + int ret; +}; + +static int +access_elf_gpreg(struct task_struct *target, struct unw_frame_info *info, + unsigned long addr, unsigned long *data, int write_access) +{ + struct pt_regs *pt; + unsigned long *ptr = NULL; + int ret; + char nat = 0; + + pt = task_pt_regs(target); + switch (addr) { + case ELF_GR_OFFSET(1): + ptr = &pt->r1; + break; + case ELF_GR_OFFSET(2): + case ELF_GR_OFFSET(3): + ptr = (void *)&pt->r2 + (addr - ELF_GR_OFFSET(2)); + break; + case ELF_GR_OFFSET(4) ... ELF_GR_OFFSET(7): + if (write_access) { + /* read NaT bit first: */ + unsigned long dummy; + + ret = unw_get_gr(info, addr/8, &dummy, &nat); + if (ret < 0) + return ret; + } + return unw_access_gr(info, addr/8, data, &nat, write_access); + case ELF_GR_OFFSET(8) ... ELF_GR_OFFSET(11): + ptr = (void *)&pt->r8 + addr - ELF_GR_OFFSET(8); + break; + case ELF_GR_OFFSET(12): + case ELF_GR_OFFSET(13): + ptr = (void *)&pt->r12 + addr - ELF_GR_OFFSET(12); + break; + case ELF_GR_OFFSET(14): + ptr = &pt->r14; + break; + case ELF_GR_OFFSET(15): + ptr = &pt->r15; + } + if (write_access) + *ptr = *data; + else + *data = *ptr; + return 0; +} + +static int +access_elf_breg(struct task_struct *target, struct unw_frame_info *info, + unsigned long addr, unsigned long *data, int write_access) +{ + struct pt_regs *pt; + unsigned long *ptr = NULL; + + pt = task_pt_regs(target); + switch (addr) { + case ELF_BR_OFFSET(0): + ptr = &pt->b0; + break; + case ELF_BR_OFFSET(1) ... ELF_BR_OFFSET(5): + return unw_access_br(info, (addr - ELF_BR_OFFSET(0))/8, + data, write_access); + case ELF_BR_OFFSET(6): + ptr = &pt->b6; + break; + case ELF_BR_OFFSET(7): + ptr = &pt->b7; + } + if (write_access) + *ptr = *data; + else + *data = *ptr; + return 0; +} + +static int +access_elf_areg(struct task_struct *target, struct unw_frame_info *info, + unsigned long addr, unsigned long *data, int write_access) +{ + struct pt_regs *pt; + unsigned long cfm, urbs_end; + unsigned long *ptr = NULL; + + pt = task_pt_regs(target); + if (addr >= ELF_AR_RSC_OFFSET && addr <= ELF_AR_SSD_OFFSET) { + switch (addr) { + case ELF_AR_RSC_OFFSET: + /* force PL3 */ + if (write_access) + pt->ar_rsc = *data | (3 << 2); + else + *data = pt->ar_rsc; + return 0; + case ELF_AR_BSP_OFFSET: + /* + * By convention, we use PT_AR_BSP to refer to + * the end of the user-level backing store. + * Use ia64_rse_skip_regs(PT_AR_BSP, -CFM.sof) + * to get the real value of ar.bsp at the time + * the kernel was entered. + * + * Furthermore, when changing the contents of + * PT_AR_BSP (or PT_CFM) while the task is + * blocked in a system call, convert the state + * so that the non-system-call exit + * path is used. This ensures that the proper + * state will be picked up when resuming + * execution. However, it *also* means that + * once we write PT_AR_BSP/PT_CFM, it won't be + * possible to modify the syscall arguments of + * the pending system call any longer. This + * shouldn't be an issue because modifying + * PT_AR_BSP/PT_CFM generally implies that + * we're either abandoning the pending system + * call or that we defer it's re-execution + * (e.g., due to GDB doing an inferior + * function call). + */ + urbs_end = ia64_get_user_rbs_end(target, pt, &cfm); + if (write_access) { + if (*data != urbs_end) { + if (in_syscall(pt)) + convert_to_non_syscall(target, + pt, + cfm); + /* + * Simulate user-level write + * of ar.bsp: + */ + pt->loadrs = 0; + pt->ar_bspstore = *data; + } + } else + *data = urbs_end; + return 0; + case ELF_AR_BSPSTORE_OFFSET: + ptr = &pt->ar_bspstore; + break; + case ELF_AR_RNAT_OFFSET: + ptr = &pt->ar_rnat; + break; + case ELF_AR_CCV_OFFSET: + ptr = &pt->ar_ccv; + break; + case ELF_AR_UNAT_OFFSET: + ptr = &pt->ar_unat; + break; + case ELF_AR_FPSR_OFFSET: + ptr = &pt->ar_fpsr; + break; + case ELF_AR_PFS_OFFSET: + ptr = &pt->ar_pfs; + break; + case ELF_AR_LC_OFFSET: + return unw_access_ar(info, UNW_AR_LC, data, + write_access); + case ELF_AR_EC_OFFSET: + return unw_access_ar(info, UNW_AR_EC, data, + write_access); + case ELF_AR_CSD_OFFSET: + ptr = &pt->ar_csd; + break; + case ELF_AR_SSD_OFFSET: + ptr = &pt->ar_ssd; + } + } else if (addr >= ELF_CR_IIP_OFFSET && addr <= ELF_CR_IPSR_OFFSET) { + switch (addr) { + case ELF_CR_IIP_OFFSET: + ptr = &pt->cr_iip; + break; + case ELF_CFM_OFFSET: + urbs_end = ia64_get_user_rbs_end(target, pt, &cfm); + if (write_access) { + if (((cfm ^ *data) & PFM_MASK) != 0) { + if (in_syscall(pt)) + convert_to_non_syscall(target, + pt, + cfm); + pt->cr_ifs = ((pt->cr_ifs & ~PFM_MASK) + | (*data & PFM_MASK)); + } + } else + *data = cfm; + return 0; + case ELF_CR_IPSR_OFFSET: + if (write_access) { + unsigned long tmp = *data; + /* psr.ri==3 is a reserved value: SDM 2:25 */ + if ((tmp & IA64_PSR_RI) == IA64_PSR_RI) + tmp &= ~IA64_PSR_RI; + pt->cr_ipsr = ((tmp & IPSR_MASK) + | (pt->cr_ipsr & ~IPSR_MASK)); + } else + *data = (pt->cr_ipsr & IPSR_MASK); + return 0; + } + } else if (addr == ELF_NAT_OFFSET) + return access_nat_bits(target, pt, info, + data, write_access); + else if (addr == ELF_PR_OFFSET) + ptr = &pt->pr; + else + return -1; + + if (write_access) + *ptr = *data; + else + *data = *ptr; + + return 0; +} + +static int +access_elf_reg(struct task_struct *target, struct unw_frame_info *info, + unsigned long addr, unsigned long *data, int write_access) +{ + if (addr >= ELF_GR_OFFSET(1) && addr <= ELF_GR_OFFSET(15)) + return access_elf_gpreg(target, info, addr, data, write_access); + else if (addr >= ELF_BR_OFFSET(0) && addr <= ELF_BR_OFFSET(7)) + return access_elf_breg(target, info, addr, data, write_access); + else + return access_elf_areg(target, info, addr, data, write_access); +} + +void do_gpregs_get(struct unw_frame_info *info, void *arg) +{ + struct pt_regs *pt; + struct regset_getset *dst = arg; + elf_greg_t tmp[16]; + unsigned int i, index, min_copy; + + if (unw_unwind_to_user(info) < 0) + return; + + /* + * coredump format: + * r0-r31 + * NaT bits (for r0-r31; bit N == 1 iff rN is a NaT) + * predicate registers (p0-p63) + * b0-b7 + * ip cfm user-mask + * ar.rsc ar.bsp ar.bspstore ar.rnat + * ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec + */ + + + /* Skip r0 */ + if (dst->count > 0 && dst->pos < ELF_GR_OFFSET(1)) { + dst->ret = user_regset_copyout_zero(&dst->pos, &dst->count, + &dst->u.get.kbuf, + &dst->u.get.ubuf, + 0, ELF_GR_OFFSET(1)); + if (dst->ret || dst->count == 0) + return; + } + + /* gr1 - gr15 */ + if (dst->count > 0 && dst->pos < ELF_GR_OFFSET(16)) { + index = (dst->pos - ELF_GR_OFFSET(1)) / sizeof(elf_greg_t); + min_copy = ELF_GR_OFFSET(16) > (dst->pos + dst->count) ? + (dst->pos + dst->count) : ELF_GR_OFFSET(16); + for (i = dst->pos; i < min_copy; i += sizeof(elf_greg_t), + index++) + if (access_elf_reg(dst->target, info, i, + &tmp[index], 0) < 0) { + dst->ret = -EIO; + return; + } + dst->ret = user_regset_copyout(&dst->pos, &dst->count, + &dst->u.get.kbuf, &dst->u.get.ubuf, tmp, + ELF_GR_OFFSET(1), ELF_GR_OFFSET(16)); + if (dst->ret || dst->count == 0) + return; + } + + /* r16-r31 */ + if (dst->count > 0 && dst->pos < ELF_NAT_OFFSET) { + pt = task_pt_regs(dst->target); + dst->ret = user_regset_copyout(&dst->pos, &dst->count, + &dst->u.get.kbuf, &dst->u.get.ubuf, &pt->r16, + ELF_GR_OFFSET(16), ELF_NAT_OFFSET); + if (dst->ret || dst->count == 0) + return; + } + + /* nat, pr, b0 - b7 */ + if (dst->count > 0 && dst->pos < ELF_CR_IIP_OFFSET) { + index = (dst->pos - ELF_NAT_OFFSET) / sizeof(elf_greg_t); + min_copy = ELF_CR_IIP_OFFSET > (dst->pos + dst->count) ? + (dst->pos + dst->count) : ELF_CR_IIP_OFFSET; + for (i = dst->pos; i < min_copy; i += sizeof(elf_greg_t), + index++) + if (access_elf_reg(dst->target, info, i, + &tmp[index], 0) < 0) { + dst->ret = -EIO; + return; + } + dst->ret = user_regset_copyout(&dst->pos, &dst->count, + &dst->u.get.kbuf, &dst->u.get.ubuf, tmp, + ELF_NAT_OFFSET, ELF_CR_IIP_OFFSET); + if (dst->ret || dst->count == 0) + return; + } + + /* ip cfm psr ar.rsc ar.bsp ar.bspstore ar.rnat + * ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec ar.csd ar.ssd + */ + if (dst->count > 0 && dst->pos < (ELF_AR_END_OFFSET)) { + index = (dst->pos - ELF_CR_IIP_OFFSET) / sizeof(elf_greg_t); + min_copy = ELF_AR_END_OFFSET > (dst->pos + dst->count) ? + (dst->pos + dst->count) : ELF_AR_END_OFFSET; + for (i = dst->pos; i < min_copy; i += sizeof(elf_greg_t), + index++) + if (access_elf_reg(dst->target, info, i, + &tmp[index], 0) < 0) { + dst->ret = -EIO; + return; + } + dst->ret = user_regset_copyout(&dst->pos, &dst->count, + &dst->u.get.kbuf, &dst->u.get.ubuf, tmp, + ELF_CR_IIP_OFFSET, ELF_AR_END_OFFSET); + } +} + +void do_gpregs_set(struct unw_frame_info *info, void *arg) +{ + struct pt_regs *pt; + struct regset_getset *dst = arg; + elf_greg_t tmp[16]; + unsigned int i, index; + + if (unw_unwind_to_user(info) < 0) + return; + + /* Skip r0 */ + if (dst->count > 0 && dst->pos < ELF_GR_OFFSET(1)) { + dst->ret = user_regset_copyin_ignore(&dst->pos, &dst->count, + &dst->u.set.kbuf, + &dst->u.set.ubuf, + 0, ELF_GR_OFFSET(1)); + if (dst->ret || dst->count == 0) + return; + } + + /* gr1-gr15 */ + if (dst->count > 0 && dst->pos < ELF_GR_OFFSET(16)) { + i = dst->pos; + index = (dst->pos - ELF_GR_OFFSET(1)) / sizeof(elf_greg_t); + dst->ret = user_regset_copyin(&dst->pos, &dst->count, + &dst->u.set.kbuf, &dst->u.set.ubuf, tmp, + ELF_GR_OFFSET(1), ELF_GR_OFFSET(16)); + if (dst->ret) + return; + for ( ; i < dst->pos; i += sizeof(elf_greg_t), index++) + if (access_elf_reg(dst->target, info, i, + &tmp[index], 1) < 0) { + dst->ret = -EIO; + return; + } + if (dst->count == 0) + return; + } + + /* gr16-gr31 */ + if (dst->count > 0 && dst->pos < ELF_NAT_OFFSET) { + pt = task_pt_regs(dst->target); + dst->ret = user_regset_copyin(&dst->pos, &dst->count, + &dst->u.set.kbuf, &dst->u.set.ubuf, &pt->r16, + ELF_GR_OFFSET(16), ELF_NAT_OFFSET); + if (dst->ret || dst->count == 0) + return; + } + + /* nat, pr, b0 - b7 */ + if (dst->count > 0 && dst->pos < ELF_CR_IIP_OFFSET) { + i = dst->pos; + index = (dst->pos - ELF_NAT_OFFSET) / sizeof(elf_greg_t); + dst->ret = user_regset_copyin(&dst->pos, &dst->count, + &dst->u.set.kbuf, &dst->u.set.ubuf, tmp, + ELF_NAT_OFFSET, ELF_CR_IIP_OFFSET); + if (dst->ret) + return; + for (; i < dst->pos; i += sizeof(elf_greg_t), index++) + if (access_elf_reg(dst->target, info, i, + &tmp[index], 1) < 0) { + dst->ret = -EIO; + return; + } + if (dst->count == 0) + return; + } + + /* ip cfm psr ar.rsc ar.bsp ar.bspstore ar.rnat + * ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec ar.csd ar.ssd + */ + if (dst->count > 0 && dst->pos < (ELF_AR_END_OFFSET)) { + i = dst->pos; + index = (dst->pos - ELF_CR_IIP_OFFSET) / sizeof(elf_greg_t); + dst->ret = user_regset_copyin(&dst->pos, &dst->count, + &dst->u.set.kbuf, &dst->u.set.ubuf, tmp, + ELF_CR_IIP_OFFSET, ELF_AR_END_OFFSET); + if (dst->ret) + return; + for ( ; i < dst->pos; i += sizeof(elf_greg_t), index++) + if (access_elf_reg(dst->target, info, i, + &tmp[index], 1) < 0) { + dst->ret = -EIO; + return; + } + } +} + +#define ELF_FP_OFFSET(i) (i * sizeof(elf_fpreg_t)) + +void do_fpregs_get(struct unw_frame_info *info, void *arg) +{ + struct regset_getset *dst = arg; + struct task_struct *task = dst->target; + elf_fpreg_t tmp[30]; + int index, min_copy, i; + + if (unw_unwind_to_user(info) < 0) + return; + + /* Skip pos 0 and 1 */ + if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(2)) { + dst->ret = user_regset_copyout_zero(&dst->pos, &dst->count, + &dst->u.get.kbuf, + &dst->u.get.ubuf, + 0, ELF_FP_OFFSET(2)); + if (dst->count == 0 || dst->ret) + return; + } + + /* fr2-fr31 */ + if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(32)) { + index = (dst->pos - ELF_FP_OFFSET(2)) / sizeof(elf_fpreg_t); + + min_copy = min(((unsigned int)ELF_FP_OFFSET(32)), + dst->pos + dst->count); + for (i = dst->pos; i < min_copy; i += sizeof(elf_fpreg_t), + index++) + if (unw_get_fr(info, i / sizeof(elf_fpreg_t), + &tmp[index])) { + dst->ret = -EIO; + return; + } + dst->ret = user_regset_copyout(&dst->pos, &dst->count, + &dst->u.get.kbuf, &dst->u.get.ubuf, tmp, + ELF_FP_OFFSET(2), ELF_FP_OFFSET(32)); + if (dst->count == 0 || dst->ret) + return; + } + + /* fph */ + if (dst->count > 0) { + ia64_flush_fph(dst->target); + if (task->thread.flags & IA64_THREAD_FPH_VALID) + dst->ret = user_regset_copyout( + &dst->pos, &dst->count, + &dst->u.get.kbuf, &dst->u.get.ubuf, + &dst->target->thread.fph, + ELF_FP_OFFSET(32), -1); + else + /* Zero fill instead. */ + dst->ret = user_regset_copyout_zero( + &dst->pos, &dst->count, + &dst->u.get.kbuf, &dst->u.get.ubuf, + ELF_FP_OFFSET(32), -1); + } +} + +void do_fpregs_set(struct unw_frame_info *info, void *arg) +{ + struct regset_getset *dst = arg; + elf_fpreg_t fpreg, tmp[30]; + int index, start, end; + + if (unw_unwind_to_user(info) < 0) + return; + + /* Skip pos 0 and 1 */ + if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(2)) { + dst->ret = user_regset_copyin_ignore(&dst->pos, &dst->count, + &dst->u.set.kbuf, + &dst->u.set.ubuf, + 0, ELF_FP_OFFSET(2)); + if (dst->count == 0 || dst->ret) + return; + } + + /* fr2-fr31 */ + if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(32)) { + start = dst->pos; + end = min(((unsigned int)ELF_FP_OFFSET(32)), + dst->pos + dst->count); + dst->ret = user_regset_copyin(&dst->pos, &dst->count, + &dst->u.set.kbuf, &dst->u.set.ubuf, tmp, + ELF_FP_OFFSET(2), ELF_FP_OFFSET(32)); + if (dst->ret) + return; + + if (start & 0xF) { /* only write high part */ + if (unw_get_fr(info, start / sizeof(elf_fpreg_t), + &fpreg)) { + dst->ret = -EIO; + return; + } + tmp[start / sizeof(elf_fpreg_t) - 2].u.bits[0] + = fpreg.u.bits[0]; + start &= ~0xFUL; + } + if (end & 0xF) { /* only write low part */ + if (unw_get_fr(info, end / sizeof(elf_fpreg_t), + &fpreg)) { + dst->ret = -EIO; + return; + } + tmp[end / sizeof(elf_fpreg_t) - 2].u.bits[1] + = fpreg.u.bits[1]; + end = (end + 0xF) & ~0xFUL; + } + + for ( ; start < end ; start += sizeof(elf_fpreg_t)) { + index = start / sizeof(elf_fpreg_t); + if (unw_set_fr(info, index, tmp[index - 2])) { + dst->ret = -EIO; + return; + } + } + if (dst->ret || dst->count == 0) + return; + } + + /* fph */ + if (dst->count > 0 && dst->pos < ELF_FP_OFFSET(128)) { + ia64_sync_fph(dst->target); + dst->ret = user_regset_copyin(&dst->pos, &dst->count, + &dst->u.set.kbuf, + &dst->u.set.ubuf, + &dst->target->thread.fph, + ELF_FP_OFFSET(32), -1); + } +} + +static int +do_regset_call(void (*call)(struct unw_frame_info *, void *), + struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct regset_getset info = { .target = target, .regset = regset, + .pos = pos, .count = count, + .u.set = { .kbuf = kbuf, .ubuf = ubuf }, + .ret = 0 }; + + if (target == current) + unw_init_running(call, &info); + else { + struct unw_frame_info ufi; + memset(&ufi, 0, sizeof(ufi)); + unw_init_from_blocked_task(&ufi, target); + (*call)(&ufi, &info); + } + + return info.ret; +} + +static int +gpregs_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + return do_regset_call(do_gpregs_get, target, regset, pos, count, + kbuf, ubuf); +} + +static int gpregs_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + return do_regset_call(do_gpregs_set, target, regset, pos, count, + kbuf, ubuf); +} + +static void do_gpregs_writeback(struct unw_frame_info *info, void *arg) +{ + do_sync_rbs(info, ia64_sync_user_rbs); +} + +/* + * This is called to write back the register backing store. + * ptrace does this before it stops, so that a tracer reading the user + * memory after the thread stops will get the current register data. + */ +static int +gpregs_writeback(struct task_struct *target, + const struct user_regset *regset, + int now) +{ + if (test_and_set_tsk_thread_flag(target, TIF_RESTORE_RSE)) + return 0; + tsk_set_notify_resume(target); + return do_regset_call(do_gpregs_writeback, target, regset, 0, 0, + NULL, NULL); +} + +static int +fpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + return (target->thread.flags & IA64_THREAD_FPH_VALID) ? 128 : 32; +} + +static int fpregs_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + return do_regset_call(do_fpregs_get, target, regset, pos, count, + kbuf, ubuf); +} + +static int fpregs_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + return do_regset_call(do_fpregs_set, target, regset, pos, count, + kbuf, ubuf); +} + +static const struct user_regset native_regsets[] = { + { + .core_note_type = NT_PRSTATUS, + .n = ELF_NGREG, + .size = sizeof(elf_greg_t), .align = sizeof(elf_greg_t), + .get = gpregs_get, .set = gpregs_set, + .writeback = gpregs_writeback + }, + { + .core_note_type = NT_PRFPREG, + .n = ELF_NFPREG, + .size = sizeof(elf_fpreg_t), .align = sizeof(elf_fpreg_t), + .get = fpregs_get, .set = fpregs_set, .active = fpregs_active + }, +}; + +static const struct user_regset_view user_ia64_view = { + .name = "ia64", + .e_machine = EM_IA_64, + .regsets = native_regsets, .n = ARRAY_SIZE(native_regsets) +}; + +const struct user_regset_view *task_user_regset_view(struct task_struct *tsk) +{ + return &user_ia64_view; +} diff --git a/include/asm-ia64/elf.h b/include/asm-ia64/elf.h index f8e83eca67a2..064cf7dcea8e 100644 --- a/include/asm-ia64/elf.h +++ b/include/asm-ia64/elf.h @@ -154,6 +154,30 @@ extern void ia64_init_addr_space (void); #define ELF_NGREG 128 /* we really need just 72 but let's leave some headroom... */ #define ELF_NFPREG 128 /* f0 and f1 could be omitted, but so what... */ +/* elf_gregset_t register offsets */ +#define ELF_GR_0_OFFSET 0 +#define ELF_NAT_OFFSET (32 * sizeof(elf_greg_t)) +#define ELF_PR_OFFSET (33 * sizeof(elf_greg_t)) +#define ELF_BR_0_OFFSET (34 * sizeof(elf_greg_t)) +#define ELF_CR_IIP_OFFSET (42 * sizeof(elf_greg_t)) +#define ELF_CFM_OFFSET (43 * sizeof(elf_greg_t)) +#define ELF_CR_IPSR_OFFSET (44 * sizeof(elf_greg_t)) +#define ELF_GR_OFFSET(i) (ELF_GR_0_OFFSET + i * sizeof(elf_greg_t)) +#define ELF_BR_OFFSET(i) (ELF_BR_0_OFFSET + i * sizeof(elf_greg_t)) +#define ELF_AR_RSC_OFFSET (45 * sizeof(elf_greg_t)) +#define ELF_AR_BSP_OFFSET (46 * sizeof(elf_greg_t)) +#define ELF_AR_BSPSTORE_OFFSET (47 * sizeof(elf_greg_t)) +#define ELF_AR_RNAT_OFFSET (48 * sizeof(elf_greg_t)) +#define ELF_AR_CCV_OFFSET (49 * sizeof(elf_greg_t)) +#define ELF_AR_UNAT_OFFSET (50 * sizeof(elf_greg_t)) +#define ELF_AR_FPSR_OFFSET (51 * sizeof(elf_greg_t)) +#define ELF_AR_PFS_OFFSET (52 * sizeof(elf_greg_t)) +#define ELF_AR_LC_OFFSET (53 * sizeof(elf_greg_t)) +#define ELF_AR_EC_OFFSET (54 * sizeof(elf_greg_t)) +#define ELF_AR_CSD_OFFSET (55 * sizeof(elf_greg_t)) +#define ELF_AR_SSD_OFFSET (56 * sizeof(elf_greg_t)) +#define ELF_AR_END_OFFSET (57 * sizeof(elf_greg_t)) + typedef unsigned long elf_fpxregset_t; typedef unsigned long elf_greg_t; From 75529219373e53042fc46c86d991125e616f42dd Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 28 Feb 2008 16:09:33 +0800 Subject: [PATCH 04/27] [IA64] regset: 32-bit support This is the 32-bit regset implementation under IA64. Basically register read/write, which is derived from current ptrace register read/write. This version added TLS support. Signed-off-by: Shaohua Li Signed-off-by: Tony Luck --- arch/ia64/ia32/sys_ia32.c | 649 ++++++++++++++++++++++++++++++++++++-- arch/ia64/kernel/ptrace.c | 5 + 2 files changed, 629 insertions(+), 25 deletions(-) diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c index b1bf51fe97b4..7e028ceb93ba 100644 --- a/arch/ia64/ia32/sys_ia32.c +++ b/arch/ia64/ia32/sys_ia32.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -2387,16 +2388,45 @@ get_free_idx (void) return -ESRCH; } +static void set_tls_desc(struct task_struct *p, int idx, + const struct ia32_user_desc *info, int n) +{ + struct thread_struct *t = &p->thread; + struct desc_struct *desc = &t->tls_array[idx - GDT_ENTRY_TLS_MIN]; + int cpu; + + /* + * We must not get preempted while modifying the TLS. + */ + cpu = get_cpu(); + + while (n-- > 0) { + if (LDT_empty(info)) { + desc->a = 0; + desc->b = 0; + } else { + desc->a = LDT_entry_a(info); + desc->b = LDT_entry_b(info); + } + + ++info; + ++desc; + } + + if (t == ¤t->thread) + load_TLS(t, cpu); + + put_cpu(); +} + /* * Set a given TLS descriptor: */ asmlinkage int sys32_set_thread_area (struct ia32_user_desc __user *u_info) { - struct thread_struct *t = ¤t->thread; struct ia32_user_desc info; - struct desc_struct *desc; - int cpu, idx; + int idx; if (copy_from_user(&info, u_info, sizeof(info))) return -EFAULT; @@ -2416,18 +2446,7 @@ sys32_set_thread_area (struct ia32_user_desc __user *u_info) if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) return -EINVAL; - desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; - - cpu = smp_processor_id(); - - if (LDT_empty(&info)) { - desc->a = 0; - desc->b = 0; - } else { - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - } - load_TLS(t, cpu); + set_tls_desc(current, idx, &info, 1); return 0; } @@ -2451,6 +2470,20 @@ sys32_set_thread_area (struct ia32_user_desc __user *u_info) #define GET_PRESENT(desc) (((desc)->b >> 15) & 1) #define GET_USEABLE(desc) (((desc)->b >> 20) & 1) +static void fill_user_desc(struct ia32_user_desc *info, int idx, + const struct desc_struct *desc) +{ + info->entry_number = idx; + info->base_addr = GET_BASE(desc); + info->limit = GET_LIMIT(desc); + info->seg_32bit = GET_32BIT(desc); + info->contents = GET_CONTENTS(desc); + info->read_exec_only = !GET_WRITABLE(desc); + info->limit_in_pages = GET_LIMIT_PAGES(desc); + info->seg_not_present = !GET_PRESENT(desc); + info->useable = GET_USEABLE(desc); +} + asmlinkage int sys32_get_thread_area (struct ia32_user_desc __user *u_info) { @@ -2464,22 +2497,588 @@ sys32_get_thread_area (struct ia32_user_desc __user *u_info) return -EINVAL; desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; - - info.entry_number = idx; - info.base_addr = GET_BASE(desc); - info.limit = GET_LIMIT(desc); - info.seg_32bit = GET_32BIT(desc); - info.contents = GET_CONTENTS(desc); - info.read_exec_only = !GET_WRITABLE(desc); - info.limit_in_pages = GET_LIMIT_PAGES(desc); - info.seg_not_present = !GET_PRESENT(desc); - info.useable = GET_USEABLE(desc); + fill_user_desc(&info, idx, desc); if (copy_to_user(u_info, &info, sizeof(info))) return -EFAULT; return 0; } +struct regset_get { + void *kbuf; + void __user *ubuf; +}; + +struct regset_set { + const void *kbuf; + const void __user *ubuf; +}; + +struct regset_getset { + struct task_struct *target; + const struct user_regset *regset; + union { + struct regset_get get; + struct regset_set set; + } u; + unsigned int pos; + unsigned int count; + int ret; +}; + +static void getfpreg(struct task_struct *task, int regno, int *val) +{ + switch (regno / sizeof(int)) { + case 0: + *val = task->thread.fcr & 0xffff; + break; + case 1: + *val = task->thread.fsr & 0xffff; + break; + case 2: + *val = (task->thread.fsr>>16) & 0xffff; + break; + case 3: + *val = task->thread.fir; + break; + case 4: + *val = (task->thread.fir>>32) & 0xffff; + break; + case 5: + *val = task->thread.fdr; + break; + case 6: + *val = (task->thread.fdr >> 32) & 0xffff; + break; + } +} + +static void setfpreg(struct task_struct *task, int regno, int val) +{ + switch (regno / sizeof(int)) { + case 0: + task->thread.fcr = (task->thread.fcr & (~0x1f3f)) + | (val & 0x1f3f); + break; + case 1: + task->thread.fsr = (task->thread.fsr & (~0xffff)) | val; + break; + case 2: + task->thread.fsr = (task->thread.fsr & (~0xffff0000)) + | (val << 16); + break; + case 3: + task->thread.fir = (task->thread.fir & (~0xffffffff)) | val; + break; + case 5: + task->thread.fdr = (task->thread.fdr & (~0xffffffff)) | val; + break; + } +} + +static void access_fpreg_ia32(int regno, void *reg, + struct pt_regs *pt, struct switch_stack *sw, + int tos, int write) +{ + void *f; + + if ((regno += tos) >= 8) + regno -= 8; + if (regno < 4) + f = &pt->f8 + regno; + else if (regno <= 7) + f = &sw->f12 + (regno - 4); + else { + printk(KERN_ERR "regno must be less than 7 \n"); + return; + } + + if (write) + memcpy(f, reg, sizeof(struct _fpreg_ia32)); + else + memcpy(reg, f, sizeof(struct _fpreg_ia32)); +} + +static void do_fpregs_get(struct unw_frame_info *info, void *arg) +{ + struct regset_getset *dst = arg; + struct task_struct *task = dst->target; + struct pt_regs *pt; + int start, end, tos; + char buf[80]; + + if (dst->count == 0 || unw_unwind_to_user(info) < 0) + return; + if (dst->pos < 7 * sizeof(int)) { + end = min((dst->pos + dst->count), + (unsigned int)(7 * sizeof(int))); + for (start = dst->pos; start < end; start += sizeof(int)) + getfpreg(task, start, (int *)(buf + start)); + dst->ret = user_regset_copyout(&dst->pos, &dst->count, + &dst->u.get.kbuf, &dst->u.get.ubuf, buf, + 0, 7 * sizeof(int)); + if (dst->ret || dst->count == 0) + return; + } + if (dst->pos < sizeof(struct ia32_user_i387_struct)) { + pt = task_pt_regs(task); + tos = (task->thread.fsr >> 11) & 7; + end = min(dst->pos + dst->count, + (unsigned int)(sizeof(struct ia32_user_i387_struct))); + start = (dst->pos - 7 * sizeof(int)) / + sizeof(struct _fpreg_ia32); + end = (end - 7 * sizeof(int)) / sizeof(struct _fpreg_ia32); + for (; start < end; start++) + access_fpreg_ia32(start, + (struct _fpreg_ia32 *)buf + start, + pt, info->sw, tos, 0); + dst->ret = user_regset_copyout(&dst->pos, &dst->count, + &dst->u.get.kbuf, &dst->u.get.ubuf, + buf, 7 * sizeof(int), + sizeof(struct ia32_user_i387_struct)); + if (dst->ret || dst->count == 0) + return; + } +} + +static void do_fpregs_set(struct unw_frame_info *info, void *arg) +{ + struct regset_getset *dst = arg; + struct task_struct *task = dst->target; + struct pt_regs *pt; + char buf[80]; + int end, start, tos; + + if (dst->count == 0 || unw_unwind_to_user(info) < 0) + return; + + if (dst->pos < 7 * sizeof(int)) { + start = dst->pos; + dst->ret = user_regset_copyin(&dst->pos, &dst->count, + &dst->u.set.kbuf, &dst->u.set.ubuf, buf, + 0, 7 * sizeof(int)); + if (dst->ret) + return; + for (; start < dst->pos; start += sizeof(int)) + setfpreg(task, start, *((int *)(buf + start))); + if (dst->count == 0) + return; + } + if (dst->pos < sizeof(struct ia32_user_i387_struct)) { + start = (dst->pos - 7 * sizeof(int)) / + sizeof(struct _fpreg_ia32); + dst->ret = user_regset_copyin(&dst->pos, &dst->count, + &dst->u.set.kbuf, &dst->u.set.ubuf, + buf, 7 * sizeof(int), + sizeof(struct ia32_user_i387_struct)); + if (dst->ret) + return; + pt = task_pt_regs(task); + tos = (task->thread.fsr >> 11) & 7; + end = (dst->pos - 7 * sizeof(int)) / sizeof(struct _fpreg_ia32); + for (; start < end; start++) + access_fpreg_ia32(start, + (struct _fpreg_ia32 *)buf + start, + pt, info->sw, tos, 1); + if (dst->count == 0) + return; + } +} + +#define OFFSET(member) ((int)(offsetof(struct ia32_user_fxsr_struct, member))) +static void getfpxreg(struct task_struct *task, int start, int end, char *buf) +{ + int min_val; + + min_val = min(end, OFFSET(fop)); + while (start < min_val) { + if (start == OFFSET(cwd)) + *((short *)buf) = task->thread.fcr & 0xffff; + else if (start == OFFSET(swd)) + *((short *)buf) = task->thread.fsr & 0xffff; + else if (start == OFFSET(twd)) + *((short *)buf) = (task->thread.fsr>>16) & 0xffff; + buf += 2; + start += 2; + } + /* skip fop element */ + if (start == OFFSET(fop)) { + start += 2; + buf += 2; + } + while (start < end) { + if (start == OFFSET(fip)) + *((int *)buf) = task->thread.fir; + else if (start == OFFSET(fcs)) + *((int *)buf) = (task->thread.fir>>32) & 0xffff; + else if (start == OFFSET(foo)) + *((int *)buf) = task->thread.fdr; + else if (start == OFFSET(fos)) + *((int *)buf) = (task->thread.fdr>>32) & 0xffff; + else if (start == OFFSET(mxcsr)) + *((int *)buf) = ((task->thread.fcr>>32) & 0xff80) + | ((task->thread.fsr>>32) & 0x3f); + buf += 4; + start += 4; + } +} + +static void setfpxreg(struct task_struct *task, int start, int end, char *buf) +{ + int min_val, num32; + short num; + unsigned long num64; + + min_val = min(end, OFFSET(fop)); + while (start < min_val) { + num = *((short *)buf); + if (start == OFFSET(cwd)) { + task->thread.fcr = (task->thread.fcr & (~0x1f3f)) + | (num & 0x1f3f); + } else if (start == OFFSET(swd)) { + task->thread.fsr = (task->thread.fsr & (~0xffff)) | num; + } else if (start == OFFSET(twd)) { + task->thread.fsr = (task->thread.fsr & (~0xffff0000)) + | (((int)num) << 16); + } + buf += 2; + start += 2; + } + /* skip fop element */ + if (start == OFFSET(fop)) { + start += 2; + buf += 2; + } + while (start < end) { + num32 = *((int *)buf); + if (start == OFFSET(fip)) + task->thread.fir = (task->thread.fir & (~0xffffffff)) + | num32; + else if (start == OFFSET(foo)) + task->thread.fdr = (task->thread.fdr & (~0xffffffff)) + | num32; + else if (start == OFFSET(mxcsr)) { + num64 = num32 & 0xff10; + task->thread.fcr = (task->thread.fcr & + (~0xff1000000000UL)) | (num64<<32); + num64 = num32 & 0x3f; + task->thread.fsr = (task->thread.fsr & + (~0x3f00000000UL)) | (num64<<32); + } + buf += 4; + start += 4; + } +} + +static void do_fpxregs_get(struct unw_frame_info *info, void *arg) +{ + struct regset_getset *dst = arg; + struct task_struct *task = dst->target; + struct pt_regs *pt; + char buf[128]; + int start, end, tos; + + if (dst->count == 0 || unw_unwind_to_user(info) < 0) + return; + if (dst->pos < OFFSET(st_space[0])) { + end = min(dst->pos + dst->count, (unsigned int)32); + getfpxreg(task, dst->pos, end, buf); + dst->ret = user_regset_copyout(&dst->pos, &dst->count, + &dst->u.get.kbuf, &dst->u.get.ubuf, buf, + 0, OFFSET(st_space[0])); + if (dst->ret || dst->count == 0) + return; + } + if (dst->pos < OFFSET(xmm_space[0])) { + pt = task_pt_regs(task); + tos = (task->thread.fsr >> 11) & 7; + end = min(dst->pos + dst->count, + (unsigned int)OFFSET(xmm_space[0])); + start = (dst->pos - OFFSET(st_space[0])) / 16; + end = (end - OFFSET(st_space[0])) / 16; + for (; start < end; start++) + access_fpreg_ia32(start, buf + 16 * start, pt, + info->sw, tos, 0); + dst->ret = user_regset_copyout(&dst->pos, &dst->count, + &dst->u.get.kbuf, &dst->u.get.ubuf, + buf, OFFSET(st_space[0]), OFFSET(xmm_space[0])); + if (dst->ret || dst->count == 0) + return; + } + if (dst->pos < OFFSET(padding[0])) + dst->ret = user_regset_copyout(&dst->pos, &dst->count, + &dst->u.get.kbuf, &dst->u.get.ubuf, + &info->sw->f16, OFFSET(xmm_space[0]), + OFFSET(padding[0])); +} + +static void do_fpxregs_set(struct unw_frame_info *info, void *arg) +{ + struct regset_getset *dst = arg; + struct task_struct *task = dst->target; + char buf[128]; + int start, end; + + if (dst->count == 0 || unw_unwind_to_user(info) < 0) + return; + + if (dst->pos < OFFSET(st_space[0])) { + start = dst->pos; + dst->ret = user_regset_copyin(&dst->pos, &dst->count, + &dst->u.set.kbuf, &dst->u.set.ubuf, + buf, 0, OFFSET(st_space[0])); + if (dst->ret) + return; + setfpxreg(task, start, dst->pos, buf); + if (dst->count == 0) + return; + } + if (dst->pos < OFFSET(xmm_space[0])) { + struct pt_regs *pt; + int tos; + pt = task_pt_regs(task); + tos = (task->thread.fsr >> 11) & 7; + start = (dst->pos - OFFSET(st_space[0])) / 16; + dst->ret = user_regset_copyin(&dst->pos, &dst->count, + &dst->u.set.kbuf, &dst->u.set.ubuf, + buf, OFFSET(st_space[0]), OFFSET(xmm_space[0])); + if (dst->ret) + return; + end = (dst->pos - OFFSET(st_space[0])) / 16; + for (; start < end; start++) + access_fpreg_ia32(start, buf + 16 * start, pt, info->sw, + tos, 1); + if (dst->count == 0) + return; + } + if (dst->pos < OFFSET(padding[0])) + dst->ret = user_regset_copyin(&dst->pos, &dst->count, + &dst->u.set.kbuf, &dst->u.set.ubuf, + &info->sw->f16, OFFSET(xmm_space[0]), + OFFSET(padding[0])); +} +#undef OFFSET + +static int do_regset_call(void (*call)(struct unw_frame_info *, void *), + struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct regset_getset info = { .target = target, .regset = regset, + .pos = pos, .count = count, + .u.set = { .kbuf = kbuf, .ubuf = ubuf }, + .ret = 0 }; + + if (target == current) + unw_init_running(call, &info); + else { + struct unw_frame_info ufi; + memset(&ufi, 0, sizeof(ufi)); + unw_init_from_blocked_task(&ufi, target); + (*call)(&ufi, &info); + } + + return info.ret; +} + +static int ia32_fpregs_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + return do_regset_call(do_fpregs_get, target, regset, pos, count, + kbuf, ubuf); +} + +static int ia32_fpregs_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + return do_regset_call(do_fpregs_set, target, regset, pos, count, + kbuf, ubuf); +} + +static int ia32_fpxregs_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + return do_regset_call(do_fpxregs_get, target, regset, pos, count, + kbuf, ubuf); +} + +static int ia32_fpxregs_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + return do_regset_call(do_fpxregs_set, target, regset, pos, count, + kbuf, ubuf); +} + +static int ia32_genregs_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + if (kbuf) { + u32 *kp = kbuf; + while (count > 0) { + *kp++ = getreg(target, pos); + pos += 4; + count -= 4; + } + } else { + u32 __user *up = ubuf; + while (count > 0) { + if (__put_user(getreg(target, pos), up++)) + return -EFAULT; + pos += 4; + count -= 4; + } + } + return 0; +} + +static int ia32_genregs_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret = 0; + + if (kbuf) { + const u32 *kp = kbuf; + while (!ret && count > 0) { + putreg(target, pos, *kp++); + pos += 4; + count -= 4; + } + } else { + const u32 __user *up = ubuf; + u32 val; + while (!ret && count > 0) { + ret = __get_user(val, up++); + if (!ret) + putreg(target, pos, val); + pos += 4; + count -= 4; + } + } + return ret; +} + +static int ia32_tls_active(struct task_struct *target, + const struct user_regset *regset) +{ + struct thread_struct *t = &target->thread; + int n = GDT_ENTRY_TLS_ENTRIES; + while (n > 0 && desc_empty(&t->tls_array[n -1])) + --n; + return n; +} + +static int ia32_tls_get(struct task_struct *target, + const struct user_regset *regset, unsigned int pos, + unsigned int count, void *kbuf, void __user *ubuf) +{ + const struct desc_struct *tls; + + if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct ia32_user_desc) || + (pos % sizeof(struct ia32_user_desc)) != 0 || + (count % sizeof(struct ia32_user_desc)) != 0) + return -EINVAL; + + pos /= sizeof(struct ia32_user_desc); + count /= sizeof(struct ia32_user_desc); + + tls = &target->thread.tls_array[pos]; + + if (kbuf) { + struct ia32_user_desc *info = kbuf; + while (count-- > 0) + fill_user_desc(info++, GDT_ENTRY_TLS_MIN + pos++, + tls++); + } else { + struct ia32_user_desc __user *u_info = ubuf; + while (count-- > 0) { + struct ia32_user_desc info; + fill_user_desc(&info, GDT_ENTRY_TLS_MIN + pos++, tls++); + if (__copy_to_user(u_info++, &info, sizeof(info))) + return -EFAULT; + } + } + + return 0; +} + +static int ia32_tls_set(struct task_struct *target, + const struct user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, const void __user *ubuf) +{ + struct ia32_user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; + const struct ia32_user_desc *info; + + if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct ia32_user_desc) || + (pos % sizeof(struct ia32_user_desc)) != 0 || + (count % sizeof(struct ia32_user_desc)) != 0) + return -EINVAL; + + if (kbuf) + info = kbuf; + else if (__copy_from_user(infobuf, ubuf, count)) + return -EFAULT; + else + info = infobuf; + + set_tls_desc(target, + GDT_ENTRY_TLS_MIN + (pos / sizeof(struct ia32_user_desc)), + info, count / sizeof(struct ia32_user_desc)); + + return 0; +} + +/* + * This should match arch/i386/kernel/ptrace.c:native_regsets. + * XXX ioperm? vm86? + */ +static const struct user_regset ia32_regsets[] = { + { + .core_note_type = NT_PRSTATUS, + .n = sizeof(struct user_regs_struct32)/4, + .size = 4, .align = 4, + .get = ia32_genregs_get, .set = ia32_genregs_set + }, + { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct ia32_user_i387_struct) / 4, + .size = 4, .align = 4, + .get = ia32_fpregs_get, .set = ia32_fpregs_set + }, + { + .core_note_type = NT_PRXFPREG, + .n = sizeof(struct ia32_user_fxsr_struct) / 4, + .size = 4, .align = 4, + .get = ia32_fpxregs_get, .set = ia32_fpxregs_set + }, + { + .core_note_type = NT_386_TLS, + .n = GDT_ENTRY_TLS_ENTRIES, + .bias = GDT_ENTRY_TLS_MIN, + .size = sizeof(struct ia32_user_desc), + .align = sizeof(struct ia32_user_desc), + .active = ia32_tls_active, + .get = ia32_tls_get, .set = ia32_tls_set, + }, +}; + +const struct user_regset_view user_ia32_view = { + .name = "i386", .e_machine = EM_386, + .regsets = ia32_regsets, .n = ARRAY_SIZE(ia32_regsets) +}; + long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, __u32 len_low, __u32 len_high, int advice) { diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 7136c7811efc..4c104170ca4d 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -2314,5 +2314,10 @@ static const struct user_regset_view user_ia64_view = { const struct user_regset_view *task_user_regset_view(struct task_struct *tsk) { +#ifdef CONFIG_IA32_SUPPORT + extern const struct user_regset_view user_ia32_view; + if (IS_IA32_PROCESS(task_pt_regs(tsk))) + return &user_ia32_view; +#endif return &user_ia64_view; } From 6cb53d7a6f40858181facde0f52587731d2e621f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 28 Feb 2008 16:09:38 +0800 Subject: [PATCH 05/27] [IA64] use CORE_DUMP_USE_REGSET After we have regset support, we can use CORE_DUMP_USE_REGSET. Signed-off-by: Shaohua Li Signed-off-by: Tony Luck --- arch/ia64/kernel/process.c | 30 ------------------------------ include/asm-ia64/elf.h | 7 +------ 2 files changed, 1 insertion(+), 36 deletions(-) diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 49937a383b23..a5ea817cbcbf 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -625,42 +625,12 @@ do_dump_fpu (struct unw_frame_info *info, void *arg) do_dump_task_fpu(current, info, arg); } -int -dump_task_regs(struct task_struct *task, elf_gregset_t *regs) -{ - struct unw_frame_info tcore_info; - - if (current == task) { - unw_init_running(do_copy_regs, regs); - } else { - memset(&tcore_info, 0, sizeof(tcore_info)); - unw_init_from_blocked_task(&tcore_info, task); - do_copy_task_regs(task, &tcore_info, regs); - } - return 1; -} - void ia64_elf_core_copy_regs (struct pt_regs *pt, elf_gregset_t dst) { unw_init_running(do_copy_regs, dst); } -int -dump_task_fpu (struct task_struct *task, elf_fpregset_t *dst) -{ - struct unw_frame_info tcore_info; - - if (current == task) { - unw_init_running(do_dump_fpu, dst); - } else { - memset(&tcore_info, 0, sizeof(tcore_info)); - unw_init_from_blocked_task(&tcore_info, task); - do_dump_task_fpu(task, &tcore_info, dst); - } - return 1; -} - int dump_fpu (struct pt_regs *pt, elf_fpregset_t dst) { diff --git a/include/asm-ia64/elf.h b/include/asm-ia64/elf.h index 064cf7dcea8e..5e0c1a6bce8d 100644 --- a/include/asm-ia64/elf.h +++ b/include/asm-ia64/elf.h @@ -26,6 +26,7 @@ #define ELF_ARCH EM_IA_64 #define USE_ELF_CORE_DUMP +#define CORE_DUMP_USE_REGSET /* Least-significant four bits of ELF header's e_flags are OS-specific. The bits are interpreted as follows by Linux: */ @@ -207,12 +208,6 @@ extern void ia64_elf_core_copy_regs (struct pt_regs *src, elf_gregset_t dst); struct task_struct; -extern int dump_task_regs(struct task_struct *, elf_gregset_t *); -extern int dump_task_fpu (struct task_struct *, elf_fpregset_t *); - -#define ELF_CORE_COPY_TASK_REGS(tsk, elf_gregs) dump_task_regs(tsk, elf_gregs) -#define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs) - #define GATE_EHDR ((const struct elfhdr *) GATE_ADDR) /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ From 4cd8dc83581906948ff4cfa65007e64496b5a7c8 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 28 Feb 2008 16:09:42 +0800 Subject: [PATCH 06/27] [IA64] remove duplicate code for register access We have duplicate code to access registers (access_uarea and regset way). They just have different layout, so remove duplicate code. Signed-off-by: Shaohua Li Signed-off-by: Tony Luck --- arch/ia64/kernel/ptrace.c | 522 +++++++++++++++----------------------- 1 file changed, 200 insertions(+), 322 deletions(-) diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 4c104170ca4d..2a9943b5947f 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -745,25 +745,6 @@ ia64_sync_fph (struct task_struct *task) psr->dfh = 1; } -static int -access_fr (struct unw_frame_info *info, int regnum, int hi, - unsigned long *data, int write_access) -{ - struct ia64_fpreg fpval; - int ret; - - ret = unw_get_fr(info, regnum, &fpval); - if (ret < 0) - return ret; - - if (write_access) { - fpval.u.bits[hi] = *data; - ret = unw_set_fr(info, regnum, fpval); - } else - *data = fpval.u.bits[hi]; - return ret; -} - /* * Change the machine-state of CHILD such that it will return via the normal * kernel exit-path, rather than the syscall-exit path. @@ -865,309 +846,7 @@ access_nat_bits (struct task_struct *child, struct pt_regs *pt, static int access_uarea (struct task_struct *child, unsigned long addr, - unsigned long *data, int write_access) -{ - unsigned long *ptr, regnum, urbs_end, cfm; - struct switch_stack *sw; - struct pt_regs *pt; -# define pt_reg_addr(pt, reg) ((void *) \ - ((unsigned long) (pt) \ - + offsetof(struct pt_regs, reg))) - - - pt = task_pt_regs(child); - sw = (struct switch_stack *) (child->thread.ksp + 16); - - if ((addr & 0x7) != 0) { - dprintk("ptrace: unaligned register address 0x%lx\n", addr); - return -1; - } - - if (addr < PT_F127 + 16) { - /* accessing fph */ - if (write_access) - ia64_sync_fph(child); - else - ia64_flush_fph(child); - ptr = (unsigned long *) - ((unsigned long) &child->thread.fph + addr); - } else if ((addr >= PT_F10) && (addr < PT_F11 + 16)) { - /* scratch registers untouched by kernel (saved in pt_regs) */ - ptr = pt_reg_addr(pt, f10) + (addr - PT_F10); - } else if (addr >= PT_F12 && addr < PT_F15 + 16) { - /* - * Scratch registers untouched by kernel (saved in - * switch_stack). - */ - ptr = (unsigned long *) ((long) sw - + (addr - PT_NAT_BITS - 32)); - } else if (addr < PT_AR_LC + 8) { - /* preserved state: */ - struct unw_frame_info info; - char nat = 0; - int ret; - - unw_init_from_blocked_task(&info, child); - if (unw_unwind_to_user(&info) < 0) - return -1; - - switch (addr) { - case PT_NAT_BITS: - return access_nat_bits(child, pt, &info, - data, write_access); - - case PT_R4: case PT_R5: case PT_R6: case PT_R7: - if (write_access) { - /* read NaT bit first: */ - unsigned long dummy; - - ret = unw_get_gr(&info, (addr - PT_R4)/8 + 4, - &dummy, &nat); - if (ret < 0) - return ret; - } - return unw_access_gr(&info, (addr - PT_R4)/8 + 4, data, - &nat, write_access); - - case PT_B1: case PT_B2: case PT_B3: - case PT_B4: case PT_B5: - return unw_access_br(&info, (addr - PT_B1)/8 + 1, data, - write_access); - - case PT_AR_EC: - return unw_access_ar(&info, UNW_AR_EC, data, - write_access); - - case PT_AR_LC: - return unw_access_ar(&info, UNW_AR_LC, data, - write_access); - - default: - if (addr >= PT_F2 && addr < PT_F5 + 16) - return access_fr(&info, (addr - PT_F2)/16 + 2, - (addr & 8) != 0, data, - write_access); - else if (addr >= PT_F16 && addr < PT_F31 + 16) - return access_fr(&info, - (addr - PT_F16)/16 + 16, - (addr & 8) != 0, - data, write_access); - else { - dprintk("ptrace: rejecting access to register " - "address 0x%lx\n", addr); - return -1; - } - } - } else if (addr < PT_F9+16) { - /* scratch state */ - switch (addr) { - case PT_AR_BSP: - /* - * By convention, we use PT_AR_BSP to refer to - * the end of the user-level backing store. - * Use ia64_rse_skip_regs(PT_AR_BSP, -CFM.sof) - * to get the real value of ar.bsp at the time - * the kernel was entered. - * - * Furthermore, when changing the contents of - * PT_AR_BSP (or PT_CFM) while the task is - * blocked in a system call, convert the state - * so that the non-system-call exit - * path is used. This ensures that the proper - * state will be picked up when resuming - * execution. However, it *also* means that - * once we write PT_AR_BSP/PT_CFM, it won't be - * possible to modify the syscall arguments of - * the pending system call any longer. This - * shouldn't be an issue because modifying - * PT_AR_BSP/PT_CFM generally implies that - * we're either abandoning the pending system - * call or that we defer it's re-execution - * (e.g., due to GDB doing an inferior - * function call). - */ - urbs_end = ia64_get_user_rbs_end(child, pt, &cfm); - if (write_access) { - if (*data != urbs_end) { - if (in_syscall(pt)) - convert_to_non_syscall(child, - pt, - cfm); - /* - * Simulate user-level write - * of ar.bsp: - */ - pt->loadrs = 0; - pt->ar_bspstore = *data; - } - } else - *data = urbs_end; - return 0; - - case PT_CFM: - urbs_end = ia64_get_user_rbs_end(child, pt, &cfm); - if (write_access) { - if (((cfm ^ *data) & PFM_MASK) != 0) { - if (in_syscall(pt)) - convert_to_non_syscall(child, - pt, - cfm); - pt->cr_ifs = ((pt->cr_ifs & ~PFM_MASK) - | (*data & PFM_MASK)); - } - } else - *data = cfm; - return 0; - - case PT_CR_IPSR: - if (write_access) { - unsigned long tmp = *data; - /* psr.ri==3 is a reserved value: SDM 2:25 */ - if ((tmp & IA64_PSR_RI) == IA64_PSR_RI) - tmp &= ~IA64_PSR_RI; - pt->cr_ipsr = ((tmp & IPSR_MASK) - | (pt->cr_ipsr & ~IPSR_MASK)); - } else - *data = (pt->cr_ipsr & IPSR_MASK); - return 0; - - case PT_AR_RSC: - if (write_access) - pt->ar_rsc = *data | (3 << 2); /* force PL3 */ - else - *data = pt->ar_rsc; - return 0; - - case PT_AR_RNAT: - ptr = pt_reg_addr(pt, ar_rnat); - break; - case PT_R1: - ptr = pt_reg_addr(pt, r1); - break; - case PT_R2: case PT_R3: - ptr = pt_reg_addr(pt, r2) + (addr - PT_R2); - break; - case PT_R8: case PT_R9: case PT_R10: case PT_R11: - ptr = pt_reg_addr(pt, r8) + (addr - PT_R8); - break; - case PT_R12: case PT_R13: - ptr = pt_reg_addr(pt, r12) + (addr - PT_R12); - break; - case PT_R14: - ptr = pt_reg_addr(pt, r14); - break; - case PT_R15: - ptr = pt_reg_addr(pt, r15); - break; - case PT_R16: case PT_R17: case PT_R18: case PT_R19: - case PT_R20: case PT_R21: case PT_R22: case PT_R23: - case PT_R24: case PT_R25: case PT_R26: case PT_R27: - case PT_R28: case PT_R29: case PT_R30: case PT_R31: - ptr = pt_reg_addr(pt, r16) + (addr - PT_R16); - break; - case PT_B0: - ptr = pt_reg_addr(pt, b0); - break; - case PT_B6: - ptr = pt_reg_addr(pt, b6); - break; - case PT_B7: - ptr = pt_reg_addr(pt, b7); - break; - case PT_F6: case PT_F6+8: case PT_F7: case PT_F7+8: - case PT_F8: case PT_F8+8: case PT_F9: case PT_F9+8: - ptr = pt_reg_addr(pt, f6) + (addr - PT_F6); - break; - case PT_AR_BSPSTORE: - ptr = pt_reg_addr(pt, ar_bspstore); - break; - case PT_AR_UNAT: - ptr = pt_reg_addr(pt, ar_unat); - break; - case PT_AR_PFS: - ptr = pt_reg_addr(pt, ar_pfs); - break; - case PT_AR_CCV: - ptr = pt_reg_addr(pt, ar_ccv); - break; - case PT_AR_FPSR: - ptr = pt_reg_addr(pt, ar_fpsr); - break; - case PT_CR_IIP: - ptr = pt_reg_addr(pt, cr_iip); - break; - case PT_PR: - ptr = pt_reg_addr(pt, pr); - break; - /* scratch register */ - - default: - /* disallow accessing anything else... */ - dprintk("ptrace: rejecting access to register " - "address 0x%lx\n", addr); - return -1; - } - } else if (addr <= PT_AR_SSD) { - ptr = pt_reg_addr(pt, ar_csd) + (addr - PT_AR_CSD); - } else { - /* access debug registers */ - - if (addr >= PT_IBR) { - regnum = (addr - PT_IBR) >> 3; - ptr = &child->thread.ibr[0]; - } else { - regnum = (addr - PT_DBR) >> 3; - ptr = &child->thread.dbr[0]; - } - - if (regnum >= 8) { - dprintk("ptrace: rejecting access to register " - "address 0x%lx\n", addr); - return -1; - } -#ifdef CONFIG_PERFMON - /* - * Check if debug registers are used by perfmon. This - * test must be done once we know that we can do the - * operation, i.e. the arguments are all valid, but - * before we start modifying the state. - * - * Perfmon needs to keep a count of how many processes - * are trying to modify the debug registers for system - * wide monitoring sessions. - * - * We also include read access here, because they may - * cause the PMU-installed debug register state - * (dbr[], ibr[]) to be reset. The two arrays are also - * used by perfmon, but we do not use - * IA64_THREAD_DBG_VALID. The registers are restored - * by the PMU context switch code. - */ - if (pfm_use_debug_registers(child)) return -1; -#endif - - if (!(child->thread.flags & IA64_THREAD_DBG_VALID)) { - child->thread.flags |= IA64_THREAD_DBG_VALID; - memset(child->thread.dbr, 0, - sizeof(child->thread.dbr)); - memset(child->thread.ibr, 0, - sizeof(child->thread.ibr)); - } - - ptr += regnum; - - if ((regnum & 1) && write_access) { - /* don't let the user set kernel-level breakpoints: */ - *ptr = *data & ~(7UL << 56); - return 0; - } - } - if (write_access) - *ptr = *data; - else - *data = *ptr; - return 0; -} + unsigned long *data, int write_access); static long ptrace_getregs (struct task_struct *child, struct pt_all_user_regs __user *ppr) @@ -2290,6 +1969,205 @@ static int fpregs_set(struct task_struct *target, kbuf, ubuf); } +static int +access_uarea(struct task_struct *child, unsigned long addr, + unsigned long *data, int write_access) +{ + unsigned int pos = -1; /* an invalid value */ + int ret; + unsigned long *ptr, regnum; + + if ((addr & 0x7) != 0) { + dprintk("ptrace: unaligned register address 0x%lx\n", addr); + return -1; + } + if ((addr >= PT_NAT_BITS + 8 && addr < PT_F2) || + (addr >= PT_R7 + 8 && addr < PT_B1) || + (addr >= PT_AR_LC + 8 && addr < PT_CR_IPSR) || + (addr >= PT_AR_SSD + 8 && addr < PT_DBR)) { + dprintk("ptrace: rejecting access to register " + "address 0x%lx\n", addr); + return -1; + } + + switch (addr) { + case PT_F32 ... (PT_F127 + 15): + pos = addr - PT_F32 + ELF_FP_OFFSET(32); + break; + case PT_F2 ... (PT_F5 + 15): + pos = addr - PT_F2 + ELF_FP_OFFSET(2); + break; + case PT_F10 ... (PT_F31 + 15): + pos = addr - PT_F10 + ELF_FP_OFFSET(10); + break; + case PT_F6 ... (PT_F9 + 15): + pos = addr - PT_F6 + ELF_FP_OFFSET(6); + break; + } + + if (pos != -1) { + if (write_access) + ret = fpregs_set(child, NULL, pos, + sizeof(unsigned long), data, NULL); + else + ret = fpregs_get(child, NULL, pos, + sizeof(unsigned long), data, NULL); + if (ret != 0) + return -1; + return 0; + } + + switch (addr) { + case PT_NAT_BITS: + pos = ELF_NAT_OFFSET; + break; + case PT_R4 ... PT_R7: + pos = addr - PT_R4 + ELF_GR_OFFSET(4); + break; + case PT_B1 ... PT_B5: + pos = addr - PT_B1 + ELF_BR_OFFSET(1); + break; + case PT_AR_EC: + pos = ELF_AR_EC_OFFSET; + break; + case PT_AR_LC: + pos = ELF_AR_LC_OFFSET; + break; + case PT_CR_IPSR: + pos = ELF_CR_IPSR_OFFSET; + break; + case PT_CR_IIP: + pos = ELF_CR_IIP_OFFSET; + break; + case PT_CFM: + pos = ELF_CFM_OFFSET; + break; + case PT_AR_UNAT: + pos = ELF_AR_UNAT_OFFSET; + break; + case PT_AR_PFS: + pos = ELF_AR_PFS_OFFSET; + break; + case PT_AR_RSC: + pos = ELF_AR_RSC_OFFSET; + break; + case PT_AR_RNAT: + pos = ELF_AR_RNAT_OFFSET; + break; + case PT_AR_BSPSTORE: + pos = ELF_AR_BSPSTORE_OFFSET; + break; + case PT_PR: + pos = ELF_PR_OFFSET; + break; + case PT_B6: + pos = ELF_BR_OFFSET(6); + break; + case PT_AR_BSP: + pos = ELF_AR_BSP_OFFSET; + break; + case PT_R1 ... PT_R3: + pos = addr - PT_R1 + ELF_GR_OFFSET(1); + break; + case PT_R12 ... PT_R15: + pos = addr - PT_R12 + ELF_GR_OFFSET(12); + break; + case PT_R8 ... PT_R11: + pos = addr - PT_R8 + ELF_GR_OFFSET(8); + break; + case PT_R16 ... PT_R31: + pos = addr - PT_R16 + ELF_GR_OFFSET(16); + break; + case PT_AR_CCV: + pos = ELF_AR_CCV_OFFSET; + break; + case PT_AR_FPSR: + pos = ELF_AR_FPSR_OFFSET; + break; + case PT_B0: + pos = ELF_BR_OFFSET(0); + break; + case PT_B7: + pos = ELF_BR_OFFSET(7); + break; + case PT_AR_CSD: + pos = ELF_AR_CSD_OFFSET; + break; + case PT_AR_SSD: + pos = ELF_AR_SSD_OFFSET; + break; + } + + if (pos != -1) { + if (write_access) + ret = gpregs_set(child, NULL, pos, + sizeof(unsigned long), data, NULL); + else + ret = gpregs_get(child, NULL, pos, + sizeof(unsigned long), data, NULL); + if (ret != 0) + return -1; + return 0; + } + + /* access debug registers */ + if (addr >= PT_IBR) { + regnum = (addr - PT_IBR) >> 3; + ptr = &child->thread.ibr[0]; + } else { + regnum = (addr - PT_DBR) >> 3; + ptr = &child->thread.dbr[0]; + } + + if (regnum >= 8) { + dprintk("ptrace: rejecting access to register " + "address 0x%lx\n", addr); + return -1; + } +#ifdef CONFIG_PERFMON + /* + * Check if debug registers are used by perfmon. This + * test must be done once we know that we can do the + * operation, i.e. the arguments are all valid, but + * before we start modifying the state. + * + * Perfmon needs to keep a count of how many processes + * are trying to modify the debug registers for system + * wide monitoring sessions. + * + * We also include read access here, because they may + * cause the PMU-installed debug register state + * (dbr[], ibr[]) to be reset. The two arrays are also + * used by perfmon, but we do not use + * IA64_THREAD_DBG_VALID. The registers are restored + * by the PMU context switch code. + */ + if (pfm_use_debug_registers(child)) + return -1; +#endif + + if (!(child->thread.flags & IA64_THREAD_DBG_VALID)) { + child->thread.flags |= IA64_THREAD_DBG_VALID; + memset(child->thread.dbr, 0, + sizeof(child->thread.dbr)); + memset(child->thread.ibr, 0, + sizeof(child->thread.ibr)); + } + + ptr += regnum; + + if ((regnum & 1) && write_access) { + /* don't let the user set kernel-level breakpoints: */ + *ptr = *data & ~(7UL << 56); + return 0; + } + if (write_access) + *ptr = *data; + else + *data = *ptr; + return 0; +} + static const struct user_regset native_regsets[] = { { .core_note_type = NT_PRSTATUS, From 96651896b8d9ad4244a1c3ed9691faa3e382f503 Mon Sep 17 00:00:00 2001 From: Xiantao Zhang Date: Thu, 3 Apr 2008 11:02:58 -0700 Subject: [PATCH 07/27] [IA64] Add API for allocating Dynamic TR resource. Dynamic TR resource should be managed in the uniform way. Add two interfaces for kernel: ia64_itr_entry: Allocate a (pair of) TR for caller. ia64_ptr_entry: Purge a (pair of ) TR by caller. Signed-off-by: Xiantao Zhang Signed-off-by: Anthony Xu Signed-off-by: Tony Luck --- arch/ia64/kernel/mca.c | 49 ++++++++++ arch/ia64/kernel/mca_asm.S | 5 + arch/ia64/mm/tlb.c | 196 +++++++++++++++++++++++++++++++++++++ include/asm-ia64/kregs.h | 3 + include/asm-ia64/tlb.h | 26 +++++ 5 files changed, 279 insertions(+) diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 6c18221dba36..607006a6a976 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -97,6 +97,7 @@ #include #include +#include #include "mca_drv.h" #include "entry.h" @@ -112,6 +113,7 @@ DEFINE_PER_CPU(u64, ia64_mca_data); /* == __per_cpu_mca[smp_processor_id()] */ DEFINE_PER_CPU(u64, ia64_mca_per_cpu_pte); /* PTE to map per-CPU area */ DEFINE_PER_CPU(u64, ia64_mca_pal_pte); /* PTE to map PAL code */ DEFINE_PER_CPU(u64, ia64_mca_pal_base); /* vaddr PAL code granule */ +DEFINE_PER_CPU(u64, ia64_mca_tr_reload); /* Flag for TR reload */ unsigned long __per_cpu_mca[NR_CPUS]; @@ -1182,6 +1184,49 @@ ia64_wait_for_slaves(int monarch, const char *type) return; } +/* mca_insert_tr + * + * Switch rid when TR reload and needed! + * iord: 1: itr, 2: itr; + * +*/ +static void mca_insert_tr(u64 iord) +{ + + int i; + u64 old_rr; + struct ia64_tr_entry *p; + unsigned long psr; + int cpu = smp_processor_id(); + + psr = ia64_clear_ic(); + for (i = IA64_TR_ALLOC_BASE; i < IA64_TR_ALLOC_MAX; i++) { + p = &__per_cpu_idtrs[cpu][iord-1][i]; + if (p->pte & 0x1) { + old_rr = ia64_get_rr(p->ifa); + if (old_rr != p->rr) { + ia64_set_rr(p->ifa, p->rr); + ia64_srlz_d(); + } + ia64_ptr(iord, p->ifa, p->itir >> 2); + ia64_srlz_i(); + if (iord & 0x1) { + ia64_itr(0x1, i, p->ifa, p->pte, p->itir >> 2); + ia64_srlz_i(); + } + if (iord & 0x2) { + ia64_itr(0x2, i, p->ifa, p->pte, p->itir >> 2); + ia64_srlz_i(); + } + if (old_rr != p->rr) { + ia64_set_rr(p->ifa, old_rr); + ia64_srlz_d(); + } + } + } + ia64_set_psr(psr); +} + /* * ia64_mca_handler * @@ -1271,6 +1316,10 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw, monarch_cpu = -1; #endif } + if (__get_cpu_var(ia64_mca_tr_reload)) { + mca_insert_tr(0x1); /*Reload dynamic itrs*/ + mca_insert_tr(0x2); /*Reload dynamic itrs*/ + } if (notify_die(DIE_MCA_MONARCH_LEAVE, "MCA", regs, (long)&nd, 0, recover) == NOTIFY_STOP) ia64_mca_spin(__func__); diff --git a/arch/ia64/kernel/mca_asm.S b/arch/ia64/kernel/mca_asm.S index 8bc7d259e0c6..a06d46548ff9 100644 --- a/arch/ia64/kernel/mca_asm.S +++ b/arch/ia64/kernel/mca_asm.S @@ -219,8 +219,13 @@ ia64_reload_tr: mov r20=IA64_TR_CURRENT_STACK ;; itr.d dtr[r20]=r16 + GET_THIS_PADDR(r2, ia64_mca_tr_reload) + mov r18 = 1 ;; srlz.d + ;; + st8 [r2] =r18 + ;; done_tlb_purge_and_reload: diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c index 655da240d13c..3d8903f936a5 100644 --- a/arch/ia64/mm/tlb.c +++ b/arch/ia64/mm/tlb.c @@ -26,6 +26,8 @@ #include #include #include +#include +#include static struct { unsigned long mask; /* mask of supported purge page-sizes */ @@ -39,6 +41,10 @@ struct ia64_ctx ia64_ctx = { }; DEFINE_PER_CPU(u8, ia64_need_tlb_flush); +DEFINE_PER_CPU(u8, ia64_tr_num); /*Number of TR slots in current processor*/ +DEFINE_PER_CPU(u8, ia64_tr_used); /*Max Slot number used by kernel*/ + +struct ia64_tr_entry __per_cpu_idtrs[NR_CPUS][2][IA64_TR_ALLOC_MAX]; /* * Initializes the ia64_ctx.bitmap array based on max_ctx+1. @@ -190,6 +196,9 @@ ia64_tlb_init (void) ia64_ptce_info_t uninitialized_var(ptce_info); /* GCC be quiet */ unsigned long tr_pgbits; long status; + pal_vm_info_1_u_t vm_info_1; + pal_vm_info_2_u_t vm_info_2; + int cpu = smp_processor_id(); if ((status = ia64_pal_vm_page_size(&tr_pgbits, &purge.mask)) != 0) { printk(KERN_ERR "PAL_VM_PAGE_SIZE failed with status=%ld; " @@ -206,4 +215,191 @@ ia64_tlb_init (void) local_cpu_data->ptce_stride[1] = ptce_info.stride[1]; local_flush_tlb_all(); /* nuke left overs from bootstrapping... */ + status = ia64_pal_vm_summary(&vm_info_1, &vm_info_2); + + if (status) { + printk(KERN_ERR "ia64_pal_vm_summary=%ld\n", status); + per_cpu(ia64_tr_num, cpu) = 8; + return; + } + per_cpu(ia64_tr_num, cpu) = vm_info_1.pal_vm_info_1_s.max_itr_entry+1; + if (per_cpu(ia64_tr_num, cpu) > + (vm_info_1.pal_vm_info_1_s.max_dtr_entry+1)) + per_cpu(ia64_tr_num, cpu) = + vm_info_1.pal_vm_info_1_s.max_dtr_entry+1; + if (per_cpu(ia64_tr_num, cpu) > IA64_TR_ALLOC_MAX) { + per_cpu(ia64_tr_num, cpu) = IA64_TR_ALLOC_MAX; + printk(KERN_DEBUG "TR register number exceeds IA64_TR_ALLOC_MAX!" + "IA64_TR_ALLOC_MAX should be extended\n"); + } } + +/* + * is_tr_overlap + * + * Check overlap with inserted TRs. + */ +static int is_tr_overlap(struct ia64_tr_entry *p, u64 va, u64 log_size) +{ + u64 tr_log_size; + u64 tr_end; + u64 va_rr = ia64_get_rr(va); + u64 va_rid = RR_TO_RID(va_rr); + u64 va_end = va + (1<rr)) + return 0; + tr_log_size = (p->itir & 0xff) >> 2; + tr_end = p->ifa + (1< tr_end || p->ifa > va_end) + return 0; + return 1; + +} + +/* + * ia64_insert_tr in virtual mode. Allocate a TR slot + * + * target_mask : 0x1 : itr, 0x2 : dtr, 0x3 : idtr + * + * va : virtual address. + * pte : pte entries inserted. + * log_size: range to be covered. + * + * Return value: <0 : error No. + * + * >=0 : slot number allocated for TR. + * Must be called with preemption disabled. + */ +int ia64_itr_entry(u64 target_mask, u64 va, u64 pte, u64 log_size) +{ + int i, r; + unsigned long psr; + struct ia64_tr_entry *p; + int cpu = smp_processor_id(); + + r = -EINVAL; + /*Check overlap with existing TR entries*/ + if (target_mask & 0x1) { + p = &__per_cpu_idtrs[cpu][0][0]; + for (i = IA64_TR_ALLOC_BASE; i <= per_cpu(ia64_tr_used, cpu); + i++, p++) { + if (p->pte & 0x1) + if (is_tr_overlap(p, va, log_size)) { + printk(KERN_DEBUG "Overlapped Entry" + "Inserted for TR Reigster!!\n"); + goto out; + } + } + } + if (target_mask & 0x2) { + p = &__per_cpu_idtrs[cpu][1][0]; + for (i = IA64_TR_ALLOC_BASE; i <= per_cpu(ia64_tr_used, cpu); + i++, p++) { + if (p->pte & 0x1) + if (is_tr_overlap(p, va, log_size)) { + printk(KERN_DEBUG "Overlapped Entry" + "Inserted for TR Reigster!!\n"); + goto out; + } + } + } + + for (i = IA64_TR_ALLOC_BASE; i < per_cpu(ia64_tr_num, cpu); i++) { + switch (target_mask & 0x3) { + case 1: + if (!(__per_cpu_idtrs[cpu][0][i].pte & 0x1)) + goto found; + continue; + case 2: + if (!(__per_cpu_idtrs[cpu][1][i].pte & 0x1)) + goto found; + continue; + case 3: + if (!(__per_cpu_idtrs[cpu][0][i].pte & 0x1) && + !(__per_cpu_idtrs[cpu][1][i].pte & 0x1)) + goto found; + continue; + default: + r = -EINVAL; + goto out; + } + } +found: + if (i >= per_cpu(ia64_tr_num, cpu)) + return -EBUSY; + + /*Record tr info for mca hander use!*/ + if (i > per_cpu(ia64_tr_used, cpu)) + per_cpu(ia64_tr_used, cpu) = i; + + psr = ia64_clear_ic(); + if (target_mask & 0x1) { + ia64_itr(0x1, i, va, pte, log_size); + ia64_srlz_i(); + p = &__per_cpu_idtrs[cpu][0][i]; + p->ifa = va; + p->pte = pte; + p->itir = log_size << 2; + p->rr = ia64_get_rr(va); + } + if (target_mask & 0x2) { + ia64_itr(0x2, i, va, pte, log_size); + ia64_srlz_i(); + p = &__per_cpu_idtrs[cpu][1][i]; + p->ifa = va; + p->pte = pte; + p->itir = log_size << 2; + p->rr = ia64_get_rr(va); + } + ia64_set_psr(psr); + r = i; +out: + return r; +} +EXPORT_SYMBOL_GPL(ia64_itr_entry); + +/* + * ia64_purge_tr + * + * target_mask: 0x1: purge itr, 0x2 : purge dtr, 0x3 purge idtr. + * slot: slot number to be freed. + * + * Must be called with preemption disabled. + */ +void ia64_ptr_entry(u64 target_mask, int slot) +{ + int cpu = smp_processor_id(); + int i; + struct ia64_tr_entry *p; + + if (slot < IA64_TR_ALLOC_BASE || slot >= per_cpu(ia64_tr_num, cpu)) + return; + + if (target_mask & 0x1) { + p = &__per_cpu_idtrs[cpu][0][slot]; + if ((p->pte&0x1) && is_tr_overlap(p, p->ifa, p->itir>>2)) { + p->pte = 0; + ia64_ptr(0x1, p->ifa, p->itir>>2); + ia64_srlz_i(); + } + } + + if (target_mask & 0x2) { + p = &__per_cpu_idtrs[cpu][1][slot]; + if ((p->pte & 0x1) && is_tr_overlap(p, p->ifa, p->itir>>2)) { + p->pte = 0; + ia64_ptr(0x2, p->ifa, p->itir>>2); + ia64_srlz_i(); + } + } + + for (i = per_cpu(ia64_tr_used, cpu); i >= IA64_TR_ALLOC_BASE; i--) { + if ((__per_cpu_idtrs[cpu][0][i].pte & 0x1) || + (__per_cpu_idtrs[cpu][1][i].pte & 0x1)) + break; + } + per_cpu(ia64_tr_used, cpu) = i; +} +EXPORT_SYMBOL_GPL(ia64_ptr_entry); diff --git a/include/asm-ia64/kregs.h b/include/asm-ia64/kregs.h index 7e55a584975c..aefcdfee7f23 100644 --- a/include/asm-ia64/kregs.h +++ b/include/asm-ia64/kregs.h @@ -31,6 +31,9 @@ #define IA64_TR_PALCODE 1 /* itr1: maps PALcode as required by EFI */ #define IA64_TR_CURRENT_STACK 1 /* dtr1: maps kernel's memory- & register-stacks */ +#define IA64_TR_ALLOC_BASE 2 /* itr&dtr: Base of dynamic TR resource*/ +#define IA64_TR_ALLOC_MAX 32 /* Max number for dynamic use*/ + /* Processor status register bits: */ #define IA64_PSR_BE_BIT 1 #define IA64_PSR_UP_BIT 2 diff --git a/include/asm-ia64/tlb.h b/include/asm-ia64/tlb.h index 26edcb750f9f..20d8a39680c2 100644 --- a/include/asm-ia64/tlb.h +++ b/include/asm-ia64/tlb.h @@ -64,6 +64,32 @@ struct mmu_gather { struct page *pages[FREE_PTE_NR]; }; +struct ia64_tr_entry { + u64 ifa; + u64 itir; + u64 pte; + u64 rr; +}; /*Record for tr entry!*/ + +extern int ia64_itr_entry(u64 target_mask, u64 va, u64 pte, u64 log_size); +extern void ia64_ptr_entry(u64 target_mask, int slot); + +extern struct ia64_tr_entry __per_cpu_idtrs[NR_CPUS][2][IA64_TR_ALLOC_MAX]; + +/* + region register macros +*/ +#define RR_TO_VE(val) (((val) >> 0) & 0x0000000000000001) +#define RR_VE(val) (((val) & 0x0000000000000001) << 0) +#define RR_VE_MASK 0x0000000000000001L +#define RR_VE_SHIFT 0 +#define RR_TO_PS(val) (((val) >> 2) & 0x000000000000003f) +#define RR_PS(val) (((val) & 0x000000000000003f) << 2) +#define RR_PS_MASK 0x00000000000000fcL +#define RR_PS_SHIFT 2 +#define RR_RID_MASK 0x00000000ffffff00L +#define RR_TO_RID(val) ((val >> 8) & 0xffffff) + /* Users of the generic TLB shootdown code must declare this storage space. */ DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); From 31a6b11fed6ceec07ec4bdfefae56b8252d450cf Mon Sep 17 00:00:00 2001 From: Xiantao Zhang Date: Thu, 3 Apr 2008 11:39:43 -0700 Subject: [PATCH 08/27] [IA64] Implement smp_call_function_mask for ia64 This interface provides more flexible functionality for smp infrastructure ... e.g. KVM frequently needs to operate on a subset of cpus. Signed-off-by: Xiantao Zhang Signed-off-by: Tony Luck --- arch/ia64/kernel/smp.c | 82 ++++++++++++++++++++++++++++++++++++++++++ include/asm-ia64/smp.h | 3 ++ 2 files changed, 85 insertions(+) diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c index 4e446aa5f4ac..9a9d4c489330 100644 --- a/arch/ia64/kernel/smp.c +++ b/arch/ia64/kernel/smp.c @@ -209,6 +209,19 @@ send_IPI_allbutself (int op) } } +/* + * Called with preemption disabled. + */ +static inline void +send_IPI_mask(cpumask_t mask, int op) +{ + unsigned int cpu; + + for_each_cpu_mask(cpu, mask) { + send_IPI_single(cpu, op); + } +} + /* * Called with preemption disabled. */ @@ -401,6 +414,75 @@ smp_call_function_single (int cpuid, void (*func) (void *info), void *info, int } EXPORT_SYMBOL(smp_call_function_single); +/** + * smp_call_function_mask(): Run a function on a set of other CPUs. + * The set of cpus to run on. Must not include the current cpu. + * The function to run. This must be fast and non-blocking. + * An arbitrary pointer to pass to the function. + * If true, wait (atomically) until function + * has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +int smp_call_function_mask(cpumask_t mask, + void (*func)(void *), void *info, + int wait) +{ + struct call_data_struct data; + cpumask_t allbutself; + int cpus; + + spin_lock(&call_lock); + allbutself = cpu_online_map; + cpu_clear(smp_processor_id(), allbutself); + + cpus_and(mask, mask, allbutself); + cpus = cpus_weight(mask); + if (!cpus) { + spin_unlock(&call_lock); + return 0; + } + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + mb(); /* ensure store to call_data precedes setting of IPI_CALL_FUNC*/ + + /* Send a message to other CPUs */ + if (cpus_equal(mask, allbutself)) + send_IPI_allbutself(IPI_CALL_FUNC); + else + send_IPI_mask(mask, IPI_CALL_FUNC); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (wait) + while (atomic_read(&data.finished) != cpus) + cpu_relax(); + call_data = NULL; + + spin_unlock(&call_lock); + return 0; + +} +EXPORT_SYMBOL(smp_call_function_mask); + /* * this function sends a 'generic call function' IPI to all other CPUs * in the system. diff --git a/include/asm-ia64/smp.h b/include/asm-ia64/smp.h index 4fa733dd417a..ec5f355fb7e3 100644 --- a/include/asm-ia64/smp.h +++ b/include/asm-ia64/smp.h @@ -38,6 +38,9 @@ ia64_get_lid (void) return lid.f.id << 8 | lid.f.eid; } +extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *), + void *info, int wait); + #define hard_smp_processor_id() ia64_get_lid() #ifdef CONFIG_SMP From 2046b94e7c4fce92eb8165c2c36c6478f4927178 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 4 Apr 2008 11:05:59 -0700 Subject: [PATCH 09/27] [IA64] Multiple outstanding ptc.g instruction support According to SDM2.2, Itanium supports multiple outstanding ptc.g instructions. But current kernel function ia64_global_tlb_purge() uses a spinlock to serialize ptc.g instructions issued by multiple processors. This serialization might have scalability issue on a big SMP machine where many processors could purge TLB in parallel. The patch fixes this problem by issuing multiple ptc.g instructions in ia64_global_tlb_purge(). It also adds support for the "PALO" table to get a platform view of the max number of outstanding ptc.g instructions (which may be different from the processor view found from PAL_VM_SUMMARY). PALO specification can be found at: http://www.dig64.org/home/DIG64_PALO_R1_0.pdf spinaphore implementation by Matthew Wilcox. Signed-off-by: Fenghua Yu Signed-off-by: Tony Luck --- arch/ia64/kernel/efi.c | 46 +++++++++++++ arch/ia64/kernel/setup.c | 6 +- arch/ia64/mm/tlb.c | 125 +++++++++++++++++++++++++++++++----- include/asm-ia64/sal.h | 17 +++++ include/asm-ia64/tlbflush.h | 1 + 5 files changed, 178 insertions(+), 17 deletions(-) diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index 728d7247a1a6..003cd09b0732 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -37,6 +37,7 @@ #include #include #include +#include #define EFI_DEBUG 0 @@ -403,6 +404,41 @@ efi_get_pal_addr (void) return NULL; } + +static u8 __init palo_checksum(u8 *buffer, u32 length) +{ + u8 sum = 0; + u8 *end = buffer + length; + + while (buffer < end) + sum = (u8) (sum + *(buffer++)); + + return sum; +} + +/* + * Parse and handle PALO table which is published at: + * http://www.dig64.org/home/DIG64_PALO_R1_0.pdf + */ +static void __init handle_palo(unsigned long palo_phys) +{ + struct palo_table *palo = __va(palo_phys); + u8 checksum; + + if (strncmp(palo->signature, PALO_SIG, sizeof(PALO_SIG) - 1)) { + printk(KERN_INFO "PALO signature incorrect.\n"); + return; + } + + checksum = palo_checksum((u8 *)palo, palo->length); + if (checksum) { + printk(KERN_INFO "PALO checksum incorrect.\n"); + return; + } + + setup_ptcg_sem(palo->max_tlb_purges, 1); +} + void efi_map_pal_code (void) { @@ -432,6 +468,7 @@ efi_init (void) u64 efi_desc_size; char *cp, vendor[100] = "unknown"; int i; + unsigned long palo_phys; /* * It's too early to be able to use the standard kernel command line @@ -496,6 +533,8 @@ efi_init (void) efi.hcdp = EFI_INVALID_TABLE_ADDR; efi.uga = EFI_INVALID_TABLE_ADDR; + palo_phys = EFI_INVALID_TABLE_ADDR; + for (i = 0; i < (int) efi.systab->nr_tables; i++) { if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) { efi.mps = config_tables[i].table; @@ -515,10 +554,17 @@ efi_init (void) } else if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) { efi.hcdp = config_tables[i].table; printk(" HCDP=0x%lx", config_tables[i].table); + } else if (efi_guidcmp(config_tables[i].guid, + PROCESSOR_ABSTRACTION_LAYER_OVERWRITE_GUID) == 0) { + palo_phys = config_tables[i].table; + printk(" PALO=0x%lx", config_tables[i].table); } } printk("\n"); + if (palo_phys != EFI_INVALID_TABLE_ADDR) + handle_palo(palo_phys); + runtime = __va(efi.systab->runtime); efi.get_time = phys_get_time; efi.set_time = phys_set_time; diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 4aa9eaea76c3..1cbd26340d87 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include @@ -946,9 +947,10 @@ cpu_init (void) #endif /* set ia64_ctx.max_rid to the maximum RID that is supported by all CPUs: */ - if (ia64_pal_vm_summary(NULL, &vmi) == 0) + if (ia64_pal_vm_summary(NULL, &vmi) == 0) { max_ctx = (1U << (vmi.pal_vm_info_2_s.rid_size - 3)) - 1; - else { + setup_ptcg_sem(vmi.pal_vm_info_2_s.max_purges, 0); + } else { printk(KERN_WARNING "cpu_init: PAL VM summary failed, assuming 18 RID bits\n"); max_ctx = (1U << 15) - 1; /* use architected minimum */ } diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c index 655da240d13c..d41d6076ed03 100644 --- a/arch/ia64/mm/tlb.c +++ b/arch/ia64/mm/tlb.c @@ -11,6 +11,9 @@ * Rohit Seth * Ken Chen * Christophe de Dinechin : Avoid ptc.e on memory allocation + * Copyright (C) 2007 Intel Corp + * Fenghua Yu + * Add multiple ptc.g/ptc.ga instruction support in global tlb purge. */ #include #include @@ -26,6 +29,7 @@ #include #include #include +#include static struct { unsigned long mask; /* mask of supported purge page-sizes */ @@ -84,14 +88,104 @@ wrap_mmu_context (struct mm_struct *mm) local_flush_tlb_all(); } +/* + * Implement "spinaphores" ... like counting semaphores, but they + * spin instead of sleeping. If there are ever any other users for + * this primitive it can be moved up to a spinaphore.h header. + */ +struct spinaphore { + atomic_t cur; +}; + +static inline void spinaphore_init(struct spinaphore *ss, int val) +{ + atomic_set(&ss->cur, val); +} + +static inline void down_spin(struct spinaphore *ss) +{ + while (unlikely(!atomic_add_unless(&ss->cur, -1, 0))) + while (atomic_read(&ss->cur) == 0) + cpu_relax(); +} + +static inline void up_spin(struct spinaphore *ss) +{ + atomic_add(1, &ss->cur); +} + +static struct spinaphore ptcg_sem; +static u16 nptcg = 1; +static int need_ptcg_sem = 1; +static int toolatetochangeptcgsem = 0; + +/* + * Maximum number of simultaneous ptc.g purges in the system can + * be defined by PAL_VM_SUMMARY (in which case we should take + * the smallest value for any cpu in the system) or by the PAL + * override table (in which case we should ignore the value from + * PAL_VM_SUMMARY). + * + * Complicating the logic here is the fact that num_possible_cpus() + * isn't fully setup until we start bringing cpus online. + */ +void +setup_ptcg_sem(int max_purges, int from_palo) +{ + static int have_palo; + static int firstcpu = 1; + + if (toolatetochangeptcgsem) { + BUG_ON(max_purges < nptcg); + return; + } + + if (from_palo) { + have_palo = 1; + + /* In PALO max_purges == 0 really means it! */ + if (max_purges == 0) + panic("Whoa! Platform does not support global TLB purges.\n"); + nptcg = max_purges; + if (nptcg == PALO_MAX_TLB_PURGES) { + need_ptcg_sem = 0; + return; + } + goto resetsema; + } + if (have_palo) { + if (nptcg != PALO_MAX_TLB_PURGES) + need_ptcg_sem = (num_possible_cpus() > nptcg); + return; + } + + /* In PAL_VM_SUMMARY max_purges == 0 actually means 1 */ + if (max_purges == 0) max_purges = 1; + + if (firstcpu) { + nptcg = max_purges; + firstcpu = 0; + } + if (max_purges < nptcg) + nptcg = max_purges; + if (nptcg == PAL_MAX_PURGES) { + need_ptcg_sem = 0; + return; + } else + need_ptcg_sem = (num_possible_cpus() > nptcg); + +resetsema: + spinaphore_init(&ptcg_sem, max_purges); +} + void ia64_global_tlb_purge (struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long nbits) { - static DEFINE_SPINLOCK(ptcg_lock); - struct mm_struct *active_mm = current->active_mm; + toolatetochangeptcgsem = 1; + if (mm != active_mm) { /* Restore region IDs for mm */ if (mm && active_mm) { @@ -102,19 +196,20 @@ ia64_global_tlb_purge (struct mm_struct *mm, unsigned long start, } } - /* HW requires global serialization of ptc.ga. */ - spin_lock(&ptcg_lock); - { - do { - /* - * Flush ALAT entries also. - */ - ia64_ptcga(start, (nbits<<2)); - ia64_srlz_i(); - start += (1UL << nbits); - } while (start < end); - } - spin_unlock(&ptcg_lock); + if (need_ptcg_sem) + down_spin(&ptcg_sem); + + do { + /* + * Flush ALAT entries also. + */ + ia64_ptcga(start, (nbits << 2)); + ia64_srlz_i(); + start += (1UL << nbits); + } while (start < end); + + if (need_ptcg_sem) + up_spin(&ptcg_sem); if (mm != active_mm) { activate_context(active_mm); diff --git a/include/asm-ia64/sal.h b/include/asm-ia64/sal.h index f4904db3b057..3cd637a2c051 100644 --- a/include/asm-ia64/sal.h +++ b/include/asm-ia64/sal.h @@ -296,6 +296,9 @@ enum { EFI_GUID(0xe429faf8, 0x3cb7, 0x11d4, 0xbc, 0xa7, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81) #define SAL_PLAT_BUS_ERR_SECT_GUID \ EFI_GUID(0xe429faf9, 0x3cb7, 0x11d4, 0xbc, 0xa7, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81) +#define PROCESSOR_ABSTRACTION_LAYER_OVERWRITE_GUID \ + EFI_GUID(0x6cb0a200, 0x893a, 0x11da, 0x96, 0xd2, 0x0, 0x10, 0x83, 0xff, \ + 0xca, 0x4d) #define MAX_CACHE_ERRORS 6 #define MAX_TLB_ERRORS 6 @@ -879,6 +882,20 @@ extern void ia64_jump_to_sal(struct sal_to_os_boot *); extern void ia64_sal_handler_init(void *entry_point, void *gpval); +#define PALO_MAX_TLB_PURGES 0xFFFF +#define PALO_SIG "PALO" + +struct palo_table { + u8 signature[4]; /* Should be "PALO" */ + u32 length; + u8 minor_revision; + u8 major_revision; + u8 checksum; + u8 reserved1[5]; + u16 max_tlb_purges; + u8 reserved2[6]; +}; + #endif /* __ASSEMBLY__ */ #endif /* _ASM_IA64_SAL_H */ diff --git a/include/asm-ia64/tlbflush.h b/include/asm-ia64/tlbflush.h index 7774a1cac0cc..3be25dfed164 100644 --- a/include/asm-ia64/tlbflush.h +++ b/include/asm-ia64/tlbflush.h @@ -17,6 +17,7 @@ * Now for some TLB flushing routines. This is the kind of stuff that * can be very expensive, so try to avoid them whenever possible. */ +extern void setup_ptcg_sem(int max_purges, int from_palo); /* * Flush everything (kernel mapping may also have changed due to From a6c75b86ce9f01db4ea9912877b526c2dc4d2f0a Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 14 Mar 2008 13:57:08 -0700 Subject: [PATCH 10/27] [IA64] Kernel parameter for max number of concurrent global TLB purges The patch defines kernel parameter "nptcg=". The parameter overrides max number of concurrent global TLB purges which is reported from either PAL_VM_SUMMARY or SAL PALO. Signed-off-by: Fenghua Yu Signed-off-by: Tony Luck --- Documentation/kernel-parameters.txt | 4 +++ arch/ia64/kernel/efi.c | 2 +- arch/ia64/kernel/setup.c | 2 +- arch/ia64/mm/tlb.c | 46 +++++++++++++++++++++++++---- include/asm-ia64/sal.h | 4 +++ 5 files changed, 51 insertions(+), 7 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 4cd1a5da80a4..be92e6165722 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1335,6 +1335,10 @@ and is between 256 and 4096 characters. It is defined in the file nowb [ARM] + nptcg= [IA64] Override max number of concurrent global TLB + purges which is reported from either PAL_VM_SUMMARY or + SAL PALO. + numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. one of ['zone', 'node', 'default'] can be specified This can be set from sysctl after boot. diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index 003cd09b0732..d45f215bc8fc 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -436,7 +436,7 @@ static void __init handle_palo(unsigned long palo_phys) return; } - setup_ptcg_sem(palo->max_tlb_purges, 1); + setup_ptcg_sem(palo->max_tlb_purges, NPTCG_FROM_PALO); } void diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 1cbd26340d87..f798c0769d36 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -949,7 +949,7 @@ cpu_init (void) /* set ia64_ctx.max_rid to the maximum RID that is supported by all CPUs: */ if (ia64_pal_vm_summary(NULL, &vmi) == 0) { max_ctx = (1U << (vmi.pal_vm_info_2_s.rid_size - 3)) - 1; - setup_ptcg_sem(vmi.pal_vm_info_2_s.max_purges, 0); + setup_ptcg_sem(vmi.pal_vm_info_2_s.max_purges, NPTCG_FROM_PAL); } else { printk(KERN_WARNING "cpu_init: PAL VM summary failed, assuming 18 RID bits\n"); max_ctx = (1U << 15) - 1; /* use architected minimum */ diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c index d41d6076ed03..1a8948fd0029 100644 --- a/arch/ia64/mm/tlb.c +++ b/arch/ia64/mm/tlb.c @@ -119,6 +119,27 @@ static u16 nptcg = 1; static int need_ptcg_sem = 1; static int toolatetochangeptcgsem = 0; +/* + * Kernel parameter "nptcg=" overrides max number of concurrent global TLB + * purges which is reported from either PAL or SAL PALO. + * + * We don't have sanity checking for nptcg value. It's the user's responsibility + * for valid nptcg value on the platform. Otherwise, kernel may hang in some + * cases. + */ +static int __init +set_nptcg(char *str) +{ + int value = 0; + + get_option(&str, &value); + setup_ptcg_sem(value, NPTCG_FROM_KERNEL_PARAMETER); + + return 1; +} + +__setup("nptcg=", set_nptcg); + /* * Maximum number of simultaneous ptc.g purges in the system can * be defined by PAL_VM_SUMMARY (in which case we should take @@ -126,13 +147,18 @@ static int toolatetochangeptcgsem = 0; * override table (in which case we should ignore the value from * PAL_VM_SUMMARY). * + * Kernel parameter "nptcg=" overrides maximum number of simultanesous ptc.g + * purges defined in either PAL_VM_SUMMARY or PAL override table. In this case, + * we should ignore the value from either PAL_VM_SUMMARY or PAL override table. + * * Complicating the logic here is the fact that num_possible_cpus() * isn't fully setup until we start bringing cpus online. */ void -setup_ptcg_sem(int max_purges, int from_palo) +setup_ptcg_sem(int max_purges, int nptcg_from) { - static int have_palo; + static int kp_override; + static int palo_override; static int firstcpu = 1; if (toolatetochangeptcgsem) { @@ -140,8 +166,18 @@ setup_ptcg_sem(int max_purges, int from_palo) return; } - if (from_palo) { - have_palo = 1; + if (nptcg_from == NPTCG_FROM_KERNEL_PARAMETER) { + kp_override = 1; + nptcg = max_purges; + goto resetsema; + } + if (kp_override) { + need_ptcg_sem = num_possible_cpus() > nptcg; + return; + } + + if (nptcg_from == NPTCG_FROM_PALO) { + palo_override = 1; /* In PALO max_purges == 0 really means it! */ if (max_purges == 0) @@ -153,7 +189,7 @@ setup_ptcg_sem(int max_purges, int from_palo) } goto resetsema; } - if (have_palo) { + if (palo_override) { if (nptcg != PALO_MAX_TLB_PURGES) need_ptcg_sem = (num_possible_cpus() > nptcg); return; diff --git a/include/asm-ia64/sal.h b/include/asm-ia64/sal.h index 3cd637a2c051..89594b442f83 100644 --- a/include/asm-ia64/sal.h +++ b/include/asm-ia64/sal.h @@ -896,6 +896,10 @@ struct palo_table { u8 reserved2[6]; }; +#define NPTCG_FROM_PAL 0 +#define NPTCG_FROM_PALO 1 +#define NPTCG_FROM_KERNEL_PARAMETER 2 + #endif /* __ASSEMBLY__ */ #endif /* _ASM_IA64_SAL_H */ From 41bd26d67c41e325c6b9e56aadfe9dad8af9a565 Mon Sep 17 00:00:00 2001 From: "holt@sgi.com" Date: Thu, 3 Apr 2008 15:17:12 -0500 Subject: [PATCH 11/27] [IA64] Correct pernodesize calculation. A simple fix. The existing pernodesize reservation is not taking into account a second array of pg_data_t structures. This is normally not important because the PAGE_ALIGN macro reserves adequate space. I made the compute_pernodesize steps in the same order as the fill_pernode steps to make the correlation more clear. Signed-off-by: Robin Holt Signed-off-by: Tony Luck --- arch/ia64/mm/discontig.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index ee5e68b2af94..06c540a29467 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -124,6 +124,7 @@ static unsigned long __meminit compute_pernodesize(int node) pernodesize += node * L1_CACHE_BYTES; pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t)); pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data)); + pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t)); pernodesize = PAGE_ALIGN(pernodesize); return pernodesize; } From 2c6e6db41f01b6b4eb98809350827c9678996698 Mon Sep 17 00:00:00 2001 From: "holt@sgi.com" Date: Thu, 3 Apr 2008 15:17:13 -0500 Subject: [PATCH 12/27] [IA64] Minimize per_cpu reservations. This attached patch significantly shrinks boot memory allocation on ia64. It does this by not allocating per_cpu areas for cpus that can never exist. In the case where acpi does not have any numa node description of the cpus, I defaulted to assigning the first 32 round-robin on the known nodes.. For the !CONFIG_ACPI I used for_each_possible_cpu(). Signed-off-by: Robin Holt Signed-off-by: Tony Luck --- arch/ia64/kernel/acpi.c | 4 +++- arch/ia64/kernel/numa.c | 2 +- arch/ia64/kernel/setup.c | 2 ++ arch/ia64/mm/discontig.c | 12 ++++-------- arch/ia64/mm/numa.c | 4 +++- include/asm-ia64/acpi.h | 33 +++++++++++++++++++++++++++++++++ include/asm-ia64/numa.h | 2 ++ 7 files changed, 48 insertions(+), 11 deletions(-) diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 78f28d825f30..c7467f863c7a 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -423,6 +423,7 @@ static u32 __devinitdata pxm_flag[PXM_FLAG_LEN]; #define pxm_bit_set(bit) (set_bit(bit,(void *)pxm_flag)) #define pxm_bit_test(bit) (test_bit(bit,(void *)pxm_flag)) static struct acpi_table_slit __initdata *slit_table; +cpumask_t early_cpu_possible_map = CPU_MASK_NONE; static int get_processor_proximity_domain(struct acpi_srat_cpu_affinity *pa) { @@ -482,6 +483,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) (pa->apic_id << 8) | (pa->local_sapic_eid); /* nid should be overridden as logical node id later */ node_cpuid[srat_num_cpus].nid = pxm; + cpu_set(srat_num_cpus, early_cpu_possible_map); srat_num_cpus++; } @@ -559,7 +561,7 @@ void __init acpi_numa_arch_fixup(void) } /* set logical node id in cpu structure */ - for (i = 0; i < srat_num_cpus; i++) + for_each_possible_early_cpu(i) node_cpuid[i].nid = pxm_to_node(node_cpuid[i].nid); printk(KERN_INFO "Number of logical nodes in system = %d\n", diff --git a/arch/ia64/kernel/numa.c b/arch/ia64/kernel/numa.c index a78b45f5fe2f..c93420c97409 100644 --- a/arch/ia64/kernel/numa.c +++ b/arch/ia64/kernel/numa.c @@ -73,7 +73,7 @@ void __init build_cpu_to_node_map(void) for(node=0; node < MAX_NUMNODES; node++) cpus_clear(node_to_cpu_mask[node]); - for(cpu = 0; cpu < NR_CPUS; ++cpu) { + for_each_possible_early_cpu(cpu) { node = -1; for (i = 0; i < NR_CPUS; ++i) if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) { diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 4aa9eaea76c3..6206541f9e87 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -493,6 +493,8 @@ setup_arch (char **cmdline_p) acpi_table_init(); # ifdef CONFIG_ACPI_NUMA acpi_numa_init(); + per_cpu_scan_finalize((cpus_weight(early_cpu_possible_map) == 0 ? + 32 : cpus_weight(early_cpu_possible_map)), additional_cpus); # endif #else # ifdef CONFIG_SMP diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 06c540a29467..6136a4c6df11 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -104,7 +104,7 @@ static int __meminit early_nr_cpus_node(int node) { int cpu, n = 0; - for (cpu = 0; cpu < NR_CPUS; cpu++) + for_each_possible_early_cpu(cpu) if (node == node_cpuid[cpu].nid) n++; @@ -143,7 +143,7 @@ static void *per_cpu_node_setup(void *cpu_data, int node) #ifdef CONFIG_SMP int cpu; - for (cpu = 0; cpu < NR_CPUS; cpu++) { + for_each_possible_early_cpu(cpu) { if (node == node_cpuid[cpu].nid) { memcpy(__va(cpu_data), __phys_per_cpu_start, __per_cpu_end - __per_cpu_start); @@ -346,7 +346,7 @@ static void __init initialize_pernode_data(void) #ifdef CONFIG_SMP /* Set the node_data pointer for each per-cpu struct */ - for (cpu = 0; cpu < NR_CPUS; cpu++) { + for_each_possible_early_cpu(cpu) { node = node_cpuid[cpu].nid; per_cpu(cpu_info, cpu).node_data = mem_data[node].node_data; } @@ -494,13 +494,9 @@ void __cpuinit *per_cpu_init(void) int cpu; static int first_time = 1; - - if (smp_processor_id() != 0) - return __per_cpu_start + __per_cpu_offset[smp_processor_id()]; - if (first_time) { first_time = 0; - for (cpu = 0; cpu < NR_CPUS; cpu++) + for_each_possible_early_cpu(cpu) per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu]; } diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c index 7807fc5c0422..b73bf1838e57 100644 --- a/arch/ia64/mm/numa.c +++ b/arch/ia64/mm/numa.c @@ -27,7 +27,9 @@ */ int num_node_memblks; struct node_memblk_s node_memblk[NR_NODE_MEMBLKS]; -struct node_cpuid_s node_cpuid[NR_CPUS]; +struct node_cpuid_s node_cpuid[NR_CPUS] = + { [0 ... NR_CPUS-1] = { .phys_id = 0, .nid = NUMA_NO_NODE } }; + /* * This is a matrix with "distances" between nodes, they should be * proportional to the memory access latency ratios. diff --git a/include/asm-ia64/acpi.h b/include/asm-ia64/acpi.h index cd1cc39b5599..fcfad326f4c7 100644 --- a/include/asm-ia64/acpi.h +++ b/include/asm-ia64/acpi.h @@ -35,6 +35,7 @@ #include #include #include +#include #define COMPILER_DEPENDENT_INT64 long #define COMPILER_DEPENDENT_UINT64 unsigned long @@ -115,7 +116,11 @@ extern unsigned int is_cpu_cpei_target(unsigned int cpu); extern void set_cpei_target_cpu(unsigned int cpu); extern unsigned int get_cpei_target_cpu(void); extern void prefill_possible_map(void); +#ifdef CONFIG_ACPI_HOTPLUG_CPU extern int additional_cpus; +#else +#define additional_cpus 0 +#endif #ifdef CONFIG_ACPI_NUMA #if MAX_NUMNODES > 256 @@ -129,6 +134,34 @@ extern int __initdata nid_to_pxm_map[MAX_NUMNODES]; #define acpi_unlazy_tlb(x) +#ifdef CONFIG_ACPI_NUMA +extern cpumask_t early_cpu_possible_map; +#define for_each_possible_early_cpu(cpu) \ + for_each_cpu_mask((cpu), early_cpu_possible_map) + +static inline void per_cpu_scan_finalize(int min_cpus, int reserve_cpus) +{ + int low_cpu, high_cpu; + int cpu; + int next_nid = 0; + + low_cpu = cpus_weight(early_cpu_possible_map); + + high_cpu = max(low_cpu, min_cpus); + high_cpu = min(high_cpu + reserve_cpus, NR_CPUS); + + for (cpu = low_cpu; cpu < high_cpu; cpu++) { + cpu_set(cpu, early_cpu_possible_map); + if (node_cpuid[cpu].nid == NUMA_NO_NODE) { + node_cpuid[cpu].nid = next_nid; + next_nid++; + if (next_nid >= num_online_nodes()) + next_nid = 0; + } + } +} +#endif /* CONFIG_ACPI_NUMA */ + #endif /*__KERNEL__*/ #endif /*_ASM_ACPI_H*/ diff --git a/include/asm-ia64/numa.h b/include/asm-ia64/numa.h index 6a8a27cfae3e..3499ff57bf42 100644 --- a/include/asm-ia64/numa.h +++ b/include/asm-ia64/numa.h @@ -22,6 +22,8 @@ #include +#define NUMA_NO_NODE -1 + extern u16 cpu_to_node_map[NR_CPUS] __cacheline_aligned; extern cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned; extern pg_data_t *pgdat_list[MAX_NUMNODES]; From b0247a55f4cdd7a270e938aa39f9edb5b005a88c Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 8 Apr 2008 13:31:47 +0900 Subject: [PATCH 13/27] [IA64] kdump: add kdump_on_fatal_mca While it is convenient that we can invoke kdump by asserting INIT via button on chassis etc., there are some situations that invoking kdump on fatal MCA is not welcomed rather than rebooting fast without dump. This patch adds a new flag 'kdump_on_fatal_mca' that is independent from 'kdump_on_init' currently available. Adding this flag enable us to turning on/off of kdump depend on the event, INIT and/or fatal MCA. Default for this flag is to take the dump. Signed-off-by: Hidetoshi Seto Signed-off-by: Tony Luck --- arch/ia64/kernel/crash.c | 31 +++++++++++++++++++++++-------- arch/ia64/kernel/mca.c | 6 +----- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c index fbe742ad2fde..2b01e5a1f3ce 100644 --- a/arch/ia64/kernel/crash.c +++ b/arch/ia64/kernel/crash.c @@ -24,6 +24,7 @@ int kdump_status[NR_CPUS]; static atomic_t kdump_cpu_frozen; atomic_t kdump_in_progress; static int kdump_on_init = 1; +static int kdump_on_fatal_mca = 1; static inline Elf64_Word *append_elf_note(Elf64_Word *buf, char *name, unsigned type, void *data, @@ -148,7 +149,7 @@ kdump_init_notifier(struct notifier_block *self, unsigned long val, void *data) struct ia64_mca_notify_die *nd; struct die_args *args = data; - if (!kdump_on_init) + if (!kdump_on_init && !kdump_on_fatal_mca) return NOTIFY_DONE; if (!ia64_kimage) { @@ -174,11 +175,14 @@ kdump_init_notifier(struct notifier_block *self, unsigned long val, void *data) switch (val) { case DIE_INIT_MONARCH_PROCESS: - atomic_set(&kdump_in_progress, 1); - *(nd->monarch_cpu) = -1; + if (kdump_on_init) { + atomic_set(&kdump_in_progress, 1); + *(nd->monarch_cpu) = -1; + } break; case DIE_INIT_MONARCH_LEAVE: - machine_kdump_on_init(); + if (kdump_on_init) + machine_kdump_on_init(); break; case DIE_INIT_SLAVE_LEAVE: if (atomic_read(&kdump_in_progress)) @@ -189,16 +193,19 @@ kdump_init_notifier(struct notifier_block *self, unsigned long val, void *data) unw_init_running(kdump_cpu_freeze, NULL); break; case DIE_MCA_MONARCH_LEAVE: - /* die_register->signr indicate if MCA is recoverable */ - if (!args->signr) + /* die_register->signr indicate if MCA is recoverable */ + if (kdump_on_fatal_mca && !args->signr) { + atomic_set(&kdump_in_progress, 1); + *(nd->monarch_cpu) = -1; machine_kdump_on_init(); + } break; } return NOTIFY_DONE; } #ifdef CONFIG_SYSCTL -static ctl_table kdump_on_init_table[] = { +static ctl_table kdump_ctl_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "kdump_on_init", @@ -207,6 +214,14 @@ static ctl_table kdump_on_init_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "kdump_on_fatal_mca", + .data = &kdump_on_fatal_mca, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; @@ -215,7 +230,7 @@ static ctl_table sys_table[] = { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, - .child = kdump_on_init_table, + .child = kdump_ctl_table, }, { .ctl_name = 0 } }; diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 6c18221dba36..338dbb8c2cfc 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1266,16 +1266,12 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw, } else { /* Dump buffered message to console */ ia64_mlogbuf_finish(1); -#ifdef CONFIG_KEXEC - atomic_set(&kdump_in_progress, 1); - monarch_cpu = -1; -#endif } + if (notify_die(DIE_MCA_MONARCH_LEAVE, "MCA", regs, (long)&nd, 0, recover) == NOTIFY_STOP) ia64_mca_spin(__func__); - if (atomic_dec_return(&mca_count) > 0) { int i; From 3975afffd32b84c0ad6797debe5abd179f44a698 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 8 Apr 2008 13:33:08 +0900 Subject: [PATCH 14/27] [IA64] kdump: crash.c coding style fix Fix indenting of switch statement to follow Documentation/CodingStyle. Signed-off-by: Hidetoshi Seto Signed-off-by: Tony Luck --- arch/ia64/kernel/crash.c | 52 ++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c index 2b01e5a1f3ce..e74e15a08892 100644 --- a/arch/ia64/kernel/crash.c +++ b/arch/ia64/kernel/crash.c @@ -174,32 +174,32 @@ kdump_init_notifier(struct notifier_block *self, unsigned long val, void *data) return NOTIFY_DONE; switch (val) { - case DIE_INIT_MONARCH_PROCESS: - if (kdump_on_init) { - atomic_set(&kdump_in_progress, 1); - *(nd->monarch_cpu) = -1; - } - break; - case DIE_INIT_MONARCH_LEAVE: - if (kdump_on_init) - machine_kdump_on_init(); - break; - case DIE_INIT_SLAVE_LEAVE: - if (atomic_read(&kdump_in_progress)) - unw_init_running(kdump_cpu_freeze, NULL); - break; - case DIE_MCA_RENDZVOUS_LEAVE: - if (atomic_read(&kdump_in_progress)) - unw_init_running(kdump_cpu_freeze, NULL); - break; - case DIE_MCA_MONARCH_LEAVE: - /* die_register->signr indicate if MCA is recoverable */ - if (kdump_on_fatal_mca && !args->signr) { - atomic_set(&kdump_in_progress, 1); - *(nd->monarch_cpu) = -1; - machine_kdump_on_init(); - } - break; + case DIE_INIT_MONARCH_PROCESS: + if (kdump_on_init) { + atomic_set(&kdump_in_progress, 1); + *(nd->monarch_cpu) = -1; + } + break; + case DIE_INIT_MONARCH_LEAVE: + if (kdump_on_init) + machine_kdump_on_init(); + break; + case DIE_INIT_SLAVE_LEAVE: + if (atomic_read(&kdump_in_progress)) + unw_init_running(kdump_cpu_freeze, NULL); + break; + case DIE_MCA_RENDZVOUS_LEAVE: + if (atomic_read(&kdump_in_progress)) + unw_init_running(kdump_cpu_freeze, NULL); + break; + case DIE_MCA_MONARCH_LEAVE: + /* die_register->signr indicate if MCA is recoverable */ + if (kdump_on_fatal_mca && !args->signr) { + atomic_set(&kdump_in_progress, 1); + *(nd->monarch_cpu) = -1; + machine_kdump_on_init(); + } + break; } return NOTIFY_DONE; } From e4b05d4097eb6dab08bda86a72f6fdfdd9816395 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 9 Apr 2008 12:26:10 +0900 Subject: [PATCH 15/27] [IA64] pgd_offset() constfication. when compile 2.6.25-rc8-mm1, below warning happend. because walk_page_range pass argument as "const struct mm*", but pgd_offset() receive as "struct mm*". CC mm/pagewalk.o mm/pagewalk.c: In function 'walk_page_range': mm/pagewalk.c:111: warning: passing argument 1 of 'pgd_offset' discards qualifiers from pointer target type Signed-off-by: KOSAKI Motohiro Signed-off-by: Tony Luck --- include/asm-ia64/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h index e6204f14f614..ed70862ea247 100644 --- a/include/asm-ia64/pgtable.h +++ b/include/asm-ia64/pgtable.h @@ -371,7 +371,7 @@ pgd_index (unsigned long address) /* The offset in the 1-level directory is given by the 3 region bits (61..63) and the level-1 bits. */ static inline pgd_t* -pgd_offset (struct mm_struct *mm, unsigned long address) +pgd_offset (const struct mm_struct *mm, unsigned long address) { return mm->pgd + pgd_index(address); } From e91450161186a926d16d8fdc8669aa1998bce148 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Thu, 3 Apr 2008 14:30:36 -0400 Subject: [PATCH 16/27] [IA64] Fix unlock ordering in smp_callin One should normally unlock in the reverse order of the lock calls, and in this case there certainly is no reason not to. Signed-off-by: Alan D. Brunelle Signed-off-by: Tony Luck --- arch/ia64/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index 32ee5979a042..16483be18c0b 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -400,9 +400,9 @@ smp_callin (void) /* Setup the per cpu irq handling data structures */ __setup_vector_irq(cpuid); cpu_set(cpuid, cpu_online_map); - unlock_ipi_calllock(); per_cpu(cpu_state, cpuid) = CPU_ONLINE; spin_unlock(&vector_lock); + unlock_ipi_calllock(); smp_setup_percpu_timer(); From 6794c7526651160a75e90322cb750dcceb310d34 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 1 Apr 2008 12:29:34 +0800 Subject: [PATCH 17/27] [IA64] use goto to jump out do/while_each_thread do_each_thread/while_each_thread is a double loop, so should use 'goto' rather than 'break' to break out the loop. Signed-off-by: Li Zefan Signed-off-by: Tony Luck --- arch/ia64/kernel/perfmon.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index a2aabfdc80d9..d1d24f4598da 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -4204,10 +4204,10 @@ pfm_check_task_exist(pfm_context_t *ctx) do_each_thread (g, t) { if (t->thread.pfm_context == ctx) { ret = 0; - break; + goto out; } } while_each_thread (g, t); - +out: read_unlock(&tasklist_lock); DPRINT(("pfm_check_task_exist: ret=%d ctx=%p\n", ret, ctx)); From d167cb85150bd473a27df71e3116a9cc0008f5dd Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Sat, 29 Mar 2008 10:05:30 -0400 Subject: [PATCH 18/27] [IA64] Replace explicit jiffies tests with time_* macros. In arch/ia64/sn/kernel/xpc_{main,partition}.c Signed-off-by: Robert P. J. Day Signed-off-by: Tony Luck --- arch/ia64/sn/kernel/xpc_main.c | 8 ++++---- arch/ia64/sn/kernel/xpc_partition.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/ia64/sn/kernel/xpc_main.c b/arch/ia64/sn/kernel/xpc_main.c index 81785b78bc1e..9e0b164da9c2 100644 --- a/arch/ia64/sn/kernel/xpc_main.c +++ b/arch/ia64/sn/kernel/xpc_main.c @@ -199,7 +199,7 @@ xpc_timeout_partition_disengage_request(unsigned long data) struct xpc_partition *part = (struct xpc_partition *) data; - DBUG_ON(jiffies < part->disengage_request_timeout); + DBUG_ON(time_before(jiffies, part->disengage_request_timeout)); (void) xpc_partition_disengaged(part); @@ -230,7 +230,7 @@ xpc_hb_beater(unsigned long dummy) { xpc_vars->heartbeat++; - if (jiffies >= xpc_hb_check_timeout) { + if (time_after_eq(jiffies, xpc_hb_check_timeout)) { wake_up_interruptible(&xpc_act_IRQ_wq); } @@ -270,7 +270,7 @@ xpc_hb_checker(void *ignore) /* checking of remote heartbeats is skewed by IRQ handling */ - if (jiffies >= xpc_hb_check_timeout) { + if (time_after_eq(jiffies, xpc_hb_check_timeout)) { dev_dbg(xpc_part, "checking remote heartbeats\n"); xpc_check_remote_hb(); @@ -305,7 +305,7 @@ xpc_hb_checker(void *ignore) /* wait for IRQ or timeout */ (void) wait_event_interruptible(xpc_act_IRQ_wq, (last_IRQ_count < atomic_read(&xpc_act_IRQ_rcvd) || - jiffies >= xpc_hb_check_timeout || + time_after_eq(jiffies, xpc_hb_check_timeout) || (volatile int) xpc_exiting)); } diff --git a/arch/ia64/sn/kernel/xpc_partition.c b/arch/ia64/sn/kernel/xpc_partition.c index 7ba403232cb8..9e97c2684832 100644 --- a/arch/ia64/sn/kernel/xpc_partition.c +++ b/arch/ia64/sn/kernel/xpc_partition.c @@ -877,7 +877,7 @@ xpc_partition_disengaged(struct xpc_partition *part) disengaged = (xpc_partition_engaged(1UL << partid) == 0); if (part->disengage_request_timeout) { if (!disengaged) { - if (jiffies < part->disengage_request_timeout) { + if (time_before(jiffies, part->disengage_request_timeout)) { /* timelimit hasn't been reached yet */ return 0; } From 96ded9dadde397a9e372a650534a9ffbba97194a Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 28 Mar 2008 14:27:00 -0700 Subject: [PATCH 19/27] [IA64] fix getpid and set_tid_address fast system calls for pid namespaces The sys_getpid() and sys_set_tid_address() behavior changed from return current->tgid to struct pid *pid; pid = current->pids[PIDTYPE_PID].pid; return pid->numbers[pid->level].nr; But the fast system calls on ia64 still operate the old way. Patch them appropriately to let ia64 work with pid namespaces. Besides, this is one more step in deprecating of pid and tgid on task_struct. The fsys_getppid() is to be patched as well, but its logic is much more complex now, so I will make it later. One thing I'm not 100% sure is the trick with the IA64_UPID_SHIFT. On order to access the pid->level's element of an array I have to perform the following calculations pid + sizeof(struct upid) * pid->level The problem is that ia64 can only multiply float point registers, while all the offsets I have in code are in rXX ones. Fortunately, the sizeof(struct upid) is 32 bytes on ia64 (and is very unlikely to ever change), so the calculations get simpler: pid + pid->level << 5 So, I introduce the IA64_UPID_SHIFT and use the shl instruction. I also looked at how gcc compiles the similar place and found that it makes it with shift as well. Is this OK to do so? Tested with ski emulator with 2.6.24 kernel, but fits 2.6.25-rc4 and 2.6.25-rc4-mm1 as well. Signed-off-by: Pavel Emelyanov Cc: David Mosberger-Tang Cc: Hidetoshi Seto Cc: Fenghua Yu Cc: Amy Griffis Signed-off-by: Andrew Morton Signed-off-by: Tony Luck --- arch/ia64/kernel/asm-offsets.c | 7 +++++++ arch/ia64/kernel/fsys.S | 34 ++++++++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c index 0aebc6f79e95..f7bc40dee43d 100644 --- a/arch/ia64/kernel/asm-offsets.c +++ b/arch/ia64/kernel/asm-offsets.c @@ -7,6 +7,7 @@ #define ASM_OFFSETS_C 1 #include +#include #include #include @@ -34,6 +35,9 @@ void foo(void) DEFINE(SIGFRAME_SIZE, sizeof (struct sigframe)); DEFINE(UNW_FRAME_INFO_SIZE, sizeof (struct unw_frame_info)); + BUILD_BUG_ON(sizeof(struct upid) != 32); + DEFINE(IA64_UPID_SHIFT, 5); + BLANK(); DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); @@ -45,6 +49,9 @@ void foo(void) DEFINE(IA64_TASK_BLOCKED_OFFSET,offsetof (struct task_struct, blocked)); DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid)); DEFINE(IA64_TASK_GROUP_LEADER_OFFSET, offsetof (struct task_struct, group_leader)); + DEFINE(IA64_TASK_TGIDLINK_OFFSET, offsetof (struct task_struct, pids[PIDTYPE_PID].pid)); + DEFINE(IA64_PID_LEVEL_OFFSET, offsetof (struct pid, level)); + DEFINE(IA64_PID_UPID_OFFSET, offsetof (struct pid, numbers[0])); DEFINE(IA64_TASK_PENDING_OFFSET,offsetof (struct task_struct, pending)); DEFINE(IA64_TASK_PID_OFFSET, offsetof (struct task_struct, pid)); DEFINE(IA64_TASK_REAL_PARENT_OFFSET, offsetof (struct task_struct, real_parent)); diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index 44841971f077..3f570e6fcd9c 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S @@ -61,13 +61,29 @@ ENTRY(fsys_getpid) .prologue .altrp b6 .body + add r17=IA64_TASK_GROUP_LEADER_OFFSET,r16 + ;; + ld8 r17=[r17] // r17 = current->group_leader add r9=TI_FLAGS+IA64_TASK_SIZE,r16 ;; ld4 r9=[r9] - add r8=IA64_TASK_TGID_OFFSET,r16 + add r17=IA64_TASK_TGIDLINK_OFFSET,r17 ;; and r9=TIF_ALLWORK_MASK,r9 - ld4 r8=[r8] // r8 = current->tgid + ld8 r17=[r17] // r17 = current->group_leader->pids[PIDTYPE_PID].pid + ;; + add r8=IA64_PID_LEVEL_OFFSET,r17 + ;; + ld4 r8=[r8] // r8 = pid->level + add r17=IA64_PID_UPID_OFFSET,r17 // r17 = &pid->numbers[0] + ;; + shl r8=r8,IA64_UPID_SHIFT + ;; + add r17=r17,r8 // r17 = &pid->numbers[pid->level] + ;; + ld4 r8=[r17] // r8 = pid->numbers[pid->level].nr + ;; + mov r17=0 ;; cmp.ne p8,p0=0,r9 (p8) br.spnt.many fsys_fallback_syscall @@ -126,15 +142,25 @@ ENTRY(fsys_set_tid_address) .altrp b6 .body add r9=TI_FLAGS+IA64_TASK_SIZE,r16 + add r17=IA64_TASK_TGIDLINK_OFFSET,r16 ;; ld4 r9=[r9] tnat.z p6,p7=r32 // check argument register for being NaT + ld8 r17=[r17] // r17 = current->pids[PIDTYPE_PID].pid ;; and r9=TIF_ALLWORK_MASK,r9 - add r8=IA64_TASK_PID_OFFSET,r16 + add r8=IA64_PID_LEVEL_OFFSET,r17 add r18=IA64_TASK_CLEAR_CHILD_TID_OFFSET,r16 ;; - ld4 r8=[r8] + ld4 r8=[r8] // r8 = pid->level + add r17=IA64_PID_UPID_OFFSET,r17 // r17 = &pid->numbers[0] + ;; + shl r8=r8,IA64_UPID_SHIFT + ;; + add r17=r17,r8 // r17 = &pid->numbers[pid->level] + ;; + ld4 r8=[r17] // r8 = pid->numbers[pid->level].nr + ;; cmp.ne p8,p0=0,r9 mov r17=-1 ;; From 34e1ceb1881ec895ad9b1b52d073f414f3aa87a9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 28 Mar 2008 14:27:02 -0700 Subject: [PATCH 20/27] [IA64] kprobes: kprobe-booster for ia64 Add kprobe-booster support on ia64. Kprobe-booster improves the performance of kprobes by eliminating single-step, where possible. Currently, kprobe-booster is implemented on x86 and x86-64. This is an ia64 port. On ia64, kprobe-booster executes a copied bundle directly, instead of single stepping. Bundles which have B or X unit and which may cause an exception (including break) are not executed directly. And also, to prevent hitting break exceptions on the copied bundle, only the hindmost kprobe is executed directly if several kprobes share a bundle and are placed in different slots. Note: set_brl_inst() is used for preparing an instruction buffer(it does not modify any active code), so it does not need any atomic operation. Signed-off-by: Masami Hiramatsu Cc: Anil S Keshavamurthy Cc: Ananth N Mavinakayanahalli Cc: bibo,mao Cc: Rusty Lynch Cc: Prasanna S Panchamukhi Cc: Jim Keniston Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Tony Luck --- arch/ia64/kernel/kprobes.c | 133 +++++++++++++++++++++++++++++-------- include/asm-ia64/kprobes.h | 7 +- 2 files changed, 113 insertions(+), 27 deletions(-) diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index 8d9a446a0d17..233434f4f88f 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -78,6 +78,20 @@ static enum instruction_type bundle_encoding[32][3] = { { u, u, u }, /* 1F */ }; +/* Insert a long branch code */ +static void __kprobes set_brl_inst(void *from, void *to) +{ + s64 rel = ((s64) to - (s64) from) >> 4; + bundle_t *brl; + brl = (bundle_t *) ((u64) from & ~0xf); + brl->quad0.template = 0x05; /* [MLX](stop) */ + brl->quad0.slot0 = NOP_M_INST; /* nop.m 0x0 */ + brl->quad0.slot1_p0 = ((rel >> 20) & 0x7fffffffff) << 2; + brl->quad1.slot1_p1 = (((rel >> 20) & 0x7fffffffff) << 2) >> (64 - 46); + /* brl.cond.sptk.many.clr rel<<4 (qp=0) */ + brl->quad1.slot2 = BRL_INST(rel >> 59, rel & 0xfffff); +} + /* * In this function we check to see if the instruction * is IP relative instruction and update the kprobe @@ -496,6 +510,77 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, regs->b0 = ((struct fnptr *)kretprobe_trampoline)->ip; } +/* Check the instruction in the slot is break */ +static int __kprobes __is_ia64_break_inst(bundle_t *bundle, uint slot) +{ + unsigned int major_opcode; + unsigned int template = bundle->quad0.template; + unsigned long kprobe_inst; + + /* Move to slot 2, if bundle is MLX type and kprobe slot is 1 */ + if (slot == 1 && bundle_encoding[template][1] == L) + slot++; + + /* Get Kprobe probe instruction at given slot*/ + get_kprobe_inst(bundle, slot, &kprobe_inst, &major_opcode); + + /* For break instruction, + * Bits 37:40 Major opcode to be zero + * Bits 27:32 X6 to be zero + * Bits 32:35 X3 to be zero + */ + if (major_opcode || ((kprobe_inst >> 27) & 0x1FF)) { + /* Not a break instruction */ + return 0; + } + + /* Is a break instruction */ + return 1; +} + +/* + * In this function, we check whether the target bundle modifies IP or + * it triggers an exception. If so, it cannot be boostable. + */ +static int __kprobes can_boost(bundle_t *bundle, uint slot, + unsigned long bundle_addr) +{ + unsigned int template = bundle->quad0.template; + + do { + if (search_exception_tables(bundle_addr + slot) || + __is_ia64_break_inst(bundle, slot)) + return 0; /* exception may occur in this bundle*/ + } while ((++slot) < 3); + template &= 0x1e; + if (template >= 0x10 /* including B unit */ || + template == 0x04 /* including X unit */ || + template == 0x06) /* undefined */ + return 0; + + return 1; +} + +/* Prepare long jump bundle and disables other boosters if need */ +static void __kprobes prepare_booster(struct kprobe *p) +{ + unsigned long addr = (unsigned long)p->addr & ~0xFULL; + unsigned int slot = (unsigned long)p->addr & 0xf; + struct kprobe *other_kp; + + if (can_boost(&p->ainsn.insn[0].bundle, slot, addr)) { + set_brl_inst(&p->ainsn.insn[1].bundle, (bundle_t *)addr + 1); + p->ainsn.inst_flag |= INST_FLAG_BOOSTABLE; + } + + /* disables boosters in previous slots */ + for (; addr < (unsigned long)p->addr; addr++) { + other_kp = get_kprobe((void *)addr); + if (other_kp) + other_kp->ainsn.inst_flag &= ~INST_FLAG_BOOSTABLE; + } +} + int __kprobes arch_prepare_kprobe(struct kprobe *p) { unsigned long addr = (unsigned long) p->addr; @@ -530,6 +615,8 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) prepare_break_inst(template, slot, major_opcode, kprobe_inst, p, qp); + prepare_booster(p); + return 0; } @@ -543,7 +630,9 @@ void __kprobes arch_arm_kprobe(struct kprobe *p) src = &p->opcode.bundle; flush_icache_range((unsigned long)p->ainsn.insn, - (unsigned long)p->ainsn.insn + sizeof(kprobe_opcode_t)); + (unsigned long)p->ainsn.insn + + sizeof(kprobe_opcode_t) * MAX_INSN_SIZE); + switch (p->ainsn.slot) { case 0: dest->quad0.slot0 = src->quad0.slot0; @@ -584,13 +673,13 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { mutex_lock(&kprobe_mutex); - free_insn_slot(p->ainsn.insn, 0); + free_insn_slot(p->ainsn.insn, p->ainsn.inst_flag & INST_FLAG_BOOSTABLE); mutex_unlock(&kprobe_mutex); } /* * We are resuming execution after a single step fault, so the pt_regs * structure reflects the register state after we executed the instruction - * located in the kprobe (p->ainsn.insn.bundle). We still need to adjust + * located in the kprobe (p->ainsn.insn->bundle). We still need to adjust * the ip to point back to the original stack address. To set the IP address * to original stack address, handle the case where we need to fixup the * relative IP address and/or fixup branch register. @@ -607,7 +696,7 @@ static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs) if (slot == 1 && bundle_encoding[template][1] == L) slot = 2; - if (p->ainsn.inst_flag) { + if (p->ainsn.inst_flag & ~INST_FLAG_BOOSTABLE) { if (p->ainsn.inst_flag & INST_FLAG_FIX_RELATIVE_IP_ADDR) { /* Fix relative IP address */ @@ -686,33 +775,12 @@ static void __kprobes prepare_ss(struct kprobe *p, struct pt_regs *regs) static int __kprobes is_ia64_break_inst(struct pt_regs *regs) { unsigned int slot = ia64_psr(regs)->ri; - unsigned int template, major_opcode; - unsigned long kprobe_inst; unsigned long *kprobe_addr = (unsigned long *)regs->cr_iip; bundle_t bundle; memcpy(&bundle, kprobe_addr, sizeof(bundle_t)); - template = bundle.quad0.template; - /* Move to slot 2, if bundle is MLX type and kprobe slot is 1 */ - if (slot == 1 && bundle_encoding[template][1] == L) - slot++; - - /* Get Kprobe probe instruction at given slot*/ - get_kprobe_inst(&bundle, slot, &kprobe_inst, &major_opcode); - - /* For break instruction, - * Bits 37:40 Major opcode to be zero - * Bits 27:32 X6 to be zero - * Bits 32:35 X3 to be zero - */ - if (major_opcode || ((kprobe_inst >> 27) & 0x1FF) ) { - /* Not a break instruction */ - return 0; - } - - /* Is a break instruction */ - return 1; + return __is_ia64_break_inst(&bundle, slot); } static int __kprobes pre_kprobes_handler(struct die_args *args) @@ -802,6 +870,19 @@ static int __kprobes pre_kprobes_handler(struct die_args *args) return 1; ss_probe: +#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM) + if (p->ainsn.inst_flag == INST_FLAG_BOOSTABLE && !p->post_handler) { + /* Boost up -- we can execute copied instructions directly */ + ia64_psr(regs)->ri = p->ainsn.slot; + regs->cr_iip = (unsigned long)&p->ainsn.insn->bundle & ~0xFULL; + /* turn single stepping off */ + ia64_psr(regs)->ss = 0; + + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } +#endif prepare_ss(p, regs); kcb->kprobe_status = KPROBE_HIT_SS; return 1; diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h index d03bf9ff68e3..ef71b57fc2f4 100644 --- a/include/asm-ia64/kprobes.h +++ b/include/asm-ia64/kprobes.h @@ -30,8 +30,12 @@ #include #define __ARCH_WANT_KPROBES_INSN_SLOT -#define MAX_INSN_SIZE 1 +#define MAX_INSN_SIZE 2 /* last half is for kprobe-booster */ #define BREAK_INST (long)(__IA64_BREAK_KPROBE << 6) +#define NOP_M_INST (long)(1<<27) +#define BRL_INST(i1, i2) ((long)((0xcL << 37) | /* brl */ \ + (0x1L << 12) | /* many */ \ + (((i1) & 1) << 36) | ((i2) << 13))) /* imm */ typedef union cmp_inst { struct { @@ -112,6 +116,7 @@ struct arch_specific_insn { #define INST_FLAG_FIX_RELATIVE_IP_ADDR 1 #define INST_FLAG_FIX_BRANCH_REG 2 #define INST_FLAG_BREAK_INST 4 + #define INST_FLAG_BOOSTABLE 8 unsigned long inst_flag; unsigned short target_br_reg; unsigned short slot; From b34eb53cdcb4f49fd31d78d0e385240820ed9063 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 28 Mar 2008 14:27:03 -0700 Subject: [PATCH 21/27] [IA64] make IOMMU respect the segment boundary limits IA64's IOMMU implementation allocates memory areas spanning LLD's segment boundary limit. It forces low level drivers to have a workaround to adjust scatter lists that the IOMMU builds. We are in the process of making all the IOMMUs respect the segment boundary limits to remove such work around in LLDs. This patch is for IA64's IOMMU. Signed-off-by: FUJITA Tomonori Signed-off-by: Andrew Morton Signed-off-by: Tony Luck --- arch/ia64/Kconfig | 3 ++ arch/ia64/hp/common/sba_iommu.c | 56 +++++++++++++++++++++++---------- 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 8fa3faf5ef1b..1b73ffe746d9 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -611,6 +611,9 @@ config IRQ_PER_CPU bool default y +config IOMMU_HELPER + def_bool (IA64_HP_ZX1 || IA64_HP_ZX1_SWIOTLB || IA64_GENERIC) + source "arch/ia64/hp/sim/Kconfig" source "arch/ia64/Kconfig.debug" diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index 523eae6d3e49..9409de5c9441 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -35,6 +35,7 @@ #include #include /* hweight64() */ #include +#include #include /* ia64_get_itc() */ #include @@ -460,6 +461,13 @@ get_iovp_order (unsigned long size) return order; } +static unsigned long ptr_to_pide(struct ioc *ioc, unsigned long *res_ptr, + unsigned int bitshiftcnt) +{ + return (((unsigned long)res_ptr - (unsigned long)ioc->res_map) << 3) + + bitshiftcnt; +} + /** * sba_search_bitmap - find free space in IO PDIR resource bitmap * @ioc: IO MMU structure which owns the pdir we are interested in. @@ -471,15 +479,25 @@ get_iovp_order (unsigned long size) * Cool perf optimization: search for log2(size) bits at a time. */ static SBA_INLINE unsigned long -sba_search_bitmap(struct ioc *ioc, unsigned long bits_wanted, int use_hint) +sba_search_bitmap(struct ioc *ioc, struct device *dev, + unsigned long bits_wanted, int use_hint) { unsigned long *res_ptr; unsigned long *res_end = (unsigned long *) &(ioc->res_map[ioc->res_size]); - unsigned long flags, pide = ~0UL; + unsigned long flags, pide = ~0UL, tpide; + unsigned long boundary_size; + unsigned long shift; + int ret; ASSERT(((unsigned long) ioc->res_hint & (sizeof(unsigned long) - 1UL)) == 0); ASSERT(res_ptr < res_end); + boundary_size = (unsigned long long)dma_get_seg_boundary(dev) + 1; + boundary_size = ALIGN(boundary_size, 1ULL << iovp_shift) >> iovp_shift; + + BUG_ON(ioc->ibase & ~iovp_mask); + shift = ioc->ibase >> iovp_shift; + spin_lock_irqsave(&ioc->res_lock, flags); /* Allow caller to force a search through the entire resource space */ @@ -504,9 +522,7 @@ sba_search_bitmap(struct ioc *ioc, unsigned long bits_wanted, int use_hint) if (likely(*res_ptr != ~0UL)) { bitshiftcnt = ffz(*res_ptr); *res_ptr |= (1UL << bitshiftcnt); - pide = ((unsigned long)res_ptr - (unsigned long)ioc->res_map); - pide <<= 3; /* convert to bit address */ - pide += bitshiftcnt; + pide = ptr_to_pide(ioc, res_ptr, bitshiftcnt); ioc->res_bitshift = bitshiftcnt + bits_wanted; goto found_it; } @@ -535,11 +551,13 @@ sba_search_bitmap(struct ioc *ioc, unsigned long bits_wanted, int use_hint) DBG_RES(" %p %lx %lx\n", res_ptr, mask, *res_ptr); ASSERT(0 != mask); for (; mask ; mask <<= o, bitshiftcnt += o) { - if(0 == ((*res_ptr) & mask)) { + tpide = ptr_to_pide(ioc, res_ptr, bitshiftcnt); + ret = iommu_is_span_boundary(tpide, bits_wanted, + shift, + boundary_size); + if ((0 == ((*res_ptr) & mask)) && !ret) { *res_ptr |= mask; /* mark resources busy! */ - pide = ((unsigned long)res_ptr - (unsigned long)ioc->res_map); - pide <<= 3; /* convert to bit address */ - pide += bitshiftcnt; + pide = tpide; ioc->res_bitshift = bitshiftcnt + bits_wanted; goto found_it; } @@ -560,6 +578,11 @@ sba_search_bitmap(struct ioc *ioc, unsigned long bits_wanted, int use_hint) end = res_end - qwords; for (; res_ptr < end; res_ptr++) { + tpide = ptr_to_pide(ioc, res_ptr, 0); + ret = iommu_is_span_boundary(tpide, bits_wanted, + shift, boundary_size); + if (ret) + goto next_ptr; for (i = 0 ; i < qwords ; i++) { if (res_ptr[i] != 0) goto next_ptr; @@ -572,8 +595,7 @@ sba_search_bitmap(struct ioc *ioc, unsigned long bits_wanted, int use_hint) res_ptr[i] = ~0UL; res_ptr[i] |= RESMAP_MASK(bits); - pide = ((unsigned long)res_ptr - (unsigned long)ioc->res_map); - pide <<= 3; /* convert to bit address */ + pide = tpide; res_ptr += qwords; ioc->res_bitshift = bits; goto found_it; @@ -605,7 +627,7 @@ sba_search_bitmap(struct ioc *ioc, unsigned long bits_wanted, int use_hint) * resource bit map. */ static int -sba_alloc_range(struct ioc *ioc, size_t size) +sba_alloc_range(struct ioc *ioc, struct device *dev, size_t size) { unsigned int pages_needed = size >> iovp_shift; #ifdef PDIR_SEARCH_TIMING @@ -622,9 +644,9 @@ sba_alloc_range(struct ioc *ioc, size_t size) /* ** "seek and ye shall find"...praying never hurts either... */ - pide = sba_search_bitmap(ioc, pages_needed, 1); + pide = sba_search_bitmap(ioc, dev, pages_needed, 1); if (unlikely(pide >= (ioc->res_size << 3))) { - pide = sba_search_bitmap(ioc, pages_needed, 0); + pide = sba_search_bitmap(ioc, dev, pages_needed, 0); if (unlikely(pide >= (ioc->res_size << 3))) { #if DELAYED_RESOURCE_CNT > 0 unsigned long flags; @@ -653,7 +675,7 @@ sba_alloc_range(struct ioc *ioc, size_t size) } spin_unlock_irqrestore(&ioc->saved_lock, flags); - pide = sba_search_bitmap(ioc, pages_needed, 0); + pide = sba_search_bitmap(ioc, dev, pages_needed, 0); if (unlikely(pide >= (ioc->res_size << 3))) panic(__FILE__ ": I/O MMU @ %p is out of mapping resources\n", ioc->ioc_hpa); @@ -936,7 +958,7 @@ sba_map_single(struct device *dev, void *addr, size_t size, int dir) spin_unlock_irqrestore(&ioc->res_lock, flags); #endif - pide = sba_alloc_range(ioc, size); + pide = sba_alloc_range(ioc, dev, size); iovp = (dma_addr_t) pide << iovp_shift; @@ -1373,7 +1395,7 @@ sba_coalesce_chunks(struct ioc *ioc, struct device *dev, dma_len = (dma_len + dma_offset + ~iovp_mask) & iovp_mask; ASSERT(dma_len <= DMA_CHUNK_SIZE); dma_sg->dma_address = (dma_addr_t) (PIDE_FLAG - | (sba_alloc_range(ioc, dma_len) << iovp_shift) + | (sba_alloc_range(ioc, dev, dma_len) << iovp_shift) | dma_offset); n_mappings++; } From 734bc367b4830a4c80502a3f9ded7428b1c652e3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 28 Mar 2008 14:27:04 -0700 Subject: [PATCH 22/27] [IA64] remove redundant display of free swap space in show_mem() show_mem() has no need to print the amount of free swap space manually because show_free_areas() does this already and is called by the former. The two outputs only differ in text formatting: printk("Free swap = %lukB\n", ...); printk("Free swap: %6ldkB\n", ...); Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Tony Luck --- arch/ia64/mm/contig.c | 2 -- arch/ia64/mm/discontig.c | 2 -- 2 files changed, 4 deletions(-) diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index 344f64eca7a9..0479661fa41a 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -45,8 +45,6 @@ void show_mem(void) printk(KERN_INFO "Mem-info:\n"); show_free_areas(); - printk(KERN_INFO "Free swap: %6ldkB\n", - nr_swap_pages<<(PAGE_SHIFT-10)); printk(KERN_INFO "Node memory in pages:\n"); for_each_online_pgdat(pgdat) { unsigned long present; diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index ee5e68b2af94..ffee1ea00bb0 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -522,8 +522,6 @@ void show_mem(void) printk(KERN_INFO "Mem-info:\n"); show_free_areas(); - printk(KERN_INFO "Free swap: %6ldkB\n", - nr_swap_pages<<(PAGE_SHIFT-10)); printk(KERN_INFO "Node memory in pages:\n"); for_each_online_pgdat(pgdat) { unsigned long present; From 5cf1f7cef1c67b5c81736f00e81a2890e07041b9 Mon Sep 17 00:00:00 2001 From: "S.Caglar Onur" Date: Fri, 28 Mar 2008 14:27:05 -0700 Subject: [PATCH 23/27] [IA64] arch/ia64/kernel/: use time_* macros The functions time_before, time_before_eq, time_after, and time_after_eq are more robust for comparing jiffies against other values. So use the time_after() & time_before() macros, defined at linux/jiffies.h, which deal with wrapping correctly [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: S.Caglar Onur Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Tony Luck --- arch/ia64/kernel/irq_ia64.c | 2 +- arch/ia64/kernel/mca.c | 4 +++- arch/ia64/kernel/unaligned.c | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index d8be23fbe6bc..5538471e8d68 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -472,7 +472,7 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs) static unsigned char count; static long last_time; - if (jiffies - last_time > 5*HZ) + if (time_after(jiffies, last_time + 5 * HZ)) count = 0; if (++count < 5) { last_time = jiffies; diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 338dbb8c2cfc..1ae512910870 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -69,6 +69,7 @@ * 2007-04-27 Russ Anderson * Support multiple cpus going through OS_MCA in the same event. */ +#include #include #include #include @@ -293,7 +294,8 @@ static void ia64_mlogbuf_dump_from_init(void) if (mlogbuf_finished) return; - if (mlogbuf_timestamp && (mlogbuf_timestamp + 30*HZ > jiffies)) { + if (mlogbuf_timestamp && + time_before(jiffies, mlogbuf_timestamp + 30 * HZ)) { printk(KERN_ERR "INIT: mlogbuf_dump is interrupted by INIT " " and the system seems to be messed up.\n"); ia64_mlogbuf_finish(0); diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c index 6903361d11a5..ff0e7c10faa7 100644 --- a/arch/ia64/kernel/unaligned.c +++ b/arch/ia64/kernel/unaligned.c @@ -13,6 +13,7 @@ * 2001/08/13 Correct size of extended floats (float_fsz) from 16 to 10 bytes. * 2001/01/17 Add support emulation of unaligned kernel accesses. */ +#include #include #include #include @@ -1290,7 +1291,7 @@ within_logging_rate_limit (void) { static unsigned long count, last_time; - if (jiffies - last_time > 5*HZ) + if (time_after(jiffies, last_time + 5 * HZ)) count = 0; if (count < 5) { last_time = jiffies; From 273988fa4dffd1b1e6deb3de18b979a44e9d8732 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 9 Apr 2008 13:05:41 -0700 Subject: [PATCH 24/27] [IA64] Untangle sync_icache_dcache() page size determination Untangle the chaos of page size determination in this function by simply using PAGE_SIZE << compound_order(). Signed-off-by: Christoph Lameter Signed-off-by: Tony Luck --- arch/ia64/mm/init.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index a4ca657c72c6..da05893294b5 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -58,7 +58,6 @@ __ia64_sync_icache_dcache (pte_t pte) { unsigned long addr; struct page *page; - unsigned long order; page = pte_page(pte); addr = (unsigned long) page_address(page); @@ -66,12 +65,7 @@ __ia64_sync_icache_dcache (pte_t pte) if (test_bit(PG_arch_1, &page->flags)) return; /* i-cache is already coherent with d-cache */ - if (PageCompound(page)) { - order = compound_order(page); - flush_icache_range(addr, addr + (1UL << order << PAGE_SHIFT)); - } - else - flush_icache_range(addr, addr + PAGE_SIZE); + flush_icache_range(addr, addr + (PAGE_SIZE << compound_order(page))); set_bit(PG_arch_1, &page->flags); /* mark page as clean */ } From c19b2930df0621500913c005c06978bd8933110b Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Fri, 29 Feb 2008 17:14:44 -0600 Subject: [PATCH 25/27] [IA64] Itanium Spec updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updates based on the "IntelĀ® ItaniumĀ® Architecture Software Developer's Manual Specification Update October 2007". http://download.intel.com/design/itanium/specupdt/24869911.pdf Signed-off-by: Russ Anderson Signed-off-by: Tony Luck --- include/asm-ia64/pal.h | 72 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 9 deletions(-) diff --git a/include/asm-ia64/pal.h b/include/asm-ia64/pal.h index 8a695d3407d2..67b02901ead4 100644 --- a/include/asm-ia64/pal.h +++ b/include/asm-ia64/pal.h @@ -13,6 +13,7 @@ * Copyright (C) 1999 VA Linux Systems * Copyright (C) 1999 Walt Drummond * Copyright (C) 1999 Srinivasa Prasad Thirumalachar + * Copyright (C) 2008 Silicon Graphics, Inc. (SGI) * * 99/10/01 davidm Make sure we pass zero for reserved parameters. * 00/03/07 davidm Updated pal_cache_flush() to be in sync with PAL v2.6. @@ -73,6 +74,8 @@ #define PAL_CACHE_SHARED_INFO 43 /* returns information on caches shared by logical processor */ #define PAL_GET_HW_POLICY 48 /* Get current hardware resource sharing policy */ #define PAL_SET_HW_POLICY 49 /* Set current hardware resource sharing policy */ +#define PAL_VP_INFO 50 /* Information about virtual processor features */ +#define PAL_MC_HW_TRACKING 51 /* Hardware tracking status */ #define PAL_COPY_PAL 256 /* relocate PAL procedures and PAL PMI */ #define PAL_HALT_INFO 257 /* return the low power capabilities of processor */ @@ -504,7 +507,8 @@ typedef struct pal_cache_check_info_s { wiv : 1, /* Way field valid */ reserved2 : 1, dp : 1, /* Data poisoned on MBE */ - reserved3 : 8, + reserved3 : 6, + hlth : 2, /* Health indicator */ index : 20, /* Cache line index */ reserved4 : 2, @@ -542,7 +546,9 @@ typedef struct pal_tlb_check_info_s { dtc : 1, /* Fail in data TC */ itc : 1, /* Fail in inst. TC */ op : 4, /* Cache operation */ - reserved3 : 30, + reserved3 : 6, + hlth : 2, /* Health indicator */ + reserved4 : 22, is : 1, /* instruction set (1 == ia32) */ iv : 1, /* instruction set field valid */ @@ -633,7 +639,8 @@ typedef struct pal_uarch_check_info_s { way : 6, /* Way of structure */ wv : 1, /* way valid */ xv : 1, /* index valid */ - reserved1 : 8, + reserved1 : 6, + hlth : 2, /* Health indicator */ index : 8, /* Index or set of the uarch * structure that failed. */ @@ -1213,14 +1220,12 @@ ia64_pal_mc_drain (void) /* Return the machine check dynamic processor state */ static inline s64 -ia64_pal_mc_dynamic_state (u64 offset, u64 *size, u64 *pds) +ia64_pal_mc_dynamic_state (u64 info_type, u64 dy_buffer, u64 *size) { struct ia64_pal_retval iprv; - PAL_CALL(iprv, PAL_MC_DYNAMIC_STATE, offset, 0, 0); + PAL_CALL(iprv, PAL_MC_DYNAMIC_STATE, info_type, dy_buffer, 0); if (size) *size = iprv.v0; - if (pds) - *pds = iprv.v1; return iprv.status; } @@ -1281,15 +1286,41 @@ ia64_pal_mc_expected (u64 expected, u64 *previous) return iprv.status; } +typedef union pal_hw_tracking_u { + u64 pht_data; + struct { + u64 itc :4, /* Instruction cache tracking */ + dct :4, /* Date cache tracking */ + itt :4, /* Instruction TLB tracking */ + ddt :4, /* Data TLB tracking */ + reserved:48; + } pal_hw_tracking_s; +} pal_hw_tracking_u_t; + +/* + * Hardware tracking status. + */ +static inline s64 +ia64_pal_mc_hw_tracking (u64 *status) +{ + struct ia64_pal_retval iprv; + PAL_CALL(iprv, PAL_MC_HW_TRACKING, 0, 0, 0); + if (status) + *status = iprv.v0; + return iprv.status; +} + /* Register a platform dependent location with PAL to which it can save * minimal processor state in the event of a machine check or initialization * event. */ static inline s64 -ia64_pal_mc_register_mem (u64 physical_addr) +ia64_pal_mc_register_mem (u64 physical_addr, u64 size, u64 *req_size) { struct ia64_pal_retval iprv; - PAL_CALL(iprv, PAL_MC_REGISTER_MEM, physical_addr, 0, 0); + PAL_CALL(iprv, PAL_MC_REGISTER_MEM, physical_addr, size, 0); + if (req_size) + *req_size = iprv.v0; return iprv.status; } @@ -1631,6 +1662,29 @@ ia64_pal_vm_summary (pal_vm_info_1_u_t *vm_info_1, pal_vm_info_2_u_t *vm_info_2) return iprv.status; } +typedef union pal_vp_info_u { + u64 pvi_val; + struct { + u64 index: 48, /* virtual feature set info */ + vmm_id: 16; /* feature set id */ + } pal_vp_info_s; +} pal_vp_info_u_t; + +/* + * Returns infomation about virtual processor features + */ +static inline s64 +ia64_pal_vp_info (u64 feature_set, u64 vp_buffer, u64 *vp_info, u64 *vmm_id) +{ + struct ia64_pal_retval iprv; + PAL_CALL(iprv, PAL_VP_INFO, feature_set, vp_buffer, 0); + if (vp_info) + *vp_info = iprv.v0; + if (vmm_id) + *vmm_id = iprv.v1; + return iprv.status; +} + typedef union pal_itr_valid_u { u64 piv_val; struct { From 98075d245a5bc4aeebc2e9f16fa8b089a5c200ac Mon Sep 17 00:00:00 2001 From: Zoltan Menyhart Date: Fri, 11 Apr 2008 15:21:35 -0700 Subject: [PATCH 26/27] [IA64] Fix NUMA configuration issue There is a NUMA memory configuration issue in 2.6.24: A 2-node machine of ours has got the following memory layout: Node 0: 0 - 2 Gbytes Node 0: 4 - 8 Gbytes Node 1: 8 - 16 Gbytes Node 0: 16 - 18 Gbytes "efi_memmap_init()" merges the three last ranges into one. "register_active_ranges()" is called as follows: efi_memmap_walk(register_active_ranges, NULL); i.e. once for the 4 - 18 Gbytes range. It picks up the node number from the start address, and registers all the memory for the node #0. "register_active_ranges()" should be called as follows to make sure there is no merged address range at its entry: efi_memmap_walk(filter_memory, register_active_ranges); "filter_memory()" is similar to "filter_rsvd_memory()", but the reserved memory ranges are not filtered out. Signed-off-by: Zoltan Menyhart Signed-off-by: Tony Luck --- arch/ia64/kernel/setup.c | 23 +++++++++++++++++++++++ arch/ia64/mm/contig.c | 2 +- arch/ia64/mm/discontig.c | 2 +- arch/ia64/mm/init.c | 6 ++---- include/asm-ia64/meminit.h | 3 ++- 5 files changed, 29 insertions(+), 7 deletions(-) diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 4aa9eaea76c3..c85b7dd6ef33 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -176,6 +176,29 @@ filter_rsvd_memory (unsigned long start, unsigned long end, void *arg) return 0; } +/* + * Similar to "filter_rsvd_memory()", but the reserved memory ranges + * are not filtered out. + */ +int __init +filter_memory(unsigned long start, unsigned long end, void *arg) +{ + void (*func)(unsigned long, unsigned long, int); + +#if IGNORE_PFN0 + if (start == PAGE_OFFSET) { + printk(KERN_WARNING "warning: skipping physical page 0\n"); + start += PAGE_SIZE; + if (start >= end) + return 0; + } +#endif + func = arg; + if (start < end) + call_pernode_memory(__pa(start), end - start, func); + return 0; +} + static void __init sort_regions (struct rsvd_region *rsvd_region, int max) { diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index 0479661fa41a..798bf9835a51 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -253,7 +253,7 @@ paging_init (void) max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_VIRTUAL_MEM_MAP - efi_memmap_walk(register_active_ranges, NULL); + efi_memmap_walk(filter_memory, register_active_ranges); efi_memmap_walk(find_largest_hole, (u64 *)&max_gap); if (max_gap < LARGE_GAP) { vmem_map = (struct page *) 0; diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index ffee1ea00bb0..96d5fbfa44a4 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -444,7 +444,7 @@ void __init find_memory(void) mem_data[node].min_pfn = ~0UL; } - efi_memmap_walk(register_active_ranges, NULL); + efi_memmap_walk(filter_memory, register_active_ranges); /* * Initialize the boot memory maps in reverse order since that's diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index da05893294b5..5c1de53c8c1c 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -547,12 +547,10 @@ find_largest_hole (u64 start, u64 end, void *arg) #endif /* CONFIG_VIRTUAL_MEM_MAP */ int __init -register_active_ranges(u64 start, u64 end, void *arg) +register_active_ranges(u64 start, u64 len, int nid) { - int nid = paddr_to_nid(__pa(start)); + u64 end = start + len; - if (nid < 0) - nid = 0; #ifdef CONFIG_KEXEC if (start > crashk_res.start && start < crashk_res.end) start = crashk_res.end; diff --git a/include/asm-ia64/meminit.h b/include/asm-ia64/meminit.h index f93308f54b61..7245a5781594 100644 --- a/include/asm-ia64/meminit.h +++ b/include/asm-ia64/meminit.h @@ -35,6 +35,7 @@ extern void find_memory (void); extern void reserve_memory (void); extern void find_initrd (void); extern int filter_rsvd_memory (unsigned long start, unsigned long end, void *arg); +extern int filter_memory (unsigned long start, unsigned long end, void *arg); extern unsigned long efi_memmap_init(unsigned long *s, unsigned long *e); extern int find_max_min_low_pfn (unsigned long , unsigned long, void *); @@ -56,7 +57,7 @@ extern int reserve_elfcorehdr(unsigned long *start, unsigned long *end); #define IGNORE_PFN0 1 /* XXX fix me: ignore pfn 0 until TLB miss handler is updated... */ -extern int register_active_ranges(u64 start, u64 end, void *arg); +extern int register_active_ranges(u64 start, u64 len, int nid); #ifdef CONFIG_VIRTUAL_MEM_MAP # define LARGE_GAP 0x40000000 /* Use virtual mem map if hole is > than this */ From 072f042df335d7e0da2027637bcf720d7ff1589b Mon Sep 17 00:00:00 2001 From: Takao Indoh Date: Tue, 15 Apr 2008 05:59:54 -0400 Subject: [PATCH 27/27] [IA64] kdump: Add crash_save_vmcoreinfo for INIT This patch fixes the problem that kdump by INIT does not work if we use makedumpfile. The problem is that after INIT is issued, 2nd kernel starts and makedumpfile fails with the following error message. /proc/vmcore doesn't contain vmcoreinfo. '-x' or '-i' must be specified. makedumpfile Failed. The cause of this problem is that kernel does not call crash_save_vmcoreinfo. When kdump starts by panic or sysrq-trigger, crash_save_vmcoreinfo is called by crash_kexec. But this function is not called when kdump starts by INIT. The Attached patch fixes this. This patch just adds crash_save_vmcoreinfo into machine_kdump_on_init so that crash_save_vmcoreinfo can be called when kdump starts by INIT. I tested this patch with linux-2.6.25-rc9 and I confirmed it worked. Signed-off-by: Takao Indoh Signed-off-by: Tony Luck --- arch/ia64/kernel/crash.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c index e74e15a08892..90ef338cf46f 100644 --- a/arch/ia64/kernel/crash.c +++ b/arch/ia64/kernel/crash.c @@ -119,6 +119,7 @@ machine_crash_shutdown(struct pt_regs *pt) static void machine_kdump_on_init(void) { + crash_save_vmcoreinfo(); local_irq_disable(); kexec_disable_iosapic(); machine_kexec(ia64_kimage);